Redesign the kanban runner stages from inbox->doing->done/failed to inbox->building->review->testing->shipped (+ failed): - worker: agent rc=0 lands in review/, then runs the configurable verify command (frontmatter verify: / AGENT_QUEUE_VERIFY) in cwd; pass -> testing/ (QA), fail -> failed/, none -> parks in review/ - new commands: ship (testing->shipped, manual gate), promote (advance one stage), reject (review/testing->failed); requeue now also pulls from review/testing - status + dashboard.mjs render all six stages; RECENT panel labels shipped/testing/review/verify_failed/timeout/rejected - README: new lifecycle diagram, verify: frontmatter, result= glossary, command table + folder layout - selftest: assert no-verify->review, verify-pass->testing->ship->shipped, verify-fail->failed - rename queue/doing->building, queue/done->review; add testing/ shipped/
711 lines
28 KiB
Bash
Executable File
711 lines
28 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
#
|
|
# agent-queue — a folder-based "kanban" runner for headless coding-agent CLIs.
|
|
#
|
|
# Drop a prompt .md file into queue/inbox/, and `agent-queue run` will:
|
|
# 1. pick the oldest file (respecting --max concurrency),
|
|
# 2. move it inbox/ -> building/,
|
|
# 3. launch the chosen agent CLI (devin | claude | codex) in --yolo mode,
|
|
# 4. on agent rc=0 move building/ -> review/, then run the auto-QA verify gate:
|
|
# verify pass -> testing/ verify fail -> failed/ (no verify -> stays in review/)
|
|
# 5. on agent failure/timeout move building/ -> failed/,
|
|
# 6. you manually `ship` testing/ -> shipped/ (the human gate),
|
|
# 7. write a per-job log + live state so `status`/`watch` can show progress.
|
|
#
|
|
# Lifecycle: inbox -> building -> review -> testing -> shipped (+ failed)
|
|
#
|
|
# Per-task config travels in YAML-ish frontmatter at the top of the .md:
|
|
# ---
|
|
# engine: devin # devin | claude | codex (default: $DEFAULT_ENGINE)
|
|
# cwd: /abs/path/repo # where the agent runs (default: $PWD when added)
|
|
# yolo: true # auto-approve all tools (default: true)
|
|
# ---
|
|
#
|
|
# Subcommands: init | add | run | status | watch | dash | stop | logs |
|
|
# promote | ship | reject | requeue | clean | help
|
|
#
|
|
set -uo pipefail
|
|
|
|
# ── Resolve paths ───────────────────────────────────────────────────
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
QUEUE_ROOT="${AGENT_QUEUE_ROOT:-$SCRIPT_DIR/queue}"
|
|
INBOX="$QUEUE_ROOT/inbox"
|
|
BUILDING="$QUEUE_ROOT/building"
|
|
REVIEW="$QUEUE_ROOT/review"
|
|
TESTING="$QUEUE_ROOT/testing"
|
|
SHIPPED="$QUEUE_ROOT/shipped"
|
|
FAILED="$QUEUE_ROOT/failed"
|
|
LOGS="$QUEUE_ROOT/logs"
|
|
STATE="$QUEUE_ROOT/.state"
|
|
LOCKS="$QUEUE_ROOT/locks"
|
|
|
|
# ── Config (env-overridable) ────────────────────────────────────────
|
|
MAX_CONCURRENCY="${AGENT_QUEUE_MAX:-2}"
|
|
DEFAULT_ENGINE="${AGENT_QUEUE_ENGINE:-devin}"
|
|
POLL_SECONDS="${AGENT_QUEUE_POLL:-3}"
|
|
# A running worker is flagged "stalled" if its log has not changed in this many
|
|
# minutes (no new agent output) — surfaced in status + dash.
|
|
STALL_MIN="${AGENT_QUEUE_STALL_MIN:-10}"
|
|
# Auto-QA verify command. After an agent exits 0 the job lands in review/; if a
|
|
# verify command is set (frontmatter `verify:` overrides this default) it runs in
|
|
# the job's cwd: pass -> testing/ (QA), fail -> failed/. Empty default = jobs park
|
|
# in review/ for manual `promote`. Shipping (testing -> shipped) is always manual.
|
|
DEFAULT_VERIFY="${AGENT_QUEUE_VERIFY:-}"
|
|
|
|
# flock is used for cross-process lock hardening when available (Linux). macOS
|
|
# has no flock; mutual exclusion there relies on the single run-loop (see cmd_run).
|
|
FLOCK_BIN="${FLOCK_BIN:-$(command -v flock || true)}"
|
|
# timeout/gtimeout give hard process-tree kills for per-job timeouts; if absent
|
|
# (stock macOS) a pure-bash watchdog is used as a best-effort fallback.
|
|
TIMEOUT_BIN="${TIMEOUT_BIN:-$(command -v timeout || command -v gtimeout || true)}"
|
|
|
|
DEVIN_BIN="${DEVIN_BIN:-$(command -v devin || echo "$HOME/.local/bin/devin")}"
|
|
CLAUDE_BIN="${CLAUDE_BIN:-$(command -v claude || echo claude)}"
|
|
CODEX_BIN="${CODEX_BIN:-$(command -v codex || echo codex)}"
|
|
|
|
# ── Colors ──────────────────────────────────────────────────────────
|
|
if [[ -t 1 ]]; then
|
|
C_RESET=$'\033[0m'; C_DIM=$'\033[2m'; C_BOLD=$'\033[1m'
|
|
C_BLUE=$'\033[34m'; C_GREEN=$'\033[32m'; C_RED=$'\033[31m'; C_YEL=$'\033[33m'; C_CYAN=$'\033[36m'
|
|
else
|
|
C_RESET=""; C_DIM=""; C_BOLD=""; C_BLUE=""; C_GREEN=""; C_RED=""; C_YEL=""; C_CYAN=""
|
|
fi
|
|
|
|
log() { printf '%s[agent-queue]%s %s\n' "$C_CYAN" "$C_RESET" "$*"; }
|
|
err() { printf '%s[agent-queue]%s %s\n' "$C_RED" "$C_RESET" "$*" >&2; }
|
|
die() { err "$*"; exit 1; }
|
|
|
|
# ── Init ────────────────────────────────────────────────────────────
|
|
ensure_dirs() { mkdir -p "$INBOX" "$BUILDING" "$REVIEW" "$TESTING" "$SHIPPED" "$FAILED" "$LOGS" "$STATE" "$LOCKS"; }
|
|
|
|
# ── Frontmatter parsing ─────────────────────────────────────────────
|
|
# fm_get <file> <key> <default>
|
|
fm_get() {
|
|
local file=$1 key=$2 def=${3:-}
|
|
local val
|
|
# only scan a leading --- ... --- block
|
|
val=$(awk -v k="$key" '
|
|
NR==1 && $0!="---" { exit }
|
|
NR==1 { infm=1; next }
|
|
infm && $0=="---" { exit }
|
|
infm {
|
|
line=$0
|
|
sub(/^[ \t]*/,"",line)
|
|
if (line ~ "^" k "[ \t]*:") {
|
|
sub("^" k "[ \t]*:[ \t]*","",line)
|
|
gsub(/^["'\''[:space:]]+|["'\''[:space:]]+$/,"",line)
|
|
print line; exit
|
|
}
|
|
}' "$file" 2>/dev/null)
|
|
[[ -n "$val" ]] && printf '%s' "$val" || printf '%s' "$def"
|
|
}
|
|
|
|
# strip_frontmatter <file> -> prints the body (everything after a leading ---..--- block)
|
|
strip_frontmatter() {
|
|
awk 'NR==1 && $0=="---" { infm=1; next }
|
|
infm && $0=="---" { infm=0; next }
|
|
{ if (!infm) print }' "$1"
|
|
}
|
|
|
|
# lock_key_for <file> -> the mutual-exclusion key for a job: frontmatter `lock:`
|
|
# if set, otherwise the cwd. Jobs sharing a key never run concurrently.
|
|
lock_key_for() {
|
|
local f=$1 k
|
|
k=$(fm_get "$f" lock "")
|
|
[[ -n "$k" ]] && { printf '%s' "$k"; return; }
|
|
fm_get "$f" cwd "$PWD"
|
|
}
|
|
|
|
# _keyhash <key> -> stable filename-safe token for a lock key
|
|
_keyhash() { printf '%s' "$1" | cksum | awk '{print $1}'; }
|
|
|
|
# _mtime <file> -> file modification time in epoch seconds (BSD or GNU stat); empty if missing
|
|
_mtime() {
|
|
[[ -e "$1" ]] || { echo ""; return; }
|
|
stat -f %m "$1" 2>/dev/null || stat -c %Y "$1" 2>/dev/null || echo ""
|
|
}
|
|
|
|
# _pidstart <pid> -> the process start time as reported by ps (whitespace-normalized).
|
|
# Used as an identity token so a recycled pid is never mistaken for our worker.
|
|
_pidstart() { ps -o lstart= -p "$1" 2>/dev/null | awk '{$1=$1;print}'; }
|
|
|
|
# _pid_alive <pid> <pidstart> -> 0 if the pid is live AND (when a start time was
|
|
# recorded) its current start time still matches — defeating pid reuse.
|
|
_pid_alive() {
|
|
local pid=$1 want=$2 cur
|
|
[[ -n "$pid" ]] || return 1
|
|
kill -0 "$pid" 2>/dev/null || return 1
|
|
[[ -z "$want" ]] && return 0
|
|
cur=$(_pidstart "$pid")
|
|
[[ "$cur" == "$want" ]]
|
|
}
|
|
|
|
# _dur_to_secs <dur> -> seconds. Accepts 90, 90s, 45m, 2h, 1d. Invalid/empty -> 0.
|
|
_dur_to_secs() {
|
|
local d=$1
|
|
[[ -z "$d" || "$d" == "0" ]] && { echo 0; return; }
|
|
if [[ "$d" =~ ^([0-9]+)([smhd]?)$ ]]; then
|
|
local n=${BASH_REMATCH[1]} u=${BASH_REMATCH[2]}
|
|
case "$u" in
|
|
""|s) echo "$n";;
|
|
m) echo $((n*60));;
|
|
h) echo $((n*3600));;
|
|
d) echo $((n*86400));;
|
|
esac
|
|
else
|
|
echo 0
|
|
fi
|
|
}
|
|
|
|
# _meta_active <metafile> -> 0 if the job is occupying a concurrency slot.
|
|
# Active = no `ended=` AND (pid is live, OR pid not yet written but the meta was
|
|
# created moments ago — the reserved-slot window between meta-write and launch).
|
|
# The <30s guard prevents a meta orphaned mid-launch (daemon killed in the gap)
|
|
# from pinning a slot forever.
|
|
_meta_active() {
|
|
local f=$1 pid mt age
|
|
grep -q '^ended=' "$f" && return 1
|
|
pid=$(grep '^pid=' "$f" | head -1 | cut -d= -f2)
|
|
if [[ -n "$pid" ]]; then
|
|
local pidstart; pidstart=$(grep '^pidstart=' "$f" | head -1 | cut -d= -f2-)
|
|
_pid_alive "$pid" "$pidstart"
|
|
return $?
|
|
fi
|
|
mt=$(_mtime "$f"); age=$(( $(date +%s) - ${mt:-0} ))
|
|
[[ "$age" -lt 30 ]]
|
|
}
|
|
|
|
# active_workers -> count of jobs occupying a concurrency slot (reservation-aware).
|
|
active_workers() {
|
|
local n=0 f
|
|
for f in "$STATE"/*.meta; do
|
|
[[ -e "$f" ]] || continue
|
|
_meta_active "$f" && n=$((n+1))
|
|
done
|
|
echo "$n"
|
|
}
|
|
|
|
# busy_keys -> newline list of lock keys currently held by active workers.
|
|
busy_keys() {
|
|
local f
|
|
for f in "$STATE"/*.meta; do
|
|
[[ -e "$f" ]] || continue
|
|
_meta_active "$f" && grep '^lock=' "$f" | head -1 | cut -d= -f2-
|
|
done
|
|
}
|
|
|
|
# ── Engine driver: builds argv into AGENT_CMD[]; sets AGENT_STDIN if the ──
|
|
# prompt should be fed on stdin (claude/codex) rather than a flag. $pf is the
|
|
# frontmatter-STRIPPED body file, so a body starting with '--' is never
|
|
# misparsed as a CLI option.
|
|
build_agent_cmd() {
|
|
local engine=$1 pf=$2 yolo=$3
|
|
AGENT_CMD=(); AGENT_STDIN=""
|
|
case "$engine" in
|
|
devin)
|
|
AGENT_CMD=( "$DEVIN_BIN" -p --prompt-file "$pf" )
|
|
[[ "$yolo" == "true" ]] && AGENT_CMD+=( --permission-mode dangerous )
|
|
;;
|
|
claude)
|
|
AGENT_CMD=( "$CLAUDE_BIN" -p )
|
|
[[ "$yolo" == "true" ]] && AGENT_CMD+=( --dangerously-skip-permissions )
|
|
AGENT_STDIN="$pf"
|
|
;;
|
|
codex)
|
|
AGENT_CMD=( "$CODEX_BIN" exec )
|
|
[[ "$yolo" == "true" ]] && AGENT_CMD+=( --dangerously-bypass-approvals-and-sandbox )
|
|
AGENT_STDIN="$pf"
|
|
;;
|
|
*) die "unknown engine '$engine' (use: devin | claude | codex)";;
|
|
esac
|
|
}
|
|
|
|
# ── Worker: runs one job to completion (invoked in background) ───────
|
|
run_worker() {
|
|
local doing_file=$1
|
|
local job; job=$(basename "$doing_file")
|
|
job=${job%.md}
|
|
local engine cwd yolo logf metaf
|
|
engine=$(fm_get "$doing_file" engine "$DEFAULT_ENGINE")
|
|
cwd=$(fm_get "$doing_file" cwd "$PWD")
|
|
yolo=$(fm_get "$doing_file" yolo "true")
|
|
logf="$LOGS/$job.log"
|
|
metaf="$STATE/$job.meta"
|
|
# NOTE: the parent (cmd_run) creates $metaf with job/engine/cwd/started/pid.
|
|
# The worker only ever APPENDS (ended/exit/result) to avoid a truncation race.
|
|
|
|
{
|
|
echo "===== agent-queue job: $job ====="
|
|
echo "engine=$engine cwd=$cwd yolo=$yolo"
|
|
echo "started: $(date)"
|
|
echo "================================="
|
|
} >> "$logf"
|
|
|
|
if [[ ! -d "$cwd" ]]; then
|
|
echo "FATAL: cwd does not exist: $cwd" >> "$logf"
|
|
mv "$doing_file" "$FAILED/" 2>/dev/null
|
|
echo "result=failed" >> "$metaf"; echo "ended=$(date +%s)" >> "$metaf"
|
|
return 1
|
|
fi
|
|
|
|
# Strip our frontmatter so the agent only sees the task body.
|
|
local bodyf="$STATE/$job.body.md"
|
|
strip_frontmatter "$doing_file" > "$bodyf"
|
|
build_agent_cmd "$engine" "$bodyf" "$yolo"
|
|
|
|
_run_agent() {
|
|
if [[ -n "$AGENT_STDIN" ]]; then
|
|
( cd "$cwd" && "${AGENT_CMD[@]}" < "$AGENT_STDIN" )
|
|
else
|
|
( cd "$cwd" && "${AGENT_CMD[@]}" )
|
|
fi
|
|
}
|
|
|
|
local rc=0 lockkey tmo timed_out=false
|
|
lockkey=$(lock_key_for "$doing_file")
|
|
tmo=$(_dur_to_secs "$(fm_get "$doing_file" timeout "0")")
|
|
local tmo_flag="$STATE/$job.timedout"; rm -f "$tmo_flag"
|
|
local lf="$LOCKS/$(_keyhash "$lockkey").lock"
|
|
|
|
if [[ "$tmo" -gt 0 && -n "$TIMEOUT_BIN" ]]; then
|
|
# Hard timeout via timeout/gtimeout (kills the whole process tree).
|
|
AQ_STDIN="$AGENT_STDIN" "$TIMEOUT_BIN" -k 5 "${tmo}s" bash -c '
|
|
cd "$1" || exit 97; shift
|
|
if [ -n "${AQ_STDIN:-}" ]; then exec "$@" < "$AQ_STDIN"; else exec "$@"; fi
|
|
' _ "$cwd" "${AGENT_CMD[@]}" >> "$logf" 2>&1
|
|
rc=$?
|
|
[[ $rc -eq 124 ]] && timed_out=true
|
|
elif [[ "$tmo" -gt 0 ]]; then
|
|
# Portable watchdog fallback (no timeout binary). Flags the timeout and
|
|
# signals the worker; install coreutils (gtimeout) for hard tree kills.
|
|
_run_agent >> "$logf" 2>&1 &
|
|
local apid=$!
|
|
( sleep "$tmo"; : > "$tmo_flag"
|
|
pkill -TERM -P "$apid" 2>/dev/null; kill -TERM "$apid" 2>/dev/null
|
|
sleep 5; pkill -KILL -P "$apid" 2>/dev/null; kill -KILL "$apid" 2>/dev/null ) &
|
|
local wpid=$!
|
|
wait "$apid" 2>/dev/null; rc=$?
|
|
kill "$wpid" 2>/dev/null; wait "$wpid" 2>/dev/null
|
|
[[ -f "$tmo_flag" ]] && timed_out=true
|
|
elif [[ -n "$FLOCK_BIN" ]]; then
|
|
# Cross-process hardening where flock exists (Linux CI). The single run-loop
|
|
# already serializes by lock key; this guards against a stray second launcher.
|
|
( "$FLOCK_BIN" -n 9 || exit 75; _run_agent ) 9>"$lf" >> "$logf" 2>&1
|
|
rc=$?
|
|
if [[ $rc -eq 75 ]]; then
|
|
echo "lock busy (key=$lockkey) — requeued to inbox" >> "$logf"
|
|
mv "$doing_file" "$INBOX/" 2>/dev/null
|
|
{ echo "ended=$(date +%s)"; echo "result=requeued"; } >> "$metaf"
|
|
return 0
|
|
fi
|
|
else
|
|
_run_agent >> "$logf" 2>&1
|
|
rc=$?
|
|
fi
|
|
rm -f "$tmo_flag"
|
|
|
|
echo "exit=$rc" >> "$metaf"
|
|
if $timed_out; then
|
|
mv "$doing_file" "$FAILED/" 2>/dev/null
|
|
echo "result=timeout" >> "$metaf"
|
|
echo "ended=$(date +%s)" >> "$metaf"
|
|
echo "TIMED OUT after ${tmo}s (rc=$rc): $(date)" >> "$logf"
|
|
elif [[ $rc -eq 0 ]]; then
|
|
# Agent succeeded: land in review/, then run the auto-QA verify gate. The
|
|
# worker is still alive here so the concurrency slot stays held through
|
|
# verification — `ended=` is written only once we reach a resting stage.
|
|
mv "$doing_file" "$REVIEW/" 2>/dev/null
|
|
local review_file="$REVIEW/$job.md"
|
|
echo "completed OK (rc=0): landed in review — $(date)" >> "$logf"
|
|
local verify; verify=$(fm_get "$review_file" verify "$DEFAULT_VERIFY")
|
|
if [[ -z "$verify" ]]; then
|
|
echo "result=review" >> "$metaf"
|
|
echo "ended=$(date +%s)" >> "$metaf"
|
|
echo "no verify command — parked in review for manual promote: $(date)" >> "$logf"
|
|
else
|
|
echo "----- verify: $verify -----" >> "$logf"
|
|
local vrc=0
|
|
( cd "$cwd" && bash -c "$verify" ) >> "$logf" 2>&1 || vrc=$?
|
|
echo "verify_exit=$vrc" >> "$metaf"
|
|
if [[ $vrc -eq 0 ]]; then
|
|
mv "$review_file" "$TESTING/" 2>/dev/null
|
|
echo "result=testing" >> "$metaf"
|
|
echo "ended=$(date +%s)" >> "$metaf"
|
|
echo "VERIFY PASSED — promoted to testing (QA): $(date)" >> "$logf"
|
|
else
|
|
mv "$review_file" "$FAILED/" 2>/dev/null
|
|
echo "result=verify_failed" >> "$metaf"
|
|
echo "ended=$(date +%s)" >> "$metaf"
|
|
echo "VERIFY FAILED (rc=$vrc): $(date)" >> "$logf"
|
|
fi
|
|
fi
|
|
else
|
|
mv "$doing_file" "$FAILED/" 2>/dev/null
|
|
echo "result=failed" >> "$metaf"
|
|
echo "ended=$(date +%s)" >> "$metaf"
|
|
echo "FAILED (rc=$rc): $(date)" >> "$logf"
|
|
fi
|
|
}
|
|
|
|
# ── Commands ────────────────────────────────────────────────────────
|
|
cmd_init() { ensure_dirs; log "queue initialized at $C_BOLD$QUEUE_ROOT$C_RESET"; }
|
|
|
|
cmd_add() {
|
|
ensure_dirs
|
|
local file="" engine="" cwd="" yolo=""
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--engine) engine=$2; shift 2;;
|
|
--cwd) cwd=$2; shift 2;;
|
|
--yolo) yolo=true; shift;;
|
|
--no-yolo) yolo=false; shift;;
|
|
*) file=$1; shift;;
|
|
esac
|
|
done
|
|
[[ -n "$file" && -f "$file" ]] || die "usage: add <file.md> [--engine devin|claude|codex] [--cwd PATH] [--yolo|--no-yolo]"
|
|
local base; base=$(basename "$file")
|
|
local stamp; stamp=$(date +%Y%m%d-%H%M%S)
|
|
local dest="$INBOX/${stamp}__${base}"
|
|
|
|
# If user passed flags AND the file has no frontmatter, inject one.
|
|
if [[ -n "$engine$cwd$yolo" ]] && [[ "$(head -1 "$file")" != "---" ]]; then
|
|
{
|
|
echo "---"
|
|
echo "engine: ${engine:-$DEFAULT_ENGINE}"
|
|
echo "cwd: ${cwd:-$PWD}"
|
|
echo "yolo: ${yolo:-true}"
|
|
echo "---"
|
|
echo
|
|
cat "$file"
|
|
} > "$dest"
|
|
else
|
|
cp "$file" "$dest"
|
|
fi
|
|
log "queued $C_BOLD$(basename "$dest")$C_RESET (engine=$(fm_get "$dest" engine "$DEFAULT_ENGINE"), cwd=$(fm_get "$dest" cwd "$PWD"))"
|
|
}
|
|
|
|
cmd_run() {
|
|
ensure_dirs
|
|
local once=false
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--max) MAX_CONCURRENCY=$2; shift 2;;
|
|
--engine) DEFAULT_ENGINE=$2; shift 2;;
|
|
--once|--drain) once=true; shift;;
|
|
*) die "run: unknown arg '$1'";;
|
|
esac
|
|
done
|
|
# Refuse to start a second run loop against the same queue — two daemons would
|
|
# break the single-launcher invariant that per-cwd locking relies on.
|
|
local dpid=""
|
|
[[ -f "$STATE/daemon.pid" ]] && dpid=$(cat "$STATE/daemon.pid" 2>/dev/null)
|
|
if [[ -n "$dpid" ]] && kill -0 "$dpid" 2>/dev/null; then
|
|
die "a run loop is already active (pid $dpid). Use 'stop' first, or a different AGENT_QUEUE_ROOT."
|
|
fi
|
|
[[ -n "$dpid" ]] && log "clearing stale daemon.pid ($dpid)"
|
|
echo "$$" > "$STATE/daemon.pid"
|
|
trap 'rm -f "$STATE/daemon.pid"; log "run loop stopped"; exit 0' INT TERM
|
|
log "run loop started (max=$MAX_CONCURRENCY, default engine=$DEFAULT_ENGINE). Ctrl-C to stop."
|
|
|
|
while true; do
|
|
local running; running=$(active_workers)
|
|
# launch jobs while we have capacity and an eligible inbox file
|
|
while [[ "$running" -lt "$MAX_CONCURRENCY" ]]; do
|
|
# pick the oldest inbox file whose lock key is not currently busy, so two
|
|
# jobs sharing a cwd (or `lock:` key) never run at once, regardless of --max.
|
|
local busy; busy=$(busy_keys)
|
|
local next="" cand cand_key
|
|
while IFS= read -r cand; do
|
|
[[ -n "$cand" ]] || continue
|
|
cand_key=$(lock_key_for "$cand")
|
|
if printf '%s\n' "$busy" | grep -qxF -- "$cand_key"; then continue; fi
|
|
next="$cand"; break
|
|
done < <(ls -1 "$INBOX"/*.md 2>/dev/null | sort)
|
|
[[ -z "$next" ]] && break
|
|
|
|
local job; job=$(basename "$next"); job=${job%.md}
|
|
local doing_file="$BUILDING/$(basename "$next")"
|
|
mv "$next" "$doing_file"
|
|
local w_eng w_cwd w_yolo w_key
|
|
w_eng=$(fm_get "$doing_file" engine "$DEFAULT_ENGINE")
|
|
w_cwd=$(fm_get "$doing_file" cwd "$PWD")
|
|
w_yolo=$(fm_get "$doing_file" yolo "true")
|
|
w_key=$(lock_key_for "$doing_file")
|
|
# write meta BEFORE launch (no pid yet), then append the worker pid from $!
|
|
{
|
|
echo "job=$job"
|
|
echo "engine=$w_eng"
|
|
echo "cwd=$w_cwd"
|
|
echo "yolo=$w_yolo"
|
|
echo "lock=$w_key"
|
|
echo "started=$(date +%s)"
|
|
} > "$STATE/$job.meta"
|
|
run_worker "$doing_file" &
|
|
{ echo "pid=$!"; echo "pidstart=$(_pidstart "$!")"; } >> "$STATE/$job.meta"
|
|
log "▶ launching $C_BOLD$job$C_RESET (engine=$w_eng, lock=$w_key)"
|
|
sleep 1
|
|
running=$(active_workers)
|
|
done
|
|
|
|
if $once; then
|
|
[[ "$(active_workers)" -eq 0 && -z "$(ls -1 "$INBOX"/*.md 2>/dev/null)" ]] && {
|
|
log "drain complete — inbox empty, no workers running"; rm -f "$STATE/daemon.pid"; exit 0; }
|
|
fi
|
|
sleep "$POLL_SECONDS"
|
|
done
|
|
}
|
|
|
|
_count() { ls -1 "$1"/*.md 2>/dev/null | wc -l | tr -d ' '; }
|
|
|
|
cmd_status() {
|
|
ensure_dirs
|
|
local ib bd rv ts sh fl
|
|
ib=$(_count "$INBOX"); bd=$(_count "$BUILDING"); rv=$(_count "$REVIEW")
|
|
ts=$(_count "$TESTING"); sh=$(_count "$SHIPPED"); fl=$(_count "$FAILED")
|
|
local running; running=$(active_workers)
|
|
echo
|
|
printf '%s AGENT QUEUE %s %s\n' "$C_BOLD" "$C_DIM$QUEUE_ROOT$C_RESET" ""
|
|
printf ' %sinbox%s %-3s %sbuilding%s %-3s %sreview%s %-3s %stesting%s %-3s %sshipped%s %-3s %sfailed%s %-3s %srunning%s %s/%s\n\n' \
|
|
"$C_BLUE" "$C_RESET" "$ib" "$C_YEL" "$C_RESET" "$bd" \
|
|
"$C_CYAN" "$C_RESET" "$rv" "$C_CYAN" "$C_RESET" "$ts" \
|
|
"$C_GREEN" "$C_RESET" "$sh" "$C_RED" "$C_RESET" "$fl" \
|
|
"$C_BOLD" "$C_RESET" "$running" "$MAX_CONCURRENCY"
|
|
|
|
# running table
|
|
local f
|
|
local printed=false
|
|
for f in "$STATE"/*.meta; do
|
|
[[ -e "$f" ]] || continue
|
|
grep -q '^ended=' "$f" && continue
|
|
local pid pidstart; pid=$(grep '^pid=' "$f" | cut -d= -f2); pidstart=$(grep '^pidstart=' "$f" | cut -d= -f2-)
|
|
_pid_alive "$pid" "$pidstart" || continue
|
|
if ! $printed; then printf ' %sRUNNING%s\n' "$C_BOLD" "$C_RESET"; printed=true; fi
|
|
local job eng start now el last lmt age stall=""
|
|
job=$(grep '^job=' "$f" | cut -d= -f2)
|
|
eng=$(grep '^engine=' "$f" | cut -d= -f2)
|
|
start=$(grep '^started=' "$f" | cut -d= -f2)
|
|
now=$(date +%s); el=$(( now - ${start:-$now} ))
|
|
last=$(tail -n 1 "$LOGS/$job.log" 2>/dev/null | cut -c1-60)
|
|
lmt=$(_mtime "$LOGS/$job.log"); age=$(( now - ${lmt:-$now} ))
|
|
[[ "$age" -gt $(( STALL_MIN * 60 )) ]] && stall="${C_RED}⚠ stalled${C_RESET} "
|
|
printf ' %s%-26s%s %-7s %3dm%02ds pid %-6s %s%s%s%s\n' \
|
|
"$C_BOLD" "$job" "$C_RESET" "$eng" $((el/60)) $((el%60)) "$pid" "$stall" "$C_DIM" "$last" "$C_RESET"
|
|
done
|
|
$printed || printf ' %sno workers running%s\n' "$C_DIM" "$C_RESET"
|
|
echo
|
|
}
|
|
|
|
cmd_watch() {
|
|
local interval="${1:-2}"
|
|
while true; do clear; cmd_status; sleep "$interval"; done
|
|
}
|
|
|
|
cmd_dash() {
|
|
command -v node >/dev/null 2>&1 || die "node not found — use 'watch' for the bash status view"
|
|
AGENT_QUEUE_ROOT="$QUEUE_ROOT" exec node "$SCRIPT_DIR/dashboard.mjs" "$@"
|
|
}
|
|
|
|
cmd_stop() {
|
|
ensure_dirs
|
|
local killed=0 f pid pidstart
|
|
for f in "$STATE"/*.meta; do
|
|
[[ -e "$f" ]] || continue
|
|
grep -q '^ended=' "$f" && continue
|
|
pid=$(grep '^pid=' "$f" | cut -d= -f2); pidstart=$(grep '^pidstart=' "$f" | cut -d= -f2-)
|
|
_pid_alive "$pid" "$pidstart" && { kill "$pid" 2>/dev/null && killed=$((killed+1)); }
|
|
done
|
|
[[ -f "$STATE/daemon.pid" ]] && kill "$(cat "$STATE/daemon.pid")" 2>/dev/null
|
|
rm -f "$STATE/daemon.pid"
|
|
log "stopped $killed running worker(s) + run loop"
|
|
}
|
|
|
|
cmd_logs() {
|
|
local job="${1:-}" follow=""
|
|
[[ "${2:-}" == "-f" || "$job" == "-f" ]] && follow="-f"
|
|
[[ "$job" == "-f" ]] && job="${2:-}"
|
|
[[ -n "$job" ]] || die "usage: logs <job> [-f]"
|
|
local lf="$LOGS/$job.log"
|
|
[[ -f "$lf" ]] || lf=$(ls -1t "$LOGS"/*"$job"*.log 2>/dev/null | head -1)
|
|
[[ -f "$lf" ]] || die "no log found for '$job'"
|
|
if [[ -n "$follow" ]]; then tail -f "$lf"; else cat "$lf"; fi
|
|
}
|
|
|
|
# _find_job <job> <dir...> — echo the first matching .md across the given dirs
|
|
# (exact "<job>.md" preferred, else newest fuzzy match). Empty if none found.
|
|
_find_job() {
|
|
local job=$1; shift
|
|
local d f
|
|
for d in "$@"; do
|
|
[[ -f "$d/$job.md" ]] && { printf '%s' "$d/$job.md"; return; }
|
|
done
|
|
for d in "$@"; do
|
|
f=$(ls -1t "$d"/*"$job"*.md 2>/dev/null | head -1)
|
|
[[ -f "$f" ]] && { printf '%s' "$f"; return; }
|
|
done
|
|
}
|
|
|
|
# requeue <job> — move a job back to inbox/ for a fresh run (from failed/review/testing).
|
|
cmd_requeue() {
|
|
ensure_dirs
|
|
local job="${1:-}"
|
|
[[ -n "$job" ]] || die "usage: requeue <job>"
|
|
local f; f=$(_find_job "$job" "$FAILED" "$REVIEW" "$TESTING")
|
|
[[ -n "$f" ]] || die "no failed/review/testing job matching '$job'"
|
|
local base name from; base=$(basename "$f"); name=${base%.md}; from=$(basename "$(dirname "$f")")
|
|
mv "$f" "$INBOX/$base"
|
|
# drop stale state so it re-runs cleanly
|
|
rm -f "$STATE/$name.meta" "$STATE/$name.body.md" "$STATE/$name.timedout"
|
|
log "requeued $C_BOLD$base$C_RESET ($from → inbox)"
|
|
}
|
|
|
|
# ship <job> — manual promotion testing/ (QA) → shipped/. The human gate.
|
|
cmd_ship() {
|
|
ensure_dirs
|
|
local job="${1:-}"
|
|
[[ -n "$job" ]] || die "usage: ship <job>"
|
|
local f; f=$(_find_job "$job" "$TESTING")
|
|
[[ -n "$f" ]] || die "no job in testing/ matching '$job' (only QA-passed jobs can ship)"
|
|
local base name; base=$(basename "$f"); name=${base%.md}
|
|
mv "$f" "$SHIPPED/$base"
|
|
[[ -f "$STATE/$name.meta" ]] && echo "result=shipped" >> "$STATE/$name.meta"
|
|
log "shipped $C_BOLD$base$C_RESET (testing → shipped)"
|
|
}
|
|
|
|
# promote <job> — advance one stage forward: review → testing → shipped.
|
|
cmd_promote() {
|
|
ensure_dirs
|
|
local job="${1:-}"
|
|
[[ -n "$job" ]] || die "usage: promote <job>"
|
|
local f; f=$(_find_job "$job" "$REVIEW" "$TESTING")
|
|
[[ -n "$f" ]] || die "no job in review/ or testing/ matching '$job'"
|
|
local base name from dest result; base=$(basename "$f"); name=${base%.md}
|
|
from=$(basename "$(dirname "$f")")
|
|
case "$from" in
|
|
review) dest="$TESTING"; result="testing";;
|
|
testing) dest="$SHIPPED"; result="shipped";;
|
|
*) die "promote: '$base' is in '$from' — nothing to promote";;
|
|
esac
|
|
mv "$f" "$dest/$base"
|
|
[[ -f "$STATE/$name.meta" ]] && echo "result=$result" >> "$STATE/$name.meta"
|
|
log "promoted $C_BOLD$base$C_RESET ($from → $result)"
|
|
}
|
|
|
|
# reject <job> — move a review/testing job to failed/ (manual gate rejection).
|
|
cmd_reject() {
|
|
ensure_dirs
|
|
local job="${1:-}"
|
|
[[ -n "$job" ]] || die "usage: reject <job>"
|
|
local f; f=$(_find_job "$job" "$REVIEW" "$TESTING")
|
|
[[ -n "$f" ]] || die "no job in review/ or testing/ matching '$job'"
|
|
local base name from; base=$(basename "$f"); name=${base%.md}; from=$(basename "$(dirname "$f")")
|
|
mv "$f" "$FAILED/$base"
|
|
[[ -f "$STATE/$name.meta" ]] && echo "result=rejected" >> "$STATE/$name.meta"
|
|
log "rejected $C_BOLD$base$C_RESET ($from → failed)"
|
|
}
|
|
|
|
# clean [--keep N] — archive finished jobs' logs+meta beyond the newest N
|
|
# (default 50) into queue/.archive/<ts>/. Running jobs and the done/failed .md
|
|
# kanban records are left untouched.
|
|
cmd_clean() {
|
|
ensure_dirs
|
|
local keep=50
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--keep) keep=$2; shift 2;;
|
|
*) die "clean: unknown arg '$1'";;
|
|
esac
|
|
done
|
|
[[ "$keep" =~ ^[0-9]+$ ]] || die "clean: --keep must be a number"
|
|
local arch="$QUEUE_ROOT/.archive/$(date +%Y%m%d-%H%M%S)"
|
|
# finished metas (have ended=), newest-first by mtime
|
|
local metas; metas=$(grep -l '^ended=' "$STATE"/*.meta 2>/dev/null \
|
|
| while IFS= read -r m; do printf '%s %s\n' "$(_mtime "$m")" "$m"; done \
|
|
| sort -rn | awk '{print $2}')
|
|
local i=0 moved=0 m name
|
|
while IFS= read -r m; do
|
|
[[ -n "$m" ]] || continue
|
|
i=$((i+1))
|
|
[[ "$i" -le "$keep" ]] && continue
|
|
name=$(basename "$m"); name=${name%.meta}
|
|
mkdir -p "$arch"
|
|
mv "$m" "$arch/" 2>/dev/null
|
|
[[ -f "$LOGS/$name.log" ]] && mv "$LOGS/$name.log" "$arch/" 2>/dev/null
|
|
[[ -f "$STATE/$name.body.md" ]] && mv "$STATE/$name.body.md" "$arch/" 2>/dev/null
|
|
moved=$((moved+1))
|
|
done <<< "$metas"
|
|
if [[ "$moved" -gt 0 ]]; then
|
|
log "archived $moved finished job(s) to $C_BOLD$arch$C_RESET (kept newest $keep)"
|
|
else
|
|
log "nothing to clean (≤$keep finished jobs)"
|
|
fi
|
|
}
|
|
|
|
usage() {
|
|
cat <<EOF
|
|
${C_BOLD}agent-queue${C_RESET} — folder kanban runner for devin | claude | codex
|
|
|
|
${C_BOLD}USAGE${C_RESET}
|
|
agent-queue.sh <command> [args]
|
|
|
|
${C_BOLD}COMMANDS${C_RESET}
|
|
init create the queue/ folders
|
|
add <file.md> [opts] queue a prompt file into inbox/
|
|
--engine devin|claude|codex --cwd PATH --yolo | --no-yolo
|
|
run [--max N] [--engine E] [--once]
|
|
process inbox/ (foreground loop; Ctrl-C to stop)
|
|
status show kanban counts + running workers
|
|
watch [interval] live status (default 2s, bash)
|
|
dash [--interval N] richer live Node dashboard (recent shipped/failed too)
|
|
stop kill running workers + the run loop
|
|
logs <job> [-f] print (or follow) a job's log
|
|
promote <job> advance one stage (review → testing → shipped)
|
|
ship <job> manual gate: testing (QA) → shipped
|
|
reject <job> send a review/testing job to failed/
|
|
requeue <job> move a failed/review/testing job back to inbox/
|
|
clean [--keep N] archive finished logs+meta beyond newest N (default 50)
|
|
help this message
|
|
|
|
${C_BOLD}KANBAN${C_RESET} inbox → building → review → testing → shipped (+ failed; logs/ + .state/ alongside)
|
|
auto: agent rc=0 → review; verify pass → testing; verify fail → failed
|
|
manual: ship (testing → shipped)
|
|
|
|
${C_BOLD}TASK FRONTMATTER${C_RESET} (top of each .md)
|
|
---
|
|
engine: devin
|
|
cwd: /Users/you/code/repo
|
|
yolo: true
|
|
lock: my-repo # optional; defaults to cwd. Jobs sharing a key run serially
|
|
timeout: 45m # optional; 90s|45m|2h|1d. On expiry -> failed (result=timeout)
|
|
verify: pnpm -s test # optional; auto-QA gate. pass -> testing, fail -> failed
|
|
---
|
|
|
|
${C_BOLD}ENV${C_RESET}
|
|
AGENT_QUEUE_ROOT (=$QUEUE_ROOT) AGENT_QUEUE_MAX (=$MAX_CONCURRENCY)
|
|
AGENT_QUEUE_ENGINE (=$DEFAULT_ENGINE) AGENT_QUEUE_VERIFY (default verify cmd)
|
|
DEVIN_BIN / CLAUDE_BIN / CODEX_BIN
|
|
EOF
|
|
}
|
|
|
|
main() {
|
|
local cmd="${1:-help}"; shift || true
|
|
case "$cmd" in
|
|
init) cmd_init "$@";;
|
|
add) cmd_add "$@";;
|
|
run) cmd_run "$@";;
|
|
status) cmd_status "$@";;
|
|
watch) cmd_watch "$@";;
|
|
dash|dashboard) cmd_dash "$@";;
|
|
stop) cmd_stop "$@";;
|
|
logs) cmd_logs "$@";;
|
|
promote) cmd_promote "$@";;
|
|
ship) cmd_ship "$@";;
|
|
reject) cmd_reject "$@";;
|
|
requeue) cmd_requeue "$@";;
|
|
clean) cmd_clean "$@";;
|
|
help|-h|--help) usage;;
|
|
*) err "unknown command: $cmd"; echo; usage; exit 1;;
|
|
esac
|
|
}
|
|
|
|
main "$@"
|