bytelyst-devops-tools/agent-queue/agent-queue.sh
saravanakumardb1 a849a30e11 feat(agent-queue): refuse a second run when a daemon is already active
cmd_run now checks daemon.pid liveness up front: if a run loop is alive it exits
with an error (protecting the single-launcher invariant locking depends on); a
stale daemon.pid (dead pid) is cleared and the run proceeds.
2026-05-28 22:21:31 -07:00

530 lines
20 KiB
Bash
Executable File

#!/usr/bin/env bash
#
# agent-queue — a folder-based "kanban" runner for headless coding-agent CLIs.
#
# Drop a prompt .md file into queue/inbox/, and `agent-queue run` will:
# 1. pick the oldest file (respecting --max concurrency),
# 2. move it inbox/ -> doing/,
# 3. launch the chosen agent CLI (devin | claude | codex) in --yolo mode,
# 4. on success move doing/ -> done/, on failure -> failed/,
# 5. write a per-job log + live state so `status`/`watch` can show progress.
#
# Per-task config travels in YAML-ish frontmatter at the top of the .md:
# ---
# engine: devin # devin | claude | codex (default: $DEFAULT_ENGINE)
# cwd: /abs/path/repo # where the agent runs (default: $PWD when added)
# yolo: true # auto-approve all tools (default: true)
# ---
#
# Subcommands: init | add | run | status | watch | stop | logs | help
#
set -uo pipefail
# ── Resolve paths ───────────────────────────────────────────────────
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
QUEUE_ROOT="${AGENT_QUEUE_ROOT:-$SCRIPT_DIR/queue}"
INBOX="$QUEUE_ROOT/inbox"
DOING="$QUEUE_ROOT/doing"
DONE="$QUEUE_ROOT/done"
FAILED="$QUEUE_ROOT/failed"
LOGS="$QUEUE_ROOT/logs"
STATE="$QUEUE_ROOT/.state"
LOCKS="$QUEUE_ROOT/locks"
# ── Config (env-overridable) ────────────────────────────────────────
MAX_CONCURRENCY="${AGENT_QUEUE_MAX:-2}"
DEFAULT_ENGINE="${AGENT_QUEUE_ENGINE:-devin}"
POLL_SECONDS="${AGENT_QUEUE_POLL:-3}"
# A running worker is flagged "stalled" if its log has not changed in this many
# minutes (no new agent output) — surfaced in status + dash.
STALL_MIN="${AGENT_QUEUE_STALL_MIN:-10}"
# flock is used for cross-process lock hardening when available (Linux). macOS
# has no flock; mutual exclusion there relies on the single run-loop (see cmd_run).
FLOCK_BIN="${FLOCK_BIN:-$(command -v flock || true)}"
# timeout/gtimeout give hard process-tree kills for per-job timeouts; if absent
# (stock macOS) a pure-bash watchdog is used as a best-effort fallback.
TIMEOUT_BIN="${TIMEOUT_BIN:-$(command -v timeout || command -v gtimeout || true)}"
DEVIN_BIN="${DEVIN_BIN:-$(command -v devin || echo "$HOME/.local/bin/devin")}"
CLAUDE_BIN="${CLAUDE_BIN:-$(command -v claude || echo claude)}"
CODEX_BIN="${CODEX_BIN:-$(command -v codex || echo codex)}"
# ── Colors ──────────────────────────────────────────────────────────
if [[ -t 1 ]]; then
C_RESET=$'\033[0m'; C_DIM=$'\033[2m'; C_BOLD=$'\033[1m'
C_BLUE=$'\033[34m'; C_GREEN=$'\033[32m'; C_RED=$'\033[31m'; C_YEL=$'\033[33m'; C_CYAN=$'\033[36m'
else
C_RESET=""; C_DIM=""; C_BOLD=""; C_BLUE=""; C_GREEN=""; C_RED=""; C_YEL=""; C_CYAN=""
fi
log() { printf '%s[agent-queue]%s %s\n' "$C_CYAN" "$C_RESET" "$*"; }
err() { printf '%s[agent-queue]%s %s\n' "$C_RED" "$C_RESET" "$*" >&2; }
die() { err "$*"; exit 1; }
# ── Init ────────────────────────────────────────────────────────────
ensure_dirs() { mkdir -p "$INBOX" "$DOING" "$DONE" "$FAILED" "$LOGS" "$STATE" "$LOCKS"; }
# ── Frontmatter parsing ─────────────────────────────────────────────
# fm_get <file> <key> <default>
fm_get() {
local file=$1 key=$2 def=${3:-}
local val
# only scan a leading --- ... --- block
val=$(awk -v k="$key" '
NR==1 && $0!="---" { exit }
NR==1 { infm=1; next }
infm && $0=="---" { exit }
infm {
line=$0
sub(/^[ \t]*/,"",line)
if (line ~ "^" k "[ \t]*:") {
sub("^" k "[ \t]*:[ \t]*","",line)
gsub(/^["'\''[:space:]]+|["'\''[:space:]]+$/,"",line)
print line; exit
}
}' "$file" 2>/dev/null)
[[ -n "$val" ]] && printf '%s' "$val" || printf '%s' "$def"
}
# strip_frontmatter <file> -> prints the body (everything after a leading ---..--- block)
strip_frontmatter() {
awk 'NR==1 && $0=="---" { infm=1; next }
infm && $0=="---" { infm=0; next }
{ if (!infm) print }' "$1"
}
# lock_key_for <file> -> the mutual-exclusion key for a job: frontmatter `lock:`
# if set, otherwise the cwd. Jobs sharing a key never run concurrently.
lock_key_for() {
local f=$1 k
k=$(fm_get "$f" lock "")
[[ -n "$k" ]] && { printf '%s' "$k"; return; }
fm_get "$f" cwd "$PWD"
}
# _keyhash <key> -> stable filename-safe token for a lock key
_keyhash() { printf '%s' "$1" | cksum | awk '{print $1}'; }
# _mtime <file> -> file modification time in epoch seconds (BSD or GNU stat); empty if missing
_mtime() {
[[ -e "$1" ]] || { echo ""; return; }
stat -f %m "$1" 2>/dev/null || stat -c %Y "$1" 2>/dev/null || echo ""
}
# _dur_to_secs <dur> -> seconds. Accepts 90, 90s, 45m, 2h, 1d. Invalid/empty -> 0.
_dur_to_secs() {
local d=$1
[[ -z "$d" || "$d" == "0" ]] && { echo 0; return; }
if [[ "$d" =~ ^([0-9]+)([smhd]?)$ ]]; then
local n=${BASH_REMATCH[1]} u=${BASH_REMATCH[2]}
case "$u" in
""|s) echo "$n";;
m) echo $((n*60));;
h) echo $((n*3600));;
d) echo $((n*86400));;
esac
else
echo 0
fi
}
# _meta_active <metafile> -> 0 if the job is occupying a concurrency slot.
# Active = no `ended=` AND (pid is live, OR pid not yet written but the meta was
# created moments ago — the reserved-slot window between meta-write and launch).
# The <30s guard prevents a meta orphaned mid-launch (daemon killed in the gap)
# from pinning a slot forever.
_meta_active() {
local f=$1 pid mt age
grep -q '^ended=' "$f" && return 1
pid=$(grep '^pid=' "$f" | head -1 | cut -d= -f2)
if [[ -n "$pid" ]]; then
kill -0 "$pid" 2>/dev/null
return $?
fi
mt=$(_mtime "$f"); age=$(( $(date +%s) - ${mt:-0} ))
[[ "$age" -lt 30 ]]
}
# active_workers -> count of jobs occupying a concurrency slot (reservation-aware).
active_workers() {
local n=0 f
for f in "$STATE"/*.meta; do
[[ -e "$f" ]] || continue
_meta_active "$f" && n=$((n+1))
done
echo "$n"
}
# busy_keys -> newline list of lock keys currently held by active workers.
busy_keys() {
local f
for f in "$STATE"/*.meta; do
[[ -e "$f" ]] || continue
_meta_active "$f" && grep '^lock=' "$f" | head -1 | cut -d= -f2-
done
}
# ── Engine driver: builds argv into AGENT_CMD[]; sets AGENT_STDIN if the ──
# prompt should be fed on stdin (claude/codex) rather than a flag. $pf is the
# frontmatter-STRIPPED body file, so a body starting with '--' is never
# misparsed as a CLI option.
build_agent_cmd() {
local engine=$1 pf=$2 yolo=$3
AGENT_CMD=(); AGENT_STDIN=""
case "$engine" in
devin)
AGENT_CMD=( "$DEVIN_BIN" -p --prompt-file "$pf" )
[[ "$yolo" == "true" ]] && AGENT_CMD+=( --permission-mode dangerous )
;;
claude)
AGENT_CMD=( "$CLAUDE_BIN" -p )
[[ "$yolo" == "true" ]] && AGENT_CMD+=( --dangerously-skip-permissions )
AGENT_STDIN="$pf"
;;
codex)
AGENT_CMD=( "$CODEX_BIN" exec )
[[ "$yolo" == "true" ]] && AGENT_CMD+=( --dangerously-bypass-approvals-and-sandbox )
AGENT_STDIN="$pf"
;;
*) die "unknown engine '$engine' (use: devin | claude | codex)";;
esac
}
# ── Worker: runs one job to completion (invoked in background) ───────
run_worker() {
local doing_file=$1
local job; job=$(basename "$doing_file")
job=${job%.md}
local engine cwd yolo logf metaf
engine=$(fm_get "$doing_file" engine "$DEFAULT_ENGINE")
cwd=$(fm_get "$doing_file" cwd "$PWD")
yolo=$(fm_get "$doing_file" yolo "true")
logf="$LOGS/$job.log"
metaf="$STATE/$job.meta"
# NOTE: the parent (cmd_run) creates $metaf with job/engine/cwd/started/pid.
# The worker only ever APPENDS (ended/exit/result) to avoid a truncation race.
{
echo "===== agent-queue job: $job ====="
echo "engine=$engine cwd=$cwd yolo=$yolo"
echo "started: $(date)"
echo "================================="
} >> "$logf"
if [[ ! -d "$cwd" ]]; then
echo "FATAL: cwd does not exist: $cwd" >> "$logf"
mv "$doing_file" "$FAILED/" 2>/dev/null
echo "result=failed" >> "$metaf"; echo "ended=$(date +%s)" >> "$metaf"
return 1
fi
# Strip our frontmatter so the agent only sees the task body.
local bodyf="$STATE/$job.body.md"
strip_frontmatter "$doing_file" > "$bodyf"
build_agent_cmd "$engine" "$bodyf" "$yolo"
_run_agent() {
if [[ -n "$AGENT_STDIN" ]]; then
( cd "$cwd" && "${AGENT_CMD[@]}" < "$AGENT_STDIN" )
else
( cd "$cwd" && "${AGENT_CMD[@]}" )
fi
}
local rc=0 lockkey tmo timed_out=false
lockkey=$(lock_key_for "$doing_file")
tmo=$(_dur_to_secs "$(fm_get "$doing_file" timeout "0")")
local tmo_flag="$STATE/$job.timedout"; rm -f "$tmo_flag"
local lf="$LOCKS/$(_keyhash "$lockkey").lock"
if [[ "$tmo" -gt 0 && -n "$TIMEOUT_BIN" ]]; then
# Hard timeout via timeout/gtimeout (kills the whole process tree).
AQ_STDIN="$AGENT_STDIN" "$TIMEOUT_BIN" -k 5 "${tmo}s" bash -c '
cd "$1" || exit 97; shift
if [ -n "${AQ_STDIN:-}" ]; then exec "$@" < "$AQ_STDIN"; else exec "$@"; fi
' _ "$cwd" "${AGENT_CMD[@]}" >> "$logf" 2>&1
rc=$?
[[ $rc -eq 124 ]] && timed_out=true
elif [[ "$tmo" -gt 0 ]]; then
# Portable watchdog fallback (no timeout binary). Flags the timeout and
# signals the worker; install coreutils (gtimeout) for hard tree kills.
_run_agent >> "$logf" 2>&1 &
local apid=$!
( sleep "$tmo"; : > "$tmo_flag"
pkill -TERM -P "$apid" 2>/dev/null; kill -TERM "$apid" 2>/dev/null
sleep 5; pkill -KILL -P "$apid" 2>/dev/null; kill -KILL "$apid" 2>/dev/null ) &
local wpid=$!
wait "$apid" 2>/dev/null; rc=$?
kill "$wpid" 2>/dev/null; wait "$wpid" 2>/dev/null
[[ -f "$tmo_flag" ]] && timed_out=true
elif [[ -n "$FLOCK_BIN" ]]; then
# Cross-process hardening where flock exists (Linux CI). The single run-loop
# already serializes by lock key; this guards against a stray second launcher.
( "$FLOCK_BIN" -n 9 || exit 75; _run_agent ) 9>"$lf" >> "$logf" 2>&1
rc=$?
if [[ $rc -eq 75 ]]; then
echo "lock busy (key=$lockkey) — requeued to inbox" >> "$logf"
mv "$doing_file" "$INBOX/" 2>/dev/null
{ echo "ended=$(date +%s)"; echo "result=requeued"; } >> "$metaf"
return 0
fi
else
_run_agent >> "$logf" 2>&1
rc=$?
fi
rm -f "$tmo_flag"
echo "ended=$(date +%s)" >> "$metaf"
echo "exit=$rc" >> "$metaf"
if $timed_out; then
mv "$doing_file" "$FAILED/" 2>/dev/null
echo "result=timeout" >> "$metaf"
echo "TIMED OUT after ${tmo}s (rc=$rc): $(date)" >> "$logf"
elif [[ $rc -eq 0 ]]; then
mv "$doing_file" "$DONE/" 2>/dev/null
echo "result=done" >> "$metaf"
echo "completed OK (rc=0): $(date)" >> "$logf"
else
mv "$doing_file" "$FAILED/" 2>/dev/null
echo "result=failed" >> "$metaf"
echo "FAILED (rc=$rc): $(date)" >> "$logf"
fi
}
# ── Commands ────────────────────────────────────────────────────────
cmd_init() { ensure_dirs; log "queue initialized at $C_BOLD$QUEUE_ROOT$C_RESET"; }
cmd_add() {
ensure_dirs
local file="" engine="" cwd="" yolo=""
while [[ $# -gt 0 ]]; do
case "$1" in
--engine) engine=$2; shift 2;;
--cwd) cwd=$2; shift 2;;
--yolo) yolo=true; shift;;
--no-yolo) yolo=false; shift;;
*) file=$1; shift;;
esac
done
[[ -n "$file" && -f "$file" ]] || die "usage: add <file.md> [--engine devin|claude|codex] [--cwd PATH] [--yolo|--no-yolo]"
local base; base=$(basename "$file")
local stamp; stamp=$(date +%Y%m%d-%H%M%S)
local dest="$INBOX/${stamp}__${base}"
# If user passed flags AND the file has no frontmatter, inject one.
if [[ -n "$engine$cwd$yolo" ]] && [[ "$(head -1 "$file")" != "---" ]]; then
{
echo "---"
echo "engine: ${engine:-$DEFAULT_ENGINE}"
echo "cwd: ${cwd:-$PWD}"
echo "yolo: ${yolo:-true}"
echo "---"
echo
cat "$file"
} > "$dest"
else
cp "$file" "$dest"
fi
log "queued $C_BOLD$(basename "$dest")$C_RESET (engine=$(fm_get "$dest" engine "$DEFAULT_ENGINE"), cwd=$(fm_get "$dest" cwd "$PWD"))"
}
cmd_run() {
ensure_dirs
local once=false
while [[ $# -gt 0 ]]; do
case "$1" in
--max) MAX_CONCURRENCY=$2; shift 2;;
--engine) DEFAULT_ENGINE=$2; shift 2;;
--once|--drain) once=true; shift;;
*) die "run: unknown arg '$1'";;
esac
done
# Refuse to start a second run loop against the same queue — two daemons would
# break the single-launcher invariant that per-cwd locking relies on.
local dpid=""
[[ -f "$STATE/daemon.pid" ]] && dpid=$(cat "$STATE/daemon.pid" 2>/dev/null)
if [[ -n "$dpid" ]] && kill -0 "$dpid" 2>/dev/null; then
die "a run loop is already active (pid $dpid). Use 'stop' first, or a different AGENT_QUEUE_ROOT."
fi
[[ -n "$dpid" ]] && log "clearing stale daemon.pid ($dpid)"
echo "$$" > "$STATE/daemon.pid"
trap 'rm -f "$STATE/daemon.pid"; log "run loop stopped"; exit 0' INT TERM
log "run loop started (max=$MAX_CONCURRENCY, default engine=$DEFAULT_ENGINE). Ctrl-C to stop."
while true; do
local running; running=$(active_workers)
# launch jobs while we have capacity and an eligible inbox file
while [[ "$running" -lt "$MAX_CONCURRENCY" ]]; do
# pick the oldest inbox file whose lock key is not currently busy, so two
# jobs sharing a cwd (or `lock:` key) never run at once, regardless of --max.
local busy; busy=$(busy_keys)
local next="" cand cand_key
while IFS= read -r cand; do
[[ -n "$cand" ]] || continue
cand_key=$(lock_key_for "$cand")
if printf '%s\n' "$busy" | grep -qxF -- "$cand_key"; then continue; fi
next="$cand"; break
done < <(ls -1 "$INBOX"/*.md 2>/dev/null | sort)
[[ -z "$next" ]] && break
local job; job=$(basename "$next"); job=${job%.md}
local doing_file="$DOING/$(basename "$next")"
mv "$next" "$doing_file"
local w_eng w_cwd w_yolo w_key
w_eng=$(fm_get "$doing_file" engine "$DEFAULT_ENGINE")
w_cwd=$(fm_get "$doing_file" cwd "$PWD")
w_yolo=$(fm_get "$doing_file" yolo "true")
w_key=$(lock_key_for "$doing_file")
# write meta BEFORE launch (no pid yet), then append the worker pid from $!
{
echo "job=$job"
echo "engine=$w_eng"
echo "cwd=$w_cwd"
echo "yolo=$w_yolo"
echo "lock=$w_key"
echo "started=$(date +%s)"
} > "$STATE/$job.meta"
run_worker "$doing_file" &
echo "pid=$!" >> "$STATE/$job.meta"
log "▶ launching $C_BOLD$job$C_RESET (engine=$w_eng, lock=$w_key)"
sleep 1
running=$(active_workers)
done
if $once; then
[[ "$(active_workers)" -eq 0 && -z "$(ls -1 "$INBOX"/*.md 2>/dev/null)" ]] && {
log "drain complete — inbox empty, no workers running"; rm -f "$STATE/daemon.pid"; exit 0; }
fi
sleep "$POLL_SECONDS"
done
}
_count() { ls -1 "$1"/*.md 2>/dev/null | wc -l | tr -d ' '; }
cmd_status() {
ensure_dirs
local ib dg dn fl
ib=$(_count "$INBOX"); dg=$(_count "$DOING"); dn=$(_count "$DONE"); fl=$(_count "$FAILED")
local running; running=$(live_workers)
echo
printf '%s AGENT QUEUE %s %s\n' "$C_BOLD" "$C_DIM$QUEUE_ROOT$C_RESET" ""
printf ' %sinbox%s %-3s %sdoing%s %-3s %sdone%s %-3s %sfailed%s %-3s %srunning%s %s/%s\n\n' \
"$C_BLUE" "$C_RESET" "$ib" "$C_YEL" "$C_RESET" "$dg" \
"$C_GREEN" "$C_RESET" "$dn" "$C_RED" "$C_RESET" "$fl" \
"$C_BOLD" "$C_RESET" "$running" "$MAX_CONCURRENCY"
# running table
local f
local printed=false
for f in "$STATE"/*.meta; do
[[ -e "$f" ]] || continue
grep -q '^ended=' "$f" && continue
local pid; pid=$(grep '^pid=' "$f" | cut -d= -f2)
[[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null || continue
if ! $printed; then printf ' %sRUNNING%s\n' "$C_BOLD" "$C_RESET"; printed=true; fi
local job eng start now el last lmt age stall=""
job=$(grep '^job=' "$f" | cut -d= -f2)
eng=$(grep '^engine=' "$f" | cut -d= -f2)
start=$(grep '^started=' "$f" | cut -d= -f2)
now=$(date +%s); el=$(( now - ${start:-$now} ))
last=$(tail -n 1 "$LOGS/$job.log" 2>/dev/null | cut -c1-60)
lmt=$(_mtime "$LOGS/$job.log"); age=$(( now - ${lmt:-$now} ))
[[ "$age" -gt $(( STALL_MIN * 60 )) ]] && stall="${C_RED}⚠ stalled${C_RESET} "
printf ' %s%-26s%s %-7s %3dm%02ds pid %-6s %s%s%s%s\n' \
"$C_BOLD" "$job" "$C_RESET" "$eng" $((el/60)) $((el%60)) "$pid" "$stall" "$C_DIM" "$last" "$C_RESET"
done
$printed || printf ' %sno workers running%s\n' "$C_DIM" "$C_RESET"
echo
}
cmd_watch() {
local interval="${1:-2}"
while true; do clear; cmd_status; sleep "$interval"; done
}
cmd_dash() {
command -v node >/dev/null 2>&1 || die "node not found — use 'watch' for the bash status view"
AGENT_QUEUE_ROOT="$QUEUE_ROOT" exec node "$SCRIPT_DIR/dashboard.mjs" "$@"
}
cmd_stop() {
ensure_dirs
local killed=0 f pid
for f in "$STATE"/*.meta; do
[[ -e "$f" ]] || continue
grep -q '^ended=' "$f" && continue
pid=$(grep '^pid=' "$f" | cut -d= -f2)
[[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null && { kill "$pid" 2>/dev/null && killed=$((killed+1)); }
done
[[ -f "$STATE/daemon.pid" ]] && kill "$(cat "$STATE/daemon.pid")" 2>/dev/null
rm -f "$STATE/daemon.pid"
log "stopped $killed running worker(s) + run loop"
}
cmd_logs() {
local job="${1:-}" follow=""
[[ "${2:-}" == "-f" || "$job" == "-f" ]] && follow="-f"
[[ "$job" == "-f" ]] && job="${2:-}"
[[ -n "$job" ]] || die "usage: logs <job> [-f]"
local lf="$LOGS/$job.log"
[[ -f "$lf" ]] || lf=$(ls -1t "$LOGS"/*"$job"*.log 2>/dev/null | head -1)
[[ -f "$lf" ]] || die "no log found for '$job'"
if [[ -n "$follow" ]]; then tail -f "$lf"; else cat "$lf"; fi
}
usage() {
cat <<EOF
${C_BOLD}agent-queue${C_RESET} — folder kanban runner for devin | claude | codex
${C_BOLD}USAGE${C_RESET}
agent-queue.sh <command> [args]
${C_BOLD}COMMANDS${C_RESET}
init create the queue/ folders
add <file.md> [opts] queue a prompt file into inbox/
--engine devin|claude|codex --cwd PATH --yolo | --no-yolo
run [--max N] [--engine E] [--once]
process inbox/ (foreground loop; Ctrl-C to stop)
status show kanban counts + running workers
watch [interval] live status (default 2s, bash)
dash [--interval N] richer live Node dashboard (recent done/failed too)
stop kill running workers + the run loop
logs <job> [-f] print (or follow) a job's log
help this message
${C_BOLD}KANBAN${C_RESET} inbox → doing → done / failed (logs/ + .state/ alongside)
${C_BOLD}TASK FRONTMATTER${C_RESET} (top of each .md)
---
engine: devin
cwd: /Users/you/code/repo
yolo: true
lock: my-repo # optional; defaults to cwd. Jobs sharing a key run serially
timeout: 45m # optional; 90s|45m|2h|1d. On expiry -> failed (result=timeout)
---
${C_BOLD}ENV${C_RESET}
AGENT_QUEUE_ROOT (=$QUEUE_ROOT) AGENT_QUEUE_MAX (=$MAX_CONCURRENCY)
AGENT_QUEUE_ENGINE (=$DEFAULT_ENGINE) DEVIN_BIN / CLAUDE_BIN / CODEX_BIN
EOF
}
main() {
local cmd="${1:-help}"; shift || true
case "$cmd" in
init) cmd_init "$@";;
add) cmd_add "$@";;
run) cmd_run "$@";;
status) cmd_status "$@";;
watch) cmd_watch "$@";;
dash|dashboard) cmd_dash "$@";;
stop) cmd_stop "$@";;
logs) cmd_logs "$@";;
help|-h|--help) usage;;
*) err "unknown command: $cmd"; echo; usage; exit 1;;
esac
}
main "$@"