Record pidstart (ps lstart) at launch and verify it in all liveness checks (_meta_active, status, stop) via _pid_alive, so a recycled pid can never be mistaken for our worker. Falls back to plain liveness when no start time recorded.
546 lines
20 KiB
Bash
Executable File
546 lines
20 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
#
|
|
# agent-queue — a folder-based "kanban" runner for headless coding-agent CLIs.
|
|
#
|
|
# Drop a prompt .md file into queue/inbox/, and `agent-queue run` will:
|
|
# 1. pick the oldest file (respecting --max concurrency),
|
|
# 2. move it inbox/ -> doing/,
|
|
# 3. launch the chosen agent CLI (devin | claude | codex) in --yolo mode,
|
|
# 4. on success move doing/ -> done/, on failure -> failed/,
|
|
# 5. write a per-job log + live state so `status`/`watch` can show progress.
|
|
#
|
|
# Per-task config travels in YAML-ish frontmatter at the top of the .md:
|
|
# ---
|
|
# engine: devin # devin | claude | codex (default: $DEFAULT_ENGINE)
|
|
# cwd: /abs/path/repo # where the agent runs (default: $PWD when added)
|
|
# yolo: true # auto-approve all tools (default: true)
|
|
# ---
|
|
#
|
|
# Subcommands: init | add | run | status | watch | stop | logs | help
|
|
#
|
|
set -uo pipefail
|
|
|
|
# ── Resolve paths ───────────────────────────────────────────────────
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
QUEUE_ROOT="${AGENT_QUEUE_ROOT:-$SCRIPT_DIR/queue}"
|
|
INBOX="$QUEUE_ROOT/inbox"
|
|
DOING="$QUEUE_ROOT/doing"
|
|
DONE="$QUEUE_ROOT/done"
|
|
FAILED="$QUEUE_ROOT/failed"
|
|
LOGS="$QUEUE_ROOT/logs"
|
|
STATE="$QUEUE_ROOT/.state"
|
|
LOCKS="$QUEUE_ROOT/locks"
|
|
|
|
# ── Config (env-overridable) ────────────────────────────────────────
|
|
MAX_CONCURRENCY="${AGENT_QUEUE_MAX:-2}"
|
|
DEFAULT_ENGINE="${AGENT_QUEUE_ENGINE:-devin}"
|
|
POLL_SECONDS="${AGENT_QUEUE_POLL:-3}"
|
|
# A running worker is flagged "stalled" if its log has not changed in this many
|
|
# minutes (no new agent output) — surfaced in status + dash.
|
|
STALL_MIN="${AGENT_QUEUE_STALL_MIN:-10}"
|
|
|
|
# flock is used for cross-process lock hardening when available (Linux). macOS
|
|
# has no flock; mutual exclusion there relies on the single run-loop (see cmd_run).
|
|
FLOCK_BIN="${FLOCK_BIN:-$(command -v flock || true)}"
|
|
# timeout/gtimeout give hard process-tree kills for per-job timeouts; if absent
|
|
# (stock macOS) a pure-bash watchdog is used as a best-effort fallback.
|
|
TIMEOUT_BIN="${TIMEOUT_BIN:-$(command -v timeout || command -v gtimeout || true)}"
|
|
|
|
DEVIN_BIN="${DEVIN_BIN:-$(command -v devin || echo "$HOME/.local/bin/devin")}"
|
|
CLAUDE_BIN="${CLAUDE_BIN:-$(command -v claude || echo claude)}"
|
|
CODEX_BIN="${CODEX_BIN:-$(command -v codex || echo codex)}"
|
|
|
|
# ── Colors ──────────────────────────────────────────────────────────
|
|
if [[ -t 1 ]]; then
|
|
C_RESET=$'\033[0m'; C_DIM=$'\033[2m'; C_BOLD=$'\033[1m'
|
|
C_BLUE=$'\033[34m'; C_GREEN=$'\033[32m'; C_RED=$'\033[31m'; C_YEL=$'\033[33m'; C_CYAN=$'\033[36m'
|
|
else
|
|
C_RESET=""; C_DIM=""; C_BOLD=""; C_BLUE=""; C_GREEN=""; C_RED=""; C_YEL=""; C_CYAN=""
|
|
fi
|
|
|
|
log() { printf '%s[agent-queue]%s %s\n' "$C_CYAN" "$C_RESET" "$*"; }
|
|
err() { printf '%s[agent-queue]%s %s\n' "$C_RED" "$C_RESET" "$*" >&2; }
|
|
die() { err "$*"; exit 1; }
|
|
|
|
# ── Init ────────────────────────────────────────────────────────────
|
|
ensure_dirs() { mkdir -p "$INBOX" "$DOING" "$DONE" "$FAILED" "$LOGS" "$STATE" "$LOCKS"; }
|
|
|
|
# ── Frontmatter parsing ─────────────────────────────────────────────
|
|
# fm_get <file> <key> <default>
|
|
fm_get() {
|
|
local file=$1 key=$2 def=${3:-}
|
|
local val
|
|
# only scan a leading --- ... --- block
|
|
val=$(awk -v k="$key" '
|
|
NR==1 && $0!="---" { exit }
|
|
NR==1 { infm=1; next }
|
|
infm && $0=="---" { exit }
|
|
infm {
|
|
line=$0
|
|
sub(/^[ \t]*/,"",line)
|
|
if (line ~ "^" k "[ \t]*:") {
|
|
sub("^" k "[ \t]*:[ \t]*","",line)
|
|
gsub(/^["'\''[:space:]]+|["'\''[:space:]]+$/,"",line)
|
|
print line; exit
|
|
}
|
|
}' "$file" 2>/dev/null)
|
|
[[ -n "$val" ]] && printf '%s' "$val" || printf '%s' "$def"
|
|
}
|
|
|
|
# strip_frontmatter <file> -> prints the body (everything after a leading ---..--- block)
|
|
strip_frontmatter() {
|
|
awk 'NR==1 && $0=="---" { infm=1; next }
|
|
infm && $0=="---" { infm=0; next }
|
|
{ if (!infm) print }' "$1"
|
|
}
|
|
|
|
# lock_key_for <file> -> the mutual-exclusion key for a job: frontmatter `lock:`
|
|
# if set, otherwise the cwd. Jobs sharing a key never run concurrently.
|
|
lock_key_for() {
|
|
local f=$1 k
|
|
k=$(fm_get "$f" lock "")
|
|
[[ -n "$k" ]] && { printf '%s' "$k"; return; }
|
|
fm_get "$f" cwd "$PWD"
|
|
}
|
|
|
|
# _keyhash <key> -> stable filename-safe token for a lock key
|
|
_keyhash() { printf '%s' "$1" | cksum | awk '{print $1}'; }
|
|
|
|
# _mtime <file> -> file modification time in epoch seconds (BSD or GNU stat); empty if missing
|
|
_mtime() {
|
|
[[ -e "$1" ]] || { echo ""; return; }
|
|
stat -f %m "$1" 2>/dev/null || stat -c %Y "$1" 2>/dev/null || echo ""
|
|
}
|
|
|
|
# _pidstart <pid> -> the process start time as reported by ps (whitespace-normalized).
|
|
# Used as an identity token so a recycled pid is never mistaken for our worker.
|
|
_pidstart() { ps -o lstart= -p "$1" 2>/dev/null | awk '{$1=$1;print}'; }
|
|
|
|
# _pid_alive <pid> <pidstart> -> 0 if the pid is live AND (when a start time was
|
|
# recorded) its current start time still matches — defeating pid reuse.
|
|
_pid_alive() {
|
|
local pid=$1 want=$2 cur
|
|
[[ -n "$pid" ]] || return 1
|
|
kill -0 "$pid" 2>/dev/null || return 1
|
|
[[ -z "$want" ]] && return 0
|
|
cur=$(_pidstart "$pid")
|
|
[[ "$cur" == "$want" ]]
|
|
}
|
|
|
|
# _dur_to_secs <dur> -> seconds. Accepts 90, 90s, 45m, 2h, 1d. Invalid/empty -> 0.
|
|
_dur_to_secs() {
|
|
local d=$1
|
|
[[ -z "$d" || "$d" == "0" ]] && { echo 0; return; }
|
|
if [[ "$d" =~ ^([0-9]+)([smhd]?)$ ]]; then
|
|
local n=${BASH_REMATCH[1]} u=${BASH_REMATCH[2]}
|
|
case "$u" in
|
|
""|s) echo "$n";;
|
|
m) echo $((n*60));;
|
|
h) echo $((n*3600));;
|
|
d) echo $((n*86400));;
|
|
esac
|
|
else
|
|
echo 0
|
|
fi
|
|
}
|
|
|
|
# _meta_active <metafile> -> 0 if the job is occupying a concurrency slot.
|
|
# Active = no `ended=` AND (pid is live, OR pid not yet written but the meta was
|
|
# created moments ago — the reserved-slot window between meta-write and launch).
|
|
# The <30s guard prevents a meta orphaned mid-launch (daemon killed in the gap)
|
|
# from pinning a slot forever.
|
|
_meta_active() {
|
|
local f=$1 pid mt age
|
|
grep -q '^ended=' "$f" && return 1
|
|
pid=$(grep '^pid=' "$f" | head -1 | cut -d= -f2)
|
|
if [[ -n "$pid" ]]; then
|
|
local pidstart; pidstart=$(grep '^pidstart=' "$f" | head -1 | cut -d= -f2-)
|
|
_pid_alive "$pid" "$pidstart"
|
|
return $?
|
|
fi
|
|
mt=$(_mtime "$f"); age=$(( $(date +%s) - ${mt:-0} ))
|
|
[[ "$age" -lt 30 ]]
|
|
}
|
|
|
|
# active_workers -> count of jobs occupying a concurrency slot (reservation-aware).
|
|
active_workers() {
|
|
local n=0 f
|
|
for f in "$STATE"/*.meta; do
|
|
[[ -e "$f" ]] || continue
|
|
_meta_active "$f" && n=$((n+1))
|
|
done
|
|
echo "$n"
|
|
}
|
|
|
|
# busy_keys -> newline list of lock keys currently held by active workers.
|
|
busy_keys() {
|
|
local f
|
|
for f in "$STATE"/*.meta; do
|
|
[[ -e "$f" ]] || continue
|
|
_meta_active "$f" && grep '^lock=' "$f" | head -1 | cut -d= -f2-
|
|
done
|
|
}
|
|
|
|
# ── Engine driver: builds argv into AGENT_CMD[]; sets AGENT_STDIN if the ──
|
|
# prompt should be fed on stdin (claude/codex) rather than a flag. $pf is the
|
|
# frontmatter-STRIPPED body file, so a body starting with '--' is never
|
|
# misparsed as a CLI option.
|
|
build_agent_cmd() {
|
|
local engine=$1 pf=$2 yolo=$3
|
|
AGENT_CMD=(); AGENT_STDIN=""
|
|
case "$engine" in
|
|
devin)
|
|
AGENT_CMD=( "$DEVIN_BIN" -p --prompt-file "$pf" )
|
|
[[ "$yolo" == "true" ]] && AGENT_CMD+=( --permission-mode dangerous )
|
|
;;
|
|
claude)
|
|
AGENT_CMD=( "$CLAUDE_BIN" -p )
|
|
[[ "$yolo" == "true" ]] && AGENT_CMD+=( --dangerously-skip-permissions )
|
|
AGENT_STDIN="$pf"
|
|
;;
|
|
codex)
|
|
AGENT_CMD=( "$CODEX_BIN" exec )
|
|
[[ "$yolo" == "true" ]] && AGENT_CMD+=( --dangerously-bypass-approvals-and-sandbox )
|
|
AGENT_STDIN="$pf"
|
|
;;
|
|
*) die "unknown engine '$engine' (use: devin | claude | codex)";;
|
|
esac
|
|
}
|
|
|
|
# ── Worker: runs one job to completion (invoked in background) ───────
|
|
run_worker() {
|
|
local doing_file=$1
|
|
local job; job=$(basename "$doing_file")
|
|
job=${job%.md}
|
|
local engine cwd yolo logf metaf
|
|
engine=$(fm_get "$doing_file" engine "$DEFAULT_ENGINE")
|
|
cwd=$(fm_get "$doing_file" cwd "$PWD")
|
|
yolo=$(fm_get "$doing_file" yolo "true")
|
|
logf="$LOGS/$job.log"
|
|
metaf="$STATE/$job.meta"
|
|
# NOTE: the parent (cmd_run) creates $metaf with job/engine/cwd/started/pid.
|
|
# The worker only ever APPENDS (ended/exit/result) to avoid a truncation race.
|
|
|
|
{
|
|
echo "===== agent-queue job: $job ====="
|
|
echo "engine=$engine cwd=$cwd yolo=$yolo"
|
|
echo "started: $(date)"
|
|
echo "================================="
|
|
} >> "$logf"
|
|
|
|
if [[ ! -d "$cwd" ]]; then
|
|
echo "FATAL: cwd does not exist: $cwd" >> "$logf"
|
|
mv "$doing_file" "$FAILED/" 2>/dev/null
|
|
echo "result=failed" >> "$metaf"; echo "ended=$(date +%s)" >> "$metaf"
|
|
return 1
|
|
fi
|
|
|
|
# Strip our frontmatter so the agent only sees the task body.
|
|
local bodyf="$STATE/$job.body.md"
|
|
strip_frontmatter "$doing_file" > "$bodyf"
|
|
build_agent_cmd "$engine" "$bodyf" "$yolo"
|
|
|
|
_run_agent() {
|
|
if [[ -n "$AGENT_STDIN" ]]; then
|
|
( cd "$cwd" && "${AGENT_CMD[@]}" < "$AGENT_STDIN" )
|
|
else
|
|
( cd "$cwd" && "${AGENT_CMD[@]}" )
|
|
fi
|
|
}
|
|
|
|
local rc=0 lockkey tmo timed_out=false
|
|
lockkey=$(lock_key_for "$doing_file")
|
|
tmo=$(_dur_to_secs "$(fm_get "$doing_file" timeout "0")")
|
|
local tmo_flag="$STATE/$job.timedout"; rm -f "$tmo_flag"
|
|
local lf="$LOCKS/$(_keyhash "$lockkey").lock"
|
|
|
|
if [[ "$tmo" -gt 0 && -n "$TIMEOUT_BIN" ]]; then
|
|
# Hard timeout via timeout/gtimeout (kills the whole process tree).
|
|
AQ_STDIN="$AGENT_STDIN" "$TIMEOUT_BIN" -k 5 "${tmo}s" bash -c '
|
|
cd "$1" || exit 97; shift
|
|
if [ -n "${AQ_STDIN:-}" ]; then exec "$@" < "$AQ_STDIN"; else exec "$@"; fi
|
|
' _ "$cwd" "${AGENT_CMD[@]}" >> "$logf" 2>&1
|
|
rc=$?
|
|
[[ $rc -eq 124 ]] && timed_out=true
|
|
elif [[ "$tmo" -gt 0 ]]; then
|
|
# Portable watchdog fallback (no timeout binary). Flags the timeout and
|
|
# signals the worker; install coreutils (gtimeout) for hard tree kills.
|
|
_run_agent >> "$logf" 2>&1 &
|
|
local apid=$!
|
|
( sleep "$tmo"; : > "$tmo_flag"
|
|
pkill -TERM -P "$apid" 2>/dev/null; kill -TERM "$apid" 2>/dev/null
|
|
sleep 5; pkill -KILL -P "$apid" 2>/dev/null; kill -KILL "$apid" 2>/dev/null ) &
|
|
local wpid=$!
|
|
wait "$apid" 2>/dev/null; rc=$?
|
|
kill "$wpid" 2>/dev/null; wait "$wpid" 2>/dev/null
|
|
[[ -f "$tmo_flag" ]] && timed_out=true
|
|
elif [[ -n "$FLOCK_BIN" ]]; then
|
|
# Cross-process hardening where flock exists (Linux CI). The single run-loop
|
|
# already serializes by lock key; this guards against a stray second launcher.
|
|
( "$FLOCK_BIN" -n 9 || exit 75; _run_agent ) 9>"$lf" >> "$logf" 2>&1
|
|
rc=$?
|
|
if [[ $rc -eq 75 ]]; then
|
|
echo "lock busy (key=$lockkey) — requeued to inbox" >> "$logf"
|
|
mv "$doing_file" "$INBOX/" 2>/dev/null
|
|
{ echo "ended=$(date +%s)"; echo "result=requeued"; } >> "$metaf"
|
|
return 0
|
|
fi
|
|
else
|
|
_run_agent >> "$logf" 2>&1
|
|
rc=$?
|
|
fi
|
|
rm -f "$tmo_flag"
|
|
|
|
echo "ended=$(date +%s)" >> "$metaf"
|
|
echo "exit=$rc" >> "$metaf"
|
|
if $timed_out; then
|
|
mv "$doing_file" "$FAILED/" 2>/dev/null
|
|
echo "result=timeout" >> "$metaf"
|
|
echo "TIMED OUT after ${tmo}s (rc=$rc): $(date)" >> "$logf"
|
|
elif [[ $rc -eq 0 ]]; then
|
|
mv "$doing_file" "$DONE/" 2>/dev/null
|
|
echo "result=done" >> "$metaf"
|
|
echo "completed OK (rc=0): $(date)" >> "$logf"
|
|
else
|
|
mv "$doing_file" "$FAILED/" 2>/dev/null
|
|
echo "result=failed" >> "$metaf"
|
|
echo "FAILED (rc=$rc): $(date)" >> "$logf"
|
|
fi
|
|
}
|
|
|
|
# ── Commands ────────────────────────────────────────────────────────
|
|
cmd_init() { ensure_dirs; log "queue initialized at $C_BOLD$QUEUE_ROOT$C_RESET"; }
|
|
|
|
cmd_add() {
|
|
ensure_dirs
|
|
local file="" engine="" cwd="" yolo=""
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--engine) engine=$2; shift 2;;
|
|
--cwd) cwd=$2; shift 2;;
|
|
--yolo) yolo=true; shift;;
|
|
--no-yolo) yolo=false; shift;;
|
|
*) file=$1; shift;;
|
|
esac
|
|
done
|
|
[[ -n "$file" && -f "$file" ]] || die "usage: add <file.md> [--engine devin|claude|codex] [--cwd PATH] [--yolo|--no-yolo]"
|
|
local base; base=$(basename "$file")
|
|
local stamp; stamp=$(date +%Y%m%d-%H%M%S)
|
|
local dest="$INBOX/${stamp}__${base}"
|
|
|
|
# If user passed flags AND the file has no frontmatter, inject one.
|
|
if [[ -n "$engine$cwd$yolo" ]] && [[ "$(head -1 "$file")" != "---" ]]; then
|
|
{
|
|
echo "---"
|
|
echo "engine: ${engine:-$DEFAULT_ENGINE}"
|
|
echo "cwd: ${cwd:-$PWD}"
|
|
echo "yolo: ${yolo:-true}"
|
|
echo "---"
|
|
echo
|
|
cat "$file"
|
|
} > "$dest"
|
|
else
|
|
cp "$file" "$dest"
|
|
fi
|
|
log "queued $C_BOLD$(basename "$dest")$C_RESET (engine=$(fm_get "$dest" engine "$DEFAULT_ENGINE"), cwd=$(fm_get "$dest" cwd "$PWD"))"
|
|
}
|
|
|
|
cmd_run() {
|
|
ensure_dirs
|
|
local once=false
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--max) MAX_CONCURRENCY=$2; shift 2;;
|
|
--engine) DEFAULT_ENGINE=$2; shift 2;;
|
|
--once|--drain) once=true; shift;;
|
|
*) die "run: unknown arg '$1'";;
|
|
esac
|
|
done
|
|
# Refuse to start a second run loop against the same queue — two daemons would
|
|
# break the single-launcher invariant that per-cwd locking relies on.
|
|
local dpid=""
|
|
[[ -f "$STATE/daemon.pid" ]] && dpid=$(cat "$STATE/daemon.pid" 2>/dev/null)
|
|
if [[ -n "$dpid" ]] && kill -0 "$dpid" 2>/dev/null; then
|
|
die "a run loop is already active (pid $dpid). Use 'stop' first, or a different AGENT_QUEUE_ROOT."
|
|
fi
|
|
[[ -n "$dpid" ]] && log "clearing stale daemon.pid ($dpid)"
|
|
echo "$$" > "$STATE/daemon.pid"
|
|
trap 'rm -f "$STATE/daemon.pid"; log "run loop stopped"; exit 0' INT TERM
|
|
log "run loop started (max=$MAX_CONCURRENCY, default engine=$DEFAULT_ENGINE). Ctrl-C to stop."
|
|
|
|
while true; do
|
|
local running; running=$(active_workers)
|
|
# launch jobs while we have capacity and an eligible inbox file
|
|
while [[ "$running" -lt "$MAX_CONCURRENCY" ]]; do
|
|
# pick the oldest inbox file whose lock key is not currently busy, so two
|
|
# jobs sharing a cwd (or `lock:` key) never run at once, regardless of --max.
|
|
local busy; busy=$(busy_keys)
|
|
local next="" cand cand_key
|
|
while IFS= read -r cand; do
|
|
[[ -n "$cand" ]] || continue
|
|
cand_key=$(lock_key_for "$cand")
|
|
if printf '%s\n' "$busy" | grep -qxF -- "$cand_key"; then continue; fi
|
|
next="$cand"; break
|
|
done < <(ls -1 "$INBOX"/*.md 2>/dev/null | sort)
|
|
[[ -z "$next" ]] && break
|
|
|
|
local job; job=$(basename "$next"); job=${job%.md}
|
|
local doing_file="$DOING/$(basename "$next")"
|
|
mv "$next" "$doing_file"
|
|
local w_eng w_cwd w_yolo w_key
|
|
w_eng=$(fm_get "$doing_file" engine "$DEFAULT_ENGINE")
|
|
w_cwd=$(fm_get "$doing_file" cwd "$PWD")
|
|
w_yolo=$(fm_get "$doing_file" yolo "true")
|
|
w_key=$(lock_key_for "$doing_file")
|
|
# write meta BEFORE launch (no pid yet), then append the worker pid from $!
|
|
{
|
|
echo "job=$job"
|
|
echo "engine=$w_eng"
|
|
echo "cwd=$w_cwd"
|
|
echo "yolo=$w_yolo"
|
|
echo "lock=$w_key"
|
|
echo "started=$(date +%s)"
|
|
} > "$STATE/$job.meta"
|
|
run_worker "$doing_file" &
|
|
{ echo "pid=$!"; echo "pidstart=$(_pidstart "$!")"; } >> "$STATE/$job.meta"
|
|
log "▶ launching $C_BOLD$job$C_RESET (engine=$w_eng, lock=$w_key)"
|
|
sleep 1
|
|
running=$(active_workers)
|
|
done
|
|
|
|
if $once; then
|
|
[[ "$(active_workers)" -eq 0 && -z "$(ls -1 "$INBOX"/*.md 2>/dev/null)" ]] && {
|
|
log "drain complete — inbox empty, no workers running"; rm -f "$STATE/daemon.pid"; exit 0; }
|
|
fi
|
|
sleep "$POLL_SECONDS"
|
|
done
|
|
}
|
|
|
|
_count() { ls -1 "$1"/*.md 2>/dev/null | wc -l | tr -d ' '; }
|
|
|
|
cmd_status() {
|
|
ensure_dirs
|
|
local ib dg dn fl
|
|
ib=$(_count "$INBOX"); dg=$(_count "$DOING"); dn=$(_count "$DONE"); fl=$(_count "$FAILED")
|
|
local running; running=$(live_workers)
|
|
echo
|
|
printf '%s AGENT QUEUE %s %s\n' "$C_BOLD" "$C_DIM$QUEUE_ROOT$C_RESET" ""
|
|
printf ' %sinbox%s %-3s %sdoing%s %-3s %sdone%s %-3s %sfailed%s %-3s %srunning%s %s/%s\n\n' \
|
|
"$C_BLUE" "$C_RESET" "$ib" "$C_YEL" "$C_RESET" "$dg" \
|
|
"$C_GREEN" "$C_RESET" "$dn" "$C_RED" "$C_RESET" "$fl" \
|
|
"$C_BOLD" "$C_RESET" "$running" "$MAX_CONCURRENCY"
|
|
|
|
# running table
|
|
local f
|
|
local printed=false
|
|
for f in "$STATE"/*.meta; do
|
|
[[ -e "$f" ]] || continue
|
|
grep -q '^ended=' "$f" && continue
|
|
local pid pidstart; pid=$(grep '^pid=' "$f" | cut -d= -f2); pidstart=$(grep '^pidstart=' "$f" | cut -d= -f2-)
|
|
_pid_alive "$pid" "$pidstart" || continue
|
|
if ! $printed; then printf ' %sRUNNING%s\n' "$C_BOLD" "$C_RESET"; printed=true; fi
|
|
local job eng start now el last lmt age stall=""
|
|
job=$(grep '^job=' "$f" | cut -d= -f2)
|
|
eng=$(grep '^engine=' "$f" | cut -d= -f2)
|
|
start=$(grep '^started=' "$f" | cut -d= -f2)
|
|
now=$(date +%s); el=$(( now - ${start:-$now} ))
|
|
last=$(tail -n 1 "$LOGS/$job.log" 2>/dev/null | cut -c1-60)
|
|
lmt=$(_mtime "$LOGS/$job.log"); age=$(( now - ${lmt:-$now} ))
|
|
[[ "$age" -gt $(( STALL_MIN * 60 )) ]] && stall="${C_RED}⚠ stalled${C_RESET} "
|
|
printf ' %s%-26s%s %-7s %3dm%02ds pid %-6s %s%s%s%s\n' \
|
|
"$C_BOLD" "$job" "$C_RESET" "$eng" $((el/60)) $((el%60)) "$pid" "$stall" "$C_DIM" "$last" "$C_RESET"
|
|
done
|
|
$printed || printf ' %sno workers running%s\n' "$C_DIM" "$C_RESET"
|
|
echo
|
|
}
|
|
|
|
cmd_watch() {
|
|
local interval="${1:-2}"
|
|
while true; do clear; cmd_status; sleep "$interval"; done
|
|
}
|
|
|
|
cmd_dash() {
|
|
command -v node >/dev/null 2>&1 || die "node not found — use 'watch' for the bash status view"
|
|
AGENT_QUEUE_ROOT="$QUEUE_ROOT" exec node "$SCRIPT_DIR/dashboard.mjs" "$@"
|
|
}
|
|
|
|
cmd_stop() {
|
|
ensure_dirs
|
|
local killed=0 f pid pidstart
|
|
for f in "$STATE"/*.meta; do
|
|
[[ -e "$f" ]] || continue
|
|
grep -q '^ended=' "$f" && continue
|
|
pid=$(grep '^pid=' "$f" | cut -d= -f2); pidstart=$(grep '^pidstart=' "$f" | cut -d= -f2-)
|
|
_pid_alive "$pid" "$pidstart" && { kill "$pid" 2>/dev/null && killed=$((killed+1)); }
|
|
done
|
|
[[ -f "$STATE/daemon.pid" ]] && kill "$(cat "$STATE/daemon.pid")" 2>/dev/null
|
|
rm -f "$STATE/daemon.pid"
|
|
log "stopped $killed running worker(s) + run loop"
|
|
}
|
|
|
|
cmd_logs() {
|
|
local job="${1:-}" follow=""
|
|
[[ "${2:-}" == "-f" || "$job" == "-f" ]] && follow="-f"
|
|
[[ "$job" == "-f" ]] && job="${2:-}"
|
|
[[ -n "$job" ]] || die "usage: logs <job> [-f]"
|
|
local lf="$LOGS/$job.log"
|
|
[[ -f "$lf" ]] || lf=$(ls -1t "$LOGS"/*"$job"*.log 2>/dev/null | head -1)
|
|
[[ -f "$lf" ]] || die "no log found for '$job'"
|
|
if [[ -n "$follow" ]]; then tail -f "$lf"; else cat "$lf"; fi
|
|
}
|
|
|
|
usage() {
|
|
cat <<EOF
|
|
${C_BOLD}agent-queue${C_RESET} — folder kanban runner for devin | claude | codex
|
|
|
|
${C_BOLD}USAGE${C_RESET}
|
|
agent-queue.sh <command> [args]
|
|
|
|
${C_BOLD}COMMANDS${C_RESET}
|
|
init create the queue/ folders
|
|
add <file.md> [opts] queue a prompt file into inbox/
|
|
--engine devin|claude|codex --cwd PATH --yolo | --no-yolo
|
|
run [--max N] [--engine E] [--once]
|
|
process inbox/ (foreground loop; Ctrl-C to stop)
|
|
status show kanban counts + running workers
|
|
watch [interval] live status (default 2s, bash)
|
|
dash [--interval N] richer live Node dashboard (recent done/failed too)
|
|
stop kill running workers + the run loop
|
|
logs <job> [-f] print (or follow) a job's log
|
|
help this message
|
|
|
|
${C_BOLD}KANBAN${C_RESET} inbox → doing → done / failed (logs/ + .state/ alongside)
|
|
|
|
${C_BOLD}TASK FRONTMATTER${C_RESET} (top of each .md)
|
|
---
|
|
engine: devin
|
|
cwd: /Users/you/code/repo
|
|
yolo: true
|
|
lock: my-repo # optional; defaults to cwd. Jobs sharing a key run serially
|
|
timeout: 45m # optional; 90s|45m|2h|1d. On expiry -> failed (result=timeout)
|
|
---
|
|
|
|
${C_BOLD}ENV${C_RESET}
|
|
AGENT_QUEUE_ROOT (=$QUEUE_ROOT) AGENT_QUEUE_MAX (=$MAX_CONCURRENCY)
|
|
AGENT_QUEUE_ENGINE (=$DEFAULT_ENGINE) DEVIN_BIN / CLAUDE_BIN / CODEX_BIN
|
|
EOF
|
|
}
|
|
|
|
main() {
|
|
local cmd="${1:-help}"; shift || true
|
|
case "$cmd" in
|
|
init) cmd_init "$@";;
|
|
add) cmd_add "$@";;
|
|
run) cmd_run "$@";;
|
|
status) cmd_status "$@";;
|
|
watch) cmd_watch "$@";;
|
|
dash|dashboard) cmd_dash "$@";;
|
|
stop) cmd_stop "$@";;
|
|
logs) cmd_logs "$@";;
|
|
help|-h|--help) usage;;
|
|
*) err "unknown command: $cmd"; echo; usage; exit 1;;
|
|
esac
|
|
}
|
|
|
|
main "$@"
|