bytelyst-devops-tools/agent-queue/agent-queue.sh
saravanakumardb1 f14e6c2336 feat(agent-queue): per-cwd locking so two agents never share a repo
Serialize jobs by lock key (frontmatter 'lock:' override, default cwd) via the
single run-loop's pre-launch eligibility check; the oldest non-busy job is picked
regardless of --max. Adds a flock-based worker guard where flock exists (Linux);
macOS relies on the single-daemon model. Records lock= in job meta.
2026-05-28 22:10:30 -07:00

453 lines
16 KiB
Bash
Executable File

#!/usr/bin/env bash
#
# agent-queue — a folder-based "kanban" runner for headless coding-agent CLIs.
#
# Drop a prompt .md file into queue/inbox/, and `agent-queue run` will:
# 1. pick the oldest file (respecting --max concurrency),
# 2. move it inbox/ -> doing/,
# 3. launch the chosen agent CLI (devin | claude | codex) in --yolo mode,
# 4. on success move doing/ -> done/, on failure -> failed/,
# 5. write a per-job log + live state so `status`/`watch` can show progress.
#
# Per-task config travels in YAML-ish frontmatter at the top of the .md:
# ---
# engine: devin # devin | claude | codex (default: $DEFAULT_ENGINE)
# cwd: /abs/path/repo # where the agent runs (default: $PWD when added)
# yolo: true # auto-approve all tools (default: true)
# ---
#
# Subcommands: init | add | run | status | watch | stop | logs | help
#
set -uo pipefail
# ── Resolve paths ───────────────────────────────────────────────────
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
QUEUE_ROOT="${AGENT_QUEUE_ROOT:-$SCRIPT_DIR/queue}"
INBOX="$QUEUE_ROOT/inbox"
DOING="$QUEUE_ROOT/doing"
DONE="$QUEUE_ROOT/done"
FAILED="$QUEUE_ROOT/failed"
LOGS="$QUEUE_ROOT/logs"
STATE="$QUEUE_ROOT/.state"
LOCKS="$QUEUE_ROOT/locks"
# ── Config (env-overridable) ────────────────────────────────────────
MAX_CONCURRENCY="${AGENT_QUEUE_MAX:-2}"
DEFAULT_ENGINE="${AGENT_QUEUE_ENGINE:-devin}"
POLL_SECONDS="${AGENT_QUEUE_POLL:-3}"
# flock is used for cross-process lock hardening when available (Linux). macOS
# has no flock; mutual exclusion there relies on the single run-loop (see cmd_run).
FLOCK_BIN="${FLOCK_BIN:-$(command -v flock || true)}"
DEVIN_BIN="${DEVIN_BIN:-$(command -v devin || echo "$HOME/.local/bin/devin")}"
CLAUDE_BIN="${CLAUDE_BIN:-$(command -v claude || echo claude)}"
CODEX_BIN="${CODEX_BIN:-$(command -v codex || echo codex)}"
# ── Colors ──────────────────────────────────────────────────────────
if [[ -t 1 ]]; then
C_RESET=$'\033[0m'; C_DIM=$'\033[2m'; C_BOLD=$'\033[1m'
C_BLUE=$'\033[34m'; C_GREEN=$'\033[32m'; C_RED=$'\033[31m'; C_YEL=$'\033[33m'; C_CYAN=$'\033[36m'
else
C_RESET=""; C_DIM=""; C_BOLD=""; C_BLUE=""; C_GREEN=""; C_RED=""; C_YEL=""; C_CYAN=""
fi
log() { printf '%s[agent-queue]%s %s\n' "$C_CYAN" "$C_RESET" "$*"; }
err() { printf '%s[agent-queue]%s %s\n' "$C_RED" "$C_RESET" "$*" >&2; }
die() { err "$*"; exit 1; }
# ── Init ────────────────────────────────────────────────────────────
ensure_dirs() { mkdir -p "$INBOX" "$DOING" "$DONE" "$FAILED" "$LOGS" "$STATE" "$LOCKS"; }
# ── Frontmatter parsing ─────────────────────────────────────────────
# fm_get <file> <key> <default>
fm_get() {
local file=$1 key=$2 def=${3:-}
local val
# only scan a leading --- ... --- block
val=$(awk -v k="$key" '
NR==1 && $0!="---" { exit }
NR==1 { infm=1; next }
infm && $0=="---" { exit }
infm {
line=$0
sub(/^[ \t]*/,"",line)
if (line ~ "^" k "[ \t]*:") {
sub("^" k "[ \t]*:[ \t]*","",line)
gsub(/^["'\''[:space:]]+|["'\''[:space:]]+$/,"",line)
print line; exit
}
}' "$file" 2>/dev/null)
[[ -n "$val" ]] && printf '%s' "$val" || printf '%s' "$def"
}
# strip_frontmatter <file> -> prints the body (everything after a leading ---..--- block)
strip_frontmatter() {
awk 'NR==1 && $0=="---" { infm=1; next }
infm && $0=="---" { infm=0; next }
{ if (!infm) print }' "$1"
}
# lock_key_for <file> -> the mutual-exclusion key for a job: frontmatter `lock:`
# if set, otherwise the cwd. Jobs sharing a key never run concurrently.
lock_key_for() {
local f=$1 k
k=$(fm_get "$f" lock "")
[[ -n "$k" ]] && { printf '%s' "$k"; return; }
fm_get "$f" cwd "$PWD"
}
# _keyhash <key> -> stable filename-safe token for a lock key
_keyhash() { printf '%s' "$1" | cksum | awk '{print $1}'; }
# busy_keys -> newline list of lock keys currently held by active workers.
# A worker is active if its meta has no `ended=` and its pid is live (or the pid
# has not been written yet, i.e. it was just launched and the slot is reserved).
busy_keys() {
local f pid
for f in "$STATE"/*.meta; do
[[ -e "$f" ]] || continue
grep -q '^ended=' "$f" && continue
pid=$(grep '^pid=' "$f" | head -1 | cut -d= -f2)
if [[ -z "$pid" ]] || kill -0 "$pid" 2>/dev/null; then
grep '^lock=' "$f" | head -1 | cut -d= -f2-
fi
done
}
# ── Engine driver: builds argv into AGENT_CMD[]; sets AGENT_STDIN if the ──
# prompt should be fed on stdin (claude/codex) rather than a flag. $pf is the
# frontmatter-STRIPPED body file, so a body starting with '--' is never
# misparsed as a CLI option.
build_agent_cmd() {
local engine=$1 pf=$2 yolo=$3
AGENT_CMD=(); AGENT_STDIN=""
case "$engine" in
devin)
AGENT_CMD=( "$DEVIN_BIN" -p --prompt-file "$pf" )
[[ "$yolo" == "true" ]] && AGENT_CMD+=( --permission-mode dangerous )
;;
claude)
AGENT_CMD=( "$CLAUDE_BIN" -p )
[[ "$yolo" == "true" ]] && AGENT_CMD+=( --dangerously-skip-permissions )
AGENT_STDIN="$pf"
;;
codex)
AGENT_CMD=( "$CODEX_BIN" exec )
[[ "$yolo" == "true" ]] && AGENT_CMD+=( --dangerously-bypass-approvals-and-sandbox )
AGENT_STDIN="$pf"
;;
*) die "unknown engine '$engine' (use: devin | claude | codex)";;
esac
}
# ── Worker: runs one job to completion (invoked in background) ───────
run_worker() {
local doing_file=$1
local job; job=$(basename "$doing_file")
job=${job%.md}
local engine cwd yolo logf metaf
engine=$(fm_get "$doing_file" engine "$DEFAULT_ENGINE")
cwd=$(fm_get "$doing_file" cwd "$PWD")
yolo=$(fm_get "$doing_file" yolo "true")
logf="$LOGS/$job.log"
metaf="$STATE/$job.meta"
# NOTE: the parent (cmd_run) creates $metaf with job/engine/cwd/started/pid.
# The worker only ever APPENDS (ended/exit/result) to avoid a truncation race.
{
echo "===== agent-queue job: $job ====="
echo "engine=$engine cwd=$cwd yolo=$yolo"
echo "started: $(date)"
echo "================================="
} >> "$logf"
if [[ ! -d "$cwd" ]]; then
echo "FATAL: cwd does not exist: $cwd" >> "$logf"
mv "$doing_file" "$FAILED/" 2>/dev/null
echo "result=failed" >> "$metaf"; echo "ended=$(date +%s)" >> "$metaf"
return 1
fi
# Strip our frontmatter so the agent only sees the task body.
local bodyf="$STATE/$job.body.md"
strip_frontmatter "$doing_file" > "$bodyf"
build_agent_cmd "$engine" "$bodyf" "$yolo"
_run_agent() {
if [[ -n "$AGENT_STDIN" ]]; then
( cd "$cwd" && "${AGENT_CMD[@]}" < "$AGENT_STDIN" )
else
( cd "$cwd" && "${AGENT_CMD[@]}" )
fi
}
local rc lockkey
lockkey=$(lock_key_for "$doing_file")
if [[ -n "$FLOCK_BIN" ]]; then
# Cross-process hardening where flock exists (Linux CI). The single run-loop
# already serializes by lock key; this guards against a stray second launcher.
local lf="$LOCKS/$(_keyhash "$lockkey").lock"
( "$FLOCK_BIN" -n 9 || exit 75; _run_agent ) 9>"$lf" >> "$logf" 2>&1
rc=$?
if [[ $rc -eq 75 ]]; then
echo "lock busy (key=$lockkey) — requeued to inbox" >> "$logf"
mv "$doing_file" "$INBOX/" 2>/dev/null
{ echo "ended=$(date +%s)"; echo "result=requeued"; } >> "$metaf"
return 0
fi
else
_run_agent >> "$logf" 2>&1
rc=$?
fi
echo "ended=$(date +%s)" >> "$metaf"
echo "exit=$rc" >> "$metaf"
if [[ $rc -eq 0 ]]; then
mv "$doing_file" "$DONE/" 2>/dev/null
echo "result=done" >> "$metaf"
echo "completed OK (rc=0): $(date)" >> "$logf"
else
mv "$doing_file" "$FAILED/" 2>/dev/null
echo "result=failed" >> "$metaf"
echo "FAILED (rc=$rc): $(date)" >> "$logf"
fi
}
# count live workers by checking recorded pids
live_workers() {
local n=0 f pid
for f in "$STATE"/*.meta; do
[[ -e "$f" ]] || continue
grep -q '^ended=' "$f" && continue
pid=$(grep '^pid=' "$f" | head -1 | cut -d= -f2)
[[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null && n=$((n+1))
done
echo "$n"
}
# ── Commands ────────────────────────────────────────────────────────
cmd_init() { ensure_dirs; log "queue initialized at $C_BOLD$QUEUE_ROOT$C_RESET"; }
cmd_add() {
ensure_dirs
local file="" engine="" cwd="" yolo=""
while [[ $# -gt 0 ]]; do
case "$1" in
--engine) engine=$2; shift 2;;
--cwd) cwd=$2; shift 2;;
--yolo) yolo=true; shift;;
--no-yolo) yolo=false; shift;;
*) file=$1; shift;;
esac
done
[[ -n "$file" && -f "$file" ]] || die "usage: add <file.md> [--engine devin|claude|codex] [--cwd PATH] [--yolo|--no-yolo]"
local base; base=$(basename "$file")
local stamp; stamp=$(date +%Y%m%d-%H%M%S)
local dest="$INBOX/${stamp}__${base}"
# If user passed flags AND the file has no frontmatter, inject one.
if [[ -n "$engine$cwd$yolo" ]] && [[ "$(head -1 "$file")" != "---" ]]; then
{
echo "---"
echo "engine: ${engine:-$DEFAULT_ENGINE}"
echo "cwd: ${cwd:-$PWD}"
echo "yolo: ${yolo:-true}"
echo "---"
echo
cat "$file"
} > "$dest"
else
cp "$file" "$dest"
fi
log "queued $C_BOLD$(basename "$dest")$C_RESET (engine=$(fm_get "$dest" engine "$DEFAULT_ENGINE"), cwd=$(fm_get "$dest" cwd "$PWD"))"
}
cmd_run() {
ensure_dirs
local once=false
while [[ $# -gt 0 ]]; do
case "$1" in
--max) MAX_CONCURRENCY=$2; shift 2;;
--engine) DEFAULT_ENGINE=$2; shift 2;;
--once|--drain) once=true; shift;;
*) die "run: unknown arg '$1'";;
esac
done
echo "$$" > "$STATE/daemon.pid"
trap 'rm -f "$STATE/daemon.pid"; log "run loop stopped"; exit 0' INT TERM
log "run loop started (max=$MAX_CONCURRENCY, default engine=$DEFAULT_ENGINE). Ctrl-C to stop."
while true; do
local running; running=$(live_workers)
# launch jobs while we have capacity and an eligible inbox file
while [[ "$running" -lt "$MAX_CONCURRENCY" ]]; do
# pick the oldest inbox file whose lock key is not currently busy, so two
# jobs sharing a cwd (or `lock:` key) never run at once, regardless of --max.
local busy; busy=$(busy_keys)
local next="" cand cand_key
while IFS= read -r cand; do
[[ -n "$cand" ]] || continue
cand_key=$(lock_key_for "$cand")
if printf '%s\n' "$busy" | grep -qxF -- "$cand_key"; then continue; fi
next="$cand"; break
done < <(ls -1 "$INBOX"/*.md 2>/dev/null | sort)
[[ -z "$next" ]] && break
local job; job=$(basename "$next"); job=${job%.md}
local doing_file="$DOING/$(basename "$next")"
mv "$next" "$doing_file"
local w_eng w_cwd w_yolo w_key
w_eng=$(fm_get "$doing_file" engine "$DEFAULT_ENGINE")
w_cwd=$(fm_get "$doing_file" cwd "$PWD")
w_yolo=$(fm_get "$doing_file" yolo "true")
w_key=$(lock_key_for "$doing_file")
# write meta BEFORE launch (no pid yet), then append the worker pid from $!
{
echo "job=$job"
echo "engine=$w_eng"
echo "cwd=$w_cwd"
echo "yolo=$w_yolo"
echo "lock=$w_key"
echo "started=$(date +%s)"
} > "$STATE/$job.meta"
run_worker "$doing_file" &
echo "pid=$!" >> "$STATE/$job.meta"
log "▶ launching $C_BOLD$job$C_RESET (engine=$w_eng, lock=$w_key)"
sleep 1
running=$(live_workers)
done
if $once; then
[[ "$(live_workers)" -eq 0 && -z "$(ls -1 "$INBOX"/*.md 2>/dev/null)" ]] && {
log "drain complete — inbox empty, no workers running"; rm -f "$STATE/daemon.pid"; exit 0; }
fi
sleep "$POLL_SECONDS"
done
}
_count() { ls -1 "$1"/*.md 2>/dev/null | wc -l | tr -d ' '; }
cmd_status() {
ensure_dirs
local ib dg dn fl
ib=$(_count "$INBOX"); dg=$(_count "$DOING"); dn=$(_count "$DONE"); fl=$(_count "$FAILED")
local running; running=$(live_workers)
echo
printf '%s AGENT QUEUE %s %s\n' "$C_BOLD" "$C_DIM$QUEUE_ROOT$C_RESET" ""
printf ' %sinbox%s %-3s %sdoing%s %-3s %sdone%s %-3s %sfailed%s %-3s %srunning%s %s/%s\n\n' \
"$C_BLUE" "$C_RESET" "$ib" "$C_YEL" "$C_RESET" "$dg" \
"$C_GREEN" "$C_RESET" "$dn" "$C_RED" "$C_RESET" "$fl" \
"$C_BOLD" "$C_RESET" "$running" "$MAX_CONCURRENCY"
# running table
local f
local printed=false
for f in "$STATE"/*.meta; do
[[ -e "$f" ]] || continue
grep -q '^ended=' "$f" && continue
local pid; pid=$(grep '^pid=' "$f" | cut -d= -f2)
[[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null || continue
if ! $printed; then printf ' %sRUNNING%s\n' "$C_BOLD" "$C_RESET"; printed=true; fi
local job eng start now el last
job=$(grep '^job=' "$f" | cut -d= -f2)
eng=$(grep '^engine=' "$f" | cut -d= -f2)
start=$(grep '^started=' "$f" | cut -d= -f2)
now=$(date +%s); el=$(( now - ${start:-$now} ))
last=$(tail -n 1 "$LOGS/$job.log" 2>/dev/null | cut -c1-60)
printf ' %s%-26s%s %-7s %3dm%02ds pid %-6s %s%s%s\n' \
"$C_BOLD" "$job" "$C_RESET" "$eng" $((el/60)) $((el%60)) "$pid" "$C_DIM" "$last" "$C_RESET"
done
$printed || printf ' %sno workers running%s\n' "$C_DIM" "$C_RESET"
echo
}
cmd_watch() {
local interval="${1:-2}"
while true; do clear; cmd_status; sleep "$interval"; done
}
cmd_dash() {
command -v node >/dev/null 2>&1 || die "node not found — use 'watch' for the bash status view"
AGENT_QUEUE_ROOT="$QUEUE_ROOT" exec node "$SCRIPT_DIR/dashboard.mjs" "$@"
}
cmd_stop() {
ensure_dirs
local killed=0 f pid
for f in "$STATE"/*.meta; do
[[ -e "$f" ]] || continue
grep -q '^ended=' "$f" && continue
pid=$(grep '^pid=' "$f" | cut -d= -f2)
[[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null && { kill "$pid" 2>/dev/null && killed=$((killed+1)); }
done
[[ -f "$STATE/daemon.pid" ]] && kill "$(cat "$STATE/daemon.pid")" 2>/dev/null
rm -f "$STATE/daemon.pid"
log "stopped $killed running worker(s) + run loop"
}
cmd_logs() {
local job="${1:-}" follow=""
[[ "${2:-}" == "-f" || "$job" == "-f" ]] && follow="-f"
[[ "$job" == "-f" ]] && job="${2:-}"
[[ -n "$job" ]] || die "usage: logs <job> [-f]"
local lf="$LOGS/$job.log"
[[ -f "$lf" ]] || lf=$(ls -1t "$LOGS"/*"$job"*.log 2>/dev/null | head -1)
[[ -f "$lf" ]] || die "no log found for '$job'"
if [[ -n "$follow" ]]; then tail -f "$lf"; else cat "$lf"; fi
}
usage() {
cat <<EOF
${C_BOLD}agent-queue${C_RESET} — folder kanban runner for devin | claude | codex
${C_BOLD}USAGE${C_RESET}
agent-queue.sh <command> [args]
${C_BOLD}COMMANDS${C_RESET}
init create the queue/ folders
add <file.md> [opts] queue a prompt file into inbox/
--engine devin|claude|codex --cwd PATH --yolo | --no-yolo
run [--max N] [--engine E] [--once]
process inbox/ (foreground loop; Ctrl-C to stop)
status show kanban counts + running workers
watch [interval] live status (default 2s, bash)
dash [--interval N] richer live Node dashboard (recent done/failed too)
stop kill running workers + the run loop
logs <job> [-f] print (or follow) a job's log
help this message
${C_BOLD}KANBAN${C_RESET} inbox → doing → done / failed (logs/ + .state/ alongside)
${C_BOLD}TASK FRONTMATTER${C_RESET} (top of each .md)
---
engine: devin
cwd: /Users/you/code/repo
yolo: true
lock: my-repo # optional; defaults to cwd. Jobs sharing a key run serially
---
${C_BOLD}ENV${C_RESET}
AGENT_QUEUE_ROOT (=$QUEUE_ROOT) AGENT_QUEUE_MAX (=$MAX_CONCURRENCY)
AGENT_QUEUE_ENGINE (=$DEFAULT_ENGINE) DEVIN_BIN / CLAUDE_BIN / CODEX_BIN
EOF
}
main() {
local cmd="${1:-help}"; shift || true
case "$cmd" in
init) cmd_init "$@";;
add) cmd_add "$@";;
run) cmd_run "$@";;
status) cmd_status "$@";;
watch) cmd_watch "$@";;
dash|dashboard) cmd_dash "$@";;
stop) cmd_stop "$@";;
logs) cmd_logs "$@";;
help|-h|--help) usage;;
*) err "unknown command: $cmd"; echo; usage; exit 1;;
esac
}
main "$@"