feat(agent-queue): single-host crash recovery, WIP checkpoint/resume, retry + insights (P1-S3)
Implements the single-host bash equivalents of roadmap §25 (durability/crash
recovery) and §26 (execution insights), plus §11 retry/dead-letter stand-in.
Resilience (A1-A4):
- recover_orphans + `recover` command: building/ jobs with a dead worker (dead
pid, pidstart reuse-guard) are moved back to inbox/ with attempts incremented,
on `run` startup and each loop. Idempotent (folder location is the guard).
- WIP checkpointing: for a git cwd, _wip_start creates/checks out aq/wip/<job>
and _wip_checkpoint commits changes on every exit path via an EXIT/INT/TERM
trap; never commits to main/current branch; non-git cwd skipped. RESUME: a
relaunch whose aq/wip/<job> exists checks it out first (continue from
checkpoint). wip_base persisted in a write-once sidecar.
- retry policy (now functional): retry { max, backoff, on } requeues failures
whose class (timeout|verify_failed|crash) is in `on`, honoring backoff via
next_eligible (selection skips until eligible), up to max attempts; exhaustion
-> failed/ result=retries_exhausted with the WIP branch + full log preserved.
- state integrity: all meta writes stay append-only; attempts/next_eligible/wip_*
are re-derivable; recovery is crash-safe.
Insights (B1-B6):
- per-run metrics into meta: duration_s, exit, result, attempts, and (git cwd)
files_changed/lines_added/lines_deleted from numstat wip_base..HEAD.
- parse_usage(engine, log) adapter: generic AQ_USAGE line + Claude/Codex token
heuristics; Devin/Copilot TODO; usage_estimated flag; never fabricates numbers.
- status insights sub-line; new `insights [job]` command (per-job metrics or a
recent table + per-engine token/cost/success/duration rollup).
- privacy: only metrics are recorded, never prompt content or secrets.
Backward-compatible: legacy .md and non-git cwd behave exactly as before.
This commit is contained in:
parent
bc0c0e263c
commit
1758bc1ab1
@ -469,11 +469,26 @@ run_worker() {
|
||||
return 1
|
||||
fi
|
||||
|
||||
local started; started=$(grep '^started=' "$metaf" 2>/dev/null | tail -1 | cut -d= -f2)
|
||||
|
||||
# Strip our frontmatter so the agent only sees the task body.
|
||||
local bodyf="$STATE/$job.body.md"
|
||||
strip_frontmatter "$doing_file" > "$bodyf"
|
||||
build_agent_cmd "$engine" "$bodyf" "$yolo"
|
||||
|
||||
# ── WIP checkpoint setup (§25.2): on a git cwd, create/checkout aq/wip/<job>
|
||||
# so partial work survives a crash; a trap guarantees a checkpoint on EVERY
|
||||
# exit path (success, failure, timeout, SIGTERM/SIGINT). Non-git cwd: no-op. ──
|
||||
WIP_ACTIVE=0; WIP_BASE=""; WIP_DONE=0
|
||||
_worker_trap() {
|
||||
[[ "$WIP_DONE" == 1 ]] && return
|
||||
WIP_DONE=1
|
||||
[[ "$WIP_ACTIVE" == 1 ]] && _wip_checkpoint "$job" "$cwd" "$metaf" "$logf" "trap-exit"
|
||||
}
|
||||
trap '_worker_trap' EXIT
|
||||
trap '_worker_trap; exit 143' INT TERM
|
||||
_wip_start "$job" "$cwd" "$metaf" "$logf" || true
|
||||
|
||||
_run_agent() {
|
||||
if [[ -n "$AGENT_STDIN" ]]; then
|
||||
( cd "$cwd" && "${AGENT_CMD[@]}" < "$AGENT_STDIN" )
|
||||
@ -525,23 +540,27 @@ run_worker() {
|
||||
fi
|
||||
rm -f "$tmo_flag"
|
||||
|
||||
echo "exit=$rc" >> "$metaf"
|
||||
# ── Preserve work + capture run metrics on EVERY path (§25.2/§26.1/§26.2) ──
|
||||
if [[ "$WIP_ACTIVE" == 1 ]]; then
|
||||
_wip_checkpoint "$job" "$cwd" "$metaf" "$logf" "agent-exit"; WIP_DONE=1
|
||||
fi
|
||||
_numstat_into_meta "$cwd" "$WIP_BASE" "$metaf"
|
||||
parse_usage "$engine" "$logf" >> "$metaf"
|
||||
|
||||
if $timed_out; then
|
||||
mv "$doing_file" "$FAILED/" 2>/dev/null
|
||||
echo "result=timeout" >> "$metaf"
|
||||
echo "ended=$(date +%s)" >> "$metaf"
|
||||
echo "TIMED OUT after ${tmo}s (rc=$rc): $(date)" >> "$logf"
|
||||
_finish_failure "$job" "$doing_file" "$metaf" "$logf" "timeout" "$rc" "$started"
|
||||
elif [[ $rc -eq 0 ]]; then
|
||||
# Agent succeeded: land in review/, then run the auto-QA verify gate. The
|
||||
# worker is still alive here so the concurrency slot stays held through
|
||||
# verification — `ended=` is written only once we reach a resting stage.
|
||||
mv "$doing_file" "$REVIEW/" 2>/dev/null
|
||||
local review_file="$REVIEW/$job.md"
|
||||
echo "exit=$rc" >> "$metaf"
|
||||
echo "completed OK (rc=0): landed in review — $(date)" >> "$logf"
|
||||
local verify; verify=$(fm_get "$review_file" verify "$DEFAULT_VERIFY")
|
||||
if [[ -z "$verify" ]]; then
|
||||
echo "result=review" >> "$metaf"
|
||||
echo "ended=$(date +%s)" >> "$metaf"
|
||||
_meta_end "$metaf" "review" "$started"
|
||||
echo "no verify command — parked in review for manual promote: $(date)" >> "$logf"
|
||||
else
|
||||
echo "----- verify: $verify -----" >> "$logf"
|
||||
@ -550,24 +569,273 @@ run_worker() {
|
||||
echo "verify_exit=$vrc" >> "$metaf"
|
||||
if [[ $vrc -eq 0 ]]; then
|
||||
mv "$review_file" "$TESTING/" 2>/dev/null
|
||||
echo "result=testing" >> "$metaf"
|
||||
echo "ended=$(date +%s)" >> "$metaf"
|
||||
_meta_end "$metaf" "testing" "$started"
|
||||
echo "VERIFY PASSED — promoted to testing (QA): $(date)" >> "$logf"
|
||||
else
|
||||
mv "$review_file" "$FAILED/" 2>/dev/null
|
||||
echo "result=verify_failed" >> "$metaf"
|
||||
echo "ended=$(date +%s)" >> "$metaf"
|
||||
echo "VERIFY FAILED (rc=$vrc): $(date)" >> "$logf"
|
||||
# verify ran on the review_file; retry policy may requeue it.
|
||||
_finish_failure "$job" "$review_file" "$metaf" "$logf" "verify_failed" "$rc" "$started"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
mv "$doing_file" "$FAILED/" 2>/dev/null
|
||||
echo "result=failed" >> "$metaf"
|
||||
echo "ended=$(date +%s)" >> "$metaf"
|
||||
echo "FAILED (rc=$rc): $(date)" >> "$logf"
|
||||
_finish_failure "$job" "$doing_file" "$metaf" "$logf" "crash" "$rc" "$started"
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Resilience & insights helpers (Phase 1 — single-host §25/§26) ────
|
||||
#
|
||||
# _in_list <item> <space-separated-list> -> 0 if item is present.
|
||||
_in_list() { case " ${2:-} " in *" $1 "*) return 0;; esac; return 1; }
|
||||
|
||||
# _meta_end <metafile> <result> <started-epoch> -> append result/ended/duration
|
||||
# (append-only; never truncates a live meta — §25 state integrity).
|
||||
_meta_end() {
|
||||
local mf=$1 res=$2 now; now=$(date +%s); local st=${3:-$now}
|
||||
[[ "$st" =~ ^[0-9]+$ ]] || st=$now
|
||||
{ echo "result=$res"; echo "ended=$now"; echo "duration_s=$(( now - st ))"; } >> "$mf"
|
||||
}
|
||||
|
||||
# ── git WIP checkpointing (§25.2) ──
|
||||
_is_git_repo() { git -C "$1" rev-parse --is-inside-work-tree >/dev/null 2>&1; }
|
||||
_wip_branch() { printf 'aq/wip/%s' "$1"; }
|
||||
|
||||
# _wip_start <job> <cwd> <metaf> <logf> -> ensure/checkout the WIP branch.
|
||||
# Sets globals WIP_ACTIVE (1 when a git WIP branch is in play) and WIP_BASE.
|
||||
# RESUME: if aq/wip/<job> already exists (orphan/retry relaunch), check it out so
|
||||
# the agent continues from the checkpoint instead of from zero. The base commit is
|
||||
# persisted in a write-once sidecar so it survives meta re-creation across attempts.
|
||||
_wip_start() {
|
||||
local job=$1 cwd=$2 metaf=$3 logf=$4
|
||||
WIP_ACTIVE=0; WIP_BASE=""
|
||||
if ! _is_git_repo "$cwd"; then
|
||||
echo "wip: cwd not a git repo — skipping checkpoint" >> "$logf"
|
||||
return 1
|
||||
fi
|
||||
local br basefile base; br=$(_wip_branch "$job"); basefile="$STATE/$job.wipbase"
|
||||
if git -C "$cwd" show-ref --verify --quiet "refs/heads/$br"; then
|
||||
git -C "$cwd" checkout "$br" >/dev/null 2>&1 \
|
||||
|| echo "wip: could not checkout $br (resume) — staying on current branch" >> "$logf"
|
||||
echo "wip: resuming on $br ($(date))" >> "$logf"
|
||||
else
|
||||
base=$(git -C "$cwd" rev-parse --short HEAD 2>/dev/null || echo "")
|
||||
if ! git -C "$cwd" checkout -b "$br" >/dev/null 2>&1; then
|
||||
echo "wip: could not create $br — skipping checkpoint" >> "$logf"
|
||||
return 1
|
||||
fi
|
||||
[[ -n "$base" ]] && printf '%s\n' "$base" > "$basefile"
|
||||
echo "wip: created $br from ${base:-<root>} ($(date))" >> "$logf"
|
||||
fi
|
||||
base=$(cat "$basefile" 2>/dev/null || echo "")
|
||||
{ echo "wip_branch=$br"; echo "wip_base=$base"; } >> "$metaf"
|
||||
WIP_ACTIVE=1; WIP_BASE="$base"
|
||||
return 0
|
||||
}
|
||||
|
||||
# _wip_checkpoint <job> <cwd> <metaf> <logf> <stage> -> commit any changes in cwd
|
||||
# onto aq/wip/<job>. Idempotent (no commit when the tree is clean). NEVER commits
|
||||
# to a non-WIP branch — protects main/protected branches (§12).
|
||||
_wip_checkpoint() {
|
||||
local job=$1 cwd=$2 metaf=$3 logf=$4 stage=$5
|
||||
_is_git_repo "$cwd" || return 0
|
||||
local br cur; br=$(_wip_branch "$job")
|
||||
cur=$(git -C "$cwd" symbolic-ref --short HEAD 2>/dev/null || echo "")
|
||||
if [[ "$cur" != "$br" ]]; then
|
||||
git -C "$cwd" checkout "$br" >/dev/null 2>&1 \
|
||||
|| { echo "wip: not on $br — skipping checkpoint to protect '$cur'" >> "$logf"; return 0; }
|
||||
fi
|
||||
git -C "$cwd" add -A >/dev/null 2>&1
|
||||
if git -C "$cwd" diff --cached --quiet 2>/dev/null; then
|
||||
return 0 # nothing to preserve
|
||||
fi
|
||||
git -C "$cwd" -c user.email=agent-queue@local -c user.name=agent-queue \
|
||||
commit -q -m "aq wip: $job ($stage)" >/dev/null 2>&1
|
||||
local c; c=$(git -C "$cwd" rev-parse --short HEAD 2>/dev/null || echo "")
|
||||
[[ -n "$c" ]] && echo "wip_commit=$c" >> "$metaf"
|
||||
echo "wip: checkpoint ($stage) -> ${c:-?} ($(date))" >> "$logf"
|
||||
}
|
||||
|
||||
# _numstat_into_meta <cwd> <base> <metaf> -> record files_changed/lines_added/
|
||||
# lines_deleted for the run (base..HEAD on the WIP branch; binary files count 0).
|
||||
_numstat_into_meta() {
|
||||
local cwd=$1 base=$2 metaf=$3 out stats
|
||||
_is_git_repo "$cwd" || return 0
|
||||
if [[ -n "$base" ]]; then
|
||||
out=$(git -C "$cwd" diff --numstat "$base" HEAD 2>/dev/null)
|
||||
else
|
||||
out=$(git -C "$cwd" diff --numstat HEAD 2>/dev/null)
|
||||
fi
|
||||
[[ -n "$out" ]] || return 0
|
||||
stats=$(printf '%s\n' "$out" | awk '
|
||||
{ if ($1 ~ /^[0-9]+$/) a+=$1; if ($2 ~ /^[0-9]+$/) d+=$2; f++ }
|
||||
END { printf "%d %d %d", f+0, a+0, d+0 }')
|
||||
# shellcheck disable=SC2086
|
||||
set -- $stats
|
||||
{ echo "files_changed=$1"; echo "lines_added=$2"; echo "lines_deleted=$3"; } >> "$metaf"
|
||||
}
|
||||
|
||||
# ── retry policy (§5/§11/§25.3) ──
|
||||
# Parse `retry: { max: N, backoff: 5m, on: [classes] }`. Absent/empty -> no retry.
|
||||
_retry_max() { local m; m=$(printf '%s' "${1:-}" | grep -oE 'max:[[:space:]]*[0-9]+' | grep -oE '[0-9]+' | head -1); echo "${m:-0}"; }
|
||||
_retry_backoff_s() { local b; b=$(printf '%s' "${1:-}" | grep -oE 'backoff:[[:space:]]*[0-9]+[smhd]?' | sed -E 's/^backoff:[[:space:]]*//' | head -1); _dur_to_secs "${b:-0}"; }
|
||||
_retry_on() {
|
||||
local raw=${1:-} inside
|
||||
inside=$(printf '%s' "$raw" | sed -nE 's/.*on:[[:space:]]*\[([^]]*)\].*/\1/p')
|
||||
[[ -n "$inside" ]] || inside=$(printf '%s' "$raw" | sed -nE 's/.*on:[[:space:]]*([A-Za-z_,[:space:]-]+).*/\1/p')
|
||||
parse_list "$inside" | tr '\n' ' '
|
||||
}
|
||||
# _class_retryable <class> <on-list> -> 0 if this failure class should retry.
|
||||
# `crash` and `agent_error` are synonyms for a non-zero agent exit.
|
||||
_class_retryable() {
|
||||
local class=$1 on=$2
|
||||
case "$class" in
|
||||
crash) _in_list crash "$on" || _in_list agent_error "$on";;
|
||||
*) _in_list "$class" "$on";;
|
||||
esac
|
||||
}
|
||||
|
||||
# _finish_failure <job> <file> <metaf> <logf> <class> <rc> <started> -> apply the
|
||||
# retry policy to a failed run. Retries (requeue to inbox with backoff via
|
||||
# next_eligible) while the class is in `retry.on` and attempts <= max; on
|
||||
# exhaustion -> failed/ result=retries_exhausted; with no policy -> failed/ with
|
||||
# the natural result (failed|timeout|verify_failed). The WIP branch is preserved
|
||||
# either way so a retry resumes from the checkpoint.
|
||||
_finish_failure() {
|
||||
local job=$1 file=$2 metaf=$3 logf=$4 class=$5 rc=$6 started=$7
|
||||
local raw max on attempts; raw=$(fm_get "$file" retry "")
|
||||
attempts=$(grep '^attempts=' "$metaf" 2>/dev/null | tail -1 | cut -d= -f2); attempts=${attempts:-1}
|
||||
max=$(_retry_max "$raw"); on=$(_retry_on "$raw")
|
||||
if [[ "$max" -gt 0 ]] && _class_retryable "$class" "$on" && [[ "$attempts" -le "$max" ]]; then
|
||||
local backoff now next na; backoff=$(_retry_backoff_s "$raw")
|
||||
now=$(date +%s); next=$(( now + backoff )); na=$(( attempts + 1 ))
|
||||
mv "$file" "$INBOX/" 2>/dev/null
|
||||
{
|
||||
echo "exit=$rc"; echo "attempts=$na"; echo "next_eligible=$next"
|
||||
echo "retry_class=$class"; echo "result=retry_scheduled"
|
||||
echo "ended=$now"; echo "duration_s=$(( now - ${started:-now} ))"
|
||||
} >> "$metaf"
|
||||
echo "RETRY scheduled: class=$class, attempt $attempts/$max, backoff ${backoff}s -> inbox ($(date))" >> "$logf"
|
||||
return 0
|
||||
fi
|
||||
local result
|
||||
if [[ "$max" -gt 0 ]] && _class_retryable "$class" "$on"; then
|
||||
result="retries_exhausted"
|
||||
echo "RETRIES EXHAUSTED after $attempts attempt(s) (class=$class) -> failed/ ($(date))" >> "$logf"
|
||||
else
|
||||
case "$class" in
|
||||
timeout) result="timeout";;
|
||||
verify_failed) result="verify_failed";;
|
||||
*) result="failed";;
|
||||
esac
|
||||
fi
|
||||
echo "exit=$rc" >> "$metaf"
|
||||
mv "$file" "$FAILED/" 2>/dev/null
|
||||
_meta_end "$metaf" "$result" "$started"
|
||||
}
|
||||
|
||||
# ── orphan recovery (§25.3) ──
|
||||
# recover_orphans -> move building/ jobs whose worker is dead back to inbox/ for
|
||||
# re-selection (resume-aware via the WIP branch), incrementing attempts. Idempotent:
|
||||
# once moved out of building/ a job is never recovered twice. A retry-capped job
|
||||
# that has exhausted crash retries goes to failed/ result=retries_exhausted instead
|
||||
# of looping forever; otherwise recovery never strands work.
|
||||
recover_orphans() {
|
||||
local f job metaf pid pidstart
|
||||
for f in "$BUILDING"/*.md; do
|
||||
[[ -e "$f" ]] || continue
|
||||
job=$(basename "$f"); job=${job%.md}; metaf="$STATE/$job.meta"
|
||||
if [[ -f "$metaf" ]] && ! grep -q '^ended=' "$metaf"; then
|
||||
pid=$(grep '^pid=' "$metaf" 2>/dev/null | tail -1 | cut -d= -f2)
|
||||
pidstart=$(grep '^pidstart=' "$metaf" 2>/dev/null | tail -1 | cut -d= -f2-)
|
||||
_pid_alive "$pid" "$pidstart" && continue # a live worker still owns it
|
||||
fi
|
||||
local prev na raw max now; prev=$(grep '^attempts=' "$metaf" 2>/dev/null | tail -1 | cut -d= -f2); prev=${prev:-1}
|
||||
raw=$(fm_get "$f" retry ""); max=$(_retry_max "$raw"); now=$(date +%s)
|
||||
if [[ "$max" -gt 0 ]] && _class_retryable crash "$(_retry_on "$raw")" && [[ "$prev" -gt "$max" ]]; then
|
||||
mv "$f" "$FAILED/" 2>/dev/null
|
||||
{ echo "attempts=$prev"; echo "recovered=$now"; } >> "$metaf"
|
||||
_meta_end "$metaf" "retries_exhausted" "$(grep '^started=' "$metaf" | tail -1 | cut -d= -f2)"
|
||||
echo "ORPHAN: $job exhausted crash retries -> failed/ ($(date))" >> "$LOGS/$job.log"
|
||||
log "↻ orphan $C_BOLD$job$C_RESET exhausted retries -> failed"
|
||||
continue
|
||||
fi
|
||||
na=$(( prev + 1 ))
|
||||
local next=""; [[ "$max" -gt 0 ]] && next=$(( now + $(_retry_backoff_s "$raw") ))
|
||||
mv "$f" "$INBOX/" 2>/dev/null
|
||||
{
|
||||
echo "attempts=$na"; echo "recovered=$now"
|
||||
[[ -n "$next" ]] && echo "next_eligible=$next"
|
||||
echo "result=recovered"; echo "ended=$now"
|
||||
} >> "$metaf"
|
||||
echo "ORPHAN RECOVERED: $job (worker dead) -> inbox, attempt now $na ($(date))" >> "$LOGS/$job.log"
|
||||
log "↻ recovered orphan $C_BOLD$job$C_RESET (attempt $na)"
|
||||
done
|
||||
}
|
||||
|
||||
# ── token / cost capture (§26.2) ──
|
||||
# parse_usage <engine> <logfile> -> emit `key=value` usage lines (model, tokens_in,
|
||||
# tokens_out, tokens_cached, cost_usd, turns, tool_calls, usage_estimated) when the
|
||||
# engine's output exposes them. This is the SINGLE place per-engine extraction lives.
|
||||
# A wrapper of any engine may emit a machine-readable `AQ_USAGE k=v ...` line, which
|
||||
# is always honored; engine-specific heuristics are best-effort (real where known,
|
||||
# TODO otherwise). Never fabricate precise numbers — omit or mark usage_estimated.
|
||||
parse_usage() {
|
||||
local engine=$1 log=$2
|
||||
[[ -f "$log" ]] || return 0
|
||||
# 1) Generic, explicit usage line (preferred; emitted by any cooperating wrapper).
|
||||
local line; line=$(grep -E '^AQ_USAGE ' "$log" 2>/dev/null | tail -1)
|
||||
if [[ -n "$line" ]]; then
|
||||
local kv
|
||||
for kv in ${line#AQ_USAGE }; do
|
||||
case "$kv" in
|
||||
model=*|tokens_in=*|tokens_out=*|tokens_cached=*|cost_usd=*|turns=*|tool_calls=*|usage_estimated=*) echo "$kv";;
|
||||
esac
|
||||
done
|
||||
return 0
|
||||
fi
|
||||
# 2) Engine-specific best-effort heuristics (real where the format is known).
|
||||
local ti to
|
||||
case "$engine" in
|
||||
claude)
|
||||
# Claude Code can surface usage as JSON-ish input_tokens/output_tokens.
|
||||
ti=$(grep -oE '"input_tokens"[": ]+[0-9]+' "$log" 2>/dev/null | grep -oE '[0-9]+' | tail -1)
|
||||
to=$(grep -oE '"output_tokens"[": ]+[0-9]+' "$log" 2>/dev/null | grep -oE '[0-9]+' | tail -1)
|
||||
[[ -n "$ti" ]] && echo "tokens_in=$ti"
|
||||
[[ -n "$to" ]] && echo "tokens_out=$to"
|
||||
;;
|
||||
codex)
|
||||
# OpenAI usage object: prompt_tokens / completion_tokens.
|
||||
ti=$(grep -oE '"prompt_tokens"[": ]+[0-9]+' "$log" 2>/dev/null | grep -oE '[0-9]+' | tail -1)
|
||||
to=$(grep -oE '"completion_tokens"[": ]+[0-9]+' "$log" 2>/dev/null | grep -oE '[0-9]+' | tail -1)
|
||||
[[ -n "$ti" ]] && echo "tokens_in=$ti"
|
||||
[[ -n "$to" ]] && echo "tokens_out=$to"
|
||||
;;
|
||||
devin) : ;; # TODO: Devin session metrics are exposed via API, not the local log.
|
||||
copilot) : ;; # TODO: GitHub Copilot CLI usage format not yet documented here.
|
||||
esac
|
||||
return 0
|
||||
}
|
||||
|
||||
# ── insights helpers (§26) ──
|
||||
# _meta_val <metafile> <key> -> last value for key (append-only safe), else empty.
|
||||
_meta_val() { grep "^$2=" "$1" 2>/dev/null | tail -1 | cut -d= -f2-; }
|
||||
# _result_is_success <result> -> 0 if the agent run succeeded (reached a good stage).
|
||||
_result_is_success() { case "$1" in review|testing|shipped) return 0;; *) return 1;; esac; }
|
||||
|
||||
# _insights_line <metafile> -> compact one-line metrics summary for status/dash.
|
||||
_insights_line() {
|
||||
local f=$1 s="" v ti to la ld
|
||||
v=$(_meta_val "$f" attempts); [[ -n "$v" ]] && s+="attempt $v "
|
||||
v=$(_meta_val "$f" duration_s); [[ -n "$v" ]] && s+="${v}s "
|
||||
ti=$(_meta_val "$f" tokens_in); to=$(_meta_val "$f" tokens_out)
|
||||
[[ -n "$ti$to" ]] && s+="tok ${ti:-0}/${to:-0} "
|
||||
v=$(_meta_val "$f" cost_usd); [[ -n "$v" ]] && s+="usd=$v "
|
||||
la=$(_meta_val "$f" lines_added); ld=$(_meta_val "$f" lines_deleted)
|
||||
[[ -n "$la$ld" ]] && s+="+${la:-0}/-${ld:-0} "
|
||||
[[ -n "$(_meta_val "$f" usage_estimated)" ]] && s+="(est) "
|
||||
printf 'insights: %s' "${s:-(pending)}"
|
||||
}
|
||||
|
||||
# ── Commands ────────────────────────────────────────────────────────
|
||||
cmd_init() { ensure_dirs; log "queue initialized at $C_BOLD$QUEUE_ROOT$C_RESET"; }
|
||||
|
||||
@ -671,20 +939,30 @@ cmd_run() {
|
||||
echo "$$" > "$STATE/daemon.pid"
|
||||
trap 'rm -f "$STATE/daemon.pid"; log "run loop stopped"; exit 0' INT TERM
|
||||
log "run loop started (max=$MAX_CONCURRENCY, default engine=$DEFAULT_ENGINE). Ctrl-C to stop."
|
||||
# Crash recovery (§25.3): reclaim jobs orphaned in building/ by a previous
|
||||
# crash/power-off before launching anything new.
|
||||
recover_orphans
|
||||
|
||||
while true; do
|
||||
# continuously sweep for orphans (a worker that died mid-loop)
|
||||
recover_orphans
|
||||
local running; running=$(active_workers)
|
||||
# launch jobs while we have capacity and an eligible inbox file
|
||||
while [[ "$running" -lt "$MAX_CONCURRENCY" ]]; do
|
||||
# pick by priority (critical→low) then age, skipping files whose lock key
|
||||
# is currently busy, so two jobs sharing a cwd (or `lock:` key) never run
|
||||
# at once regardless of --max. inbox_sorted replaces the old pure-FIFO sort.
|
||||
# Also skip jobs still inside their retry/recovery backoff (next_eligible).
|
||||
local busy; busy=$(busy_keys)
|
||||
local next="" cand cand_key
|
||||
local next="" cand cand_key cand_job cand_ne now_s
|
||||
now_s=$(date +%s)
|
||||
while IFS= read -r cand; do
|
||||
[[ -n "$cand" ]] || continue
|
||||
cand_key=$(lock_key_for "$cand")
|
||||
if printf '%s\n' "$busy" | grep -qxF -- "$cand_key"; then continue; fi
|
||||
cand_job=$(basename "$cand"); cand_job=${cand_job%.md}
|
||||
cand_ne=$(grep '^next_eligible=' "$STATE/$cand_job.meta" 2>/dev/null | tail -1 | cut -d= -f2)
|
||||
if [[ "$cand_ne" =~ ^[0-9]+$ ]] && [[ "$cand_ne" -gt "$now_s" ]]; then continue; fi
|
||||
next="$cand"; break
|
||||
done < <(inbox_sorted)
|
||||
[[ -z "$next" ]] && break
|
||||
@ -700,6 +978,11 @@ cmd_run() {
|
||||
w_cwd=$(fm_get "$doing_file" cwd "$PWD")
|
||||
w_yolo=$(fm_get "$doing_file" yolo "true")
|
||||
w_key=$(lock_key_for "$doing_file")
|
||||
# Preserve the attempt counter across requeues (retry/orphan recovery set
|
||||
# it before re-queuing); a fresh job starts at 1. Read BEFORE truncating
|
||||
# the meta below so the count survives re-creation (§25.4 crash-safe).
|
||||
local w_attempts; w_attempts=$(grep '^attempts=' "$STATE/$job.meta" 2>/dev/null | tail -1 | cut -d= -f2)
|
||||
[[ "$w_attempts" =~ ^[0-9]+$ ]] && [[ "$w_attempts" -gt 0 ]] || w_attempts=1
|
||||
# write meta BEFORE launch (no pid yet), then append the worker pid from $!.
|
||||
# The new manifest fields (§5) are recorded here; only priority,
|
||||
# capabilities, engine-class and idempotency-key are functional this
|
||||
@ -711,6 +994,7 @@ cmd_run() {
|
||||
echo "yolo=$w_yolo"
|
||||
echo "lock=$w_key"
|
||||
echo "started=$(date +%s)"
|
||||
echo "attempts=$w_attempts"
|
||||
echo "priority=$(fm_get "$doing_file" priority medium)"
|
||||
echo "profile=$(fm_get "$doing_file" profile "")"
|
||||
echo "engine_class=$(fm_get "$doing_file" engine-class "")"
|
||||
@ -785,6 +1069,7 @@ cmd_status() {
|
||||
[[ -n "$m_caps" ]] && extra+="caps=$m_caps "
|
||||
[[ -n "$m_trk" ]] && extra+="tracker=$m_trk "
|
||||
[[ -n "$extra" ]] && printf ' %s%s%s\n' "$C_DIM" "$extra" "$C_RESET"
|
||||
printf ' %s%s%s\n' "$C_DIM" "$(_insights_line "$f")" "$C_RESET"
|
||||
done
|
||||
$printed || printf ' %sno workers running%s\n' "$C_DIM" "$C_RESET"
|
||||
echo
|
||||
@ -795,6 +1080,68 @@ cmd_watch() {
|
||||
while true; do clear; cmd_status; sleep "$interval"; done
|
||||
}
|
||||
|
||||
# recover — reclaim orphaned building/ jobs (dead worker) back to inbox/. Runs
|
||||
# automatically inside `run`; exposed for operators + crash-recovery testing.
|
||||
cmd_recover() { ensure_dirs; recover_orphans; log "orphan sweep complete"; }
|
||||
|
||||
# insights [job] — per-job metrics, or a recent-jobs table + per-engine rollup.
|
||||
cmd_insights() {
|
||||
ensure_dirs
|
||||
local job="${1:-}"
|
||||
if [[ -n "$job" ]]; then
|
||||
local f="$STATE/$job.meta"
|
||||
[[ -f "$f" ]] || f=$(ls -1t "$STATE"/*"$job"*.meta 2>/dev/null | head -1)
|
||||
[[ -f "$f" ]] || die "no job meta matching '$job'"
|
||||
local jn; jn=$(basename "$f"); jn=${jn%.meta}
|
||||
printf '\n%s INSIGHTS %s%s%s\n' "$C_BOLD" "$C_CYAN" "$jn" "$C_RESET"
|
||||
local k val
|
||||
for k in engine result attempts started ended duration_s exit verify_exit \
|
||||
model tokens_in tokens_out tokens_cached cost_usd turns tool_calls usage_estimated \
|
||||
files_changed lines_added lines_deleted wip_branch wip_base wip_commit \
|
||||
next_eligible retry_class recovered; do
|
||||
val=$(_meta_val "$f" "$k")
|
||||
[[ -n "$val" ]] && printf ' %-15s %s\n' "$k" "$val"
|
||||
done
|
||||
echo
|
||||
return 0
|
||||
fi
|
||||
|
||||
printf '\n%s INSIGHTS — recent finished jobs%s %s%s%s\n' \
|
||||
"$C_BOLD" "$C_RESET" "$C_DIM" "$QUEUE_ROOT" "$C_RESET"
|
||||
printf ' %-26s %-8s %-16s %6s %10s %9s\n' "job" "engine" "result" "dur" "tok(i/o)" "cost"
|
||||
local f rows=0 agg; agg=$(mktemp "${TMPDIR:-/tmp}/aq-insights.XXXXXX")
|
||||
while IFS= read -r f; do
|
||||
[[ -n "$f" ]] || continue
|
||||
grep -q '^ended=' "$f" || continue
|
||||
local jn eng res dur ti to cost est
|
||||
jn=$(basename "$f"); jn=${jn%.meta}
|
||||
eng=$(_meta_val "$f" engine); res=$(_meta_val "$f" result); dur=$(_meta_val "$f" duration_s)
|
||||
ti=$(_meta_val "$f" tokens_in); to=$(_meta_val "$f" tokens_out); cost=$(_meta_val "$f" cost_usd)
|
||||
est=$(_meta_val "$f" usage_estimated)
|
||||
rows=$((rows+1))
|
||||
[[ $rows -le 15 ]] && printf ' %-26.26s %-8.8s %-16.16s %5ss %10s %9s\n' \
|
||||
"$jn" "${eng:-?}" "${res:-?}" "${dur:-0}" "${ti:-0}/${to:-0}" "${cost:-–}"
|
||||
local succ=0; _result_is_success "$res" && succ=1
|
||||
printf '%s|%s|%s|%s|%s|%s|%s\n' "${eng:-?}" "${ti:-0}" "${to:-0}" "${cost:-0}" "${dur:-0}" "$succ" "$est" >> "$agg"
|
||||
done < <(ls -1t "$STATE"/*.meta 2>/dev/null)
|
||||
if [[ $rows -eq 0 ]]; then
|
||||
printf ' %sno finished jobs yet%s\n\n' "$C_DIM" "$C_RESET"; rm -f "$agg"; return 0
|
||||
fi
|
||||
printf '\n%s ROLLUP BY ENGINE%s\n' "$C_BOLD" "$C_RESET"
|
||||
printf ' %-8s %5s %10s %10s %10s %8s\n' "engine" "jobs" "tok_in" "tok_out" "cost" "success"
|
||||
local e
|
||||
for e in $(cut -d'|' -f1 "$agg" | sort -u); do
|
||||
awk -F'|' -v eng="$e" '
|
||||
$1==eng { jobs++; ti+=$2; to+=$3; cost+=$4; succ+=$6; if ($7!="") est=1 }
|
||||
END {
|
||||
rate = jobs>0 ? (succ*100.0/jobs) : 0
|
||||
printf " %-8s %5d %10d %10d %9.4f%s %6.0f%%\n", eng, jobs, ti, to, cost, (est?"*":" "), rate
|
||||
}' "$agg"
|
||||
done
|
||||
printf ' %s* total includes estimated token/cost values%s\n\n' "$C_DIM" "$C_RESET"
|
||||
rm -f "$agg"
|
||||
}
|
||||
|
||||
cmd_dash() {
|
||||
command -v node >/dev/null 2>&1 || die "node not found — use 'watch' for the bash status view"
|
||||
AGENT_QUEUE_ROOT="$QUEUE_ROOT" exec node "$SCRIPT_DIR/dashboard.mjs" "$@"
|
||||
@ -948,8 +1295,10 @@ ${C_BOLD}COMMANDS${C_RESET}
|
||||
--engine devin|claude|codex --cwd PATH --yolo | --no-yolo
|
||||
run [--max N] [--engine E] [--once]
|
||||
process inbox/ (foreground loop; Ctrl-C to stop)
|
||||
status show kanban counts + running workers
|
||||
status show kanban counts + running workers (+ insights)
|
||||
watch [interval] live status (default 2s, bash)
|
||||
insights [job] per-job metrics, or recent table + per-engine rollup
|
||||
recover reclaim orphaned building/ jobs (dead worker) -> inbox
|
||||
dash [--interval N] richer live Node dashboard (recent shipped/failed too)
|
||||
stop kill running workers + the run loop
|
||||
logs <job> [-f] print (or follow) a job's log
|
||||
@ -978,10 +1327,15 @@ ${C_BOLD}TASK FRONTMATTER${C_RESET} (top of each .md)
|
||||
prefers-engine: [claude] # optional order hint for engine-class resolution
|
||||
capabilities: [os:any, node>=20, has:git] # hard host requirements; unmet -> failed (capability_mismatch)
|
||||
idempotency-key: my-task-1 # re-adding same key+body = no-op; same key+different body = reject/supersede
|
||||
retry: { max: 2, backoff: 5m, on: [timeout, verify_failed, crash] } # requeue on these classes up to max, then retries_exhausted
|
||||
# --- reserved (parsed + shown in status, but no-op until a later phase) ---
|
||||
profile: prefers: budget: deps: deps-mode: retry: review-policy: artifacts: tracker-item:
|
||||
profile: prefers: budget: deps: deps-mode: review-policy: artifacts: tracker-item:
|
||||
---
|
||||
|
||||
${C_BOLD}RESILIENCE${C_RESET} crash-safe: orphaned building/ jobs (dead worker) are recovered to inbox/ on
|
||||
'run' startup; git-repo cwd work is checkpointed to branch aq/wip/<job> on every
|
||||
exit (resumed on retry); 'retry' requeues failures with backoff. See 'insights'.
|
||||
|
||||
${C_BOLD}ENV${C_RESET}
|
||||
AGENT_QUEUE_ROOT (=$QUEUE_ROOT) AGENT_QUEUE_MAX (=$MAX_CONCURRENCY)
|
||||
AGENT_QUEUE_ENGINE (=$DEFAULT_ENGINE) AGENT_QUEUE_VERIFY (default verify cmd)
|
||||
@ -997,6 +1351,8 @@ main() {
|
||||
run) cmd_run "$@";;
|
||||
status) cmd_status "$@";;
|
||||
watch) cmd_watch "$@";;
|
||||
insights) cmd_insights "$@";;
|
||||
recover) cmd_recover "$@";;
|
||||
dash|dashboard) cmd_dash "$@";;
|
||||
stop) cmd_stop "$@";;
|
||||
logs) cmd_logs "$@";;
|
||||
|
||||
Loading…
Reference in New Issue
Block a user