From 3b71f0117a3946108a1b9d4e1e2434d7b139ff2b Mon Sep 17 00:00:00 2001 From: saravanakumardb1 Date: Thu, 28 May 2026 22:13:50 -0700 Subject: [PATCH] feat(agent-queue): per-job timeout via frontmatter timeout: Honor 'timeout: 45m' (90s|45m|2h|1d) by wrapping the agent in timeout/gtimeout when available (hard process-tree kill), else a portable bash watchdog. On expiry the job moves doing->failed with result=timeout and a TIMED OUT log line. --- agent-queue/agent-queue.sh | 57 +++++++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/agent-queue/agent-queue.sh b/agent-queue/agent-queue.sh index 20d8de3..b8983f3 100755 --- a/agent-queue/agent-queue.sh +++ b/agent-queue/agent-queue.sh @@ -39,6 +39,9 @@ POLL_SECONDS="${AGENT_QUEUE_POLL:-3}" # flock is used for cross-process lock hardening when available (Linux). macOS # has no flock; mutual exclusion there relies on the single run-loop (see cmd_run). FLOCK_BIN="${FLOCK_BIN:-$(command -v flock || true)}" +# timeout/gtimeout give hard process-tree kills for per-job timeouts; if absent +# (stock macOS) a pure-bash watchdog is used as a best-effort fallback. +TIMEOUT_BIN="${TIMEOUT_BIN:-$(command -v timeout || command -v gtimeout || true)}" DEVIN_BIN="${DEVIN_BIN:-$(command -v devin || echo "$HOME/.local/bin/devin")}" CLAUDE_BIN="${CLAUDE_BIN:-$(command -v claude || echo claude)}" @@ -100,6 +103,23 @@ lock_key_for() { # _keyhash -> stable filename-safe token for a lock key _keyhash() { printf '%s' "$1" | cksum | awk '{print $1}'; } +# _dur_to_secs -> seconds. Accepts 90, 90s, 45m, 2h, 1d. Invalid/empty -> 0. +_dur_to_secs() { + local d=$1 + [[ -z "$d" || "$d" == "0" ]] && { echo 0; return; } + if [[ "$d" =~ ^([0-9]+)([smhd]?)$ ]]; then + local n=${BASH_REMATCH[1]} u=${BASH_REMATCH[2]} + case "$u" in + ""|s) echo "$n";; + m) echo $((n*60));; + h) echo $((n*3600));; + d) echo $((n*86400));; + esac + else + echo 0 + fi +} + # busy_keys -> newline list of lock keys currently held by active workers. # A worker is active if its meta has no `ended=` and its pid is live (or the pid # has not been written yet, i.e. it was just launched and the slot is reserved). @@ -182,12 +202,35 @@ run_worker() { fi } - local rc lockkey + local rc=0 lockkey tmo timed_out=false lockkey=$(lock_key_for "$doing_file") - if [[ -n "$FLOCK_BIN" ]]; then + tmo=$(_dur_to_secs "$(fm_get "$doing_file" timeout "0")") + local tmo_flag="$STATE/$job.timedout"; rm -f "$tmo_flag" + local lf="$LOCKS/$(_keyhash "$lockkey").lock" + + if [[ "$tmo" -gt 0 && -n "$TIMEOUT_BIN" ]]; then + # Hard timeout via timeout/gtimeout (kills the whole process tree). + AQ_STDIN="$AGENT_STDIN" "$TIMEOUT_BIN" -k 5 "${tmo}s" bash -c ' + cd "$1" || exit 97; shift + if [ -n "${AQ_STDIN:-}" ]; then exec "$@" < "$AQ_STDIN"; else exec "$@"; fi + ' _ "$cwd" "${AGENT_CMD[@]}" >> "$logf" 2>&1 + rc=$? + [[ $rc -eq 124 ]] && timed_out=true + elif [[ "$tmo" -gt 0 ]]; then + # Portable watchdog fallback (no timeout binary). Flags the timeout and + # signals the worker; install coreutils (gtimeout) for hard tree kills. + _run_agent >> "$logf" 2>&1 & + local apid=$! + ( sleep "$tmo"; : > "$tmo_flag" + pkill -TERM -P "$apid" 2>/dev/null; kill -TERM "$apid" 2>/dev/null + sleep 5; pkill -KILL -P "$apid" 2>/dev/null; kill -KILL "$apid" 2>/dev/null ) & + local wpid=$! + wait "$apid" 2>/dev/null; rc=$? + kill "$wpid" 2>/dev/null; wait "$wpid" 2>/dev/null + [[ -f "$tmo_flag" ]] && timed_out=true + elif [[ -n "$FLOCK_BIN" ]]; then # Cross-process hardening where flock exists (Linux CI). The single run-loop # already serializes by lock key; this guards against a stray second launcher. - local lf="$LOCKS/$(_keyhash "$lockkey").lock" ( "$FLOCK_BIN" -n 9 || exit 75; _run_agent ) 9>"$lf" >> "$logf" 2>&1 rc=$? if [[ $rc -eq 75 ]]; then @@ -200,10 +243,15 @@ run_worker() { _run_agent >> "$logf" 2>&1 rc=$? fi + rm -f "$tmo_flag" echo "ended=$(date +%s)" >> "$metaf" echo "exit=$rc" >> "$metaf" - if [[ $rc -eq 0 ]]; then + if $timed_out; then + mv "$doing_file" "$FAILED/" 2>/dev/null + echo "result=timeout" >> "$metaf" + echo "TIMED OUT after ${tmo}s (rc=$rc): $(date)" >> "$logf" + elif [[ $rc -eq 0 ]]; then mv "$doing_file" "$DONE/" 2>/dev/null echo "result=done" >> "$metaf" echo "completed OK (rc=0): $(date)" >> "$logf" @@ -425,6 +473,7 @@ ${C_BOLD}TASK FRONTMATTER${C_RESET} (top of each .md) cwd: /Users/you/code/repo yolo: true lock: my-repo # optional; defaults to cwd. Jobs sharing a key run serially + timeout: 45m # optional; 90s|45m|2h|1d. On expiry -> failed (result=timeout) --- ${C_BOLD}ENV${C_RESET}