feat(agent-queue): per-job timeout via frontmatter timeout:

Honor 'timeout: 45m' (90s|45m|2h|1d) by wrapping the agent in timeout/gtimeout
when available (hard process-tree kill), else a portable bash watchdog. On expiry
the job moves doing->failed with result=timeout and a TIMED OUT log line.
This commit is contained in:
saravanakumardb1 2026-05-28 22:13:50 -07:00
parent f14e6c2336
commit 3b71f0117a

View File

@ -39,6 +39,9 @@ POLL_SECONDS="${AGENT_QUEUE_POLL:-3}"
# flock is used for cross-process lock hardening when available (Linux). macOS
# has no flock; mutual exclusion there relies on the single run-loop (see cmd_run).
FLOCK_BIN="${FLOCK_BIN:-$(command -v flock || true)}"
# timeout/gtimeout give hard process-tree kills for per-job timeouts; if absent
# (stock macOS) a pure-bash watchdog is used as a best-effort fallback.
TIMEOUT_BIN="${TIMEOUT_BIN:-$(command -v timeout || command -v gtimeout || true)}"
DEVIN_BIN="${DEVIN_BIN:-$(command -v devin || echo "$HOME/.local/bin/devin")}"
CLAUDE_BIN="${CLAUDE_BIN:-$(command -v claude || echo claude)}"
@ -100,6 +103,23 @@ lock_key_for() {
# _keyhash <key> -> stable filename-safe token for a lock key
_keyhash() { printf '%s' "$1" | cksum | awk '{print $1}'; }
# _dur_to_secs <dur> -> seconds. Accepts 90, 90s, 45m, 2h, 1d. Invalid/empty -> 0.
_dur_to_secs() {
local d=$1
[[ -z "$d" || "$d" == "0" ]] && { echo 0; return; }
if [[ "$d" =~ ^([0-9]+)([smhd]?)$ ]]; then
local n=${BASH_REMATCH[1]} u=${BASH_REMATCH[2]}
case "$u" in
""|s) echo "$n";;
m) echo $((n*60));;
h) echo $((n*3600));;
d) echo $((n*86400));;
esac
else
echo 0
fi
}
# busy_keys -> newline list of lock keys currently held by active workers.
# A worker is active if its meta has no `ended=` and its pid is live (or the pid
# has not been written yet, i.e. it was just launched and the slot is reserved).
@ -182,12 +202,35 @@ run_worker() {
fi
}
local rc lockkey
local rc=0 lockkey tmo timed_out=false
lockkey=$(lock_key_for "$doing_file")
if [[ -n "$FLOCK_BIN" ]]; then
tmo=$(_dur_to_secs "$(fm_get "$doing_file" timeout "0")")
local tmo_flag="$STATE/$job.timedout"; rm -f "$tmo_flag"
local lf="$LOCKS/$(_keyhash "$lockkey").lock"
if [[ "$tmo" -gt 0 && -n "$TIMEOUT_BIN" ]]; then
# Hard timeout via timeout/gtimeout (kills the whole process tree).
AQ_STDIN="$AGENT_STDIN" "$TIMEOUT_BIN" -k 5 "${tmo}s" bash -c '
cd "$1" || exit 97; shift
if [ -n "${AQ_STDIN:-}" ]; then exec "$@" < "$AQ_STDIN"; else exec "$@"; fi
' _ "$cwd" "${AGENT_CMD[@]}" >> "$logf" 2>&1
rc=$?
[[ $rc -eq 124 ]] && timed_out=true
elif [[ "$tmo" -gt 0 ]]; then
# Portable watchdog fallback (no timeout binary). Flags the timeout and
# signals the worker; install coreutils (gtimeout) for hard tree kills.
_run_agent >> "$logf" 2>&1 &
local apid=$!
( sleep "$tmo"; : > "$tmo_flag"
pkill -TERM -P "$apid" 2>/dev/null; kill -TERM "$apid" 2>/dev/null
sleep 5; pkill -KILL -P "$apid" 2>/dev/null; kill -KILL "$apid" 2>/dev/null ) &
local wpid=$!
wait "$apid" 2>/dev/null; rc=$?
kill "$wpid" 2>/dev/null; wait "$wpid" 2>/dev/null
[[ -f "$tmo_flag" ]] && timed_out=true
elif [[ -n "$FLOCK_BIN" ]]; then
# Cross-process hardening where flock exists (Linux CI). The single run-loop
# already serializes by lock key; this guards against a stray second launcher.
local lf="$LOCKS/$(_keyhash "$lockkey").lock"
( "$FLOCK_BIN" -n 9 || exit 75; _run_agent ) 9>"$lf" >> "$logf" 2>&1
rc=$?
if [[ $rc -eq 75 ]]; then
@ -200,10 +243,15 @@ run_worker() {
_run_agent >> "$logf" 2>&1
rc=$?
fi
rm -f "$tmo_flag"
echo "ended=$(date +%s)" >> "$metaf"
echo "exit=$rc" >> "$metaf"
if [[ $rc -eq 0 ]]; then
if $timed_out; then
mv "$doing_file" "$FAILED/" 2>/dev/null
echo "result=timeout" >> "$metaf"
echo "TIMED OUT after ${tmo}s (rc=$rc): $(date)" >> "$logf"
elif [[ $rc -eq 0 ]]; then
mv "$doing_file" "$DONE/" 2>/dev/null
echo "result=done" >> "$metaf"
echo "completed OK (rc=0): $(date)" >> "$logf"
@ -425,6 +473,7 @@ ${C_BOLD}TASK FRONTMATTER${C_RESET} (top of each .md)
cwd: /Users/you/code/repo
yolo: true
lock: my-repo # optional; defaults to cwd. Jobs sharing a key run serially
timeout: 45m # optional; 90s|45m|2h|1d. On expiry -> failed (result=timeout)
---
${C_BOLD}ENV${C_RESET}