feat(agent-queue): per-job timeout via frontmatter timeout:
Honor 'timeout: 45m' (90s|45m|2h|1d) by wrapping the agent in timeout/gtimeout when available (hard process-tree kill), else a portable bash watchdog. On expiry the job moves doing->failed with result=timeout and a TIMED OUT log line.
This commit is contained in:
parent
f14e6c2336
commit
3b71f0117a
@ -39,6 +39,9 @@ POLL_SECONDS="${AGENT_QUEUE_POLL:-3}"
|
|||||||
# flock is used for cross-process lock hardening when available (Linux). macOS
|
# flock is used for cross-process lock hardening when available (Linux). macOS
|
||||||
# has no flock; mutual exclusion there relies on the single run-loop (see cmd_run).
|
# has no flock; mutual exclusion there relies on the single run-loop (see cmd_run).
|
||||||
FLOCK_BIN="${FLOCK_BIN:-$(command -v flock || true)}"
|
FLOCK_BIN="${FLOCK_BIN:-$(command -v flock || true)}"
|
||||||
|
# timeout/gtimeout give hard process-tree kills for per-job timeouts; if absent
|
||||||
|
# (stock macOS) a pure-bash watchdog is used as a best-effort fallback.
|
||||||
|
TIMEOUT_BIN="${TIMEOUT_BIN:-$(command -v timeout || command -v gtimeout || true)}"
|
||||||
|
|
||||||
DEVIN_BIN="${DEVIN_BIN:-$(command -v devin || echo "$HOME/.local/bin/devin")}"
|
DEVIN_BIN="${DEVIN_BIN:-$(command -v devin || echo "$HOME/.local/bin/devin")}"
|
||||||
CLAUDE_BIN="${CLAUDE_BIN:-$(command -v claude || echo claude)}"
|
CLAUDE_BIN="${CLAUDE_BIN:-$(command -v claude || echo claude)}"
|
||||||
@ -100,6 +103,23 @@ lock_key_for() {
|
|||||||
# _keyhash <key> -> stable filename-safe token for a lock key
|
# _keyhash <key> -> stable filename-safe token for a lock key
|
||||||
_keyhash() { printf '%s' "$1" | cksum | awk '{print $1}'; }
|
_keyhash() { printf '%s' "$1" | cksum | awk '{print $1}'; }
|
||||||
|
|
||||||
|
# _dur_to_secs <dur> -> seconds. Accepts 90, 90s, 45m, 2h, 1d. Invalid/empty -> 0.
|
||||||
|
_dur_to_secs() {
|
||||||
|
local d=$1
|
||||||
|
[[ -z "$d" || "$d" == "0" ]] && { echo 0; return; }
|
||||||
|
if [[ "$d" =~ ^([0-9]+)([smhd]?)$ ]]; then
|
||||||
|
local n=${BASH_REMATCH[1]} u=${BASH_REMATCH[2]}
|
||||||
|
case "$u" in
|
||||||
|
""|s) echo "$n";;
|
||||||
|
m) echo $((n*60));;
|
||||||
|
h) echo $((n*3600));;
|
||||||
|
d) echo $((n*86400));;
|
||||||
|
esac
|
||||||
|
else
|
||||||
|
echo 0
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
# busy_keys -> newline list of lock keys currently held by active workers.
|
# busy_keys -> newline list of lock keys currently held by active workers.
|
||||||
# A worker is active if its meta has no `ended=` and its pid is live (or the pid
|
# A worker is active if its meta has no `ended=` and its pid is live (or the pid
|
||||||
# has not been written yet, i.e. it was just launched and the slot is reserved).
|
# has not been written yet, i.e. it was just launched and the slot is reserved).
|
||||||
@ -182,12 +202,35 @@ run_worker() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
local rc lockkey
|
local rc=0 lockkey tmo timed_out=false
|
||||||
lockkey=$(lock_key_for "$doing_file")
|
lockkey=$(lock_key_for "$doing_file")
|
||||||
if [[ -n "$FLOCK_BIN" ]]; then
|
tmo=$(_dur_to_secs "$(fm_get "$doing_file" timeout "0")")
|
||||||
|
local tmo_flag="$STATE/$job.timedout"; rm -f "$tmo_flag"
|
||||||
|
local lf="$LOCKS/$(_keyhash "$lockkey").lock"
|
||||||
|
|
||||||
|
if [[ "$tmo" -gt 0 && -n "$TIMEOUT_BIN" ]]; then
|
||||||
|
# Hard timeout via timeout/gtimeout (kills the whole process tree).
|
||||||
|
AQ_STDIN="$AGENT_STDIN" "$TIMEOUT_BIN" -k 5 "${tmo}s" bash -c '
|
||||||
|
cd "$1" || exit 97; shift
|
||||||
|
if [ -n "${AQ_STDIN:-}" ]; then exec "$@" < "$AQ_STDIN"; else exec "$@"; fi
|
||||||
|
' _ "$cwd" "${AGENT_CMD[@]}" >> "$logf" 2>&1
|
||||||
|
rc=$?
|
||||||
|
[[ $rc -eq 124 ]] && timed_out=true
|
||||||
|
elif [[ "$tmo" -gt 0 ]]; then
|
||||||
|
# Portable watchdog fallback (no timeout binary). Flags the timeout and
|
||||||
|
# signals the worker; install coreutils (gtimeout) for hard tree kills.
|
||||||
|
_run_agent >> "$logf" 2>&1 &
|
||||||
|
local apid=$!
|
||||||
|
( sleep "$tmo"; : > "$tmo_flag"
|
||||||
|
pkill -TERM -P "$apid" 2>/dev/null; kill -TERM "$apid" 2>/dev/null
|
||||||
|
sleep 5; pkill -KILL -P "$apid" 2>/dev/null; kill -KILL "$apid" 2>/dev/null ) &
|
||||||
|
local wpid=$!
|
||||||
|
wait "$apid" 2>/dev/null; rc=$?
|
||||||
|
kill "$wpid" 2>/dev/null; wait "$wpid" 2>/dev/null
|
||||||
|
[[ -f "$tmo_flag" ]] && timed_out=true
|
||||||
|
elif [[ -n "$FLOCK_BIN" ]]; then
|
||||||
# Cross-process hardening where flock exists (Linux CI). The single run-loop
|
# Cross-process hardening where flock exists (Linux CI). The single run-loop
|
||||||
# already serializes by lock key; this guards against a stray second launcher.
|
# already serializes by lock key; this guards against a stray second launcher.
|
||||||
local lf="$LOCKS/$(_keyhash "$lockkey").lock"
|
|
||||||
( "$FLOCK_BIN" -n 9 || exit 75; _run_agent ) 9>"$lf" >> "$logf" 2>&1
|
( "$FLOCK_BIN" -n 9 || exit 75; _run_agent ) 9>"$lf" >> "$logf" 2>&1
|
||||||
rc=$?
|
rc=$?
|
||||||
if [[ $rc -eq 75 ]]; then
|
if [[ $rc -eq 75 ]]; then
|
||||||
@ -200,10 +243,15 @@ run_worker() {
|
|||||||
_run_agent >> "$logf" 2>&1
|
_run_agent >> "$logf" 2>&1
|
||||||
rc=$?
|
rc=$?
|
||||||
fi
|
fi
|
||||||
|
rm -f "$tmo_flag"
|
||||||
|
|
||||||
echo "ended=$(date +%s)" >> "$metaf"
|
echo "ended=$(date +%s)" >> "$metaf"
|
||||||
echo "exit=$rc" >> "$metaf"
|
echo "exit=$rc" >> "$metaf"
|
||||||
if [[ $rc -eq 0 ]]; then
|
if $timed_out; then
|
||||||
|
mv "$doing_file" "$FAILED/" 2>/dev/null
|
||||||
|
echo "result=timeout" >> "$metaf"
|
||||||
|
echo "TIMED OUT after ${tmo}s (rc=$rc): $(date)" >> "$logf"
|
||||||
|
elif [[ $rc -eq 0 ]]; then
|
||||||
mv "$doing_file" "$DONE/" 2>/dev/null
|
mv "$doing_file" "$DONE/" 2>/dev/null
|
||||||
echo "result=done" >> "$metaf"
|
echo "result=done" >> "$metaf"
|
||||||
echo "completed OK (rc=0): $(date)" >> "$logf"
|
echo "completed OK (rc=0): $(date)" >> "$logf"
|
||||||
@ -425,6 +473,7 @@ ${C_BOLD}TASK FRONTMATTER${C_RESET} (top of each .md)
|
|||||||
cwd: /Users/you/code/repo
|
cwd: /Users/you/code/repo
|
||||||
yolo: true
|
yolo: true
|
||||||
lock: my-repo # optional; defaults to cwd. Jobs sharing a key run serially
|
lock: my-repo # optional; defaults to cwd. Jobs sharing a key run serially
|
||||||
|
timeout: 45m # optional; 90s|45m|2h|1d. On expiry -> failed (result=timeout)
|
||||||
---
|
---
|
||||||
|
|
||||||
${C_BOLD}ENV${C_RESET}
|
${C_BOLD}ENV${C_RESET}
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user