feat(agent-queue): resilient lease renewal + graceful drain

Two runner-side reliability improvements (additive, opt-out via env where relevant): - Lease-renewal retry: a renewal lost to a transient blip (timeout/5xx/proxy) previously let the lease expire and the coordinator reclaim a still-running job, wasting the work. fleet_lease_renew now retries a TRANSIENT failure a few times with a short backoff (AQ_FLEET_RENEW_RETRIES=2, AQ_FLEET_RENEW_BACKOFF_SEC=2); a 409/412 FENCE is terminal and never retried. - Graceful drain: a new `drain` command signals a running loop (via a $STATE/draining flag) to stop taking NEW work (coordinator claim + local inbox), let in-flight jobs finish, release their leases, and exit cleanly — ideal before a deploy. Distinct from `stop` (kills workers immediately) and the `--drain`/--once startup flag (drains the queue to empty). The flag is cleared at run-loop start and on exit. bash -n clean on both files; ./selftest.sh PASS; `drain` smoke-tested. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
2026-06-01 12:24:45 -07:00 · 2026-06-01 12:24:45 -07:00 · 2ae1af1930
commit 2ae1af1930
parent 14308fc382
2 changed files with 57 additions and 9 deletions
--- a/agent-queue/agent-queue.sh
+++ b/agent-queue/agent-queue.sh
@ -1691,10 +1691,17 @@ cmd_run() {
  # Fleet (§8): register with the coordinator (registration == first heartbeat).
  fleet_enabled && fleet_heartbeat
  fleet_flags_warn_once   # §16: warn once if ROUTE=1 + SHADOW=1 (ROUTE wins, shadow off)
+  rm -f "$STATE/draining"  # clear any stale drain request from a prior run

  while true; do
    # continuously sweep for orphans (a worker that died mid-loop)
    recover_orphans
+    # Graceful drain (deploy-friendly): `agent-queue drain` drops this flag; we then
+    # stop taking NEW work (coordinator claim + local inbox), let in-flight jobs
+    # finish, release their leases, and exit cleanly. Distinct from `--drain`/--once,
+    # which drains the queue to empty at startup.
+    local draining=false
+    [[ -f "$STATE/draining" ]] && draining=true
    # Fleet (§7/§8): heartbeat on cadence, renew leases for in-flight fleet jobs,
    # and — if we have capacity — claim one coordinator job into inbox/ so the
    # normal selection loop below executes it interleaved with local .md files.
@ -1707,7 +1714,8 @@ cmd_run() {
      # §M0 RU gate: when AQ_FLEET_GATE=1, skip the (expensive) claim while the
      # cheap per-product queue version is unchanged and we are not mid-drain.
      # Gate off (default) -> fleet_gate_should_claim always true == prior behavior.
-      if fleet_route_enabled && [[ "$(active_workers)" -lt "$MAX_CONCURRENCY" ]] \
+      if [[ "$draining" != true ]] && fleet_route_enabled \
+         && [[ "$(active_workers)" -lt "$MAX_CONCURRENCY" ]] \
         && fleet_gate_should_claim; then
        local _crc=0
        fleet_claim >/dev/null 2>&1 || _crc=$?
@ -1716,8 +1724,9 @@ cmd_run() {
    fi
    local shadow_local=""   # the local (authoritative) job picked this iteration (for shadow compare)
    local running; running=$(active_workers)
-    # launch jobs while we have capacity and an eligible inbox file
-    while [[ "$running" -lt "$MAX_CONCURRENCY" ]]; do
+    # launch jobs while we have capacity and an eligible inbox file — but NOT while
+    # draining (we let in-flight finish and take nothing new).
+    while [[ "$draining" != true ]] && [[ "$running" -lt "$MAX_CONCURRENCY" ]]; do
      # pick by priority (critical→low) then age, skipping files whose lock key
      # is currently busy, so two jobs sharing a cwd (or `lock:` key) never run
      # at once regardless of --max. inbox_sorted replaces the old pure-FIFO sort.
@ -1816,6 +1825,14 @@ cmd_run() {
        log "drain complete — no runnable work, no workers running"; rm -f "$STATE/daemon.pid"; exit 0
      fi
    fi
+    # Graceful drain requested: once every in-flight worker has finished, release any
+    # leftover leases and exit cleanly (a deploy can now restart us safely).
+    if [[ "$draining" == true ]] && [[ "$(active_workers)" -eq 0 ]]; then
+      fleet_release_all_active || true
+      rm -f "$STATE/draining" "$STATE/daemon.pid"
+      log "drain complete (graceful) — in-flight work finished, leases released"
+      exit 0
+    fi
    sleep "$POLL_SECONDS"
  done
 }
@ -1979,6 +1996,23 @@ cmd_stop() {
  log "stopped $killed running worker(s) + run loop"
 }

+# cmd_drain — request a GRACEFUL drain of a running loop: stop taking new work
+# (coordinator claim + local inbox), let in-flight jobs finish, release their
+# leases, then exit. Unlike `stop` (which kills workers immediately), drain never
+# interrupts running work — ideal before a deploy/restart. No-op-safe if no loop
+# is running (the flag is cleared on the next `run`).
+cmd_drain() {
+  ensure_dirs
+  : > "$STATE/draining"
+  local dpid=""
+  [[ -f "$STATE/daemon.pid" ]] && dpid=$(cat "$STATE/daemon.pid" 2>/dev/null)
+  if [[ -n "$dpid" ]] && kill -0 "$dpid" 2>/dev/null; then
+    log "drain requested — run loop (pid $dpid) will stop claiming, finish in-flight work, and exit"
+  else
+    log "drain flag set — no run loop is currently active (it will drain on next start until cleared)"
+  fi
+}
+
 cmd_logs() {
  local job="${1:-}" follow=""
  [[ "${2:-}" == "-f" || "$job" == "-f" ]] && follow="-f"
@ -2138,6 +2172,7 @@ ${C_BOLD}COMMANDS${C_RESET}
  fleet-shadow-report [N]       summarize the shadow/dual-run divergence log (counts, agreement, last N)
  dash [--interval N]          richer live Node dashboard (recent shipped/failed too)
  stop                         kill running workers + the run loop
+  drain                        graceful stop: finish in-flight work, release leases, exit
  logs <job> [-f]              print (or follow) a job's log
  promote <job>                advance one stage (review → testing → shipped)
  ship <job>                   manual gate: testing (QA) → shipped
@ -2229,6 +2264,7 @@ main() {
    to-tracker)   cmd_to_tracker "$@";;
    dash|dashboard) cmd_dash "$@";;
    stop)   cmd_stop "$@";;
+    drain)  cmd_drain "$@";;
    logs)   cmd_logs "$@";;
    promote) cmd_promote "$@";;
    ship)   cmd_ship "$@";;
--- a/agent-queue/lib/fleet-client.sh
+++ b/agent-queue/lib/fleet-client.sh
@ -316,18 +316,30 @@ fleet_report() {
 }

 # fleet_lease_renew <job> -> extend the lease; 0 ok, 2 fenced, 1 degraded.
+# A renewal lost to a transient blip (timeout / 5xx / proxy) would let the lease
+# expire and the coordinator reclaim a job that is still running, wasting the work.
+# So retry a TRANSIENT failure a few times with a short backoff (well within the
+# lease window); a 409/412 FENCE is terminal and never retried. Tunables:
+#   AQ_FLEET_RENEW_RETRIES (default 2 extra attempts), AQ_FLEET_RENEW_BACKOFF_SEC (2).
 fleet_lease_renew() {
  fleet_enabled || return 0
  local job=$1 metaf jid epoch
  metaf="$STATE/$job.meta"
  jid=$(_meta_val "$metaf" fleet_job_id); epoch=$(_meta_val "$metaf" fleet_lease_epoch)
  [[ -n "$jid" ]] || return 0
-  _fleet_call POST "/fleet/jobs/$jid/lease/renew" "{\"leaseEpoch\":${epoch:-0},\"leaseSeconds\":${AQ_FLEET_LEASE_SECONDS:-900}}"
-  case "$FLEET_CODE" in
-    2*) return 0;;
-    409|412) echo "fleet_fenced=1" >> "$metaf"; return 2;;
-    *) return 1;;
-  esac
+  local retries="${AQ_FLEET_RENEW_RETRIES:-2}" backoff="${AQ_FLEET_RENEW_BACKOFF_SEC:-2}" i=0
+  [[ "$retries" =~ ^[0-9]+$ ]] || retries=2
+  while :; do
+    _fleet_call POST "/fleet/jobs/$jid/lease/renew" "{\"leaseEpoch\":${epoch:-0},\"leaseSeconds\":${AQ_FLEET_LEASE_SECONDS:-900}}"
+    case "$FLEET_CODE" in
+      2*) return 0;;
+      409|412) echo "fleet_fenced=1" >> "$metaf"; return 2;;
+    esac
+    # transient: retry up to $retries extra times before giving up degraded
+    [[ "$i" -ge "$retries" ]] && return 1
+    i=$((i + 1))
+    sleep "$backoff"
+  done
 }

 # fleet_lease_release <job> [stage] -> best-effort release on a terminal stage.