fix(agent-queue): explicit-engine availability check, shutdown lease release, cache GC

Three runner-side robustness fixes (behavior-preserving, opt-out where relevant):

- resolve_engine now availability-checks an EXPLICIT engine (mirroring the
  engine-class path): if the requested engine's binary isn't installed it emits
  the no-engine signal so the job is marked no_engine, instead of invoking a
  missing binary and surfacing a generic crash.
- The run-loop INT/TERM trap now best-effort releases leases for in-flight
  building/ jobs (new fleet_release_all_active) so a stopped factory's jobs are
  reclaimable immediately rather than waiting out the ~900s lease TTL.
- _cache_prune GCs cached repo checkouts under $STATE/repos not accessed in
  AQ_FLEET_CACHE_TTL_DAYS days (default 14; 0 disables), run once at run-loop
  startup, to stop unbounded disk growth. Guards against rm on an empty base path.

bash -n passes on both files; ./selftest.sh PASS.

Generated with [Devin](https://cli.devin.ai/docs)

Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
This commit is contained in:
saravanakumardb1 2026-06-01 11:51:56 -07:00
parent 79e6a8db00
commit 14308fc382
2 changed files with 49 additions and 2 deletions

View File

@ -448,7 +448,13 @@ caps_match() {
resolve_engine() {
local f=$1 eng cls prefers
eng=$(fm_get "$f" engine "")
if [[ -n "$eng" ]]; then printf '%s' "$eng"; return 0; fi
if [[ -n "$eng" ]]; then
# Mirror the engine-class path: only honor an explicit engine when its binary
# is actually available; otherwise emit "" (the no-engine signal) so the caller
# marks the job no_engine instead of trying to run a missing binary and crashing.
if engine_available "$eng"; then printf '%s' "$eng"; else printf '%s' ""; fi
return 0
fi
cls=$(fm_eff "$f" engine-class "") # inherit engine-class from the job's profile
if [[ -z "$cls" ]]; then printf '%s' "$DEFAULT_ENGINE"; return 0; fi
local class_engines; class_engines=$(engine_class_engines "$cls")
@ -686,6 +692,28 @@ _fleet_pr_prepare() {
echo "$dir"
}
# _cache_prune — conservatively GC cached repo checkouts under the clone cache
# ($STATE/repos, the same base _fleet_pr_prepare uses) that have not been ACCESSED
# in AQ_FLEET_CACHE_TTL_DAYS days (default 14). Set the var to 0 to disable pruning
# entirely (prior behavior). Best-effort: never fails the run loop, and never runs
# rm against an empty/unset base path.
_cache_prune() {
local ttl="${AQ_FLEET_CACHE_TTL_DAYS:-14}"
[[ "$ttl" =~ ^[0-9]+$ ]] || return 0
[[ "$ttl" -eq 0 ]] && return 0
local reposdir="${AQ_FLEET_REPOS_DIR:-$STATE/repos}"
# Guard: refuse to operate unless both the state root and the repos dir are
# non-empty and the repos dir actually exists (never rm an unset/empty path).
[[ -n "$STATE" && -n "$reposdir" && -d "$reposdir" ]] || return 0
local before after pruned
before=$(find "$reposdir" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | wc -l | tr -d ' ')
find "$reposdir" -mindepth 1 -maxdepth 1 -type d -atime +"$ttl" -exec rm -rf {} + 2>/dev/null || true
after=$(find "$reposdir" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | wc -l | tr -d ' ')
pruned=$(( before - after ))
[[ "$pruned" -gt 0 ]] && log "cache: pruned $pruned stale checkout(s)"
return 0
}
# Commit the agent's work in <dir>, push branch aq/job/<jid>, open a PR against
# <base>, and echo the PR URL. Returns 1 with no output if there is nothing to
# commit or any git/gh step fails (caller leaves pr_url empty).
@ -1651,11 +1679,15 @@ cmd_run() {
fi
[[ -n "$dpid" ]] && log "clearing stale daemon.pid ($dpid)"
echo "$$" > "$STATE/daemon.pid"
trap 'rm -f "$STATE/daemon.pid"; log "run loop stopped"; exit 0' INT TERM
# On shutdown also release any in-flight leases (best-effort) so the coordinator
# can reassign immediately instead of waiting out the lease TTL.
trap 'fleet_release_all_active || true; rm -f "$STATE/daemon.pid"; log "run loop stopped"; exit 0' INT TERM
log "run loop started (max=$MAX_CONCURRENCY, default engine=$DEFAULT_ENGINE). Ctrl-C to stop."
# Crash recovery (§25.3): reclaim jobs orphaned in building/ by a previous
# crash/power-off before launching anything new.
recover_orphans
# Disk hygiene: prune stale cached repo checkouts (unbounded growth otherwise).
_cache_prune
# Fleet (§8): register with the coordinator (registration == first heartbeat).
fleet_enabled && fleet_heartbeat
fleet_flags_warn_once # §16: warn once if ROUTE=1 + SHADOW=1 (ROUTE wins, shadow off)

View File

@ -398,6 +398,21 @@ fleet_renew_active() {
return 0
}
# fleet_release_all_active — best-effort release leases for all in-flight (building/)
# fleet jobs, e.g. on daemon shutdown, so the coordinator can reclaim them
# immediately instead of waiting out the lease TTL (~900s). Never blocks shutdown;
# no-op when fleet is disabled. Mirrors fleet_renew_active (release vs renew).
fleet_release_all_active() {
fleet_enabled || return 0
local f job
for f in "$BUILDING"/*.md; do
[[ -e "$f" ]] || continue
job=$(basename "$f"); job=${job%.md}
_fleet_is_job "$job" && { fleet_lease_release "$job" >/dev/null 2>&1 || true; }
done
return 0
}
# fleet_quarantine <job> <file> <metaf> <logf> — a fenced (reclaimed) worker must
# NOT ship: park the local result in failed/ for human triage (§9 split-brain).
fleet_quarantine() {