diff --git a/agent-queue/agent-queue.sh b/agent-queue/agent-queue.sh index 62ef308..bd84c4f 100755 --- a/agent-queue/agent-queue.sh +++ b/agent-queue/agent-queue.sh @@ -448,7 +448,13 @@ caps_match() { resolve_engine() { local f=$1 eng cls prefers eng=$(fm_get "$f" engine "") - if [[ -n "$eng" ]]; then printf '%s' "$eng"; return 0; fi + if [[ -n "$eng" ]]; then + # Mirror the engine-class path: only honor an explicit engine when its binary + # is actually available; otherwise emit "" (the no-engine signal) so the caller + # marks the job no_engine instead of trying to run a missing binary and crashing. + if engine_available "$eng"; then printf '%s' "$eng"; else printf '%s' ""; fi + return 0 + fi cls=$(fm_eff "$f" engine-class "") # inherit engine-class from the job's profile if [[ -z "$cls" ]]; then printf '%s' "$DEFAULT_ENGINE"; return 0; fi local class_engines; class_engines=$(engine_class_engines "$cls") @@ -686,6 +692,28 @@ _fleet_pr_prepare() { echo "$dir" } +# _cache_prune — conservatively GC cached repo checkouts under the clone cache +# ($STATE/repos, the same base _fleet_pr_prepare uses) that have not been ACCESSED +# in AQ_FLEET_CACHE_TTL_DAYS days (default 14). Set the var to 0 to disable pruning +# entirely (prior behavior). Best-effort: never fails the run loop, and never runs +# rm against an empty/unset base path. +_cache_prune() { + local ttl="${AQ_FLEET_CACHE_TTL_DAYS:-14}" + [[ "$ttl" =~ ^[0-9]+$ ]] || return 0 + [[ "$ttl" -eq 0 ]] && return 0 + local reposdir="${AQ_FLEET_REPOS_DIR:-$STATE/repos}" + # Guard: refuse to operate unless both the state root and the repos dir are + # non-empty and the repos dir actually exists (never rm an unset/empty path). + [[ -n "$STATE" && -n "$reposdir" && -d "$reposdir" ]] || return 0 + local before after pruned + before=$(find "$reposdir" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | wc -l | tr -d ' ') + find "$reposdir" -mindepth 1 -maxdepth 1 -type d -atime +"$ttl" -exec rm -rf {} + 2>/dev/null || true + after=$(find "$reposdir" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | wc -l | tr -d ' ') + pruned=$(( before - after )) + [[ "$pruned" -gt 0 ]] && log "cache: pruned $pruned stale checkout(s)" + return 0 +} + # Commit the agent's work in , push branch aq/job/, open a PR against # , and echo the PR URL. Returns 1 with no output if there is nothing to # commit or any git/gh step fails (caller leaves pr_url empty). @@ -1651,11 +1679,15 @@ cmd_run() { fi [[ -n "$dpid" ]] && log "clearing stale daemon.pid ($dpid)" echo "$$" > "$STATE/daemon.pid" - trap 'rm -f "$STATE/daemon.pid"; log "run loop stopped"; exit 0' INT TERM + # On shutdown also release any in-flight leases (best-effort) so the coordinator + # can reassign immediately instead of waiting out the lease TTL. + trap 'fleet_release_all_active || true; rm -f "$STATE/daemon.pid"; log "run loop stopped"; exit 0' INT TERM log "run loop started (max=$MAX_CONCURRENCY, default engine=$DEFAULT_ENGINE). Ctrl-C to stop." # Crash recovery (§25.3): reclaim jobs orphaned in building/ by a previous # crash/power-off before launching anything new. recover_orphans + # Disk hygiene: prune stale cached repo checkouts (unbounded growth otherwise). + _cache_prune # Fleet (§8): register with the coordinator (registration == first heartbeat). fleet_enabled && fleet_heartbeat fleet_flags_warn_once # §16: warn once if ROUTE=1 + SHADOW=1 (ROUTE wins, shadow off) diff --git a/agent-queue/lib/fleet-client.sh b/agent-queue/lib/fleet-client.sh index 9fdec81..df7d3c4 100644 --- a/agent-queue/lib/fleet-client.sh +++ b/agent-queue/lib/fleet-client.sh @@ -398,6 +398,21 @@ fleet_renew_active() { return 0 } +# fleet_release_all_active — best-effort release leases for all in-flight (building/) +# fleet jobs, e.g. on daemon shutdown, so the coordinator can reclaim them +# immediately instead of waiting out the lease TTL (~900s). Never blocks shutdown; +# no-op when fleet is disabled. Mirrors fleet_renew_active (release vs renew). +fleet_release_all_active() { + fleet_enabled || return 0 + local f job + for f in "$BUILDING"/*.md; do + [[ -e "$f" ]] || continue + job=$(basename "$f"); job=${job%.md} + _fleet_is_job "$job" && { fleet_lease_release "$job" >/dev/null 2>&1 || true; } + done + return 0 +} + # fleet_quarantine — a fenced (reclaimed) worker must # NOT ship: park the local result in failed/ for human triage (§9 split-brain). fleet_quarantine() {