From fa1f1d1b300e2fe03eb3cb877a6e698197e93e73 Mon Sep 17 00:00:00 2001 From: saravanakumardb1 Date: Sun, 31 May 2026 23:36:51 -0700 Subject: [PATCH] fix(agent-queue): write ended= after PR/report so --once can't exit early MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit run_worker marked the job ended (testing) right after moving to testing/, BEFORE opening/merging the PR and reporting to the coordinator. Once ended= is written, _meta_active returns false, active_workers drops to 0, and "run --once" could drain-exit (and callers could observe completion) while the background worker was still opening the PR — a real race that made the PR-mode selftest flaky and could free a concurrency slot prematurely in production. Move the ended= write to the end of the success path (after PR open/merge + testing/shipped reports). No behavior change on the autoship/ship path. Full selftest now passes deterministically across repeated runs. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- agent-queue/agent-queue.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/agent-queue/agent-queue.sh b/agent-queue/agent-queue.sh index a583558..f450fea 100755 --- a/agent-queue/agent-queue.sh +++ b/agent-queue/agent-queue.sh @@ -971,7 +971,6 @@ run_worker() { echo "verify_exit=$vrc" >> "$metaf" if [[ $vrc -eq 0 ]]; then mv "$review_file" "$TESTING/" 2>/dev/null - _meta_end "$metaf" "testing" "$started" echo "VERIFY PASSED — promoted to testing (QA): $(date)" >> "$logf" # PR mode (§PR): work passed verify — commit/push the job branch, open a PR, # record the URL in the meta, and push it onto the coordinator run. @@ -995,14 +994,22 @@ run_worker() { # is enabled, the factory's verify gate IS the test phase — advance # testing -> shipped (closing the testing->shipped gap autonomously). Default # off leaves the job resting at `testing` for the human review gate / ship. + local _ship_done=0 if fleet_enabled && _fleet_is_job "$job"; then fleet_report "$job" testing if [[ "${AQ_FLEET_AUTOSHIP:-0}" == 1 ]] && fleet_report "$job" shipped; then mv "$TESTING/$job.md" "$SHIPPED/" 2>/dev/null - _meta_end "$metaf" "shipped" "$started" + _meta_end "$metaf" "shipped" "$started"; _ship_done=1 echo "FLEET AUTOSHIP — testing -> shipped: $(date)" >> "$logf" fi fi + # Mark the concurrency slot done LAST — only after the PR open/merge + the + # coordinator reports above. Writing `ended=` here (not right after the + # testing/ move) keeps the worker counted as active until that work is + # complete, so `run --once` cannot drain-exit and a caller cannot observe + # the job as finished before the PR is actually opened/merged. Fixes a + # flaky race where the PR/report steps ran after the slot was freed. + [[ "$_ship_done" == 1 ]] || _meta_end "$metaf" "testing" "$started" else echo "VERIFY FAILED (rc=$vrc): $(date)" >> "$logf" # verify ran on the review_file; retry policy may requeue it.