diff --git a/agent-queue/agent-queue.sh b/agent-queue/agent-queue.sh index e1dcd18..f475575 100755 --- a/agent-queue/agent-queue.sh +++ b/agent-queue/agent-queue.sh @@ -112,6 +112,21 @@ _mtime() { stat -f %m "$1" 2>/dev/null || stat -c %Y "$1" 2>/dev/null || echo "" } +# _pidstart -> the process start time as reported by ps (whitespace-normalized). +# Used as an identity token so a recycled pid is never mistaken for our worker. +_pidstart() { ps -o lstart= -p "$1" 2>/dev/null | awk '{$1=$1;print}'; } + +# _pid_alive -> 0 if the pid is live AND (when a start time was +# recorded) its current start time still matches — defeating pid reuse. +_pid_alive() { + local pid=$1 want=$2 cur + [[ -n "$pid" ]] || return 1 + kill -0 "$pid" 2>/dev/null || return 1 + [[ -z "$want" ]] && return 0 + cur=$(_pidstart "$pid") + [[ "$cur" == "$want" ]] +} + # _dur_to_secs -> seconds. Accepts 90, 90s, 45m, 2h, 1d. Invalid/empty -> 0. _dur_to_secs() { local d=$1 @@ -139,7 +154,8 @@ _meta_active() { grep -q '^ended=' "$f" && return 1 pid=$(grep '^pid=' "$f" | head -1 | cut -d= -f2) if [[ -n "$pid" ]]; then - kill -0 "$pid" 2>/dev/null + local pidstart; pidstart=$(grep '^pidstart=' "$f" | head -1 | cut -d= -f2-) + _pid_alive "$pid" "$pidstart" return $? fi mt=$(_mtime "$f"); age=$(( $(date +%s) - ${mt:-0} )) @@ -386,7 +402,7 @@ cmd_run() { echo "started=$(date +%s)" } > "$STATE/$job.meta" run_worker "$doing_file" & - echo "pid=$!" >> "$STATE/$job.meta" + { echo "pid=$!"; echo "pidstart=$(_pidstart "$!")"; } >> "$STATE/$job.meta" log "▶ launching $C_BOLD$job$C_RESET (engine=$w_eng, lock=$w_key)" sleep 1 running=$(active_workers) @@ -420,8 +436,8 @@ cmd_status() { for f in "$STATE"/*.meta; do [[ -e "$f" ]] || continue grep -q '^ended=' "$f" && continue - local pid; pid=$(grep '^pid=' "$f" | cut -d= -f2) - [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null || continue + local pid pidstart; pid=$(grep '^pid=' "$f" | cut -d= -f2); pidstart=$(grep '^pidstart=' "$f" | cut -d= -f2-) + _pid_alive "$pid" "$pidstart" || continue if ! $printed; then printf ' %sRUNNING%s\n' "$C_BOLD" "$C_RESET"; printed=true; fi local job eng start now el last lmt age stall="" job=$(grep '^job=' "$f" | cut -d= -f2) @@ -450,12 +466,12 @@ cmd_dash() { cmd_stop() { ensure_dirs - local killed=0 f pid + local killed=0 f pid pidstart for f in "$STATE"/*.meta; do [[ -e "$f" ]] || continue grep -q '^ended=' "$f" && continue - pid=$(grep '^pid=' "$f" | cut -d= -f2) - [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null && { kill "$pid" 2>/dev/null && killed=$((killed+1)); } + pid=$(grep '^pid=' "$f" | cut -d= -f2); pidstart=$(grep '^pidstart=' "$f" | cut -d= -f2-) + _pid_alive "$pid" "$pidstart" && { kill "$pid" 2>/dev/null && killed=$((killed+1)); } done [[ -f "$STATE/daemon.pid" ]] && kill "$(cat "$STATE/daemon.pid")" 2>/dev/null rm -f "$STATE/daemon.pid"