fix(agent-queue): verify pid start time to defeat pid reuse

Record pidstart (ps lstart) at launch and verify it in all liveness checks
(_meta_active, status, stop) via _pid_alive, so a recycled pid can never be
mistaken for our worker. Falls back to plain liveness when no start time recorded.
This commit is contained in:
saravanakumardb1 2026-05-28 22:24:50 -07:00
parent a849a30e11
commit 4239648876

View File

@ -112,6 +112,21 @@ _mtime() {
stat -f %m "$1" 2>/dev/null || stat -c %Y "$1" 2>/dev/null || echo ""
}
# _pidstart <pid> -> the process start time as reported by ps (whitespace-normalized).
# Used as an identity token so a recycled pid is never mistaken for our worker.
_pidstart() { ps -o lstart= -p "$1" 2>/dev/null | awk '{$1=$1;print}'; }
# _pid_alive <pid> <pidstart> -> 0 if the pid is live AND (when a start time was
# recorded) its current start time still matches — defeating pid reuse.
_pid_alive() {
local pid=$1 want=$2 cur
[[ -n "$pid" ]] || return 1
kill -0 "$pid" 2>/dev/null || return 1
[[ -z "$want" ]] && return 0
cur=$(_pidstart "$pid")
[[ "$cur" == "$want" ]]
}
# _dur_to_secs <dur> -> seconds. Accepts 90, 90s, 45m, 2h, 1d. Invalid/empty -> 0.
_dur_to_secs() {
local d=$1
@ -139,7 +154,8 @@ _meta_active() {
grep -q '^ended=' "$f" && return 1
pid=$(grep '^pid=' "$f" | head -1 | cut -d= -f2)
if [[ -n "$pid" ]]; then
kill -0 "$pid" 2>/dev/null
local pidstart; pidstart=$(grep '^pidstart=' "$f" | head -1 | cut -d= -f2-)
_pid_alive "$pid" "$pidstart"
return $?
fi
mt=$(_mtime "$f"); age=$(( $(date +%s) - ${mt:-0} ))
@ -386,7 +402,7 @@ cmd_run() {
echo "started=$(date +%s)"
} > "$STATE/$job.meta"
run_worker "$doing_file" &
echo "pid=$!" >> "$STATE/$job.meta"
{ echo "pid=$!"; echo "pidstart=$(_pidstart "$!")"; } >> "$STATE/$job.meta"
log "▶ launching $C_BOLD$job$C_RESET (engine=$w_eng, lock=$w_key)"
sleep 1
running=$(active_workers)
@ -420,8 +436,8 @@ cmd_status() {
for f in "$STATE"/*.meta; do
[[ -e "$f" ]] || continue
grep -q '^ended=' "$f" && continue
local pid; pid=$(grep '^pid=' "$f" | cut -d= -f2)
[[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null || continue
local pid pidstart; pid=$(grep '^pid=' "$f" | cut -d= -f2); pidstart=$(grep '^pidstart=' "$f" | cut -d= -f2-)
_pid_alive "$pid" "$pidstart" || continue
if ! $printed; then printf ' %sRUNNING%s\n' "$C_BOLD" "$C_RESET"; printed=true; fi
local job eng start now el last lmt age stall=""
job=$(grep '^job=' "$f" | cut -d= -f2)
@ -450,12 +466,12 @@ cmd_dash() {
cmd_stop() {
ensure_dirs
local killed=0 f pid
local killed=0 f pid pidstart
for f in "$STATE"/*.meta; do
[[ -e "$f" ]] || continue
grep -q '^ended=' "$f" && continue
pid=$(grep '^pid=' "$f" | cut -d= -f2)
[[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null && { kill "$pid" 2>/dev/null && killed=$((killed+1)); }
pid=$(grep '^pid=' "$f" | cut -d= -f2); pidstart=$(grep '^pidstart=' "$f" | cut -d= -f2-)
_pid_alive "$pid" "$pidstart" && { kill "$pid" 2>/dev/null && killed=$((killed+1)); }
done
[[ -f "$STATE/daemon.pid" ]] && kill "$(cat "$STATE/daemon.pid")" 2>/dev/null
rm -f "$STATE/daemon.pid"