fix(agent-queue): verify pid start time to defeat pid reuse
Record pidstart (ps lstart) at launch and verify it in all liveness checks (_meta_active, status, stop) via _pid_alive, so a recycled pid can never be mistaken for our worker. Falls back to plain liveness when no start time recorded.
This commit is contained in:
parent
a849a30e11
commit
4239648876
@ -112,6 +112,21 @@ _mtime() {
|
||||
stat -f %m "$1" 2>/dev/null || stat -c %Y "$1" 2>/dev/null || echo ""
|
||||
}
|
||||
|
||||
# _pidstart <pid> -> the process start time as reported by ps (whitespace-normalized).
|
||||
# Used as an identity token so a recycled pid is never mistaken for our worker.
|
||||
_pidstart() { ps -o lstart= -p "$1" 2>/dev/null | awk '{$1=$1;print}'; }
|
||||
|
||||
# _pid_alive <pid> <pidstart> -> 0 if the pid is live AND (when a start time was
|
||||
# recorded) its current start time still matches — defeating pid reuse.
|
||||
_pid_alive() {
|
||||
local pid=$1 want=$2 cur
|
||||
[[ -n "$pid" ]] || return 1
|
||||
kill -0 "$pid" 2>/dev/null || return 1
|
||||
[[ -z "$want" ]] && return 0
|
||||
cur=$(_pidstart "$pid")
|
||||
[[ "$cur" == "$want" ]]
|
||||
}
|
||||
|
||||
# _dur_to_secs <dur> -> seconds. Accepts 90, 90s, 45m, 2h, 1d. Invalid/empty -> 0.
|
||||
_dur_to_secs() {
|
||||
local d=$1
|
||||
@ -139,7 +154,8 @@ _meta_active() {
|
||||
grep -q '^ended=' "$f" && return 1
|
||||
pid=$(grep '^pid=' "$f" | head -1 | cut -d= -f2)
|
||||
if [[ -n "$pid" ]]; then
|
||||
kill -0 "$pid" 2>/dev/null
|
||||
local pidstart; pidstart=$(grep '^pidstart=' "$f" | head -1 | cut -d= -f2-)
|
||||
_pid_alive "$pid" "$pidstart"
|
||||
return $?
|
||||
fi
|
||||
mt=$(_mtime "$f"); age=$(( $(date +%s) - ${mt:-0} ))
|
||||
@ -386,7 +402,7 @@ cmd_run() {
|
||||
echo "started=$(date +%s)"
|
||||
} > "$STATE/$job.meta"
|
||||
run_worker "$doing_file" &
|
||||
echo "pid=$!" >> "$STATE/$job.meta"
|
||||
{ echo "pid=$!"; echo "pidstart=$(_pidstart "$!")"; } >> "$STATE/$job.meta"
|
||||
log "▶ launching $C_BOLD$job$C_RESET (engine=$w_eng, lock=$w_key)"
|
||||
sleep 1
|
||||
running=$(active_workers)
|
||||
@ -420,8 +436,8 @@ cmd_status() {
|
||||
for f in "$STATE"/*.meta; do
|
||||
[[ -e "$f" ]] || continue
|
||||
grep -q '^ended=' "$f" && continue
|
||||
local pid; pid=$(grep '^pid=' "$f" | cut -d= -f2)
|
||||
[[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null || continue
|
||||
local pid pidstart; pid=$(grep '^pid=' "$f" | cut -d= -f2); pidstart=$(grep '^pidstart=' "$f" | cut -d= -f2-)
|
||||
_pid_alive "$pid" "$pidstart" || continue
|
||||
if ! $printed; then printf ' %sRUNNING%s\n' "$C_BOLD" "$C_RESET"; printed=true; fi
|
||||
local job eng start now el last lmt age stall=""
|
||||
job=$(grep '^job=' "$f" | cut -d= -f2)
|
||||
@ -450,12 +466,12 @@ cmd_dash() {
|
||||
|
||||
cmd_stop() {
|
||||
ensure_dirs
|
||||
local killed=0 f pid
|
||||
local killed=0 f pid pidstart
|
||||
for f in "$STATE"/*.meta; do
|
||||
[[ -e "$f" ]] || continue
|
||||
grep -q '^ended=' "$f" && continue
|
||||
pid=$(grep '^pid=' "$f" | cut -d= -f2)
|
||||
[[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null && { kill "$pid" 2>/dev/null && killed=$((killed+1)); }
|
||||
pid=$(grep '^pid=' "$f" | cut -d= -f2); pidstart=$(grep '^pidstart=' "$f" | cut -d= -f2-)
|
||||
_pid_alive "$pid" "$pidstart" && { kill "$pid" 2>/dev/null && killed=$((killed+1)); }
|
||||
done
|
||||
[[ -f "$STATE/daemon.pid" ]] && kill "$(cat "$STATE/daemon.pid")" 2>/dev/null
|
||||
rm -f "$STATE/daemon.pid"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user