feat: detect stale VM automation
Some checks failed
pre-commit / pre-commit (push) Failing after 32s
Some checks failed
pre-commit / pre-commit (push) Failing after 32s
This commit is contained in:
parent
3d5f369f3d
commit
9210a8890f
@ -357,7 +357,7 @@ Effective `sshd -T` settings showed:
|
|||||||
- [ ] Inventory root/user crontabs, `/etc/cron.d`, systemd timers, Hermes cron, and Gitea Actions schedules.
|
- [ ] Inventory root/user crontabs, `/etc/cron.d`, systemd timers, Hermes cron, and Gitea Actions schedules.
|
||||||
- [x] Remove or update stale `/opt/bytelyst/bytelyst-devops-tools/...` references after confirming replacements.
|
- [x] Remove or update stale `/opt/bytelyst/bytelyst-devops-tools/...` references after confirming replacements.
|
||||||
- [ ] Add owner, purpose, expected output, and alert channel for every job.
|
- [ ] Add owner, purpose, expected output, and alert channel for every job.
|
||||||
- [ ] Add a stale-job detector for missing script paths and failed systemd units.
|
- [x] Add a stale-job detector for missing script paths and failed systemd units.
|
||||||
|
|
||||||
**Acceptance criteria:**
|
**Acceptance criteria:**
|
||||||
|
|
||||||
@ -400,7 +400,7 @@ Effective `sshd -T` settings showed:
|
|||||||
- [ ] Fix/retire unhealthy containers.
|
- [ ] Fix/retire unhealthy containers.
|
||||||
- [x] Resolve `hermes-root-backup.service` failed state.
|
- [x] Resolve `hermes-root-backup.service` failed state.
|
||||||
- [x] Decide and document Gitea runner active/disabled state.
|
- [x] Decide and document Gitea runner active/disabled state.
|
||||||
- [ ] Add missing-script checks. Stale root cron path was fixed on 2026-05-27.
|
- [x] Add missing-script checks. Stale root cron path was fixed on 2026-05-27.
|
||||||
- [ ] Apply pending security/runtime updates in a maintenance window.
|
- [ ] Apply pending security/runtime updates in a maintenance window.
|
||||||
|
|
||||||
**Exit criteria:** no unexpected failed units, no ignored unhealthy required containers, no stale cron paths, and runner state is intentional.
|
**Exit criteria:** no unexpected failed units, no ignored unhealthy required containers, no stale cron paths, and runner state is intentional.
|
||||||
@ -497,6 +497,24 @@ Minimum post-checks for Phase 1:
|
|||||||
- Record a small dedicated smoke workflow that does not need production secrets, so runner health is proven by a controlled workflow rather than incidental queued work.
|
- Record a small dedicated smoke workflow that does not need production secrets, so runner health is proven by a controlled workflow rather than incidental queued work.
|
||||||
- Add runner health to VM observability so enabled-but-inactive drift is caught automatically.
|
- Add runner health to VM observability so enabled-but-inactive drift is caught automatically.
|
||||||
|
|
||||||
|
### 2026-05-27 — Phase 2 stale automation detector
|
||||||
|
|
||||||
|
**Changed:**
|
||||||
|
|
||||||
|
- Extended `scripts/VMs/HostingerVM/vm-health-check.sh` with an `AUTOMATION DRIFT` section.
|
||||||
|
- The daily health check now reports failed systemd units and root crontab script paths that no longer exist.
|
||||||
|
- Made optional `/var/log/vm-health-check.log` writes silent when the script runs in a restricted/read-only context.
|
||||||
|
|
||||||
|
**Verified:**
|
||||||
|
|
||||||
|
- `bash -n scripts/VMs/HostingerVM/vm-health-check.sh` passed.
|
||||||
|
- Restricted `--json` run stayed quiet on log-write failure and reported the new checks.
|
||||||
|
- Host-permission `--json` run reported `failed_units=OK` and `cron_missing_paths=OK`.
|
||||||
|
|
||||||
|
**Residual risk:**
|
||||||
|
|
||||||
|
- The detector currently covers root crontab and failed systemd units. Full ownership inventory still needs `/etc/cron.d`, user crontabs, Hermes cron, Gitea schedules, owners, outputs, and alert channels.
|
||||||
|
|
||||||
## Do Not Start With
|
## Do Not Start With
|
||||||
|
|
||||||
- Rootless Docker migration.
|
- Rootless Docker migration.
|
||||||
|
|||||||
@ -8,6 +8,10 @@ This directory is the preferred home for self-contained operational scripts.
|
|||||||
- Supported.
|
- Supported.
|
||||||
- Purpose: update and harden Ubuntu VMs with unattended upgrades, UFW, and fail2ban.
|
- Purpose: update and harden Ubuntu VMs with unattended upgrades, UFW, and fail2ban.
|
||||||
- Risk level: high, because it modifies packages, firewall rules, and reboot behavior.
|
- Risk level: high, because it modifies packages, firewall rules, and reboot behavior.
|
||||||
|
- `VMs/HostingerVM/vm-health-check.sh`
|
||||||
|
- Supported.
|
||||||
|
- Purpose: read-only VM health and drift check for disk, memory, swap, Docker health, failed systemd units, and stale root crontab script paths.
|
||||||
|
- Risk level: low, because it is read-only apart from an optional local log write.
|
||||||
|
|
||||||
## Conventions
|
## Conventions
|
||||||
|
|
||||||
|
|||||||
@ -69,6 +69,11 @@ declare -A JSON_DATA
|
|||||||
|
|
||||||
# ── Helpers ─────────────────────────────────────────────────────────────────
|
# ── Helpers ─────────────────────────────────────────────────────────────────
|
||||||
log_to_file() {
|
log_to_file() {
|
||||||
|
local log_dir
|
||||||
|
log_dir="$(dirname "$LOG_FILE")"
|
||||||
|
if [[ ( -e "$LOG_FILE" && ! -w "$LOG_FILE" ) || ( ! -e "$LOG_FILE" && ! -w "$log_dir" ) ]]; then
|
||||||
|
return
|
||||||
|
fi
|
||||||
echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >> "$LOG_FILE" 2>/dev/null || true
|
echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >> "$LOG_FILE" 2>/dev/null || true
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -256,6 +261,47 @@ check_logs() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
check_automation_drift() {
|
||||||
|
header "AUTOMATION DRIFT"
|
||||||
|
|
||||||
|
local failed_units
|
||||||
|
failed_units=$(systemctl --failed --no-legend --plain 2>/dev/null | awk '{print $1}' | paste -sd, - || true)
|
||||||
|
if [[ -n "$failed_units" ]]; then
|
||||||
|
record failed_units WARN "$failed_units" "Failed systemd units: $failed_units"
|
||||||
|
else
|
||||||
|
record failed_units OK "0 failed" "No failed systemd units"
|
||||||
|
fi
|
||||||
|
|
||||||
|
local missing_paths=()
|
||||||
|
local cron_line path clean_path
|
||||||
|
while IFS= read -r cron_line; do
|
||||||
|
[[ -z "$cron_line" || "$cron_line" =~ ^[[:space:]]*# ]] && continue
|
||||||
|
[[ "$cron_line" =~ ^[A-Za-z_][A-Za-z0-9_]*= ]] && continue
|
||||||
|
|
||||||
|
while IFS= read -r path; do
|
||||||
|
clean_path="${path%\"}"
|
||||||
|
clean_path="${clean_path%\'}"
|
||||||
|
clean_path="${clean_path%,}"
|
||||||
|
clean_path="${clean_path%;}"
|
||||||
|
clean_path="${clean_path%)}"
|
||||||
|
|
||||||
|
case "$clean_path" in
|
||||||
|
/var/log/*|/run/*|/proc/*|/sys/*|/dev/*) continue ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
if [[ "$clean_path" == *.sh || "$clean_path" == *.py || "$clean_path" == */scripts/* ]]; then
|
||||||
|
[[ -e "$clean_path" ]] || missing_paths+=("$clean_path")
|
||||||
|
fi
|
||||||
|
done < <(grep -oE '/(opt|root|home|usr/local|etc)/[^[:space:]|;&<>]+' <<< "$cron_line" || true)
|
||||||
|
done < <(crontab -l 2>/dev/null || true)
|
||||||
|
|
||||||
|
if (( ${#missing_paths[@]} > 0 )); then
|
||||||
|
record cron_missing_paths WARN "${missing_paths[*]}" "Cron references missing path(s): ${missing_paths[*]}"
|
||||||
|
else
|
||||||
|
record cron_missing_paths OK "0 missing" "No missing script paths in root crontab"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
# ── Run all checks ───────────────────────────────────────────────────────────
|
# ── Run all checks ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
if ! $JSON_MODE && ! $QUIET; then
|
if ! $JSON_MODE && ! $QUIET; then
|
||||||
@ -269,6 +315,7 @@ check_swap
|
|||||||
check_docker_containers
|
check_docker_containers
|
||||||
check_docker_disk
|
check_docker_disk
|
||||||
check_logs
|
check_logs
|
||||||
|
check_automation_drift
|
||||||
|
|
||||||
# ── Summary ──────────────────────────────────────────────────────────────────
|
# ── Summary ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user