95 lines
4.0 KiB
Python
Executable File
95 lines
4.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Silent-on-success Hermes health watchdog for ByteLyst.
|
|
|
|
Designed for a Hermes no-agent cron job. It prints nothing when healthy and
|
|
prints a concise Telegram-ready alert when an actionable problem is detected.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
DISK_WARN_PERCENT = int(os.getenv("HERMES_WATCHDOG_DISK_WARN_PERCENT", "85"))
|
|
BACKUP_STALE_MINUTES = int(os.getenv("HERMES_WATCHDOG_BACKUP_STALE_MINUTES", "90"))
|
|
BACKUP_JOB_NAME = os.getenv("HERMES_WATCHDOG_BACKUP_JOB_NAME", "Sync Hermes persistent-data backup to GitHub")
|
|
GATEWAY_SERVICE = os.getenv("HERMES_WATCHDOG_GATEWAY_SERVICE", "hermes-gateway.service")
|
|
HERMES_HOME = Path(os.getenv("HERMES_HOME", str(Path.home() / ".hermes")))
|
|
|
|
|
|
def run(cmd: list[str], timeout: int = 20) -> subprocess.CompletedProcess[str]:
|
|
return subprocess.run(cmd, text=True, capture_output=True, timeout=timeout, check=False)
|
|
|
|
|
|
def check_gateway(alerts: list[str]) -> None:
|
|
result = run(["systemctl", "is-active", GATEWAY_SERVICE])
|
|
if result.stdout.strip() != "active":
|
|
alerts.append(f"gateway service `{GATEWAY_SERVICE}` is not active: `{result.stdout.strip() or result.stderr.strip() or 'unknown'}`")
|
|
|
|
|
|
def check_backup_cron(alerts: list[str]) -> None:
|
|
result = run(["hermes", "cron", "list"], timeout=30)
|
|
out = result.stdout + result.stderr
|
|
if result.returncode != 0:
|
|
alerts.append(f"`hermes cron list` failed with exit {result.returncode}")
|
|
return
|
|
if BACKUP_JOB_NAME not in out:
|
|
alerts.append(f"backup cron job `{BACKUP_JOB_NAME}` was not found")
|
|
return
|
|
if "Last run:" not in out or " ok" not in out:
|
|
alerts.append("backup cron last-run status is not visibly `ok` in `hermes cron list`")
|
|
|
|
script_path = HERMES_HOME / "scripts" / "sync_hermes_persistent_backup.py"
|
|
if script_path.exists():
|
|
age_minutes = (datetime.now(timezone.utc).timestamp() - script_path.stat().st_mtime) / 60
|
|
# Script mtime is not backup freshness; keep this as a weak sanity note only.
|
|
if age_minutes < 0:
|
|
alerts.append("backup sync script has a future modification time")
|
|
|
|
|
|
def check_backup_repo_freshness(alerts: list[str]) -> None:
|
|
repo = Path(os.getenv("HERMES_WATCHDOG_BACKUP_REPO", str(HERMES_HOME / "persistent_backup_repo")))
|
|
candidates = [repo, Path.home() / "hermes_persistent_backup", Path.home() / "hermes_persistent_backup_repo"]
|
|
existing = next((p for p in candidates if (p / ".git").exists()), None)
|
|
if not existing:
|
|
# The backup cron may use its own path; cron status is the primary check.
|
|
return
|
|
result = run(["git", "-C", str(existing), "log", "-1", "--format=%ct"], timeout=20)
|
|
if result.returncode != 0 or not result.stdout.strip().isdigit():
|
|
alerts.append(f"could not inspect backup repo freshness at `{existing}`")
|
|
return
|
|
age_minutes = (datetime.now(timezone.utc).timestamp() - int(result.stdout.strip())) / 60
|
|
if age_minutes > BACKUP_STALE_MINUTES:
|
|
alerts.append(f"backup repo `{existing}` latest commit is stale: {age_minutes:.0f} minutes old")
|
|
|
|
|
|
def check_disk(alerts: list[str]) -> None:
|
|
usage = shutil.disk_usage("/")
|
|
pct = int(round((usage.used / usage.total) * 100))
|
|
if pct >= DISK_WARN_PERCENT:
|
|
alerts.append(f"root disk usage is high: {pct}% used (threshold {DISK_WARN_PERCENT}%)")
|
|
|
|
|
|
def main() -> int:
|
|
alerts: list[str] = []
|
|
for check in (check_gateway, check_backup_cron, check_backup_repo_freshness, check_disk):
|
|
try:
|
|
check(alerts)
|
|
except Exception as exc: # noqa: BLE001 - watchdog should alert, not crash silently
|
|
alerts.append(f"{check.__name__} errored: {exc}")
|
|
|
|
if alerts:
|
|
print("🚨 ByteLyst Hermes watchdog alert")
|
|
for item in alerts:
|
|
print(f"- {item}")
|
|
print("\nSuggested first checks: `systemctl status hermes-gateway --no-pager`, `hermes cron list`, `df -h /`.")
|
|
return 0
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|