#!/usr/bin/env python3 """Silent-on-success Hermes health watchdog for ByteLyst. Designed for a Hermes no-agent cron job. It prints nothing when healthy and prints a concise Telegram-ready alert when an actionable problem is detected. """ from __future__ import annotations import os import shutil import subprocess import sys from datetime import datetime, timezone from pathlib import Path DISK_WARN_PERCENT = int(os.getenv("HERMES_WATCHDOG_DISK_WARN_PERCENT", "85")) MEMORY_WARN_PERCENT = int(os.getenv("HERMES_WATCHDOG_MEMORY_WARN_PERCENT", "90")) BACKUP_STALE_MINUTES = int(os.getenv("HERMES_WATCHDOG_BACKUP_STALE_MINUTES", "90")) BACKUP_JOB_NAME = os.getenv("HERMES_WATCHDOG_BACKUP_JOB_NAME", "Sync Hermes persistent-data backup to GitHub") GATEWAY_SERVICE = os.getenv("HERMES_WATCHDOG_GATEWAY_SERVICE", "hermes-gateway.service") DOCKER_CONTAINERS = [ item.strip() for item in os.getenv("HERMES_WATCHDOG_DOCKER_CONTAINERS", "caddy,gitea-npm-registry").split(",") if item.strip() ] HERMES_HOME = Path(os.getenv("HERMES_HOME", str(Path.home() / ".hermes"))) def run(cmd: list[str], timeout: int = 20) -> subprocess.CompletedProcess[str]: return subprocess.run(cmd, text=True, capture_output=True, timeout=timeout, check=False) def check_gateway(alerts: list[str]) -> None: result = run(["systemctl", "is-active", GATEWAY_SERVICE]) if result.stdout.strip() != "active": alerts.append(f"gateway service `{GATEWAY_SERVICE}` is not active: `{result.stdout.strip() or result.stderr.strip() or 'unknown'}`") def check_backup_cron(alerts: list[str]) -> None: result = run(["hermes", "cron", "list"], timeout=30) out = result.stdout + result.stderr if result.returncode != 0: alerts.append(f"`hermes cron list` failed with exit {result.returncode}") return if BACKUP_JOB_NAME not in out: alerts.append(f"backup cron job `{BACKUP_JOB_NAME}` was not found") return if "Last run:" not in out or " ok" not in out: alerts.append("backup cron last-run status is not visibly `ok` in `hermes cron list`") script_path = HERMES_HOME / "scripts" / "sync_hermes_persistent_backup.py" if script_path.exists(): age_minutes = (datetime.now(timezone.utc).timestamp() - script_path.stat().st_mtime) / 60 # Script mtime is not backup freshness; keep this as a weak sanity note only. if age_minutes < 0: alerts.append("backup sync script has a future modification time") def check_backup_repo_freshness(alerts: list[str]) -> None: repo = Path(os.getenv("HERMES_WATCHDOG_BACKUP_REPO", str(HERMES_HOME / "persistent_backup_repo"))) candidates = [repo, Path.home() / "hermes_persistent_backup", Path.home() / "hermes_persistent_backup_repo"] existing = next((p for p in candidates if (p / ".git").exists()), None) if not existing: # The backup cron may use its own path; cron status is the primary check. return result = run(["git", "-C", str(existing), "log", "-1", "--format=%ct"], timeout=20) if result.returncode != 0 or not result.stdout.strip().isdigit(): alerts.append(f"could not inspect backup repo freshness at `{existing}`") return age_minutes = (datetime.now(timezone.utc).timestamp() - int(result.stdout.strip())) / 60 if age_minutes > BACKUP_STALE_MINUTES: alerts.append(f"backup repo `{existing}` latest commit is stale: {age_minutes:.0f} minutes old") def check_disk(alerts: list[str]) -> None: usage = shutil.disk_usage("/") pct = int(round((usage.used / usage.total) * 100)) if pct >= DISK_WARN_PERCENT: alerts.append(f"root disk usage is high: {pct}% used (threshold {DISK_WARN_PERCENT}%)") def check_memory(alerts: list[str]) -> None: meminfo: dict[str, int] = {} for line in Path("/proc/meminfo").read_text(encoding="utf-8").splitlines(): parts = line.split() if len(parts) >= 2: meminfo[parts[0].rstrip(":")] = int(parts[1]) total = meminfo.get("MemTotal", 0) available = meminfo.get("MemAvailable", 0) if total <= 0 or available <= 0: alerts.append("could not read memory pressure from /proc/meminfo") return used_pct = int(round(((total - available) / total) * 100)) if used_pct >= MEMORY_WARN_PERCENT: alerts.append(f"memory pressure is high: {used_pct}% used (threshold {MEMORY_WARN_PERCENT}%)") def check_docker_containers(alerts: list[str]) -> None: if not DOCKER_CONTAINERS: return docker = shutil.which("docker") if not docker: alerts.append("docker executable not found; cannot verify critical containers") return result = run([docker, "ps", "--format", "{{.Names}}"], timeout=20) if result.returncode != 0: alerts.append(f"`docker ps` failed while checking critical containers: {result.stderr.strip() or result.stdout.strip()}") return running = set(result.stdout.splitlines()) missing = [name for name in DOCKER_CONTAINERS if name not in running] if missing: alerts.append(f"critical Docker container(s) not running: {', '.join(missing)}") def main() -> int: alerts: list[str] = [] for check in ( check_gateway, check_backup_cron, check_backup_repo_freshness, check_disk, check_memory, check_docker_containers, ): try: check(alerts) except Exception as exc: # noqa: BLE001 - watchdog should alert, not crash silently alerts.append(f"{check.__name__} errored: {exc}") if alerts: print("🚨 ByteLyst Hermes watchdog alert") for item in alerts: print(f"- {item}") print( "\nSuggested first checks: `systemctl status hermes-gateway --no-pager`, " "`hermes cron list`, `df -h /`, `free -h`, `docker ps`." ) return 0 return 0 if __name__ == "__main__": sys.exit(main())