From eb5c3b1c664bfe48622073a98e7a08eaef083c69 Mon Sep 17 00:00:00 2001 From: saravanakumardb1 Date: Thu, 5 Mar 2026 11:56:43 -0800 Subject: [PATCH] docs(a2a): add SafetyMonitorAgent (NomGap) + SyncDiagnosticsAgent (PeakPulse) specs; check off in DOMAIN_PRODUCTS.md --- docs/MCP+A2A/DOMAIN_PRODUCTS.md | 4 +- docs/MCP+A2A/agents/SafetyMonitorAgent.md | 165 +++++++++++++++++ docs/MCP+A2A/agents/SyncDiagnosticsAgent.md | 189 ++++++++++++++++++++ 3 files changed, 356 insertions(+), 2 deletions(-) create mode 100644 docs/MCP+A2A/agents/SafetyMonitorAgent.md create mode 100644 docs/MCP+A2A/agents/SyncDiagnosticsAgent.md diff --git a/docs/MCP+A2A/DOMAIN_PRODUCTS.md b/docs/MCP+A2A/DOMAIN_PRODUCTS.md index ad49880a..f0ab0f6f 100644 --- a/docs/MCP+A2A/DOMAIN_PRODUCTS.md +++ b/docs/MCP+A2A/DOMAIN_PRODUCTS.md @@ -120,7 +120,7 @@ nomgap.social.listGroupFasts(userId) - [x] Wire `@bytelyst/diagnostics-client` (`src/app/_layout.tsx` + `src/lib/diagnostics.ts`) — [6371266](https://github.com/saravanakumardb1/learning_ai_fastgap/commit/6371266) - [x] `nomgap.push.fire` convenience functions — _all 7 trigger types already existed in `src/api/push-api.ts`_ - [x] Fix `getAuthToken` type-safety + `void .start()` + `mmkvStorage` cast — [7a87a6c](https://github.com/saravanakumardb1/learning_ai_fastgap/commit/7a87a6c) -- [ ] SafetyMonitorAgent spec in `docs/agents/` +- [x] SafetyMonitorAgent spec — `docs/MCP+A2A/agents/SafetyMonitorAgent.md` --- @@ -163,7 +163,7 @@ peakpulse.weather.getSnapshot(sessionId) ← WeatherSnapshotDoc for a ses - [x] Wire `ByteLystDiagnostics` into iOS `PeakPulseApp.swift` (`DiagnosticsService.swift`) — [7d4b86f](https://github.com/saravanakumardb1/learning_ai_peakpulse/commit/7d4b86f) - [x] `peakpulse.sessions.export` endpoint (`GET /peak/sessions/:id/export`) — [bfeb465](https://github.com/saravanakumardb1/learning_ai_peakpulse/commit/bfeb465) - [x] Fix `pollIntervalSeconds` → `pollIntervalMs` in `DiagnosticsService.swift` — [690e007](https://github.com/saravanakumardb1/learning_ai_peakpulse/commit/690e007) -- [ ] SyncDiagnosticsAgent spec +- [x] SyncDiagnosticsAgent spec — `docs/MCP+A2A/agents/SyncDiagnosticsAgent.md` --- diff --git a/docs/MCP+A2A/agents/SafetyMonitorAgent.md b/docs/MCP+A2A/agents/SafetyMonitorAgent.md new file mode 100644 index 00000000..d5dbd0b8 --- /dev/null +++ b/docs/MCP+A2A/agents/SafetyMonitorAgent.md @@ -0,0 +1,165 @@ +# SafetyMonitorAgent — A2A Spec + +**Product:** NomGap +**Trigger:** Telemetry event `fasting.duration_milestone` where `hours IN [24, 42, 48, 72]`, or on-demand via `nomgap.safety.check(userId)` +**Output:** Push notification (`refeeding_reminder` or `extended_fast_warning`) + structured safety brief logged as telemetry event + +--- + +## Agent roster + +| Step | Agent | Input | Output | +| ---- | -------------------------- | ----------------------------------- | ------------------------------------------------------------------- | +| 1 | `FastStateInspectorAgent` | `userId`, current timestamp | Active fasting session doc + elapsed hours + protocol name | +| 2 | `ThresholdEvaluatorAgent` | Fasting session + elapsed hours | Safety level (`safe` / `caution` / `critical`) + recommended action | +| 3 | `SafetyNotificationAgent` | Safety level + userId + protocol | Push notification fired + telemetry event logged | +| 4 | `ExtendedFastHandoffAgent` | Session (hours ≥ 48) + safety brief | A2A handoff to `ExtendedFastDetectionAgent` (if escalation needed) | + +--- + +## Agent contracts + +### FastStateInspectorAgent + +```typescript +// Tool: nomgap.fasting.getSession (GET /fasting-sessions/active?userId=) +input: { + userId: string; +} +output: { + sessionId: string; + protocolId: string; + protocolName: string; + startedAt: string; // ISO 8601 + elapsedHours: number; + targetHours: number; + isActive: boolean; +} +``` + +### ThresholdEvaluatorAgent + +```typescript +// Pure in-process evaluation — no external calls +// Thresholds defined in nomgap safety rules: +// 0-24h: safe +// 24-42h: caution (electrolyte reminder at 24h) +// 42-48h: caution (pre-refeeding warning) +// 48h+: critical (refeeding_reminder + medical supervision flag) +// 72h+: critical (escalate to ExtendedFastDetectionAgent) +input: { + sessionId: string; + elapsedHours: number; + protocolName: string; + targetHours: number; +} +output: { + safetyLevel: 'safe' | 'caution' | 'critical'; + pushType: 'refeeding_reminder' | 'extended_fast_warning' | 'electrolyte_reminder' | null; + escalate: boolean; // true when elapsedHours >= 72 + messageVariables: Record; // { hours, protocolName, refeedingTip } +} +``` + +### SafetyNotificationAgent + +```typescript +// Tool: nomgap.push.fire (POST platform-service /push-triggers) +// Tool: platform.telemetry.events (POST platform-service /telemetry/events) +input: { + userId: string; + pushType: string; + messageVariables: Record; + sessionId: string; + safetyLevel: string; +} +output: { + notified: boolean; + pushTriggerId: string; + telemetryEventId: string; +} +``` + +### ExtendedFastHandoffAgent + +```typescript +// Fires only when ThresholdEvaluatorAgent.escalate === true (hours >= 72) +// A2A handoff artifact is emitted as a telemetry event so any subscribed +// ExtendedFastDetectionAgent can pick it up. +input: { + userId: string; + sessionId: string; + elapsedHours: number; + safetyLevel: 'critical'; +} +output: { + handoffEventId: string; + handoffPayload: { + productId: 'nomgap'; + agentTarget: 'ExtendedFastDetectionAgent'; + userId: string; + sessionId: string; + elapsedHours: number; + triggeredAt: string; + } +} +``` + +--- + +## Data model + +```typescript +interface SafetyCheckResult { + userId: string; + productId: 'nomgap'; + sessionId: string; + elapsedHours: number; + safetyLevel: 'safe' | 'caution' | 'critical'; + actionsTaken: string[]; // e.g. ['push:refeeding_reminder', 'telemetry:logged', 'handoff:ExtendedFastDetectionAgent'] + checkedAt: string; // ISO 8601 +} +``` + +--- + +## Threshold table + +| Elapsed hours | Safety level | Push type | Escalate? | +| ------------- | ------------ | ----------------------- | --------- | +| < 24 h | `safe` | none | no | +| 24 h | `caution` | `electrolyte_reminder` | no | +| 42 h | `caution` | `refeeding_reminder` | no | +| 48 h | `critical` | `refeeding_reminder` | no | +| 72 h+ | `critical` | `extended_fast_warning` | **yes** | + +--- + +## Error handling + +- If the user has no active fasting session, `FastStateInspectorAgent` returns `isActive: false` and the pipeline exits with no action. +- If `nomgap.push.fire` fails, the telemetry event is still logged (best-effort notification). +- If `ExtendedFastHandoffAgent` cannot emit the handoff event, the critical push notification still fires — escalation failure is non-blocking. +- Idempotent within a 1-hour window: a safety check that already fired the same `pushType` within the last 60 minutes is skipped. + +--- + +## MCP tool surface + +```typescript +nomgap.safety.check(userId: string): SafetyCheckResult +nomgap.safety.getThresholds(): ThresholdTable // returns the threshold table above +``` + +--- + +## Implementation checklist + +- [ ] `FastStateInspectorAgent` — thin wrapper over `GET /fasting-sessions/active` +- [ ] `ThresholdEvaluatorAgent` — pure evaluation logic (no I/O), unit-testable +- [ ] `SafetyNotificationAgent` — calls `nomgap.push.fire` + logs telemetry event +- [ ] `ExtendedFastHandoffAgent` — emits A2A handoff telemetry event for ≥ 72 h fasts +- [ ] Cron or event subscription: subscribe to `fasting.duration_milestone` telemetry events +- [ ] Deduplication guard: skip if same `(userId, pushType)` fired within 60 minutes +- [ ] MCP tool registration: `nomgap.safety.check` + `nomgap.safety.getThresholds` +- [ ] Unit tests for `ThresholdEvaluatorAgent` covering all 5 threshold rows diff --git a/docs/MCP+A2A/agents/SyncDiagnosticsAgent.md b/docs/MCP+A2A/agents/SyncDiagnosticsAgent.md new file mode 100644 index 00000000..4d264ba1 --- /dev/null +++ b/docs/MCP+A2A/agents/SyncDiagnosticsAgent.md @@ -0,0 +1,189 @@ +# SyncDiagnosticsAgent — A2A Spec + +**Product:** PeakPulse +**Trigger:** Telemetry event `sync_upload_failed` (from `TelemetryService.swift` / Android `SyncRepository`), or on-demand via `peakpulse.sync.diagnose(userId)` +**Output:** Active diagnostics session targeting the failing device + structured sync diagnostic report logged as telemetry event + +--- + +## Agent roster + +| Step | Agent | Input | Output | +| ---- | --------------------------- | -------------------------------------------------- | ------------------------------------------------------------------------ | +| 1 | `SyncFailureInspectorAgent` | `userId`, `deviceId`, failure event payload | Sync queue depth + last successful sync + failure pattern classification | +| 2 | `DiagnosticsSessionAgent` | `userId`, `deviceId`, `productId`, failure context | Active `platform.diagnostics` session ID targeting the device | +| 3 | `SyncRetryObserverAgent` | Session ID + retry window | Collected network traces from next sync attempt (or timeout report) | +| 4 | `SyncDiagnosticReportAgent` | Traces + failure classification + session stats | Structured diagnostic report + telemetry event + optional alert to admin | + +--- + +## Agent contracts + +### SyncFailureInspectorAgent + +```typescript +// Tool: peakpulse.syncStatus (GET /peak/sync-status?userId=) +// Tool: platform.telemetry.query (GET /api/telemetry/query?productId=peakpulse&eventType=sync_upload_failed) +input: { + userId: string; + deviceId?: string; + failureEvent: { + eventType: 'sync_upload_failed'; + platform: 'ios' | 'android'; + appVersion: string; + errorCode: string; // e.g. 'network_timeout', 'auth_401', 'cosmos_conflict' + sessionId?: string; // the PeakPulse session that failed to upload + timestamp: string; + }; +} +output: { + userId: string; + deviceId: string | null; + queueDepth: number; // number of unsynced sessions pending upload + lastSuccessfulSync: string | null; // ISO 8601 + failurePattern: 'transient' | 'auth' | 'persistent' | 'unknown'; + recentFailureCount: number; // failures in last 24 h + shouldStartDiagnostics: boolean; // true if recentFailureCount >= 2 or pattern != 'transient' +} +``` + +### DiagnosticsSessionAgent + +```typescript +// Tool: platform.diagnostics.sessions.create +// POST /api/diagnostics/sessions +// Only called when SyncFailureInspectorAgent.shouldStartDiagnostics === true +input: { + userId: string; + deviceId: string | null; + productId: 'peakpulse'; + failurePattern: string; + collectionLevel: 'debug' | 'trace'; // 'trace' for auth failures, 'debug' otherwise +} +output: { + sessionId: string; + sessionStatus: 'active'; + expiresAt: string; // ISO 8601 + collectionLevel: string; + captureNetwork: true; // always true for sync diagnostics — network traces are the signal +} +``` + +### SyncRetryObserverAgent + +```typescript +// Tool: platform.diagnostics.sessions.get (polls status) +// Tool: platform.diagnostics.sessions.getLogs (fetches captured log entries) +// Tool: platform.diagnostics.sessions.getTraces (fetches OTel network spans) +// Polls until a new sync attempt is captured OR the session expires +input: { + sessionId: string; + pollIntervalSeconds: number; // default 30 + maxWaitMinutes: number; // default 15 +} +output: { + syncAttemptDetected: boolean; + networkTraces: unknown[]; // raw OTel spans from the sync attempt + logEntries: unknown[]; // filtered for 'sync' module + capturedAt: string | null; + timedOut: boolean; +} +``` + +### SyncDiagnosticReportAgent + +```typescript +// Tool: platform.telemetry.events (POST /api/telemetry/events — logs the report as a structured event) +// Tool: extraction.run (POST /api/extract — optional: extract error patterns from log text) +input: { + userId: string; + deviceId: string | null; + sessionId: string; + failurePattern: string; + networkTraces: unknown[]; + logEntries: unknown[]; + syncAttemptDetected: boolean; + timedOut: boolean; +} +output: { + reportId: string; // telemetry event ID + rootCauseSummary: string; // short natural-language summary + recommendedAction: string; // e.g. 'force token refresh', 'check Cosmos conflict resolution', 'retry on wifi' + extractedEntities: unknown[]; // from extraction-service (error codes, URLs, status codes) + diagnosticSessionId: string; +} +``` + +--- + +## Data model + +```typescript +interface SyncDiagnosticReport { + productId: 'peakpulse'; + reportId: string; + userId: string; + deviceId: string | null; + failurePattern: 'transient' | 'auth' | 'persistent' | 'unknown'; + diagnosticsSessionId: string; + syncAttemptDetected: boolean; + rootCauseSummary: string; + recommendedAction: string; + extractedEntities: Array<{ + extraction_class: string; + extraction_text: string; + }>; + generatedAt: string; // ISO 8601 +} +``` + +--- + +## Decision gate: when to start diagnostics + +``` +sync_upload_failed event received + │ + ▼ +recentFailureCount >= 2? ──── yes ──► start DiagnosticsSessionAgent + │ + no + │ +failurePattern == 'auth'? ──── yes ──► start DiagnosticsSessionAgent (trace level) + │ + no + │ + └──► log telemetry only, skip diagnostics (transient, single failure) +``` + +--- + +## Error handling + +- If `DiagnosticsSessionAgent` fails to create a session (e.g., platform-service unavailable), the report is still generated from the failure event alone — diagnostics collection is best-effort. +- If `SyncRetryObserverAgent` times out (no retry within `maxWaitMinutes`), the report is marked `syncAttemptDetected: false` with `timedOut: true`. +- If `extraction-service` is unavailable, `SyncDiagnosticReportAgent` skips entity extraction and sets `extractedEntities: []`. +- The diagnostics session is closed (`status: completed`) after the report is generated regardless of outcome. + +--- + +## MCP tool surface + +```typescript +peakpulse.sync.diagnose(userId: string, deviceId?: string): SyncDiagnosticReport +peakpulse.sync.status(userId: string): { queueDepth: number; lastSuccessfulSync: string | null; recentFailureCount: number } +``` + +--- + +## Implementation checklist + +- [ ] `SyncFailureInspectorAgent` — wraps `peakpulse.syncStatus` + telemetry query for recent failures +- [ ] `DiagnosticsSessionAgent` — calls `platform.diagnostics.sessions.create` with `captureNetwork: true` +- [ ] `SyncRetryObserverAgent` — polls `getLogs` + `getTraces` on active session until retry detected or timeout +- [ ] `SyncDiagnosticReportAgent` — assembles report, calls `extraction.run` (best-effort), logs telemetry event +- [ ] Event subscription: subscribe to `sync_upload_failed` telemetry events for `productId=peakpulse` +- [ ] Decision gate: implement `shouldStartDiagnostics` threshold (≥ 2 failures or auth pattern) +- [ ] Session cleanup: close diagnostics session after report is generated +- [ ] MCP tool registration: `peakpulse.sync.diagnose` + `peakpulse.sync.status` +- [ ] Integration test: simulate `sync_upload_failed` event → verify session created + report generated