From bf7769bdaa15f5e93ba5e4ec49d2f5569874d617 Mon Sep 17 00:00:00 2001 From: saravanakumardb1 Date: Thu, 5 Mar 2026 10:41:02 -0800 Subject: [PATCH] fix(diagnostics-client): use session-scoped ingest endpoints; update MCP+A2A docs --- docs/MCP+A2A/A2A_ORCHESTRATION_FRAMEWORK.md | 138 +++++++++++ docs/MCP+A2A/DOMAIN_DASHBOARDS.md | 54 ++++ docs/MCP+A2A/DOMAIN_EXTRACTION_SERVICE.md | 70 ++++++ docs/MCP+A2A/DOMAIN_PACKAGES_AND_SDKS.md | 82 ++++++ docs/MCP+A2A/DOMAIN_PLATFORM_SERVICE.md | 120 +++++++++ docs/MCP+A2A/DOMAIN_PRODUCTS.md | 80 ++++++ docs/MCP+A2A/EXECUTION_CHECKLIST.md | 59 +++++ docs/MCP+A2A/IMPLEMENTATION_PLAN.md | 261 ++++++++++++++++++++ docs/MCP+A2A/MCP_SERVER_FRAMEWORK.md | 155 ++++++++++++ docs/MCP+A2A/README.md | 35 +++ docs/MCP+A2A/WORKSPACE_USE_CASE_CATALOG.md | 173 +++++++++++++ packages/diagnostics-client/src/client.ts | 150 +++++++---- 12 files changed, 1325 insertions(+), 52 deletions(-) create mode 100644 docs/MCP+A2A/A2A_ORCHESTRATION_FRAMEWORK.md create mode 100644 docs/MCP+A2A/DOMAIN_DASHBOARDS.md create mode 100644 docs/MCP+A2A/DOMAIN_EXTRACTION_SERVICE.md create mode 100644 docs/MCP+A2A/DOMAIN_PACKAGES_AND_SDKS.md create mode 100644 docs/MCP+A2A/DOMAIN_PLATFORM_SERVICE.md create mode 100644 docs/MCP+A2A/DOMAIN_PRODUCTS.md create mode 100644 docs/MCP+A2A/EXECUTION_CHECKLIST.md create mode 100644 docs/MCP+A2A/IMPLEMENTATION_PLAN.md create mode 100644 docs/MCP+A2A/MCP_SERVER_FRAMEWORK.md create mode 100644 docs/MCP+A2A/README.md create mode 100644 docs/MCP+A2A/WORKSPACE_USE_CASE_CATALOG.md diff --git a/docs/MCP+A2A/A2A_ORCHESTRATION_FRAMEWORK.md b/docs/MCP+A2A/A2A_ORCHESTRATION_FRAMEWORK.md new file mode 100644 index 00000000..18a8c857 --- /dev/null +++ b/docs/MCP+A2A/A2A_ORCHESTRATION_FRAMEWORK.md @@ -0,0 +1,138 @@ +# A2A Orchestration Framework — Recommended Pattern (ByteLyst) + +## Intent + +Standardize how multiple agents collaborate on platform + product tasks (support, ops, releases, prompt iterations) with: + +- explicit roles +- explicit handoff artifacts +- consistent safety + audit + +## Canonical roles (agents) + +### 1) Router / Dispatcher + +- **Responsibility** + - decide which specialist agent(s) to involve + - enforce policy (role gating, PII constraints) +- **Inputs** + - user request + current environment (productId, repo) +- **Outputs** + - sequence of tasks (plan) + handoff payloads + +### 2) Telemetry Analyst + +- **Responsibility** + - find clusters, correlate by version/platform, propose next steps +- **Calls** + - telemetry MCP tools + +### 3) Diagnostics Orchestrator + +- **Responsibility** + - start / monitor remote diagnostics sessions + - summarize results +- **Calls** + - diagnostics MCP tools + +### 4) Extraction Task Designer + +- **Responsibility** + - design extraction task prompts/examples + - coordinate evaluation runs +- **Calls** + - extraction MCP tools + +### 5) Ops Executor + +- **Responsibility** + - execute mutating ops (jobs trigger, maintenance windows, flag changes) +- **Calls** + - platform MCP tools + +### 6) Fix / PR Agent + +- **Responsibility** + - implement code changes + - ensure tests and conventions + +### 7) Report Writer + +- **Responsibility** + - produce a final summary in a consistent format + - include links/IDs (clusterId, sessionId, policyId) + +## Handoff artifacts (contracts) + +Examples below are illustrative. All artifacts must be explicitly scoped to a `productId`. + +### A) Support incident brief + +```json +{ + "productId": "", + "userReport": { + "summary": "dictation inserts nothing in Messages", + "platform": "ios", + "channel": "keyboard_extension", + "appVersion": "1.2.0", + "buildNumber": "35", + "userId": "usr_...", + "anonymousInstallId": "..." + }, + "timeWindow": { "from": "...", "to": "..." } +} +``` + +Mapping note: `userReport.anonymousInstallId` maps to diagnostics session targeting via `targetAnonymousId`. + +### B) Telemetry findings + +```json +{ + "clusters": [{ "clusterId": "...", "pk": "...", "severity": "error" }], + "topHypotheses": ["permission denied", "insertText noop"], + "recommendedActions": ["start diagnostics session", "enable debug policy for one user"] +} +``` + +### C) Diagnostics session plan + +```json +{ + "target": { "userId": "...", "deviceId": "..." }, + "collection": { "level": "trace", "captureNetwork": true, "captureLogs": true }, + "expiresInMinutes": 30 +} +``` + +### D) Patch plan (code) + +- scope, files, risk, tests + +## Routing logic (simple) + +- If request mentions: + - **"crash" / "not working" / "bug"** → Telemetry Analyst → Diagnostics Orchestrator → Fix Agent + - **"extraction" / "entity" / "triage"** → Extraction Task Designer → Eval Runner → Fix Agent + - **"maintenance" / "flag" / "job"** → Ops Executor + +## Safety rules + +- Never include raw user content in telemetry/diagnostics. +- Diagnostics sessions must be time-bounded. +- Mutating actions require: + - explicit approval from dispatcher + - audit log + - optional dry-run + +## Where A2A yields immediate wins in this workspace + +- **Telemetry policy governance** + - Planner + Reviewer pattern +- **Remote diagnostics** + - Orchestrator agent that monitors sessions and summarizes +- **Prompt iteration loops** (extraction) + - Task designer + eval runner separation +- **Release workflows** + - Dedicated agent for quality gates (build/test/typecheck) and a separate agent for publishing diff --git a/docs/MCP+A2A/DOMAIN_DASHBOARDS.md b/docs/MCP+A2A/DOMAIN_DASHBOARDS.md new file mode 100644 index 00000000..c9539483 --- /dev/null +++ b/docs/MCP+A2A/DOMAIN_DASHBOARDS.md @@ -0,0 +1,54 @@ +# Domain — Dashboards (admin-web, tracker-web, ux-lab) + +## Admin dashboard (`dashboards/admin-web`) + +### Existing leverage points + +- It already centralizes many ops capabilities behind UI. +- It already has a service client layer (`src/lib/platform-client.ts`) that talks to `platform-service`. + +### MCP opportunities + +- Provide “headless equivalents” of the admin UI actions via MCP tools. +- Use MCP resources to provide the dashboard with richer contextual data: + - module inventories + - policy templates + - incident runbooks + +### A2A opportunities + +- Build an Ops Copilot that: + - proposes actions + - executes via MCP tools + - links back to the relevant admin dashboard pages + +## Tracker dashboard (`dashboards/tracker-web`) + +- Candidate MCP tools (illustrative names; map onto the tracker modules in platform-service such as items/votes/comments/public): + - `tracker.listPublicItems()` + - `tracker.submitFeedback()` + - `tracker.vote(itemId)` + +## UX Lab (`dashboards/ux-lab`) + +### Existing intent + +These micro-apps are greenfield UI experiments that intentionally avoid backend dependencies. + +### MCP opportunity + +- Use MCP as a **dataset generator** for local-only UX experiments. + - Example: generate realistic telemetry events & clusters JSON. + +### Suggested tools + +- `dev.generateSampleTelemetryEvents(count, shape, seed)` (local-only helper) +- `dev.generateSampleClusters(count, seed)` (local-only helper) + +## Dashboard “component extraction” workflow + +An A2A workflow that: + +- identifies patterns in ux-lab (tables/filters/drawers) +- proposes a migration plan into `@bytelyst/dashboard-components` +- auto-generates a PR in the appropriate package diff --git a/docs/MCP+A2A/DOMAIN_EXTRACTION_SERVICE.md b/docs/MCP+A2A/DOMAIN_EXTRACTION_SERVICE.md new file mode 100644 index 00000000..d03f1c3f --- /dev/null +++ b/docs/MCP+A2A/DOMAIN_EXTRACTION_SERVICE.md @@ -0,0 +1,70 @@ +# Domain — extraction-service (MCP + A2A Opportunities) + +## Why extraction-service is ideal for MCP + +It already provides: + +- a single entrypoint (`POST /extract`, `POST /extract/batch`) +- async extraction jobs (`/extract/jobs`) +- model registry +- sidecar health monitoring and circuit breaker +- rate limits + quotas + cache + +Agents can use MCP tools to iterate on prompts/tasks safely and repeatably. + +## High-value MCP tool proposals + +### Core extraction + +- `extraction.extract(text, taskId?, modelId?, productId?)` +- `extraction.extractBatch(inputs, modelId?)` + +### Async jobs + +- `extraction.submitJob(inputs, modelId?, webhookUrl?)` +- `extraction.getJob(jobId)` +- `extraction.listJobs()` + +### Observability + +- `extraction.sidecarHealth()` +- `extraction.metrics()` +- `extraction.cacheStats()` (backs `GET /extract/cache-stats`) +- `extraction.sidecarMonitoringState()` (backs `GET /extract/monitoring/sidecar`) + +### Rate limits / admin utilities + +- `extraction.getProductRateLimitStatus(productId?)` +- `extraction.resetProductRateLimit(productId)` (admin) + +## Recommended MCP resources + +- `extraction.modelRegistry` +- `extraction.taskCatalog` + - list task IDs used across products (triage, reflection-enrichment, memory-insight, etc.) +- `extraction.promptGuidelines` + +## Recommended A2A workflows + +### 1) Task design loop + +- **TaskDesignerAgent** drafts: + - task prompt + - a small set of examples +- **EvalRunnerAgent** runs: + - `extractBatch` over an eval set + - compares JSON shape correctness +- **RegressionAgent** checks: + - no degradation vs previous baseline + +### 2) Extraction incident response + +- If extraction errors spike: + - check sidecar health and circuit breaker state + - reduce per-product rate limits + - switch modelId (if supported) + +## Product integration hotspots + +- MindLyst web API routes proxy to extraction-service (`/api/extract` and triage routes). +- Future: other products can standardize on the same tasks and use a shared task registry. diff --git a/docs/MCP+A2A/DOMAIN_PACKAGES_AND_SDKS.md b/docs/MCP+A2A/DOMAIN_PACKAGES_AND_SDKS.md new file mode 100644 index 00000000..477a3bdf --- /dev/null +++ b/docs/MCP+A2A/DOMAIN_PACKAGES_AND_SDKS.md @@ -0,0 +1,82 @@ +# Domain — Shared Packages + SDKs (MCP + A2A Opportunities) + +## Why packages/SDKs matter for MCP/A2A + +They define: + +- the **portable contracts** (telemetry event schema, diagnostics types) +- the **client integration points** (Swift/Kotlin SDKs, TS client packages) +- reusable primitives (offline queue, platform client) + +MCP/A2A should treat these as _the single source of truth_ for: + +- schemas +- naming conventions +- safety constraints (no PII) + +## Key packages and how to leverage them + +### `@bytelyst/telemetry-client` + +- Already provides a browser/RN-safe client. +- MCP can expose resources: + - event schema + - recommended module/eventName conventions + +**Opportunity:** add a companion “policy-aware client” mode that calls `GET /telemetry/config` and samples accordingly. + +**Note:** `platform-service` already exposes `GET /api/telemetry/config` (ETag-based). The remaining work is wiring the client to consume it safely. + +### `@bytelyst/diagnostics-client` + +- Provides session polling + capture utilities. + +**Gap (must fix before relying on it):** the client currently flushes batches to `POST /api/diagnostics/ingest`, while `platform-service` routes are session-scoped: + +- `POST /api/diagnostics/sessions/:id/logs` +- `POST /api/diagnostics/sessions/:id/traces` +- `POST /api/diagnostics/sessions/:id/screenshots` (SAS upload) + +Decision: update `@bytelyst/diagnostics-client` to post to the session-scoped endpoints (no backwards-compat alias endpoint needed). + +**Opportunity:** standardize how product apps integrate it (common initialization patterns and user-consent prompts). + +### `@bytelyst/platform-client` + +- Typed fetch wrapper with auth injection. + +**Opportunity:** use it as the basis for a _frontend-side_ MCP client (where appropriate) and for consistent request-id propagation. + +### `@bytelyst/offline-queue` + +- Good candidate for A2A workflows that need reliable retries. + +### Swift / Kotlin Platform SDKs + +- Already provide consistent platform-service integration. + +**Opportunity:** + +- expose SDK version + capabilities as MCP resources +- keep a “compatibility matrix” resource (which products use which SDK features) + +## MCP resources recommended from packages + +- `schemas.telemetry` (from shared types) +- `schemas.diagnostics` (from shared types) +- `sdk.swift.capabilities` +- `sdk.kotlin.capabilities` + +## A2A workflows + +### 1) Cross-repo integration audit + +- Identify product repos that drift from shared SDK patterns. +- Output: a per-repo “alignment report” and suggested PRs. + +### 2) Release impact analysis + +- When changing a shared package: + - agent enumerates downstream consumers + - runs typecheck/build matrix + - updates docs + versioning diff --git a/docs/MCP+A2A/DOMAIN_PLATFORM_SERVICE.md b/docs/MCP+A2A/DOMAIN_PLATFORM_SERVICE.md new file mode 100644 index 00000000..1ade4ca7 --- /dev/null +++ b/docs/MCP+A2A/DOMAIN_PLATFORM_SERVICE.md @@ -0,0 +1,120 @@ +# Domain — platform-service (MCP + A2A Opportunities) + +## Why this domain is ideal for MCP + +`platform-service` is already organized as a set of well-defined modules and stable REST endpoints (Fastify 5 + Zod), with: + +- telemetry ingestion + query + policies +- remote diagnostics sessions +- jobs, delivery, sessions, maintenance +- settings, flags, rate limits +- webhooks subscriptions + +This makes it a near-perfect backing store for MCP tools. + +## High-value MCP tool proposals + +### Telemetry + +- `telemetry.queryEvents(filters)` +- `telemetry.listClusters(filters)` +- `telemetry.updateClusterStatus(clusterId, pk, status)` +- `telemetry.listPolicies()` +- `telemetry.previewPolicy(targeting)` +- `telemetry.createPolicy(input)` +- `telemetry.updatePolicy(id, updates)` +- `telemetry.deletePolicy(id)` +- `telemetry.getMetrics()` +- `telemetry.getGeoDistribution(from?, to?)` + +**A2A use** + +- Telemetry Analyst Agent automates: + - time window selection + - cluster-to-user drilldown + - policy suggestions (targeting + expiry) + +### Remote diagnostics + +- `diagnostics.createSession(input)` +- `diagnostics.listSessions(filters)` +- `diagnostics.getSession(id)` +- `diagnostics.updateSession(id, updates)` +- `diagnostics.cancelSession(id)` +- `diagnostics.getLogs(sessionId, filters)` +- `diagnostics.getTraces(sessionId, filters)` +- `diagnostics.listScreenshots(sessionId)` + +**Target identifiers (schema-aligned):** `targetUserId`, `targetAnonymousId`, `targetDeviceId`. + +**A2A use** + +- Diagnostics Orchestrator Agent monitors session lifecycle and compiles summaries. + +### Jobs + +Current module provides list/update/trigger/list runs. + +- `jobs.list()` +- `jobs.get(id)` +- `jobs.update(id, updates)` +- `jobs.trigger(jobName)` +- `jobs.listRuns(jobName, limit)` + +**Note** + +- Current `jobs/routes.ts` uses `DEFAULT_PRODUCT_ID = 'lysnrai'`. For MCP, prefer explicit productId routing. + +### Settings / kill switch + +- `settings.get(userId)` +- `settings.update(userId, patch)` +- `settings.getDeviceResolved(userId, deviceId)` +- `settings.setDeviceOverrides(userId, deviceId, overrides)` +- `settings.clearDeviceOverrides(userId, deviceId)` +- `settings.checkKillSwitch(productId)` + +### Flags + +- `flags.list(productId)` +- `flags.get(key, productId)` +- `flags.upsert(key, enabled, targeting, description)` + +### Maintenance + +- `maintenance.getCurrent(productId)` +- `maintenance.set(productId, mode, bypassRules, windows)` + +### Webhooks + +- `webhooks.listSubscriptions(productId)` +- `webhooks.createSubscription(input)` +- `webhooks.updateSubscription(id, productId, updates)` +- `webhooks.deleteSubscription(id, productId)` +- `webhooks.listDeliveries(subscriptionId, limit)` +- `webhooks.test(subscriptionId)` +- `webhooks.rotateSecret(subscriptionId)` + +## Recommended MCP resources + +- `platform-service.modules` + - enumerates module names, key endpoints, auth requirements +- `telemetry.eventSchema` +- `diagnostics.sessionSchema` + +## Recommended A2A workflows backed by platform-service + +### 1) Support debug pack + +- input: user report +- output: timeline + clusters + recommended actions + (optional) diagnostics session results + +### 2) Canary rollout of additional telemetry + +- build policy → preview → create with expiry → monitor cluster changes + +### 3) Post-incident cleanup + +- resolve clusters +- remove policies +- export audit log for the incident window diff --git a/docs/MCP+A2A/DOMAIN_PRODUCTS.md b/docs/MCP+A2A/DOMAIN_PRODUCTS.md new file mode 100644 index 00000000..0574bf6b --- /dev/null +++ b/docs/MCP+A2A/DOMAIN_PRODUCTS.md @@ -0,0 +1,80 @@ +# Domain — Product Repos (MCP + A2A Opportunities) + +This document captures product-specific “where MCP/A2A helps” patterns, without duplicating each product’s full architecture docs. + +## Cross-product recurring needs + +- Debugging: telemetry clusters + remote diagnostics sessions +- Platform controls: kill switch, feature flags, maintenance +- Content intelligence: extraction tasks +- Release readiness: build/test/typecheck workflows + +## ChronoMind (`learning_ai_clock`) + +- **Opportunities** + - Telemetry-driven quality tracking for timer engine + NL parsing. + - A2A “routine regression” agent: detect changes that affect scheduling. +- **MCP hooks** + - telemetry tools for PWA + - platform-service jobs/webhooks for timer sharing integrations + +## NomGap (`learning_ai_fastgap`) + +- **Opportunities** + - React Native offline-first flows map well onto offline queue + platform-client. + - A2A “protocol tuning” agent: uses telemetry + extraction to correlate adherence patterns. +- **MCP hooks** + - telemetry + kill switch clients already exist; MCP can standardize their usage. + +## PeakPulse (`learning_ai_peakpulse`) + +- **Opportunities** + - Sync reliability: a diagnostics session targeted at a user’s device can capture network failures. + - A2A “safety alerts correctness” agent using telemetry to validate thresholds. +- **MCP hooks** + - platform-service telemetry/diagnostics + - product backend endpoints for session uploads (via sync engine) + +## MindLyst (`learning_multimodal_memory_agents`) + +- **Opportunities** + - Extraction-service is core to triage and insight enrichment; prompt/task iteration loop is high ROI. + - A2A “triage regression” agent that runs eval suites. +- **MCP hooks** + - extraction tools + - telemetry for web + native apps + +## JarvisJr (`learning_ai_jarvis_jr`) + +- **Opportunities** + - Multi-agent product: A2A patterns can be applied to its own internal coaching “crew”. + - Marketplace + certification workflows can be agent-automated. +- **MCP hooks** + - platform-service for auth/telemetry + - product backend for marketplace modules + +## LysnrAI (`learning_voice_ai_agent`) + +- **Opportunities** + - Support/debug workflows are already telemetry-heavy (keyboard + desktop). + - A2A “keyboard bug triage” agent that starts diagnostics sessions + drafts fixes. +- **MCP hooks** + - platform-service telemetry/diagnostics + - admin dashboard tooling + +## Product-specific MCP server (optional) + +For each product, you can optionally add a small MCP namespace that calls the product backend (`backend/` in each repo) for domain actions. + +Examples: + +- ChronoMind: `timers.list/create`, `routines.run/validate` +- PeakPulse: `sessions.upload`, `routes.export` +- JarvisJr: `agents.list/publish`, `marketplace.certify` + +## Recommended first A2A workflows to ship per product + +- **All products**: Support Debug Pack (telemetry + diagnostics) +- **MindLyst**: Extraction task design + eval loop +- **JarvisJr**: Marketplace certification assistant +- **NomGap**: Offline queue flush assistant / sync reliability assistant diff --git a/docs/MCP+A2A/EXECUTION_CHECKLIST.md b/docs/MCP+A2A/EXECUTION_CHECKLIST.md new file mode 100644 index 00000000..97748cd9 --- /dev/null +++ b/docs/MCP+A2A/EXECUTION_CHECKLIST.md @@ -0,0 +1,59 @@ +# MCP + A2A — Execution Checklist + +This is the “ready to start building” checklist that turns the docs in this folder into an executable plan. + +## 1) Decisions to make (30–60 minutes) + +- **MCP server placement** + - Recommended default: create a new service/package under `learning_ai_common_plat` (not colocated inside `platform-service`) to keep runtime concerns separated. +- **Integration mode** + - Recommended default: REST-only calls to `platform-service` and `extraction-service` for Phase 1. + - Defer direct Cosmos reads until you have a clear perf/cost need. +- **Auth strategy** + - Recommended default: platform-service JWT for interactive use; platform API tokens only for trusted automation. +- **Where to store A2A handoffs** + - Recommended default: Phase 1 store handoffs as telemetry events + structured logs; Phase 2 introduce a dedicated Cosmos container if you need queryability. + +## 2) Must-fix dependency before MVP + +- **Diagnostics client/server route mismatch** + - `platform-service` ingests via session-scoped endpoints: + - `POST /api/diagnostics/sessions/:id/logs` + - `POST /api/diagnostics/sessions/:id/traces` + - screenshots via session-scoped SAS upload + - `@bytelyst/diagnostics-client` currently flushes to `POST /api/diagnostics/ingest`. + +Pick one (recommended: update the client): + +- Decision: update `@bytelyst/diagnostics-client` to post to session-scoped endpoints. No backwards-compatible `POST /api/diagnostics/ingest` alias endpoint. + +## 3) Phase 1 build steps (P0 slice) + +- **Implement MCP tool namespaces** + - `platform.telemetry.*` + - `platform.diagnostics.*` + - `extraction.*` +- **Enforce hard guardrails in MCP layer** + - `productId` required and forwarded as `x-product-id` + - `x-request-id` required and propagated + - default query caps + max caps + - expiry required for any “amplification” (telemetry policy, diagnostics session) + - role gating (viewer/admin/super_admin) +- **Ship one compound tool** + - `support.createDebugPack(...)` + +## 4) Phase 1 definition of done + +- Read-only tools work end-to-end against real services. +- Mutating tools are role-gated and generate audit trails. +- The compound debug pack produces a single structured artifact with: + - telemetry cluster references + - optional diagnostics session reference + - a short markdown summary + +## 5) Phase 2+ quick sanity checks + +- If you make telemetry policy-aware clients: + - ensure `GET /api/telemetry/config` consumption is cached (ETag) and privacy-safe. +- If you add direct Cosmos reads: + - enforce product scoping at query layer and add explicit auditing for sensitive reads. diff --git a/docs/MCP+A2A/IMPLEMENTATION_PLAN.md b/docs/MCP+A2A/IMPLEMENTATION_PLAN.md new file mode 100644 index 00000000..1042f78e --- /dev/null +++ b/docs/MCP+A2A/IMPLEMENTATION_PLAN.md @@ -0,0 +1,261 @@ +# MCP + A2A — Implementation Plan (Execution-Ready) + +## Objective + +Deliver a **safe, auditable, product-aware** MCP + A2A capability layer on top of existing ByteLyst services (primarily `platform-service` and `extraction-service`) so that agents can: + +- diagnose incidents (telemetry + diagnostics) +- manage telemetry policies / rollouts +- orchestrate remote diagnostics sessions +- iterate on extraction tasks/prompts with eval loops +- run repeatable ops workflows (jobs, flags, maintenance) + +This plan intentionally starts with a **minimal P0 slice** and expands in phases. + +## Guiding constraints (must-haves) + +- **Product isolation** + - Every tool call is scoped to an explicit `productId`. +- **Auditability** + - Every mutating tool call must produce an audit record (directly or via existing APIs). +- **Least privilege** + - Query tools available to viewer roles; mutating tools gated to admin/super_admin. +- **Safety defaults** + - Mutations support `dryRun` where feasible. + - Diagnostic amplification (policies/sessions) must require an `expiresAt`. +- **No new “shadow APIs”** + - Prefer calling existing service endpoints; only add endpoints if the tool surface cannot be expressed otherwise. + +## Phase 0 — Baseline readiness (1–2 days) + +### Deliverables + +- A “tool surface” inventory mapped to existing REST endpoints and required headers. +- A role matrix for tool authorization. +- A request-id propagation and logging standard for MCP tool calls. + +### Phase 0 checklist (definition of done) + +- Confirm whether MCP tool implementations will call **REST only** (preferred) or allow **direct Cosmos reads** for selected query paths. +- Choose whether MCP is: + - a new service/package in `learning_ai_common_plat`, or + - colocated under `services/platform-service`. +- Define the initial auth strategy: + - JWT only, or + - JWT + API tokens for automation. +- Define tool-level authorization rules (viewer/admin/super_admin) and how they’re enforced. +- Agree on a consistent product scoping rule: + - `productId` is mandatory input for every tool call and sent as `x-product-id` downstream. + +### Decisions to lock + +- **MCP server shape** + - Single server with namespaces (`platform.*`, `extraction.*`) vs. two servers. +- **Auth mechanism** + - Primary: platform-service JWT. + - Secondary: platform API tokens for trusted automation. + +### Required invariants + +- `x-request-id` propagated on all downstream calls. +- `x-product-id` required and validated for all calls. + +## Phase 1 — MVP MCP server (P0 slice) (3–7 days) + +### Goal + +Enable a Support/Ops agent to answer: _“What’s happening?”_ and _“Start a bounded diagnostics session.”_ + +### Tool surface (MVP) + +#### Read-only (viewer) + +- `telemetry.queryEvents(filters)` +- `telemetry.listClusters(filters)` +- `telemetry.listPolicies()` +- `telemetry.getMetrics()` +- `diagnostics.getSession(sessionId)` +- `diagnostics.listSessions(filters)` +- `diagnostics.getLogs(sessionId, filters)` +- `diagnostics.getTraces(sessionId, filters)` +- `extraction.sidecarHealth()` + +### Phase 1 prerequisites (to avoid hidden integration failures) + +- Align `@bytelyst/diagnostics-client` ingest endpoint with `platform-service`. + - Today the service ingests via `POST /api/diagnostics/sessions/:id/logs|traces` (and session-scoped screenshot upload), while the client posts to `POST /api/diagnostics/ingest`. + - Decision: update the client to use the session-scoped endpoints (no backwards-compat alias endpoint). +- Decide whether `@bytelyst/telemetry-client` should become policy-aware by consuming `GET /api/telemetry/config`. + - If yes, treat it as a Phase 1 deliverable (with caching + ETag). Otherwise, explicitly defer to Phase 2. + +#### Mutating (admin) + +- `telemetry.previewPolicy(targeting)` +- `telemetry.createPolicy(input)` + - requires `expiresAt` +- `telemetry.updatePolicy(id, updates)` +- `telemetry.updateClusterStatus(clusterId, pk, status)` +- `diagnostics.createSession(target, config)` + - requires `expiresAt` or `expiresInMinutes` +- `diagnostics.updateSession(sessionId, updates)` +- `diagnostics.cancelSession(sessionId)` + +#### Compound workflow tool (admin) + +- `support.createDebugPack(input)` + - internally calls: + - telemetry queries + - optional diagnostics session create + polling + - returns a single structured artifact: + - `debugPackId`, `clusterRefs`, `sessionRefs`, and a markdown summary + +### Output contracts (schemas) + +Define explicit JSON schemas (Zod or equivalent) for: + +- `TelemetryFilters` +- `TelemetryPolicyInput` (requires expiry) +- `DiagnosticsSessionTarget` +- `DiagnosticsSessionConfig` +- `DebugPackRequest` +- `DebugPackResponse` + +### Guardrails + +- Query limits: + - default `limit` and max `limit` enforced at MCP layer. +- Policy guardrails: + - require `expiresAt` + - require explicit `eventTypes/modules` + - block wildcard collection unless super_admin +- Diagnostics guardrails: + - enforce max duration + - enforce max capture volume per flush + +### Acceptance criteria + +- A single agent can: + - pull clusters for a product + time window + - propose a telemetry policy and preview targeting + - create an expiring policy + - start a diagnostics session and retrieve data + - generate a “Debug Pack” artifact + +### Phase 1 engineering checklist (definition of done) + +- MCP layer enforces: + - request-id propagation (`x-request-id`) + - required product scoping (`productId`) + - default query caps and maximum caps + - expiry requirements for policies/sessions +- Every mutating tool call produces an audit record (either: + - by calling existing audit endpoints, or + - by ensuring the underlying platform-service endpoint already records audit) +- Tool names and inputs are documented and stable (no breaking renames during Phase 1) + +## Phase 2 — A2A orchestration (1–2 weeks) + +### Goal + +Turn multi-step support/ops workflows into **repeatable agent playbooks** with explicit handoffs. + +### Standard agents + +- **DispatcherAgent** +- **TelemetryAnalystAgent** +- **DiagnosticsOrchestratorAgent** +- **OpsExecutorAgent** +- **ReportWriterAgent** + +### Handoff artifacts + +- `SupportIncidentBrief` +- `TelemetryFindings` +- `DiagnosticsSessionPlan` +- `OpsChangePlan` +- `FinalIncidentReport` + +### Acceptance criteria + +- “Support Debug Pack” runs end-to-end via: + - Dispatcher → Telemetry Analyst → Diagnostics Orchestrator → Report Writer +- Every handoff is persisted (even if only in logs initially) with stable IDs. + +## Phase 3 — Extraction task iteration loop (1–3 weeks) + +### Goal + +Make extraction prompt/task improvements safe, testable, and regression-resistant. + +### MCP tools + +- `extraction.extract(text, taskId?, modelId?)` +- `extraction.extractBatch(inputs)` +- `extraction.submitJob(inputs, webhookUrl?)` +- `extraction.getJob(jobId)` +- `extraction.metrics()` / `extraction.cacheStats()` + +### A2A workflow + +- **TaskDesignerAgent** drafts task prompt + examples +- **EvalRunnerAgent** runs batch eval sets +- **RegressionAgent** compares to baseline +- **PublisherAgent** updates task registry + rollout + +### Acceptance criteria + +- A single command/workflow can: + - run eval suite + - compute simple quality metrics (schema validity, required fields coverage) + - produce a report and recommended next edit + +## Phase 4 — Ops expansion (jobs/flags/maintenance/webhooks) (1–2 weeks) + +### Tools + +- `jobs.list`, `jobs.trigger`, `jobs.listRuns` +- `flags.list`, `flags.upsert`, `flags.evaluate` +- `maintenance.get`, `maintenance.set` +- `webhooks.listSubscriptions`, `webhooks.test`, `webhooks.rotateSecret` + +### Acceptance criteria + +- “Ops Copilot” can safely execute a bounded change plan: + - propose change + - dry-run if supported + - execute with audit + - verify outcome + +## Security & privacy checklist + +- Explicit `productId` on every tool call +- Avoid returning raw PII in tool results +- Ensure diagnostics redaction remains enforced server-side +- Enforce expirations on policies and sessions +- Rate limit MCP server endpoints + +## Rollout strategy + +- Start with internal-only usage (super_admin). +- Add admin roles once guardrails are proven. +- Add viewer read-only access for broader teams. +- Add product-specific namespaces only after platform namespaces stabilize. + +## Work breakdown (suggested) + +- **Milestone A**: MVP MCP server + telemetry/diagnostics read-only +- **Milestone B**: mutating tools + dry-run/expiry enforcement +- **Milestone C**: `support.createDebugPack` compound tool +- **Milestone D**: A2A runner + handoff schemas +- **Milestone E**: extraction eval loop + +## Open questions (need decisions) + +- Should the MCP server call services via: + - service REST endpoints only, or + - direct Cosmos reads for some query paths? +- Where should A2A handoff artifacts be stored: + - telemetry events, + - a dedicated Cosmos container, + - or both? +- Do we want one MCP server repo/package, or colocated under `platform-service`? diff --git a/docs/MCP+A2A/MCP_SERVER_FRAMEWORK.md b/docs/MCP+A2A/MCP_SERVER_FRAMEWORK.md new file mode 100644 index 00000000..c30e5b21 --- /dev/null +++ b/docs/MCP+A2A/MCP_SERVER_FRAMEWORK.md @@ -0,0 +1,155 @@ +# MCP Server Framework — Recommended Architecture (ByteLyst) + +## Why an MCP server here + +This workspace already has a clear separation of concerns: + +- **Authoritative services** (Fastify): `platform-service`, `extraction-service`, plus product backends. +- **Dashboards** (Next.js): admin + tracker. +- **Client SDKs**: Swift/Kotlin platform SDKs + TS client packages. + +An MCP server becomes the **single programmatic gateway** that agents can call to: + +- query/act on platform state +- assemble debugging evidence +- run repeatable ops workflows +- safely orchestrate A2A agents + +## Core design constraints + +- **Do not bypass service invariants** + - Prefer calling service endpoints or repositories with the same validation (Zod) and auth. +- **Auditability** + - Every mutating tool should emit audit logs (or call APIs that already do). +- **Least privilege** + - Split tools by role (viewer/admin/super_admin). +- **Product isolation** + - All tools/resources must explicitly bind to `productId`. + +## Reality check: what exists today + +- `platform-service` already exposes: + - `GET /api/telemetry/config` (ETag-based client collection config) + - `GET /api/telemetry/query`, `GET /api/telemetry/clusters`, policies CRUD (admin) + - diagnostics session CRUD + `GET /api/diagnostics/sessions/:id/logs|traces|screenshots` (admin) +- `extraction-service` already exposes: + - `/extract`, `/extract/batch`, `/extract/jobs`, sidecar health, metrics, cache stats + +The primary “new work” for MCP is orchestration, safety gating, and consistent auth/audit — not inventing new primitives. + +## Proposed MCP servers (2-tier) + +### 1) `bytelyst-platform-mcp` (primary) + +Backed by `platform-service` (port 4003) and optionally Cosmos for direct reads. + +- **Responsibilities** + - Telemetry querying + policy management + - Remote diagnostics sessions orchestration + - Jobs trigger/list + - Flags/settings/maintenance + - Webhooks + delivery logs + - Audit query + +### 2) `bytelyst-extraction-mcp` (specialized) + +Backed by `extraction-service` (port 4005). + +- **Responsibilities** + - Extract / batch extract + - Submit and monitor async extraction jobs + - Sidecar health + circuit breaker insight + - Metrics + cache stats + +(Optionally, these can be a single MCP server with two namespaces.) + +## Tool taxonomy + +### A) Read-only tools + +- `telemetry.queryEvents` +- `telemetry.listClusters` +- `telemetry.getMetrics` +- `diagnostics.getSession` +- `diagnostics.getLogs/getTraces` +- `jobs.list/listRuns` +- `flags.list` +- `settings.get` +- `webhooks.listSubscriptions/listDeliveries` +- `extraction.metrics/cacheStats/sidecarHealth` + +### B) Mutating tools (require elevated role) + +- `telemetry.createPolicy/updatePolicy/deletePolicy` +- `telemetry.updateClusterStatus` +- `diagnostics.createSession/updateSession/cancelSession` +- `jobs.trigger` +- `maintenance.set` +- `flags.set` (or flag upserts) +- `webhooks.rotateSecret` / `webhooks.test` +- `extraction.rateLimitReset` (if you keep that admin endpoint) + +### C) Compound tools (“one tool = one workflow”) + +- `support.createDebugPack(reportInput)` + - pulls telemetry timeline + cluster context + - optionally starts diagnostics session + - returns a single structured artifact (markdown/json) + +This reduces prompt fragility vs. requiring the LLM to call 8 tools in the right order. + +## MCP resources + +Resources should be stable references agents can read repeatedly: + +- `platform.modules.index` + - module list + base routes + required headers +- `telemetry.schema` +- `diagnostics.schema` +- `extraction.tasks.catalog` +- `ops.runbooks` + - e.g. “how to debug iOS keyboard insert_noop” +- `product.identity` + - productId, plan tiers, allowed baseUrls + +## Prompts (MCP prompt templates) + +- `prompt.support_triage` +- `prompt.telemetry_policy_proposal` +- `prompt.remote_diagnostics_session_plan` +- `prompt.extraction_task_design` + +## Authentication & authorization + +- **Primary**: platform-service JWT (same `verifyToken` logic). +- **Secondary**: service-to-service API tokens (only for trusted automation). +- **Tool gating** + - viewer: query-only + - admin: policy updates, create diagnostics sessions + - super_admin: secret rotation, maintenance, destructive operations + +## Observability for the MCP server + +- Use structured logs (Fastify/pino style) and propagate `x-request-id`. +- Record tool invocation metrics into `telemetry` as `backend_service` channel: + - module: `mcp` + - eventName: `tool_invoked`, `tool_failed`, `a2a_handoff` + +## Safe defaults / guardrails + +- Any mutating tool should support a `dryRun: true` mode. +- Enforce `expiresAt` on any “diagnostic collection amplification” (telemetry policy, diagnostics session). +- Cap queries by default (limit/pageSize), require explicit `limit` increases. + +## Known integration risk (fix early) + +- `@bytelyst/diagnostics-client` currently flushes to `POST /api/diagnostics/ingest`, while `platform-service` ingests via session-scoped endpoints. +- Resolve this mismatch before using diagnostics tooling as a core MCP/A2A workflow dependency. + - Decision: update `@bytelyst/diagnostics-client` to post to `POST /api/diagnostics/sessions/:id/logs|traces`. + +## Suggested initial tool surface (minimal viable) + +- `telemetry.queryEvents`, `telemetry.listClusters`, `telemetry.listPolicies`, `telemetry.previewPolicy`, `telemetry.createPolicy` +- `diagnostics.createSession`, `diagnostics.getSession`, `diagnostics.getLogs`, `diagnostics.getTraces` +- `extraction.extract`, `extraction.extractBatch`, `extraction.sidecarHealth` +- `jobs.list`, `jobs.trigger` diff --git a/docs/MCP+A2A/README.md b/docs/MCP+A2A/README.md new file mode 100644 index 00000000..df1b6e72 --- /dev/null +++ b/docs/MCP+A2A/README.md @@ -0,0 +1,35 @@ +# MCP + A2A — Workspace Initiative + +This folder contains a workspace-wide scan of the ByteLyst ecosystem (platform + shared packages + dashboards + product repos) for opportunities to leverage: + +- MCP (Model Context Protocol) +- A2A (agent-to-agent) patterns + +## Documents + +- `WORKSPACE_USE_CASE_CATALOG.md` +- `MCP_SERVER_FRAMEWORK.md` +- `A2A_ORCHESTRATION_FRAMEWORK.md` +- `IMPLEMENTATION_PLAN.md` +- `DOMAIN_PLATFORM_SERVICE.md` +- `DOMAIN_EXTRACTION_SERVICE.md` +- `DOMAIN_DASHBOARDS.md` +- `DOMAIN_PACKAGES_AND_SDKS.md` +- `DOMAIN_PRODUCTS.md` + +## Scope of scan + +Primary sources used: + +- `services/platform-service` (telemetry, diagnostics, jobs, settings, webhooks, auth, etc.) +- `services/extraction-service` (sidecar + tasks + async jobs + rate limits) +- `packages/*` (Swift/Kotlin platform SDKs, TS clients, event bus, telemetry/diagnostics clients) +- `dashboards/admin-web`, `dashboards/tracker-web`, `dashboards/ux-lab` +- Product repos: ChronoMind (`learning_ai_clock`), NomGap (`learning_ai_fastgap`), PeakPulse (`learning_ai_peakpulse`), LysnrAI (`learning_voice_ai_agent`), MindLyst (`learning_multimodal_memory_agents`), JarvisJr (`learning_ai_jarvis_jr`) + +## Notation + +- **Tool**: an MCP tool callable by an LLM/agent. +- **Resource**: an MCP resource (read-only or read-mostly) exposed for context. +- **Prompt**: an MCP prompt template. +- **Agent**: an A2A-capable worker with a specific responsibility. diff --git a/docs/MCP+A2A/WORKSPACE_USE_CASE_CATALOG.md b/docs/MCP+A2A/WORKSPACE_USE_CASE_CATALOG.md new file mode 100644 index 00000000..8e4ae29f --- /dev/null +++ b/docs/MCP+A2A/WORKSPACE_USE_CASE_CATALOG.md @@ -0,0 +1,173 @@ +# WORKSPACE — MCP + A2A Use-Case Catalog + +## Goals + +- Identify **high-leverage** MCP tools/resources and A2A agent workflows across the ByteLyst workspace. +- Provide a **prioritized backlog** that maps cleanly onto existing services: + - `platform-service` (telemetry, diagnostics, jobs, settings, auth, webhooks, etc.) + - `extraction-service` (structured extraction, async jobs) + - dashboards (admin/tracker) + - shared packages + mobile SDKs + - product repos (ChronoMind, NomGap, PeakPulse, MindLyst, JarvisJr, LysnrAI) + +## What “MCP” and “A2A” mean in this workspace + +- **MCP server**: exposes authoritative product/platform capabilities as tools/resources. + - Tools should map to stable APIs (prefer calling service endpoints / repositories rather than scraping UI). + - Resources should expose read-mostly context: schemas, runbooks, current status, recent incidents, etc. +- **A2A**: multiple specialized agents collaborating via explicit contracts. + - Example: _Support agent_ → _Triage agent_ → _Diagnostics agent_ → _Fix agent_ → _Release agent_. + +## Reality check: “exists today” vs “proposed tool surface” + +- **Backed by existing services today** + - Telemetry query/clusters/policies (platform-service) + - Telemetry client config (`GET /api/telemetry/config`, ETag-based) + - Diagnostics sessions + ingest (platform-service) + - Extraction extract/batch/jobs/health (extraction-service) + - Jobs/flags/settings/webhooks endpoints (platform-service) +- **Proposed tools that may require new endpoints or privileged integrations** + - `secrets.rotate(...)`, `services.restart(...)` (depends on Key Vault/infra integration) + - UX-lab dataset generators (`dev.generateTelemetryDataset(...)`) (local-only helper tooling) + - Compound tools like `support.createDebugPack(...)` (implemented in MCP layer as orchestration) + +## Prioritization rubric + +- **P0**: removes constant manual toil; immediate engineering impact. +- **P1**: unlocks new ops / reliability capabilities. +- **P2**: enables new product intelligence / automations. +- **P3**: experimental / UX labs. + +## P0 — Highest leverage (do first) + +### 1) Incident / Support “one-click evidence” pack (Telemetry + Diagnostics) + +- **Why** + - You already have rich primitives: `telemetry` (clusters + policies) and `diagnostics` (remote sessions). + - The missing piece is automated _assembly_ of a support-ready packet. +- **MCP tools** + - `telemetry.queryEvents(filters)` + - `telemetry.listClusters(filters)` + - `telemetry.updateClusterStatus(clusterId, pk, status)` + - `diagnostics.createSession(targetUserId|targetAnonymousId|targetDeviceId, config)` + - `diagnostics.getSession(sessionId)` + - `diagnostics.getLogs(sessionId, filters)` + - `diagnostics.getTraces(sessionId, filters)` + - `diagnostics.listScreenshots(sessionId)` +- **A2A agents** + - **SupportTriageAgent**: extracts identifiers from user report (email/userId/anonymousInstallId, appVersion). (Maps to `targetAnonymousId` for diagnostics.) + - **TelemetryAnalystAgent**: pulls clusters + timelines; proposes policy changes. + - **DiagnosticsOrchestratorAgent**: starts a debug session when needed; monitors results. + - **ReportWriterAgent**: produces a “Debug Pack” markdown/PDF. + +### 2) Telemetry policy authoring assistant (safe-by-default) + +- **Why** + - Policies are powerful but easy to over-collect. Assistant can: + - suggest targeting + - set auto-expiry + - choose sampling +- **MCP tools** + - `telemetry.listPolicies()` + - `telemetry.previewPolicy(targeting)` + - `telemetry.createPolicy(input)` + - `telemetry.updatePolicy(id, updates)` +- **A2A agents** + - **PolicyPlannerAgent**: proposes policy with guardrails. + - **PolicyReviewerAgent**: checks privacy constraints + expiry + scope. + +### 3) “Ops Copilot” for platform-service modules + +- **Why** + - platform-service contains many modules (auth, flags, delivery, jobs, settings, maintenance, etc.). + - Most ops tasks are combinations of a few actions (query, toggle, trigger job). +- **MCP tools** + - `jobs.list()` / `jobs.trigger(name)` / `jobs.listRuns(name)` + - `maintenance.get()` / `maintenance.set(mode, schedule)` + - `flags.list()` / `flags.set(key, enabled, targeting)` + - `settings.get(userId)` / `settings.set(userId, patch)` + - `webhooks.listSubscriptions()` / `webhooks.test()` / `webhooks.rotateSecret()` +- **A2A agents** + - **OpsAgent**: executes operations. + - **ComplianceAgent**: ensures audit trail + least privilege. + +### 4) Extraction service “task builder” and evaluation loop + +- **Why** + - extraction-service already has model registry, caching, async jobs, sidecar health. + - Task prompts/examples are high leverage; building & evaluating them is repetitive. +- **MCP tools** + - `extraction.extract(text, taskId, modelId)` + - `extraction.extractBatch(inputs)` + - `extraction.submitJob(inputs, webhookUrl?)` + - `extraction.getJob(jobId)` + - `extraction.sidecarHealth()` + - `extraction.metrics()` / `extraction.cacheStats()` +- **A2A agents** + - **TaskDesignerAgent**: drafts taskPrompt + examples. + - **EvalRunnerAgent**: runs eval sets, compares outputs. + - **PromptRegressionAgent**: ensures no quality regressions. + +## P1 — Operational maturity + +### 5) Automated regression watch: telemetry clusters → auto diagnostics session + +- **Why** + - telemetry clusters already have severity escalation + webhook alerts. + - diagnostics supports remote sessions. +- **Flow** + - Cluster crosses threshold → A2A triggers a targeted diagnostics session for a small segment. +- **MCP tools** + - `telemetry.listClusters()` + - `diagnostics.createSession(...)` + +### 6) Secret rotation “assistant” (Key Vault + config propagation) + +- **Why** + - Secrets already resolved centrally via `@bytelyst/config`. + - Rotation is still error-prone across services/dashboards. +- **MCP tools/resources** + - Resource: `secrets.mapping` (which env vars resolve which AKV secrets) + - Tool: `secrets.rotate(name)` (where permitted) + `services.restart(service)` (optional) + +### 7) Webhook subscription lifecycle assistant + +- **Why** + - You have webhooks + delivery logs; assistants can recommend retries, disable rules, test endpoints. +- **MCP tools** + - `webhooks.listSubscriptions(productId)` + - `webhooks.listDeliveries(subscriptionId)` + - `webhooks.rotateSecret(subscriptionId)` + - `webhooks.test(subscriptionId)` + +## P2 — Product intelligence and automation + +### 8) A/B experimentation assistant (platform-service `ab-testing`, `experiments`) + +- **Use** + - Draft experiment plan, compute exposure targeting, monitor telemetry signals. + +### 9) “Changelog writer” from merged PRs + telemetry impact + +- **Use** + - Collect changes; relate to drop in cluster counts; propose release notes. + +## P3 — UX-lab accelerators + +### 10) MCP-driven UX Lab data generators + +- **Why** + - UX lab apps (telemetry explorer, ops UI kit) need rich sample datasets. +- **MCP tool** + - `dev.generateTelemetryDataset(shape, size, seed)` + +## Cross-product patterns that MCP/A2A should standardize + +- **Telemetry** + - Common event naming + “productId is mandatory” invariant. +- **Kill switch** + - Single check path; consistent failure mode (fail-open). +- **Extraction** + - Task IDs used consistently across products (triage, reflection, insights, etc.). +- **Workflows** + - Release/build/test workflows can become A2A playbooks using MCP tools. diff --git a/packages/diagnostics-client/src/client.ts b/packages/diagnostics-client/src/client.ts index a2ca3f73..d18d6c56 100644 --- a/packages/diagnostics-client/src/client.ts +++ b/packages/diagnostics-client/src/client.ts @@ -13,7 +13,6 @@ import type { LogEntry, Breadcrumb, NetworkRequest, - IngestBatch, DeviceState, } from './types.js'; import { BreadcrumbTrail } from './breadcrumbs.js'; @@ -28,7 +27,6 @@ type ErrorEvent = { colno: number; error?: { stack?: string }; }; -type EventListener = (event: unknown) => void; export interface DiagnosticsClientOptions extends DiagnosticsConfig { /** Custom logger */ @@ -217,7 +215,7 @@ export class DiagnosticsClient { level, message, timestamp: new Date().toISOString(), - module: context.module as string ?? 'unknown', + module: (context.module as string) ?? 'unknown', context, correlationId: context.correlationId as string, }; @@ -254,7 +252,10 @@ export class DiagnosticsClient { span.durationMs = new Date(span.endTime).getTime() - new Date(span.startTime).getTime(); span.status = 'ok'; this.traceBuffer.push(span); - this.breadcrumbs.add('trace', `Completed: ${name}`, { spanId: span.spanId, durationMs: span.durationMs }); + this.breadcrumbs.add('trace', `Completed: ${name}`, { + spanId: span.spanId, + durationMs: span.durationMs, + }); return result; } catch (error) { span.endTime = new Date().toISOString(); @@ -262,7 +263,10 @@ export class DiagnosticsClient { span.status = 'error'; span.statusMessage = error instanceof Error ? error.message : String(error); this.traceBuffer.push(span); - this.breadcrumbs.add('trace', `Failed: ${name}`, { spanId: span.spanId, error: span.statusMessage }); + this.breadcrumbs.add('trace', `Failed: ${name}`, { + spanId: span.spanId, + error: span.statusMessage, + }); throw error; } } @@ -298,7 +302,7 @@ export class DiagnosticsClient { url.searchParams.set('installId', this.config.anonymousInstallId); const headers: Record = { - 'Accept': 'application/json', + Accept: 'application/json', }; if (this.lastEtag) { @@ -347,7 +351,10 @@ export class DiagnosticsClient { this.config.logger.error('[diagnostics] Failed to poll for session', { error: error instanceof Error ? error.message : String(error), }); - this.state = { type: 'error', error: error instanceof Error ? error : new Error(String(error)) }; + this.state = { + type: 'error', + error: error instanceof Error ? error : new Error(String(error)), + }; } } @@ -364,56 +371,90 @@ export class DiagnosticsClient { return; } - // Build batch - const batch: IngestBatch = { - sessionId: session.id, - }; + const sessionId = session.id; - if (this.logBuffer.length > 0) { - batch.logs = this.logBuffer.splice(0, 50); // Max 50 per batch - } - - if (this.traceBuffer.length > 0) { - batch.traces = this.traceBuffer.splice(0, 50); - } - - if (this.networkBuffer.length > 0) { - batch.network = this.networkBuffer.splice(0, 50); - } - - // Add breadcrumbs + const logs = this.logBuffer.splice(0, 50); // Server max: 50 + const traces = this.traceBuffer.splice(0, 50); // Server max: 50 + const network = this.networkBuffer.splice(0, 50); const crumbs = this.breadcrumbs.getAll(); - if (crumbs.length > 0) { - batch.breadcrumbs = [...crumbs]; - this.breadcrumbs.clear(); + this.breadcrumbs.clear(); + + // Encode breadcrumbs + network captures as log entries so we can ingest + // without requiring additional server-side schemas/endpoints. + const synthesizedLogs = [] as LogEntry[]; + + for (const c of crumbs) { + synthesizedLogs.push({ + level: 'info', + message: `[breadcrumb] ${c.category}: ${c.message}`, + timestamp: c.timestamp, + module: 'diagnostics.breadcrumb', + context: c.data ?? {}, + }); } - // Skip if nothing to send - if (!batch.logs && !batch.traces && !batch.network && !batch.breadcrumbs) { + for (const n of network) { + synthesizedLogs.push({ + level: n.error ? 'error' : 'info', + message: `[network] ${n.method} ${n.url} ${n.status ?? ''}`.trim(), + timestamp: n.startTime, + module: 'diagnostics.network', + context: { + requestHeaders: n.requestHeaders, + requestBody: n.requestBody, + status: n.status, + responseHeaders: n.responseHeaders, + responseBody: n.responseBody, + startTime: n.startTime, + endTime: n.endTime, + durationMs: n.durationMs, + error: n.error, + }, + }); + } + + const allLogs = [...logs, ...synthesizedLogs]; + + if (allLogs.length === 0 && traces.length === 0) { return; } + const token = await this.getAuthToken(); + const headers: Record = { + 'Content-Type': 'application/json', + ...(token ? { Authorization: `Bearer ${token}` } : {}), + }; + try { - const url = new URL('/api/diagnostics/ingest', this.config.serverUrl); - const token = await this.getAuthToken(); + if (allLogs.length > 0) { + const url = new URL( + `/api/diagnostics/sessions/${encodeURIComponent(sessionId)}/logs`, + this.config.serverUrl + ); + const response = await fetch(url.toString(), { + method: 'POST', + headers, + body: JSON.stringify({ sessionId, logs: allLogs }), + }); + if (!response.ok) throw new Error(`HTTP ${response.status}`); + } - const response = await fetch(url.toString(), { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - ...(token && { 'Authorization': `Bearer ${token}` }), - }, - body: JSON.stringify(batch), - }); - - if (!response.ok) { - throw new Error(`HTTP ${response.status}`); + if (traces.length > 0) { + const url = new URL( + `/api/diagnostics/sessions/${encodeURIComponent(sessionId)}/traces`, + this.config.serverUrl + ); + const response = await fetch(url.toString(), { + method: 'POST', + headers, + body: JSON.stringify({ sessionId, traces }), + }); + if (!response.ok) throw new Error(`HTTP ${response.status}`); } this.config.logger.debug('[diagnostics] Flushed batch', { - logs: batch.logs?.length ?? 0, - traces: batch.traces?.length ?? 0, - network: batch.network?.length ?? 0, + logs: allLogs.length, + traces: traces.length, }); } catch (error) { this.config.logger.error('[diagnostics] Failed to flush batch', { @@ -421,9 +462,14 @@ export class DiagnosticsClient { }); // Put items back in buffers for retry - if (batch.logs) this.logBuffer.unshift(...batch.logs); - if (batch.traces) this.traceBuffer.unshift(...batch.traces); - if (batch.network) this.networkBuffer.unshift(...batch.network); + if (logs.length > 0) this.logBuffer.unshift(...logs); + if (traces.length > 0) this.traceBuffer.unshift(...traces); + if (network.length > 0) this.networkBuffer.unshift(...network); + + // Breadcrumbs were converted; keep a small breadcrumb trail hint for later flush. + for (const c of crumbs.slice(-10)) { + this.breadcrumbs.add(c.category, c.message, c.data); + } } } @@ -432,7 +478,7 @@ export class DiagnosticsClient { */ private setupNetworkCapture(): void { this.networkInterceptor = new NetworkInterceptor( - (request) => { + request => { this.networkBuffer.push(request); }, { @@ -455,9 +501,9 @@ export class DiagnosticsClient { }; const capture = (level: LogLevel, args: unknown[]) => { - const message = args.map(a => - typeof a === 'object' ? JSON.stringify(a) : String(a) - ).join(' '); + const message = args + .map(a => (typeof a === 'object' ? JSON.stringify(a) : String(a))) + .join(' '); this.log(level, message, { module: 'console', source: 'captured' }); };