From bf7769bdaa15f5e93ba5e4ec49d2f5569874d617 Mon Sep 17 00:00:00 2001
From: saravanakumardb1 <saravanakumardb1@users.noreply.github.com>
Date: Thu, 5 Mar 2026 10:41:02 -0800
Subject: [PATCH] fix(diagnostics-client): use session-scoped ingest endpoints;
 update MCP+A2A docs

---
 docs/MCP+A2A/A2A_ORCHESTRATION_FRAMEWORK.md | 138 +++++++++++
 docs/MCP+A2A/DOMAIN_DASHBOARDS.md           |  54 ++++
 docs/MCP+A2A/DOMAIN_EXTRACTION_SERVICE.md   |  70 ++++++
 docs/MCP+A2A/DOMAIN_PACKAGES_AND_SDKS.md    |  82 ++++++
 docs/MCP+A2A/DOMAIN_PLATFORM_SERVICE.md     | 120 +++++++++
 docs/MCP+A2A/DOMAIN_PRODUCTS.md             |  80 ++++++
 docs/MCP+A2A/EXECUTION_CHECKLIST.md         |  59 +++++
 docs/MCP+A2A/IMPLEMENTATION_PLAN.md         | 261 ++++++++++++++++++++
 docs/MCP+A2A/MCP_SERVER_FRAMEWORK.md        | 155 ++++++++++++
 docs/MCP+A2A/README.md                      |  35 +++
 docs/MCP+A2A/WORKSPACE_USE_CASE_CATALOG.md  | 173 +++++++++++++
 packages/diagnostics-client/src/client.ts   | 150 +++++++----
 12 files changed, 1325 insertions(+), 52 deletions(-)
 create mode 100644 docs/MCP+A2A/A2A_ORCHESTRATION_FRAMEWORK.md
 create mode 100644 docs/MCP+A2A/DOMAIN_DASHBOARDS.md
 create mode 100644 docs/MCP+A2A/DOMAIN_EXTRACTION_SERVICE.md
 create mode 100644 docs/MCP+A2A/DOMAIN_PACKAGES_AND_SDKS.md
 create mode 100644 docs/MCP+A2A/DOMAIN_PLATFORM_SERVICE.md
 create mode 100644 docs/MCP+A2A/DOMAIN_PRODUCTS.md
 create mode 100644 docs/MCP+A2A/EXECUTION_CHECKLIST.md
 create mode 100644 docs/MCP+A2A/IMPLEMENTATION_PLAN.md
 create mode 100644 docs/MCP+A2A/MCP_SERVER_FRAMEWORK.md
 create mode 100644 docs/MCP+A2A/README.md
 create mode 100644 docs/MCP+A2A/WORKSPACE_USE_CASE_CATALOG.md
diff --git a/docs/MCP+A2A/A2A_ORCHESTRATION_FRAMEWORK.md b/docs/MCP+A2A/A2A_ORCHESTRATION_FRAMEWORK.md
new file mode 100644
index 00000000..18a8c857
--- /dev/null
+++ b/docs/MCP+A2A/A2A_ORCHESTRATION_FRAMEWORK.md
@@ -0,0 +1,138 @@
+# A2A Orchestration Framework — Recommended Pattern (ByteLyst)
+
+## Intent
+
+Standardize how multiple agents collaborate on platform + product tasks (support, ops, releases, prompt iterations) with:
+
+- explicit roles
+- explicit handoff artifacts
+- consistent safety + audit
+
+## Canonical roles (agents)
+
+### 1) Router / Dispatcher
+
+- **Responsibility**
+  - decide which specialist agent(s) to involve
+  - enforce policy (role gating, PII constraints)
+- **Inputs**
+  - user request + current environment (productId, repo)
+- **Outputs**
+  - sequence of tasks (plan) + handoff payloads
+
+### 2) Telemetry Analyst
+
+- **Responsibility**
+  - find clusters, correlate by version/platform, propose next steps
+- **Calls**
+  - telemetry MCP tools
+
+### 3) Diagnostics Orchestrator
+
+- **Responsibility**
+  - start / monitor remote diagnostics sessions
+  - summarize results
+- **Calls**
+  - diagnostics MCP tools
+
+### 4) Extraction Task Designer
+
+- **Responsibility**
+  - design extraction task prompts/examples
+  - coordinate evaluation runs
+- **Calls**
+  - extraction MCP tools
+
+### 5) Ops Executor
+
+- **Responsibility**
+  - execute mutating ops (jobs trigger, maintenance windows, flag changes)
+- **Calls**
+  - platform MCP tools
+
+### 6) Fix / PR Agent
+
+- **Responsibility**
+  - implement code changes
+  - ensure tests and conventions
+
+### 7) Report Writer
+
+- **Responsibility**
+  - produce a final summary in a consistent format
+  - include links/IDs (clusterId, sessionId, policyId)
+
+## Handoff artifacts (contracts)
+
+Examples below are illustrative. All artifacts must be explicitly scoped to a `productId`.
+
+### A) Support incident brief
+
+```json
+{
+  "productId": "<productId>",
+  "userReport": {
+    "summary": "dictation inserts nothing in Messages",
+    "platform": "ios",
+    "channel": "keyboard_extension",
+    "appVersion": "1.2.0",
+    "buildNumber": "35",
+    "userId": "usr_...",
+    "anonymousInstallId": "..."
+  },
+  "timeWindow": { "from": "...", "to": "..." }
+}
+```
+
+Mapping note: `userReport.anonymousInstallId` maps to diagnostics session targeting via `targetAnonymousId`.
+
+### B) Telemetry findings
+
+```json
+{
+  "clusters": [{ "clusterId": "...", "pk": "...", "severity": "error" }],
+  "topHypotheses": ["permission denied", "insertText noop"],
+  "recommendedActions": ["start diagnostics session", "enable debug policy for one user"]
+}
+```
+
+### C) Diagnostics session plan
+
+```json
+{
+  "target": { "userId": "...", "deviceId": "..." },
+  "collection": { "level": "trace", "captureNetwork": true, "captureLogs": true },
+  "expiresInMinutes": 30
+}
+```
+
+### D) Patch plan (code)
+
+- scope, files, risk, tests
+
+## Routing logic (simple)
+
+- If request mentions:
+  - **"crash" / "not working" / "bug"** → Telemetry Analyst → Diagnostics Orchestrator → Fix Agent
+  - **"extraction" / "entity" / "triage"** → Extraction Task Designer → Eval Runner → Fix Agent
+  - **"maintenance" / "flag" / "job"** → Ops Executor
+
+## Safety rules
+
+- Never include raw user content in telemetry/diagnostics.
+- Diagnostics sessions must be time-bounded.
+- Mutating actions require:
+  - explicit approval from dispatcher
+  - audit log
+  - optional dry-run
+
+## Where A2A yields immediate wins in this workspace
+
+- **Telemetry policy governance**
+  - Planner + Reviewer pattern
+- **Remote diagnostics**
+  - Orchestrator agent that monitors sessions and summarizes
+- **Prompt iteration loops** (extraction)
+  - Task designer + eval runner separation
+- **Release workflows**
+  - Dedicated agent for quality gates (build/test/typecheck) and a separate agent for publishing
diff --git a/docs/MCP+A2A/DOMAIN_DASHBOARDS.md b/docs/MCP+A2A/DOMAIN_DASHBOARDS.md
new file mode 100644
index 00000000..c9539483
--- /dev/null
+++ b/docs/MCP+A2A/DOMAIN_DASHBOARDS.md
@@ -0,0 +1,54 @@
+# Domain — Dashboards (admin-web, tracker-web, ux-lab)
+
+## Admin dashboard (`dashboards/admin-web`)
+
+### Existing leverage points
+
+- It already centralizes many ops capabilities behind UI.
+- It already has a service client layer (`src/lib/platform-client.ts`) that talks to `platform-service`.
+
+### MCP opportunities
+
+- Provide “headless equivalents” of the admin UI actions via MCP tools.
+- Use MCP resources to provide the dashboard with richer contextual data:
+  - module inventories
+  - policy templates
+  - incident runbooks
+
+### A2A opportunities
+
+- Build an Ops Copilot that:
+  - proposes actions
+  - executes via MCP tools
+  - links back to the relevant admin dashboard pages
+
+## Tracker dashboard (`dashboards/tracker-web`)
+
+- Candidate MCP tools (illustrative names; map onto the tracker modules in platform-service such as items/votes/comments/public):
+  - `tracker.listPublicItems()`
+  - `tracker.submitFeedback()`
+  - `tracker.vote(itemId)`
+
+## UX Lab (`dashboards/ux-lab`)
+
+### Existing intent
+
+These micro-apps are greenfield UI experiments that intentionally avoid backend dependencies.
+
+### MCP opportunity
+
+- Use MCP as a **dataset generator** for local-only UX experiments.
+  - Example: generate realistic telemetry events & clusters JSON.
+
+### Suggested tools
+
+- `dev.generateSampleTelemetryEvents(count, shape, seed)` (local-only helper)
+- `dev.generateSampleClusters(count, seed)` (local-only helper)
+
+## Dashboard “component extraction” workflow
+
+An A2A workflow that:
+
+- identifies patterns in ux-lab (tables/filters/drawers)
+- proposes a migration plan into `@bytelyst/dashboard-components`
+- auto-generates a PR in the appropriate package
diff --git a/docs/MCP+A2A/DOMAIN_EXTRACTION_SERVICE.md b/docs/MCP+A2A/DOMAIN_EXTRACTION_SERVICE.md
new file mode 100644
index 00000000..d03f1c3f
--- /dev/null
+++ b/docs/MCP+A2A/DOMAIN_EXTRACTION_SERVICE.md
@@ -0,0 +1,70 @@
+# Domain — extraction-service (MCP + A2A Opportunities)
+
+## Why extraction-service is ideal for MCP
+
+It already provides:
+
+- a single entrypoint (`POST /extract`, `POST /extract/batch`)
+- async extraction jobs (`/extract/jobs`)
+- model registry
+- sidecar health monitoring and circuit breaker
+- rate limits + quotas + cache
+
+Agents can use MCP tools to iterate on prompts/tasks safely and repeatably.
+
+## High-value MCP tool proposals
+
+### Core extraction
+
+- `extraction.extract(text, taskId?, modelId?, productId?)`
+- `extraction.extractBatch(inputs, modelId?)`
+
+### Async jobs
+
+- `extraction.submitJob(inputs, modelId?, webhookUrl?)`
+- `extraction.getJob(jobId)`
+- `extraction.listJobs()`
+
+### Observability
+
+- `extraction.sidecarHealth()`
+- `extraction.metrics()`
+- `extraction.cacheStats()` (backs `GET /extract/cache-stats`)
+- `extraction.sidecarMonitoringState()` (backs `GET /extract/monitoring/sidecar`)
+
+### Rate limits / admin utilities
+
+- `extraction.getProductRateLimitStatus(productId?)`
+- `extraction.resetProductRateLimit(productId)` (admin)
+
+## Recommended MCP resources
+
+- `extraction.modelRegistry`
+- `extraction.taskCatalog`
+  - list task IDs used across products (triage, reflection-enrichment, memory-insight, etc.)
+- `extraction.promptGuidelines`
+
+## Recommended A2A workflows
+
+### 1) Task design loop
+
+- **TaskDesignerAgent** drafts:
+  - task prompt
+  - a small set of examples
+- **EvalRunnerAgent** runs:
+  - `extractBatch` over an eval set
+  - compares JSON shape correctness
+- **RegressionAgent** checks:
+  - no degradation vs previous baseline
+
+### 2) Extraction incident response
+
+- If extraction errors spike:
+  - check sidecar health and circuit breaker state
+  - reduce per-product rate limits
+  - switch modelId (if supported)
+
+## Product integration hotspots
+
+- MindLyst web API routes proxy to extraction-service (`/api/extract` and triage routes).
+- Future: other products can standardize on the same tasks and use a shared task registry.
diff --git a/docs/MCP+A2A/DOMAIN_PACKAGES_AND_SDKS.md b/docs/MCP+A2A/DOMAIN_PACKAGES_AND_SDKS.md
new file mode 100644
index 00000000..477a3bdf
--- /dev/null
+++ b/docs/MCP+A2A/DOMAIN_PACKAGES_AND_SDKS.md
@@ -0,0 +1,82 @@
+# Domain — Shared Packages + SDKs (MCP + A2A Opportunities)
+
+## Why packages/SDKs matter for MCP/A2A
+
+They define:
+
+- the **portable contracts** (telemetry event schema, diagnostics types)
+- the **client integration points** (Swift/Kotlin SDKs, TS client packages)
+- reusable primitives (offline queue, platform client)
+
+MCP/A2A should treat these as _the single source of truth_ for:
+
+- schemas
+- naming conventions
+- safety constraints (no PII)
+
+## Key packages and how to leverage them
+
+### `@bytelyst/telemetry-client`
+
+- Already provides a browser/RN-safe client.
+- MCP can expose resources:
+  - event schema
+  - recommended module/eventName conventions
+
+**Opportunity:** add a companion “policy-aware client” mode that calls `GET /telemetry/config` and samples accordingly.
+
+**Note:** `platform-service` already exposes `GET /api/telemetry/config` (ETag-based). The remaining work is wiring the client to consume it safely.
+
+### `@bytelyst/diagnostics-client`
+
+- Provides session polling + capture utilities.
+
+**Gap (must fix before relying on it):** the client currently flushes batches to `POST /api/diagnostics/ingest`, while `platform-service` routes are session-scoped:
+
+- `POST /api/diagnostics/sessions/:id/logs`
+- `POST /api/diagnostics/sessions/:id/traces`
+- `POST /api/diagnostics/sessions/:id/screenshots` (SAS upload)
+
+Decision: update `@bytelyst/diagnostics-client` to post to the session-scoped endpoints (no backwards-compat alias endpoint needed).
+
+**Opportunity:** standardize how product apps integrate it (common initialization patterns and user-consent prompts).
+
+### `@bytelyst/platform-client`
+
+- Typed fetch wrapper with auth injection.
+
+**Opportunity:** use it as the basis for a _frontend-side_ MCP client (where appropriate) and for consistent request-id propagation.
+
+### `@bytelyst/offline-queue`
+
+- Good candidate for A2A workflows that need reliable retries.
+
+### Swift / Kotlin Platform SDKs
+
+- Already provide consistent platform-service integration.
+
+**Opportunity:**
+
+- expose SDK version + capabilities as MCP resources
+- keep a “compatibility matrix” resource (which products use which SDK features)
+
+## MCP resources recommended from packages
+
+- `schemas.telemetry` (from shared types)
+- `schemas.diagnostics` (from shared types)
+- `sdk.swift.capabilities`
+- `sdk.kotlin.capabilities`
+
+## A2A workflows
+
+### 1) Cross-repo integration audit
+
+- Identify product repos that drift from shared SDK patterns.
+- Output: a per-repo “alignment report” and suggested PRs.
+
+### 2) Release impact analysis
+
+- When changing a shared package:
+  - agent enumerates downstream consumers
+  - runs typecheck/build matrix
+  - updates docs + versioning
diff --git a/docs/MCP+A2A/DOMAIN_PLATFORM_SERVICE.md b/docs/MCP+A2A/DOMAIN_PLATFORM_SERVICE.md
new file mode 100644
index 00000000..1ade4ca7
--- /dev/null
+++ b/docs/MCP+A2A/DOMAIN_PLATFORM_SERVICE.md
@@ -0,0 +1,120 @@
+# Domain — platform-service (MCP + A2A Opportunities)
+
+## Why this domain is ideal for MCP
+
+`platform-service` is already organized as a set of well-defined modules and stable REST endpoints (Fastify 5 + Zod), with:
+
+- telemetry ingestion + query + policies
+- remote diagnostics sessions
+- jobs, delivery, sessions, maintenance
+- settings, flags, rate limits
+- webhooks subscriptions
+
+This makes it a near-perfect backing store for MCP tools.
+
+## High-value MCP tool proposals
+
+### Telemetry
+
+- `telemetry.queryEvents(filters)`
+- `telemetry.listClusters(filters)`
+- `telemetry.updateClusterStatus(clusterId, pk, status)`
+- `telemetry.listPolicies()`
+- `telemetry.previewPolicy(targeting)`
+- `telemetry.createPolicy(input)`
+- `telemetry.updatePolicy(id, updates)`
+- `telemetry.deletePolicy(id)`
+- `telemetry.getMetrics()`
+- `telemetry.getGeoDistribution(from?, to?)`
+
+**A2A use**
+
+- Telemetry Analyst Agent automates:
+  - time window selection
+  - cluster-to-user drilldown
+  - policy suggestions (targeting + expiry)
+
+### Remote diagnostics
+
+- `diagnostics.createSession(input)`
+- `diagnostics.listSessions(filters)`
+- `diagnostics.getSession(id)`
+- `diagnostics.updateSession(id, updates)`
+- `diagnostics.cancelSession(id)`
+- `diagnostics.getLogs(sessionId, filters)`
+- `diagnostics.getTraces(sessionId, filters)`
+- `diagnostics.listScreenshots(sessionId)`
+
+**Target identifiers (schema-aligned):** `targetUserId`, `targetAnonymousId`, `targetDeviceId`.
+
+**A2A use**
+
+- Diagnostics Orchestrator Agent monitors session lifecycle and compiles summaries.
+
+### Jobs
+
+Current module provides list/update/trigger/list runs.
+
+- `jobs.list()`
+- `jobs.get(id)`
+- `jobs.update(id, updates)`
+- `jobs.trigger(jobName)`
+- `jobs.listRuns(jobName, limit)`
+
+**Note**
+
+- Current `jobs/routes.ts` uses `DEFAULT_PRODUCT_ID = 'lysnrai'`. For MCP, prefer explicit productId routing.
+
+### Settings / kill switch
+
+- `settings.get(userId)`
+- `settings.update(userId, patch)`
+- `settings.getDeviceResolved(userId, deviceId)`
+- `settings.setDeviceOverrides(userId, deviceId, overrides)`
+- `settings.clearDeviceOverrides(userId, deviceId)`
+- `settings.checkKillSwitch(productId)`
+
+### Flags
+
+- `flags.list(productId)`
+- `flags.get(key, productId)`
+- `flags.upsert(key, enabled, targeting, description)`
+
+### Maintenance
+
+- `maintenance.getCurrent(productId)`
+- `maintenance.set(productId, mode, bypassRules, windows)`
+
+### Webhooks
+
+- `webhooks.listSubscriptions(productId)`
+- `webhooks.createSubscription(input)`
+- `webhooks.updateSubscription(id, productId, updates)`
+- `webhooks.deleteSubscription(id, productId)`
+- `webhooks.listDeliveries(subscriptionId, limit)`
+- `webhooks.test(subscriptionId)`
+- `webhooks.rotateSecret(subscriptionId)`
+
+## Recommended MCP resources
+
+- `platform-service.modules`
+  - enumerates module names, key endpoints, auth requirements
+- `telemetry.eventSchema`
+- `diagnostics.sessionSchema`
+
+## Recommended A2A workflows backed by platform-service
+
+### 1) Support debug pack
+
+- input: user report
+- output: timeline + clusters + recommended actions + (optional) diagnostics session results
+
+### 2) Canary rollout of additional telemetry
+
+- build policy → preview → create with expiry → monitor cluster changes
+
+### 3) Post-incident cleanup
+
+- resolve clusters
+- remove policies
+- export audit log for the incident window
diff --git a/docs/MCP+A2A/DOMAIN_PRODUCTS.md b/docs/MCP+A2A/DOMAIN_PRODUCTS.md
new file mode 100644
index 00000000..0574bf6b
--- /dev/null
+++ b/docs/MCP+A2A/DOMAIN_PRODUCTS.md
@@ -0,0 +1,80 @@
+# Domain — Product Repos (MCP + A2A Opportunities)
+
+This document captures product-specific “where MCP/A2A helps” patterns, without duplicating each product’s full architecture docs.
+
+## Cross-product recurring needs
+
+- Debugging: telemetry clusters + remote diagnostics sessions
+- Platform controls: kill switch, feature flags, maintenance
+- Content intelligence: extraction tasks
+- Release readiness: build/test/typecheck workflows
+
+## ChronoMind (`learning_ai_clock`)
+
+- **Opportunities**
+  - Telemetry-driven quality tracking for timer engine + NL parsing.
+  - A2A “routine regression” agent: detect changes that affect scheduling.
+- **MCP hooks**
+  - telemetry tools for PWA
+  - platform-service jobs/webhooks for timer sharing integrations
+
+## NomGap (`learning_ai_fastgap`)
+
+- **Opportunities**
+  - React Native offline-first flows map well onto offline queue + platform-client.
+  - A2A “protocol tuning” agent: uses telemetry + extraction to correlate adherence patterns.
+- **MCP hooks**
+  - telemetry + kill switch clients already exist; MCP can standardize their usage.
+
+## PeakPulse (`learning_ai_peakpulse`)
+
+- **Opportunities**
+  - Sync reliability: a diagnostics session targeted at a user’s device can capture network failures.
+  - A2A “safety alerts correctness” agent using telemetry to validate thresholds.
+- **MCP hooks**
+  - platform-service telemetry/diagnostics
+  - product backend endpoints for session uploads (via sync engine)
+
+## MindLyst (`learning_multimodal_memory_agents`)
+
+- **Opportunities**
+  - Extraction-service is core to triage and insight enrichment; prompt/task iteration loop is high ROI.
+  - A2A “triage regression” agent that runs eval suites.
+- **MCP hooks**
+  - extraction tools
+  - telemetry for web + native apps
+
+## JarvisJr (`learning_ai_jarvis_jr`)
+
+- **Opportunities**
+  - Multi-agent product: A2A patterns can be applied to its own internal coaching “crew”.
+  - Marketplace + certification workflows can be agent-automated.
+- **MCP hooks**
+  - platform-service for auth/telemetry
+  - product backend for marketplace modules
+
+## LysnrAI (`learning_voice_ai_agent`)
+
+- **Opportunities**
+  - Support/debug workflows are already telemetry-heavy (keyboard + desktop).
+  - A2A “keyboard bug triage” agent that starts diagnostics sessions + drafts fixes.
+- **MCP hooks**
+  - platform-service telemetry/diagnostics
+  - admin dashboard tooling
+
+## Product-specific MCP server (optional)
+
+For each product, you can optionally add a small MCP namespace that calls the product backend (`backend/` in each repo) for domain actions.
+
+Examples:
+
+- ChronoMind: `timers.list/create`, `routines.run/validate`
+- PeakPulse: `sessions.upload`, `routes.export`
+- JarvisJr: `agents.list/publish`, `marketplace.certify`
+
+## Recommended first A2A workflows to ship per product
+
+- **All products**: Support Debug Pack (telemetry + diagnostics)
+- **MindLyst**: Extraction task design + eval loop
+- **JarvisJr**: Marketplace certification assistant
+- **NomGap**: Offline queue flush assistant / sync reliability assistant
diff --git a/docs/MCP+A2A/EXECUTION_CHECKLIST.md b/docs/MCP+A2A/EXECUTION_CHECKLIST.md
new file mode 100644
index 00000000..97748cd9
--- /dev/null
+++ b/docs/MCP+A2A/EXECUTION_CHECKLIST.md
@@ -0,0 +1,59 @@
+# MCP + A2A — Execution Checklist
+
+This is the “ready to start building” checklist that turns the docs in this folder into an executable plan.
+
+## 1) Decisions to make (30–60 minutes)
+
+- **MCP server placement**
+  - Recommended default: create a new service/package under `learning_ai_common_plat` (not colocated inside `platform-service`) to keep runtime concerns separated.
+- **Integration mode**
+  - Recommended default: REST-only calls to `platform-service` and `extraction-service` for Phase 1.
+  - Defer direct Cosmos reads until you have a clear perf/cost need.
+- **Auth strategy**
+  - Recommended default: platform-service JWT for interactive use; platform API tokens only for trusted automation.
+- **Where to store A2A handoffs**
+  - Recommended default: Phase 1 store handoffs as telemetry events + structured logs; Phase 2 introduce a dedicated Cosmos container if you need queryability.
+
+## 2) Must-fix dependency before MVP
+
+- **Diagnostics client/server route mismatch**
+  - `platform-service` ingests via session-scoped endpoints:
+    - `POST /api/diagnostics/sessions/:id/logs`
+    - `POST /api/diagnostics/sessions/:id/traces`
+    - screenshots via session-scoped SAS upload
+  - `@bytelyst/diagnostics-client` currently flushes to `POST /api/diagnostics/ingest`.
+
+Pick one (recommended: update the client):
+
+- Decision: update `@bytelyst/diagnostics-client` to post to session-scoped endpoints. No backwards-compatible `POST /api/diagnostics/ingest` alias endpoint.
+
+## 3) Phase 1 build steps (P0 slice)
+
+- **Implement MCP tool namespaces**
+  - `platform.telemetry.*`
+  - `platform.diagnostics.*`
+  - `extraction.*`
+- **Enforce hard guardrails in MCP layer**
+  - `productId` required and forwarded as `x-product-id`
+  - `x-request-id` required and propagated
+  - default query caps + max caps
+  - expiry required for any “amplification” (telemetry policy, diagnostics session)
+  - role gating (viewer/admin/super_admin)
+- **Ship one compound tool**
+  - `support.createDebugPack(...)`
+
+## 4) Phase 1 definition of done
+
+- Read-only tools work end-to-end against real services.
+- Mutating tools are role-gated and generate audit trails.
+- The compound debug pack produces a single structured artifact with:
+  - telemetry cluster references
+  - optional diagnostics session reference
+  - a short markdown summary
+
+## 5) Phase 2+ quick sanity checks
+
+- If you make telemetry policy-aware clients:
+  - ensure `GET /api/telemetry/config` consumption is cached (ETag) and privacy-safe.
+- If you add direct Cosmos reads:
+  - enforce product scoping at query layer and add explicit auditing for sensitive reads.
diff --git a/docs/MCP+A2A/IMPLEMENTATION_PLAN.md b/docs/MCP+A2A/IMPLEMENTATION_PLAN.md
new file mode 100644
index 00000000..1042f78e
--- /dev/null
+++ b/docs/MCP+A2A/IMPLEMENTATION_PLAN.md
@@ -0,0 +1,261 @@
+# MCP + A2A — Implementation Plan (Execution-Ready)
+
+## Objective
+
+Deliver a **safe, auditable, product-aware** MCP + A2A capability layer on top of existing ByteLyst services (primarily `platform-service` and `extraction-service`) so that agents can:
+
+- diagnose incidents (telemetry + diagnostics)
+- manage telemetry policies / rollouts
+- orchestrate remote diagnostics sessions
+- iterate on extraction tasks/prompts with eval loops
+- run repeatable ops workflows (jobs, flags, maintenance)
+
+This plan intentionally starts with a **minimal P0 slice** and expands in phases.
+
+## Guiding constraints (must-haves)
+
+- **Product isolation**
+  - Every tool call is scoped to an explicit `productId`.
+- **Auditability**
+  - Every mutating tool call must produce an audit record (directly or via existing APIs).
+- **Least privilege**
+  - Query tools available to viewer roles; mutating tools gated to admin/super_admin.
+- **Safety defaults**
+  - Mutations support `dryRun` where feasible.
+  - Diagnostic amplification (policies/sessions) must require an `expiresAt`.
+- **No new “shadow APIs”**
+  - Prefer calling existing service endpoints; only add endpoints if the tool surface cannot be expressed otherwise.
+
+## Phase 0 — Baseline readiness (1–2 days)
+
+### Deliverables
+
+- A “tool surface” inventory mapped to existing REST endpoints and required headers.
+- A role matrix for tool authorization.
+- A request-id propagation and logging standard for MCP tool calls.
+
+### Phase 0 checklist (definition of done)
+
+- Confirm whether MCP tool implementations will call **REST only** (preferred) or allow **direct Cosmos reads** for selected query paths.
+- Choose whether MCP is:
+  - a new service/package in `learning_ai_common_plat`, or
+  - colocated under `services/platform-service`.
+- Define the initial auth strategy:
+  - JWT only, or
+  - JWT + API tokens for automation.
+- Define tool-level authorization rules (viewer/admin/super_admin) and how they’re enforced.
+- Agree on a consistent product scoping rule:
+  - `productId` is mandatory input for every tool call and sent as `x-product-id` downstream.
+
+### Decisions to lock
+
+- **MCP server shape**
+  - Single server with namespaces (`platform.*`, `extraction.*`) vs. two servers.
+- **Auth mechanism**
+  - Primary: platform-service JWT.
+  - Secondary: platform API tokens for trusted automation.
+
+### Required invariants
+
+- `x-request-id` propagated on all downstream calls.
+- `x-product-id` required and validated for all calls.
+
+## Phase 1 — MVP MCP server (P0 slice) (3–7 days)
+
+### Goal
+
+Enable a Support/Ops agent to answer: _“What’s happening?”_ and _“Start a bounded diagnostics session.”_
+
+### Tool surface (MVP)
+
+#### Read-only (viewer)
+
+- `telemetry.queryEvents(filters)`
+- `telemetry.listClusters(filters)`
+- `telemetry.listPolicies()`
+- `telemetry.getMetrics()`
+- `diagnostics.getSession(sessionId)`
+- `diagnostics.listSessions(filters)`
+- `diagnostics.getLogs(sessionId, filters)`
+- `diagnostics.getTraces(sessionId, filters)`
+- `extraction.sidecarHealth()`
+
+### Phase 1 prerequisites (to avoid hidden integration failures)
+
+- Align `@bytelyst/diagnostics-client` ingest endpoint with `platform-service`.
+  - Today the service ingests via `POST /api/diagnostics/sessions/:id/logs|traces` (and session-scoped screenshot upload), while the client posts to `POST /api/diagnostics/ingest`.
+  - Decision: update the client to use the session-scoped endpoints (no backwards-compat alias endpoint).
+- Decide whether `@bytelyst/telemetry-client` should become policy-aware by consuming `GET /api/telemetry/config`.
+  - If yes, treat it as a Phase 1 deliverable (with caching + ETag). Otherwise, explicitly defer to Phase 2.
+
+#### Mutating (admin)
+
+- `telemetry.previewPolicy(targeting)`
+- `telemetry.createPolicy(input)`
+  - requires `expiresAt`
+- `telemetry.updatePolicy(id, updates)`
+- `telemetry.updateClusterStatus(clusterId, pk, status)`
+- `diagnostics.createSession(target, config)`
+  - requires `expiresAt` or `expiresInMinutes`
+- `diagnostics.updateSession(sessionId, updates)`
+- `diagnostics.cancelSession(sessionId)`
+
+#### Compound workflow tool (admin)
+
+- `support.createDebugPack(input)`
+  - internally calls:
+    - telemetry queries
+    - optional diagnostics session create + polling
+  - returns a single structured artifact:
+    - `debugPackId`, `clusterRefs`, `sessionRefs`, and a markdown summary
+
+### Output contracts (schemas)
+
+Define explicit JSON schemas (Zod or equivalent) for:
+
+- `TelemetryFilters`
+- `TelemetryPolicyInput` (requires expiry)
+- `DiagnosticsSessionTarget`
+- `DiagnosticsSessionConfig`
+- `DebugPackRequest`
+- `DebugPackResponse`
+
+### Guardrails
+
+- Query limits:
+  - default `limit` and max `limit` enforced at MCP layer.
+- Policy guardrails:
+  - require `expiresAt`
+  - require explicit `eventTypes/modules`
+  - block wildcard collection unless super_admin
+- Diagnostics guardrails:
+  - enforce max duration
+  - enforce max capture volume per flush
+
+### Acceptance criteria
+
+- A single agent can:
+  - pull clusters for a product + time window
+  - propose a telemetry policy and preview targeting
+  - create an expiring policy
+  - start a diagnostics session and retrieve data
+  - generate a “Debug Pack” artifact
+
+### Phase 1 engineering checklist (definition of done)
+
+- MCP layer enforces:
+  - request-id propagation (`x-request-id`)
+  - required product scoping (`productId`)
+  - default query caps and maximum caps
+  - expiry requirements for policies/sessions
+- Every mutating tool call produces an audit record (either:
+  - by calling existing audit endpoints, or
+  - by ensuring the underlying platform-service endpoint already records audit)
+- Tool names and inputs are documented and stable (no breaking renames during Phase 1)
+
+## Phase 2 — A2A orchestration (1–2 weeks)
+
+### Goal
+
+Turn multi-step support/ops workflows into **repeatable agent playbooks** with explicit handoffs.
+
+### Standard agents
+
+- **DispatcherAgent**
+- **TelemetryAnalystAgent**
+- **DiagnosticsOrchestratorAgent**
+- **OpsExecutorAgent**
+- **ReportWriterAgent**
+
+### Handoff artifacts
+
+- `SupportIncidentBrief`
+- `TelemetryFindings`
+- `DiagnosticsSessionPlan`
+- `OpsChangePlan`
+- `FinalIncidentReport`
+
+### Acceptance criteria
+
+- “Support Debug Pack” runs end-to-end via:
+  - Dispatcher → Telemetry Analyst → Diagnostics Orchestrator → Report Writer
+- Every handoff is persisted (even if only in logs initially) with stable IDs.
+
+## Phase 3 — Extraction task iteration loop (1–3 weeks)
+
+### Goal
+
+Make extraction prompt/task improvements safe, testable, and regression-resistant.
+
+### MCP tools
+
+- `extraction.extract(text, taskId?, modelId?)`
+- `extraction.extractBatch(inputs)`
+- `extraction.submitJob(inputs, webhookUrl?)`
+- `extraction.getJob(jobId)`
+- `extraction.metrics()` / `extraction.cacheStats()`
+
+### A2A workflow
+
+- **TaskDesignerAgent** drafts task prompt + examples
+- **EvalRunnerAgent** runs batch eval sets
+- **RegressionAgent** compares to baseline
+- **PublisherAgent** updates task registry + rollout
+
+### Acceptance criteria
+
+- A single command/workflow can:
+  - run eval suite
+  - compute simple quality metrics (schema validity, required fields coverage)
+  - produce a report and recommended next edit
+
+## Phase 4 — Ops expansion (jobs/flags/maintenance/webhooks) (1–2 weeks)
+
+### Tools
+
+- `jobs.list`, `jobs.trigger`, `jobs.listRuns`
+- `flags.list`, `flags.upsert`, `flags.evaluate`
+- `maintenance.get`, `maintenance.set`
+- `webhooks.listSubscriptions`, `webhooks.test`, `webhooks.rotateSecret`
+
+### Acceptance criteria
+
+- “Ops Copilot” can safely execute a bounded change plan:
+  - propose change
+  - dry-run if supported
+  - execute with audit
+  - verify outcome
+
+## Security & privacy checklist
+
+- Explicit `productId` on every tool call
+- Avoid returning raw PII in tool results
+- Ensure diagnostics redaction remains enforced server-side
+- Enforce expirations on policies and sessions
+- Rate limit MCP server endpoints
+
+## Rollout strategy
+
+- Start with internal-only usage (super_admin).
+- Add admin roles once guardrails are proven.
+- Add viewer read-only access for broader teams.
+- Add product-specific namespaces only after platform namespaces stabilize.
+
+## Work breakdown (suggested)
+
+- **Milestone A**: MVP MCP server + telemetry/diagnostics read-only
+- **Milestone B**: mutating tools + dry-run/expiry enforcement
+- **Milestone C**: `support.createDebugPack` compound tool
+- **Milestone D**: A2A runner + handoff schemas
+- **Milestone E**: extraction eval loop
+
+## Open questions (need decisions)
+
+- Should the MCP server call services via:
+  - service REST endpoints only, or
+  - direct Cosmos reads for some query paths?
+- Where should A2A handoff artifacts be stored:
+  - telemetry events,
+  - a dedicated Cosmos container,
+  - or both?
+- Do we want one MCP server repo/package, or colocated under `platform-service`?
diff --git a/docs/MCP+A2A/MCP_SERVER_FRAMEWORK.md b/docs/MCP+A2A/MCP_SERVER_FRAMEWORK.md
new file mode 100644
index 00000000..c30e5b21
--- /dev/null
+++ b/docs/MCP+A2A/MCP_SERVER_FRAMEWORK.md
@@ -0,0 +1,155 @@
+# MCP Server Framework — Recommended Architecture (ByteLyst)
+
+## Why an MCP server here
+
+This workspace already has a clear separation of concerns:
+
+- **Authoritative services** (Fastify): `platform-service`, `extraction-service`, plus product backends.
+- **Dashboards** (Next.js): admin + tracker.
+- **Client SDKs**: Swift/Kotlin platform SDKs + TS client packages.
+
+An MCP server becomes the **single programmatic gateway** that agents can call to:
+
+- query/act on platform state
+- assemble debugging evidence
+- run repeatable ops workflows
+- safely orchestrate A2A agents
+
+## Core design constraints
+
+- **Do not bypass service invariants**
+  - Prefer calling service endpoints or repositories with the same validation (Zod) and auth.
+- **Auditability**
+  - Every mutating tool should emit audit logs (or call APIs that already do).
+- **Least privilege**
+  - Split tools by role (viewer/admin/super_admin).
+- **Product isolation**
+  - All tools/resources must explicitly bind to `productId`.
+
+## Reality check: what exists today
+
+- `platform-service` already exposes:
+  - `GET /api/telemetry/config` (ETag-based client collection config)
+  - `GET /api/telemetry/query`, `GET /api/telemetry/clusters`, policies CRUD (admin)
+  - diagnostics session CRUD + `GET /api/diagnostics/sessions/:id/logs|traces|screenshots` (admin)
+- `extraction-service` already exposes:
+  - `/extract`, `/extract/batch`, `/extract/jobs`, sidecar health, metrics, cache stats
+
+The primary “new work” for MCP is orchestration, safety gating, and consistent auth/audit — not inventing new primitives.
+
+## Proposed MCP servers (2-tier)
+
+### 1) `bytelyst-platform-mcp` (primary)
+
+Backed by `platform-service` (port 4003) and optionally Cosmos for direct reads.
+
+- **Responsibilities**
+  - Telemetry querying + policy management
+  - Remote diagnostics sessions orchestration
+  - Jobs trigger/list
+  - Flags/settings/maintenance
+  - Webhooks + delivery logs
+  - Audit query
+
+### 2) `bytelyst-extraction-mcp` (specialized)
+
+Backed by `extraction-service` (port 4005).
+
+- **Responsibilities**
+  - Extract / batch extract
+  - Submit and monitor async extraction jobs
+  - Sidecar health + circuit breaker insight
+  - Metrics + cache stats
+
+(Optionally, these can be a single MCP server with two namespaces.)
+
+## Tool taxonomy
+
+### A) Read-only tools
+
+- `telemetry.queryEvents`
+- `telemetry.listClusters`
+- `telemetry.getMetrics`
+- `diagnostics.getSession`
+- `diagnostics.getLogs/getTraces`
+- `jobs.list/listRuns`
+- `flags.list`
+- `settings.get`
+- `webhooks.listSubscriptions/listDeliveries`
+- `extraction.metrics/cacheStats/sidecarHealth`
+
+### B) Mutating tools (require elevated role)
+
+- `telemetry.createPolicy/updatePolicy/deletePolicy`
+- `telemetry.updateClusterStatus`
+- `diagnostics.createSession/updateSession/cancelSession`
+- `jobs.trigger`
+- `maintenance.set`
+- `flags.set` (or flag upserts)
+- `webhooks.rotateSecret` / `webhooks.test`
+- `extraction.rateLimitReset` (if you keep that admin endpoint)
+
+### C) Compound tools (“one tool = one workflow”)
+
+- `support.createDebugPack(reportInput)`
+  - pulls telemetry timeline + cluster context
+  - optionally starts diagnostics session
+  - returns a single structured artifact (markdown/json)
+
+This reduces prompt fragility vs. requiring the LLM to call 8 tools in the right order.
+
+## MCP resources
+
+Resources should be stable references agents can read repeatedly:
+
+- `platform.modules.index`
+  - module list + base routes + required headers
+- `telemetry.schema`
+- `diagnostics.schema`
+- `extraction.tasks.catalog`
+- `ops.runbooks`
+  - e.g. “how to debug iOS keyboard insert_noop”
+- `product.identity`
+  - productId, plan tiers, allowed baseUrls
+
+## Prompts (MCP prompt templates)
+
+- `prompt.support_triage`
+- `prompt.telemetry_policy_proposal`
+- `prompt.remote_diagnostics_session_plan`
+- `prompt.extraction_task_design`
+
+## Authentication & authorization
+
+- **Primary**: platform-service JWT (same `verifyToken` logic).
+- **Secondary**: service-to-service API tokens (only for trusted automation).
+- **Tool gating**
+  - viewer: query-only
+  - admin: policy updates, create diagnostics sessions
+  - super_admin: secret rotation, maintenance, destructive operations
+
+## Observability for the MCP server
+
+- Use structured logs (Fastify/pino style) and propagate `x-request-id`.
+- Record tool invocation metrics into `telemetry` as `backend_service` channel:
+  - module: `mcp`
+  - eventName: `tool_invoked`, `tool_failed`, `a2a_handoff`
+
+## Safe defaults / guardrails
+
+- Any mutating tool should support a `dryRun: true` mode.
+- Enforce `expiresAt` on any “diagnostic collection amplification” (telemetry policy, diagnostics session).
+- Cap queries by default (limit/pageSize), require explicit `limit` increases.
+
+## Known integration risk (fix early)
+
+- `@bytelyst/diagnostics-client` currently flushes to `POST /api/diagnostics/ingest`, while `platform-service` ingests via session-scoped endpoints.
+- Resolve this mismatch before using diagnostics tooling as a core MCP/A2A workflow dependency.
+  - Decision: update `@bytelyst/diagnostics-client` to post to `POST /api/diagnostics/sessions/:id/logs|traces`.
+
+## Suggested initial tool surface (minimal viable)
+
+- `telemetry.queryEvents`, `telemetry.listClusters`, `telemetry.listPolicies`, `telemetry.previewPolicy`, `telemetry.createPolicy`
+- `diagnostics.createSession`, `diagnostics.getSession`, `diagnostics.getLogs`, `diagnostics.getTraces`
+- `extraction.extract`, `extraction.extractBatch`, `extraction.sidecarHealth`
+- `jobs.list`, `jobs.trigger`
diff --git a/docs/MCP+A2A/README.md b/docs/MCP+A2A/README.md
new file mode 100644
index 00000000..df1b6e72
--- /dev/null
+++ b/docs/MCP+A2A/README.md
@@ -0,0 +1,35 @@
+# MCP + A2A — Workspace Initiative
+
+This folder contains a workspace-wide scan of the ByteLyst ecosystem (platform + shared packages + dashboards + product repos) for opportunities to leverage:
+
+- MCP (Model Context Protocol)
+- A2A (agent-to-agent) patterns
+
+## Documents
+
+- `WORKSPACE_USE_CASE_CATALOG.md`
+- `MCP_SERVER_FRAMEWORK.md`
+- `A2A_ORCHESTRATION_FRAMEWORK.md`
+- `IMPLEMENTATION_PLAN.md`
+- `DOMAIN_PLATFORM_SERVICE.md`
+- `DOMAIN_EXTRACTION_SERVICE.md`
+- `DOMAIN_DASHBOARDS.md`
+- `DOMAIN_PACKAGES_AND_SDKS.md`
+- `DOMAIN_PRODUCTS.md`
+
+## Scope of scan
+
+Primary sources used:
+
+- `services/platform-service` (telemetry, diagnostics, jobs, settings, webhooks, auth, etc.)
+- `services/extraction-service` (sidecar + tasks + async jobs + rate limits)
+- `packages/*` (Swift/Kotlin platform SDKs, TS clients, event bus, telemetry/diagnostics clients)
+- `dashboards/admin-web`, `dashboards/tracker-web`, `dashboards/ux-lab`
+- Product repos: ChronoMind (`learning_ai_clock`), NomGap (`learning_ai_fastgap`), PeakPulse (`learning_ai_peakpulse`), LysnrAI (`learning_voice_ai_agent`), MindLyst (`learning_multimodal_memory_agents`), JarvisJr (`learning_ai_jarvis_jr`)
+
+## Notation
+
+- **Tool**: an MCP tool callable by an LLM/agent.
+- **Resource**: an MCP resource (read-only or read-mostly) exposed for context.
+- **Prompt**: an MCP prompt template.
+- **Agent**: an A2A-capable worker with a specific responsibility.
diff --git a/docs/MCP+A2A/WORKSPACE_USE_CASE_CATALOG.md b/docs/MCP+A2A/WORKSPACE_USE_CASE_CATALOG.md
new file mode 100644
index 00000000..8e4ae29f
--- /dev/null
+++ b/docs/MCP+A2A/WORKSPACE_USE_CASE_CATALOG.md
@@ -0,0 +1,173 @@
+# WORKSPACE — MCP + A2A Use-Case Catalog
+
+## Goals
+
+- Identify **high-leverage** MCP tools/resources and A2A agent workflows across the ByteLyst workspace.
+- Provide a **prioritized backlog** that maps cleanly onto existing services:
+  - `platform-service` (telemetry, diagnostics, jobs, settings, auth, webhooks, etc.)
+  - `extraction-service` (structured extraction, async jobs)
+  - dashboards (admin/tracker)
+  - shared packages + mobile SDKs
+  - product repos (ChronoMind, NomGap, PeakPulse, MindLyst, JarvisJr, LysnrAI)
+
+## What “MCP” and “A2A” mean in this workspace
+
+- **MCP server**: exposes authoritative product/platform capabilities as tools/resources.
+  - Tools should map to stable APIs (prefer calling service endpoints / repositories rather than scraping UI).
+  - Resources should expose read-mostly context: schemas, runbooks, current status, recent incidents, etc.
+- **A2A**: multiple specialized agents collaborating via explicit contracts.
+  - Example: _Support agent_ → _Triage agent_ → _Diagnostics agent_ → _Fix agent_ → _Release agent_.
+
+## Reality check: “exists today” vs “proposed tool surface”
+
+- **Backed by existing services today**
+  - Telemetry query/clusters/policies (platform-service)
+  - Telemetry client config (`GET /api/telemetry/config`, ETag-based)
+  - Diagnostics sessions + ingest (platform-service)
+  - Extraction extract/batch/jobs/health (extraction-service)
+  - Jobs/flags/settings/webhooks endpoints (platform-service)
+- **Proposed tools that may require new endpoints or privileged integrations**
+  - `secrets.rotate(...)`, `services.restart(...)` (depends on Key Vault/infra integration)
+  - UX-lab dataset generators (`dev.generateTelemetryDataset(...)`) (local-only helper tooling)
+  - Compound tools like `support.createDebugPack(...)` (implemented in MCP layer as orchestration)
+
+## Prioritization rubric
+
+- **P0**: removes constant manual toil; immediate engineering impact.
+- **P1**: unlocks new ops / reliability capabilities.
+- **P2**: enables new product intelligence / automations.
+- **P3**: experimental / UX labs.
+
+## P0 — Highest leverage (do first)
+
+### 1) Incident / Support “one-click evidence” pack (Telemetry + Diagnostics)
+
+- **Why**
+  - You already have rich primitives: `telemetry` (clusters + policies) and `diagnostics` (remote sessions).
+  - The missing piece is automated _assembly_ of a support-ready packet.
+- **MCP tools**
+  - `telemetry.queryEvents(filters)`
+  - `telemetry.listClusters(filters)`
+  - `telemetry.updateClusterStatus(clusterId, pk, status)`
+  - `diagnostics.createSession(targetUserId|targetAnonymousId|targetDeviceId, config)`
+  - `diagnostics.getSession(sessionId)`
+  - `diagnostics.getLogs(sessionId, filters)`
+  - `diagnostics.getTraces(sessionId, filters)`
+  - `diagnostics.listScreenshots(sessionId)`
+- **A2A agents**
+  - **SupportTriageAgent**: extracts identifiers from user report (email/userId/anonymousInstallId, appVersion). (Maps to `targetAnonymousId` for diagnostics.)
+  - **TelemetryAnalystAgent**: pulls clusters + timelines; proposes policy changes.
+  - **DiagnosticsOrchestratorAgent**: starts a debug session when needed; monitors results.
+  - **ReportWriterAgent**: produces a “Debug Pack” markdown/PDF.
+
+### 2) Telemetry policy authoring assistant (safe-by-default)
+
+- **Why**
+  - Policies are powerful but easy to over-collect. Assistant can:
+    - suggest targeting
+    - set auto-expiry
+    - choose sampling
+- **MCP tools**
+  - `telemetry.listPolicies()`
+  - `telemetry.previewPolicy(targeting)`
+  - `telemetry.createPolicy(input)`
+  - `telemetry.updatePolicy(id, updates)`
+- **A2A agents**
+  - **PolicyPlannerAgent**: proposes policy with guardrails.
+  - **PolicyReviewerAgent**: checks privacy constraints + expiry + scope.
+
+### 3) “Ops Copilot” for platform-service modules
+
+- **Why**
+  - platform-service contains many modules (auth, flags, delivery, jobs, settings, maintenance, etc.).
+  - Most ops tasks are combinations of a few actions (query, toggle, trigger job).
+- **MCP tools**
+  - `jobs.list()` / `jobs.trigger(name)` / `jobs.listRuns(name)`
+  - `maintenance.get()` / `maintenance.set(mode, schedule)`
+  - `flags.list()` / `flags.set(key, enabled, targeting)`
+  - `settings.get(userId)` / `settings.set(userId, patch)`
+  - `webhooks.listSubscriptions()` / `webhooks.test()` / `webhooks.rotateSecret()`
+- **A2A agents**
+  - **OpsAgent**: executes operations.
+  - **ComplianceAgent**: ensures audit trail + least privilege.
+
+### 4) Extraction service “task builder” and evaluation loop
+
+- **Why**
+  - extraction-service already has model registry, caching, async jobs, sidecar health.
+  - Task prompts/examples are high leverage; building & evaluating them is repetitive.
+- **MCP tools**
+  - `extraction.extract(text, taskId, modelId)`
+  - `extraction.extractBatch(inputs)`
+  - `extraction.submitJob(inputs, webhookUrl?)`
+  - `extraction.getJob(jobId)`
+  - `extraction.sidecarHealth()`
+  - `extraction.metrics()` / `extraction.cacheStats()`
+- **A2A agents**
+  - **TaskDesignerAgent**: drafts taskPrompt + examples.
+  - **EvalRunnerAgent**: runs eval sets, compares outputs.
+  - **PromptRegressionAgent**: ensures no quality regressions.
+
+## P1 — Operational maturity
+
+### 5) Automated regression watch: telemetry clusters → auto diagnostics session
+
+- **Why**
+  - telemetry clusters already have severity escalation + webhook alerts.
+  - diagnostics supports remote sessions.
+- **Flow**
+  - Cluster crosses threshold → A2A triggers a targeted diagnostics session for a small segment.
+- **MCP tools**
+  - `telemetry.listClusters()`
+  - `diagnostics.createSession(...)`
+
+### 6) Secret rotation “assistant” (Key Vault + config propagation)
+
+- **Why**
+  - Secrets already resolved centrally via `@bytelyst/config`.
+  - Rotation is still error-prone across services/dashboards.
+- **MCP tools/resources**
+  - Resource: `secrets.mapping` (which env vars resolve which AKV secrets)
+  - Tool: `secrets.rotate(name)` (where permitted) + `services.restart(service)` (optional)
+
+### 7) Webhook subscription lifecycle assistant
+
+- **Why**
+  - You have webhooks + delivery logs; assistants can recommend retries, disable rules, test endpoints.
+- **MCP tools**
+  - `webhooks.listSubscriptions(productId)`
+  - `webhooks.listDeliveries(subscriptionId)`
+  - `webhooks.rotateSecret(subscriptionId)`
+  - `webhooks.test(subscriptionId)`
+
+## P2 — Product intelligence and automation
+
+### 8) A/B experimentation assistant (platform-service `ab-testing`, `experiments`)
+
+- **Use**
+  - Draft experiment plan, compute exposure targeting, monitor telemetry signals.
+
+### 9) “Changelog writer” from merged PRs + telemetry impact
+
+- **Use**
+  - Collect changes; relate to drop in cluster counts; propose release notes.
+
+## P3 — UX-lab accelerators
+
+### 10) MCP-driven UX Lab data generators
+
+- **Why**
+  - UX lab apps (telemetry explorer, ops UI kit) need rich sample datasets.
+- **MCP tool**
+  - `dev.generateTelemetryDataset(shape, size, seed)`
+
+## Cross-product patterns that MCP/A2A should standardize
+
+- **Telemetry**
+  - Common event naming + “productId is mandatory” invariant.
+- **Kill switch**
+  - Single check path; consistent failure mode (fail-open).
+- **Extraction**
+  - Task IDs used consistently across products (triage, reflection, insights, etc.).
+- **Workflows**
+  - Release/build/test workflows can become A2A playbooks using MCP tools.
diff --git a/packages/diagnostics-client/src/client.ts b/packages/diagnostics-client/src/client.ts
index a2ca3f73..d18d6c56 100644
--- a/packages/diagnostics-client/src/client.ts
+++ b/packages/diagnostics-client/src/client.ts
@@ -13,7 +13,6 @@ import type {
   LogEntry,
   Breadcrumb,
   NetworkRequest,
-  IngestBatch,
   DeviceState,
 } from './types.js';
 import { BreadcrumbTrail } from './breadcrumbs.js';
@@ -28,7 +27,6 @@ type ErrorEvent = {
   colno: number;
   error?: { stack?: string };
 };
-type EventListener = (event: unknown) => void;
 
 export interface DiagnosticsClientOptions extends DiagnosticsConfig {
   /** Custom logger */
@@ -217,7 +215,7 @@ export class DiagnosticsClient {
       level,
       message,
       timestamp: new Date().toISOString(),
-      module: context.module as string ?? 'unknown',
+      module: (context.module as string) ?? 'unknown',
       context,
       correlationId: context.correlationId as string,
     };
@@ -254,7 +252,10 @@ export class DiagnosticsClient {
       span.durationMs = new Date(span.endTime).getTime() - new Date(span.startTime).getTime();
       span.status = 'ok';
       this.traceBuffer.push(span);
-      this.breadcrumbs.add('trace', `Completed: ${name}`, { spanId: span.spanId, durationMs: span.durationMs });
+      this.breadcrumbs.add('trace', `Completed: ${name}`, {
+        spanId: span.spanId,
+        durationMs: span.durationMs,
+      });
       return result;
     } catch (error) {
       span.endTime = new Date().toISOString();
@@ -262,7 +263,10 @@ export class DiagnosticsClient {
       span.status = 'error';
       span.statusMessage = error instanceof Error ? error.message : String(error);
       this.traceBuffer.push(span);
-      this.breadcrumbs.add('trace', `Failed: ${name}`, { spanId: span.spanId, error: span.statusMessage });
+      this.breadcrumbs.add('trace', `Failed: ${name}`, {
+        spanId: span.spanId,
+        error: span.statusMessage,
+      });
       throw error;
     }
   }
@@ -298,7 +302,7 @@ export class DiagnosticsClient {
       url.searchParams.set('installId', this.config.anonymousInstallId);
 
       const headers: Record<string, string> = {
-        'Accept': 'application/json',
+        Accept: 'application/json',
       };
 
       if (this.lastEtag) {
@@ -347,7 +351,10 @@ export class DiagnosticsClient {
       this.config.logger.error('[diagnostics] Failed to poll for session', {
         error: error instanceof Error ? error.message : String(error),
       });
-      this.state = { type: 'error', error: error instanceof Error ? error : new Error(String(error)) };
+      this.state = {
+        type: 'error',
+        error: error instanceof Error ? error : new Error(String(error)),
+      };
     }
   }
 
@@ -364,56 +371,90 @@ export class DiagnosticsClient {
       return;
     }
 
-    // Build batch
-    const batch: IngestBatch = {
-      sessionId: session.id,
-    };
+    const sessionId = session.id;
 
-    if (this.logBuffer.length > 0) {
-      batch.logs = this.logBuffer.splice(0, 50); // Max 50 per batch
-    }
-
-    if (this.traceBuffer.length > 0) {
-      batch.traces = this.traceBuffer.splice(0, 50);
-    }
-
-    if (this.networkBuffer.length > 0) {
-      batch.network = this.networkBuffer.splice(0, 50);
-    }
-
-    // Add breadcrumbs
+    const logs = this.logBuffer.splice(0, 50); // Server max: 50
+    const traces = this.traceBuffer.splice(0, 50); // Server max: 50
+    const network = this.networkBuffer.splice(0, 50);
     const crumbs = this.breadcrumbs.getAll();
-    if (crumbs.length > 0) {
-      batch.breadcrumbs = [...crumbs];
-      this.breadcrumbs.clear();
+    this.breadcrumbs.clear();
+
+    // Encode breadcrumbs + network captures as log entries so we can ingest
+    // without requiring additional server-side schemas/endpoints.
+    const synthesizedLogs = [] as LogEntry[];
+
+    for (const c of crumbs) {
+      synthesizedLogs.push({
+        level: 'info',
+        message: `[breadcrumb] ${c.category}: ${c.message}`,
+        timestamp: c.timestamp,
+        module: 'diagnostics.breadcrumb',
+        context: c.data ?? {},
+      });
     }
 
-    // Skip if nothing to send
-    if (!batch.logs && !batch.traces && !batch.network && !batch.breadcrumbs) {
+    for (const n of network) {
+      synthesizedLogs.push({
+        level: n.error ? 'error' : 'info',
+        message: `[network] ${n.method} ${n.url} ${n.status ?? ''}`.trim(),
+        timestamp: n.startTime,
+        module: 'diagnostics.network',
+        context: {
+          requestHeaders: n.requestHeaders,
+          requestBody: n.requestBody,
+          status: n.status,
+          responseHeaders: n.responseHeaders,
+          responseBody: n.responseBody,
+          startTime: n.startTime,
+          endTime: n.endTime,
+          durationMs: n.durationMs,
+          error: n.error,
+        },
+      });
+    }
+
+    const allLogs = [...logs, ...synthesizedLogs];
+
+    if (allLogs.length === 0 && traces.length === 0) {
       return;
     }
 
+    const token = await this.getAuthToken();
+    const headers: Record<string, string> = {
+      'Content-Type': 'application/json',
+      ...(token ? { Authorization: `Bearer ${token}` } : {}),
+    };
+
     try {
-      const url = new URL('/api/diagnostics/ingest', this.config.serverUrl);
-      const token = await this.getAuthToken();
+      if (allLogs.length > 0) {
+        const url = new URL(
+          `/api/diagnostics/sessions/${encodeURIComponent(sessionId)}/logs`,
+          this.config.serverUrl
+        );
+        const response = await fetch(url.toString(), {
+          method: 'POST',
+          headers,
+          body: JSON.stringify({ sessionId, logs: allLogs }),
+        });
+        if (!response.ok) throw new Error(`HTTP ${response.status}`);
+      }
 
-      const response = await fetch(url.toString(), {
-        method: 'POST',
-        headers: {
-          'Content-Type': 'application/json',
-          ...(token && { 'Authorization': `Bearer ${token}` }),
-        },
-        body: JSON.stringify(batch),
-      });
-
-      if (!response.ok) {
-        throw new Error(`HTTP ${response.status}`);
+      if (traces.length > 0) {
+        const url = new URL(
+          `/api/diagnostics/sessions/${encodeURIComponent(sessionId)}/traces`,
+          this.config.serverUrl
+        );
+        const response = await fetch(url.toString(), {
+          method: 'POST',
+          headers,
+          body: JSON.stringify({ sessionId, traces }),
+        });
+        if (!response.ok) throw new Error(`HTTP ${response.status}`);
       }
 
       this.config.logger.debug('[diagnostics] Flushed batch', {
-        logs: batch.logs?.length ?? 0,
-        traces: batch.traces?.length ?? 0,
-        network: batch.network?.length ?? 0,
+        logs: allLogs.length,
+        traces: traces.length,
       });
     } catch (error) {
       this.config.logger.error('[diagnostics] Failed to flush batch', {
@@ -421,9 +462,14 @@ export class DiagnosticsClient {
       });
 
       // Put items back in buffers for retry
-      if (batch.logs) this.logBuffer.unshift(...batch.logs);
-      if (batch.traces) this.traceBuffer.unshift(...batch.traces);
-      if (batch.network) this.networkBuffer.unshift(...batch.network);
+      if (logs.length > 0) this.logBuffer.unshift(...logs);
+      if (traces.length > 0) this.traceBuffer.unshift(...traces);
+      if (network.length > 0) this.networkBuffer.unshift(...network);
+
+      // Breadcrumbs were converted; keep a small breadcrumb trail hint for later flush.
+      for (const c of crumbs.slice(-10)) {
+        this.breadcrumbs.add(c.category, c.message, c.data);
+      }
     }
   }
 
@@ -432,7 +478,7 @@ export class DiagnosticsClient {
    */
   private setupNetworkCapture(): void {
     this.networkInterceptor = new NetworkInterceptor(
-      (request) => {
+      request => {
         this.networkBuffer.push(request);
       },
       {
@@ -455,9 +501,9 @@ export class DiagnosticsClient {
     };
 
     const capture = (level: LogLevel, args: unknown[]) => {
-      const message = args.map(a =>
-        typeof a === 'object' ? JSON.stringify(a) : String(a)
-      ).join(' ');
+      const message = args
+        .map(a => (typeof a === 'object' ? JSON.stringify(a) : String(a)))
+        .join(' ');
       this.log(level, message, { module: 'console', source: 'captured' });
     };