fix(diagnostics-client): use session-scoped ingest endpoints; update MCP+A2A docs
This commit is contained in:
parent
435d873436
commit
bf7769bdaa
138
docs/MCP+A2A/A2A_ORCHESTRATION_FRAMEWORK.md
Normal file
138
docs/MCP+A2A/A2A_ORCHESTRATION_FRAMEWORK.md
Normal file
@ -0,0 +1,138 @@
|
||||
# A2A Orchestration Framework — Recommended Pattern (ByteLyst)
|
||||
|
||||
## Intent
|
||||
|
||||
Standardize how multiple agents collaborate on platform + product tasks (support, ops, releases, prompt iterations) with:
|
||||
|
||||
- explicit roles
|
||||
- explicit handoff artifacts
|
||||
- consistent safety + audit
|
||||
|
||||
## Canonical roles (agents)
|
||||
|
||||
### 1) Router / Dispatcher
|
||||
|
||||
- **Responsibility**
|
||||
- decide which specialist agent(s) to involve
|
||||
- enforce policy (role gating, PII constraints)
|
||||
- **Inputs**
|
||||
- user request + current environment (productId, repo)
|
||||
- **Outputs**
|
||||
- sequence of tasks (plan) + handoff payloads
|
||||
|
||||
### 2) Telemetry Analyst
|
||||
|
||||
- **Responsibility**
|
||||
- find clusters, correlate by version/platform, propose next steps
|
||||
- **Calls**
|
||||
- telemetry MCP tools
|
||||
|
||||
### 3) Diagnostics Orchestrator
|
||||
|
||||
- **Responsibility**
|
||||
- start / monitor remote diagnostics sessions
|
||||
- summarize results
|
||||
- **Calls**
|
||||
- diagnostics MCP tools
|
||||
|
||||
### 4) Extraction Task Designer
|
||||
|
||||
- **Responsibility**
|
||||
- design extraction task prompts/examples
|
||||
- coordinate evaluation runs
|
||||
- **Calls**
|
||||
- extraction MCP tools
|
||||
|
||||
### 5) Ops Executor
|
||||
|
||||
- **Responsibility**
|
||||
- execute mutating ops (jobs trigger, maintenance windows, flag changes)
|
||||
- **Calls**
|
||||
- platform MCP tools
|
||||
|
||||
### 6) Fix / PR Agent
|
||||
|
||||
- **Responsibility**
|
||||
- implement code changes
|
||||
- ensure tests and conventions
|
||||
|
||||
### 7) Report Writer
|
||||
|
||||
- **Responsibility**
|
||||
- produce a final summary in a consistent format
|
||||
- include links/IDs (clusterId, sessionId, policyId)
|
||||
|
||||
## Handoff artifacts (contracts)
|
||||
|
||||
Examples below are illustrative. All artifacts must be explicitly scoped to a `productId`.
|
||||
|
||||
### A) Support incident brief
|
||||
|
||||
```json
|
||||
{
|
||||
"productId": "<productId>",
|
||||
"userReport": {
|
||||
"summary": "dictation inserts nothing in Messages",
|
||||
"platform": "ios",
|
||||
"channel": "keyboard_extension",
|
||||
"appVersion": "1.2.0",
|
||||
"buildNumber": "35",
|
||||
"userId": "usr_...",
|
||||
"anonymousInstallId": "..."
|
||||
},
|
||||
"timeWindow": { "from": "...", "to": "..." }
|
||||
}
|
||||
```
|
||||
|
||||
Mapping note: `userReport.anonymousInstallId` maps to diagnostics session targeting via `targetAnonymousId`.
|
||||
|
||||
### B) Telemetry findings
|
||||
|
||||
```json
|
||||
{
|
||||
"clusters": [{ "clusterId": "...", "pk": "...", "severity": "error" }],
|
||||
"topHypotheses": ["permission denied", "insertText noop"],
|
||||
"recommendedActions": ["start diagnostics session", "enable debug policy for one user"]
|
||||
}
|
||||
```
|
||||
|
||||
### C) Diagnostics session plan
|
||||
|
||||
```json
|
||||
{
|
||||
"target": { "userId": "...", "deviceId": "..." },
|
||||
"collection": { "level": "trace", "captureNetwork": true, "captureLogs": true },
|
||||
"expiresInMinutes": 30
|
||||
}
|
||||
```
|
||||
|
||||
### D) Patch plan (code)
|
||||
|
||||
- scope, files, risk, tests
|
||||
|
||||
## Routing logic (simple)
|
||||
|
||||
- If request mentions:
|
||||
- **"crash" / "not working" / "bug"** → Telemetry Analyst → Diagnostics Orchestrator → Fix Agent
|
||||
- **"extraction" / "entity" / "triage"** → Extraction Task Designer → Eval Runner → Fix Agent
|
||||
- **"maintenance" / "flag" / "job"** → Ops Executor
|
||||
|
||||
## Safety rules
|
||||
|
||||
- Never include raw user content in telemetry/diagnostics.
|
||||
- Diagnostics sessions must be time-bounded.
|
||||
- Mutating actions require:
|
||||
- explicit approval from dispatcher
|
||||
- audit log
|
||||
- optional dry-run
|
||||
|
||||
## Where A2A yields immediate wins in this workspace
|
||||
|
||||
- **Telemetry policy governance**
|
||||
- Planner + Reviewer pattern
|
||||
- **Remote diagnostics**
|
||||
- Orchestrator agent that monitors sessions and summarizes
|
||||
- **Prompt iteration loops** (extraction)
|
||||
- Task designer + eval runner separation
|
||||
- **Release workflows**
|
||||
- Dedicated agent for quality gates (build/test/typecheck) and a separate agent for publishing
|
||||
54
docs/MCP+A2A/DOMAIN_DASHBOARDS.md
Normal file
54
docs/MCP+A2A/DOMAIN_DASHBOARDS.md
Normal file
@ -0,0 +1,54 @@
|
||||
# Domain — Dashboards (admin-web, tracker-web, ux-lab)
|
||||
|
||||
## Admin dashboard (`dashboards/admin-web`)
|
||||
|
||||
### Existing leverage points
|
||||
|
||||
- It already centralizes many ops capabilities behind UI.
|
||||
- It already has a service client layer (`src/lib/platform-client.ts`) that talks to `platform-service`.
|
||||
|
||||
### MCP opportunities
|
||||
|
||||
- Provide “headless equivalents” of the admin UI actions via MCP tools.
|
||||
- Use MCP resources to provide the dashboard with richer contextual data:
|
||||
- module inventories
|
||||
- policy templates
|
||||
- incident runbooks
|
||||
|
||||
### A2A opportunities
|
||||
|
||||
- Build an Ops Copilot that:
|
||||
- proposes actions
|
||||
- executes via MCP tools
|
||||
- links back to the relevant admin dashboard pages
|
||||
|
||||
## Tracker dashboard (`dashboards/tracker-web`)
|
||||
|
||||
- Candidate MCP tools (illustrative names; map onto the tracker modules in platform-service such as items/votes/comments/public):
|
||||
- `tracker.listPublicItems()`
|
||||
- `tracker.submitFeedback()`
|
||||
- `tracker.vote(itemId)`
|
||||
|
||||
## UX Lab (`dashboards/ux-lab`)
|
||||
|
||||
### Existing intent
|
||||
|
||||
These micro-apps are greenfield UI experiments that intentionally avoid backend dependencies.
|
||||
|
||||
### MCP opportunity
|
||||
|
||||
- Use MCP as a **dataset generator** for local-only UX experiments.
|
||||
- Example: generate realistic telemetry events & clusters JSON.
|
||||
|
||||
### Suggested tools
|
||||
|
||||
- `dev.generateSampleTelemetryEvents(count, shape, seed)` (local-only helper)
|
||||
- `dev.generateSampleClusters(count, seed)` (local-only helper)
|
||||
|
||||
## Dashboard “component extraction” workflow
|
||||
|
||||
An A2A workflow that:
|
||||
|
||||
- identifies patterns in ux-lab (tables/filters/drawers)
|
||||
- proposes a migration plan into `@bytelyst/dashboard-components`
|
||||
- auto-generates a PR in the appropriate package
|
||||
70
docs/MCP+A2A/DOMAIN_EXTRACTION_SERVICE.md
Normal file
70
docs/MCP+A2A/DOMAIN_EXTRACTION_SERVICE.md
Normal file
@ -0,0 +1,70 @@
|
||||
# Domain — extraction-service (MCP + A2A Opportunities)
|
||||
|
||||
## Why extraction-service is ideal for MCP
|
||||
|
||||
It already provides:
|
||||
|
||||
- a single entrypoint (`POST /extract`, `POST /extract/batch`)
|
||||
- async extraction jobs (`/extract/jobs`)
|
||||
- model registry
|
||||
- sidecar health monitoring and circuit breaker
|
||||
- rate limits + quotas + cache
|
||||
|
||||
Agents can use MCP tools to iterate on prompts/tasks safely and repeatably.
|
||||
|
||||
## High-value MCP tool proposals
|
||||
|
||||
### Core extraction
|
||||
|
||||
- `extraction.extract(text, taskId?, modelId?, productId?)`
|
||||
- `extraction.extractBatch(inputs, modelId?)`
|
||||
|
||||
### Async jobs
|
||||
|
||||
- `extraction.submitJob(inputs, modelId?, webhookUrl?)`
|
||||
- `extraction.getJob(jobId)`
|
||||
- `extraction.listJobs()`
|
||||
|
||||
### Observability
|
||||
|
||||
- `extraction.sidecarHealth()`
|
||||
- `extraction.metrics()`
|
||||
- `extraction.cacheStats()` (backs `GET /extract/cache-stats`)
|
||||
- `extraction.sidecarMonitoringState()` (backs `GET /extract/monitoring/sidecar`)
|
||||
|
||||
### Rate limits / admin utilities
|
||||
|
||||
- `extraction.getProductRateLimitStatus(productId?)`
|
||||
- `extraction.resetProductRateLimit(productId)` (admin)
|
||||
|
||||
## Recommended MCP resources
|
||||
|
||||
- `extraction.modelRegistry`
|
||||
- `extraction.taskCatalog`
|
||||
- list task IDs used across products (triage, reflection-enrichment, memory-insight, etc.)
|
||||
- `extraction.promptGuidelines`
|
||||
|
||||
## Recommended A2A workflows
|
||||
|
||||
### 1) Task design loop
|
||||
|
||||
- **TaskDesignerAgent** drafts:
|
||||
- task prompt
|
||||
- a small set of examples
|
||||
- **EvalRunnerAgent** runs:
|
||||
- `extractBatch` over an eval set
|
||||
- compares JSON shape correctness
|
||||
- **RegressionAgent** checks:
|
||||
- no degradation vs previous baseline
|
||||
|
||||
### 2) Extraction incident response
|
||||
|
||||
- If extraction errors spike:
|
||||
- check sidecar health and circuit breaker state
|
||||
- reduce per-product rate limits
|
||||
- switch modelId (if supported)
|
||||
|
||||
## Product integration hotspots
|
||||
|
||||
- MindLyst web API routes proxy to extraction-service (`/api/extract` and triage routes).
|
||||
- Future: other products can standardize on the same tasks and use a shared task registry.
|
||||
82
docs/MCP+A2A/DOMAIN_PACKAGES_AND_SDKS.md
Normal file
82
docs/MCP+A2A/DOMAIN_PACKAGES_AND_SDKS.md
Normal file
@ -0,0 +1,82 @@
|
||||
# Domain — Shared Packages + SDKs (MCP + A2A Opportunities)
|
||||
|
||||
## Why packages/SDKs matter for MCP/A2A
|
||||
|
||||
They define:
|
||||
|
||||
- the **portable contracts** (telemetry event schema, diagnostics types)
|
||||
- the **client integration points** (Swift/Kotlin SDKs, TS client packages)
|
||||
- reusable primitives (offline queue, platform client)
|
||||
|
||||
MCP/A2A should treat these as _the single source of truth_ for:
|
||||
|
||||
- schemas
|
||||
- naming conventions
|
||||
- safety constraints (no PII)
|
||||
|
||||
## Key packages and how to leverage them
|
||||
|
||||
### `@bytelyst/telemetry-client`
|
||||
|
||||
- Already provides a browser/RN-safe client.
|
||||
- MCP can expose resources:
|
||||
- event schema
|
||||
- recommended module/eventName conventions
|
||||
|
||||
**Opportunity:** add a companion “policy-aware client” mode that calls `GET /telemetry/config` and samples accordingly.
|
||||
|
||||
**Note:** `platform-service` already exposes `GET /api/telemetry/config` (ETag-based). The remaining work is wiring the client to consume it safely.
|
||||
|
||||
### `@bytelyst/diagnostics-client`
|
||||
|
||||
- Provides session polling + capture utilities.
|
||||
|
||||
**Gap (must fix before relying on it):** the client currently flushes batches to `POST /api/diagnostics/ingest`, while `platform-service` routes are session-scoped:
|
||||
|
||||
- `POST /api/diagnostics/sessions/:id/logs`
|
||||
- `POST /api/diagnostics/sessions/:id/traces`
|
||||
- `POST /api/diagnostics/sessions/:id/screenshots` (SAS upload)
|
||||
|
||||
Decision: update `@bytelyst/diagnostics-client` to post to the session-scoped endpoints (no backwards-compat alias endpoint needed).
|
||||
|
||||
**Opportunity:** standardize how product apps integrate it (common initialization patterns and user-consent prompts).
|
||||
|
||||
### `@bytelyst/platform-client`
|
||||
|
||||
- Typed fetch wrapper with auth injection.
|
||||
|
||||
**Opportunity:** use it as the basis for a _frontend-side_ MCP client (where appropriate) and for consistent request-id propagation.
|
||||
|
||||
### `@bytelyst/offline-queue`
|
||||
|
||||
- Good candidate for A2A workflows that need reliable retries.
|
||||
|
||||
### Swift / Kotlin Platform SDKs
|
||||
|
||||
- Already provide consistent platform-service integration.
|
||||
|
||||
**Opportunity:**
|
||||
|
||||
- expose SDK version + capabilities as MCP resources
|
||||
- keep a “compatibility matrix” resource (which products use which SDK features)
|
||||
|
||||
## MCP resources recommended from packages
|
||||
|
||||
- `schemas.telemetry` (from shared types)
|
||||
- `schemas.diagnostics` (from shared types)
|
||||
- `sdk.swift.capabilities`
|
||||
- `sdk.kotlin.capabilities`
|
||||
|
||||
## A2A workflows
|
||||
|
||||
### 1) Cross-repo integration audit
|
||||
|
||||
- Identify product repos that drift from shared SDK patterns.
|
||||
- Output: a per-repo “alignment report” and suggested PRs.
|
||||
|
||||
### 2) Release impact analysis
|
||||
|
||||
- When changing a shared package:
|
||||
- agent enumerates downstream consumers
|
||||
- runs typecheck/build matrix
|
||||
- updates docs + versioning
|
||||
120
docs/MCP+A2A/DOMAIN_PLATFORM_SERVICE.md
Normal file
120
docs/MCP+A2A/DOMAIN_PLATFORM_SERVICE.md
Normal file
@ -0,0 +1,120 @@
|
||||
# Domain — platform-service (MCP + A2A Opportunities)
|
||||
|
||||
## Why this domain is ideal for MCP
|
||||
|
||||
`platform-service` is already organized as a set of well-defined modules and stable REST endpoints (Fastify 5 + Zod), with:
|
||||
|
||||
- telemetry ingestion + query + policies
|
||||
- remote diagnostics sessions
|
||||
- jobs, delivery, sessions, maintenance
|
||||
- settings, flags, rate limits
|
||||
- webhooks subscriptions
|
||||
|
||||
This makes it a near-perfect backing store for MCP tools.
|
||||
|
||||
## High-value MCP tool proposals
|
||||
|
||||
### Telemetry
|
||||
|
||||
- `telemetry.queryEvents(filters)`
|
||||
- `telemetry.listClusters(filters)`
|
||||
- `telemetry.updateClusterStatus(clusterId, pk, status)`
|
||||
- `telemetry.listPolicies()`
|
||||
- `telemetry.previewPolicy(targeting)`
|
||||
- `telemetry.createPolicy(input)`
|
||||
- `telemetry.updatePolicy(id, updates)`
|
||||
- `telemetry.deletePolicy(id)`
|
||||
- `telemetry.getMetrics()`
|
||||
- `telemetry.getGeoDistribution(from?, to?)`
|
||||
|
||||
**A2A use**
|
||||
|
||||
- Telemetry Analyst Agent automates:
|
||||
- time window selection
|
||||
- cluster-to-user drilldown
|
||||
- policy suggestions (targeting + expiry)
|
||||
|
||||
### Remote diagnostics
|
||||
|
||||
- `diagnostics.createSession(input)`
|
||||
- `diagnostics.listSessions(filters)`
|
||||
- `diagnostics.getSession(id)`
|
||||
- `diagnostics.updateSession(id, updates)`
|
||||
- `diagnostics.cancelSession(id)`
|
||||
- `diagnostics.getLogs(sessionId, filters)`
|
||||
- `diagnostics.getTraces(sessionId, filters)`
|
||||
- `diagnostics.listScreenshots(sessionId)`
|
||||
|
||||
**Target identifiers (schema-aligned):** `targetUserId`, `targetAnonymousId`, `targetDeviceId`.
|
||||
|
||||
**A2A use**
|
||||
|
||||
- Diagnostics Orchestrator Agent monitors session lifecycle and compiles summaries.
|
||||
|
||||
### Jobs
|
||||
|
||||
Current module provides list/update/trigger/list runs.
|
||||
|
||||
- `jobs.list()`
|
||||
- `jobs.get(id)`
|
||||
- `jobs.update(id, updates)`
|
||||
- `jobs.trigger(jobName)`
|
||||
- `jobs.listRuns(jobName, limit)`
|
||||
|
||||
**Note**
|
||||
|
||||
- Current `jobs/routes.ts` uses `DEFAULT_PRODUCT_ID = 'lysnrai'`. For MCP, prefer explicit productId routing.
|
||||
|
||||
### Settings / kill switch
|
||||
|
||||
- `settings.get(userId)`
|
||||
- `settings.update(userId, patch)`
|
||||
- `settings.getDeviceResolved(userId, deviceId)`
|
||||
- `settings.setDeviceOverrides(userId, deviceId, overrides)`
|
||||
- `settings.clearDeviceOverrides(userId, deviceId)`
|
||||
- `settings.checkKillSwitch(productId)`
|
||||
|
||||
### Flags
|
||||
|
||||
- `flags.list(productId)`
|
||||
- `flags.get(key, productId)`
|
||||
- `flags.upsert(key, enabled, targeting, description)`
|
||||
|
||||
### Maintenance
|
||||
|
||||
- `maintenance.getCurrent(productId)`
|
||||
- `maintenance.set(productId, mode, bypassRules, windows)`
|
||||
|
||||
### Webhooks
|
||||
|
||||
- `webhooks.listSubscriptions(productId)`
|
||||
- `webhooks.createSubscription(input)`
|
||||
- `webhooks.updateSubscription(id, productId, updates)`
|
||||
- `webhooks.deleteSubscription(id, productId)`
|
||||
- `webhooks.listDeliveries(subscriptionId, limit)`
|
||||
- `webhooks.test(subscriptionId)`
|
||||
- `webhooks.rotateSecret(subscriptionId)`
|
||||
|
||||
## Recommended MCP resources
|
||||
|
||||
- `platform-service.modules`
|
||||
- enumerates module names, key endpoints, auth requirements
|
||||
- `telemetry.eventSchema`
|
||||
- `diagnostics.sessionSchema`
|
||||
|
||||
## Recommended A2A workflows backed by platform-service
|
||||
|
||||
### 1) Support debug pack
|
||||
|
||||
- input: user report
|
||||
- output: timeline + clusters + recommended actions + (optional) diagnostics session results
|
||||
|
||||
### 2) Canary rollout of additional telemetry
|
||||
|
||||
- build policy → preview → create with expiry → monitor cluster changes
|
||||
|
||||
### 3) Post-incident cleanup
|
||||
|
||||
- resolve clusters
|
||||
- remove policies
|
||||
- export audit log for the incident window
|
||||
80
docs/MCP+A2A/DOMAIN_PRODUCTS.md
Normal file
80
docs/MCP+A2A/DOMAIN_PRODUCTS.md
Normal file
@ -0,0 +1,80 @@
|
||||
# Domain — Product Repos (MCP + A2A Opportunities)
|
||||
|
||||
This document captures product-specific “where MCP/A2A helps” patterns, without duplicating each product’s full architecture docs.
|
||||
|
||||
## Cross-product recurring needs
|
||||
|
||||
- Debugging: telemetry clusters + remote diagnostics sessions
|
||||
- Platform controls: kill switch, feature flags, maintenance
|
||||
- Content intelligence: extraction tasks
|
||||
- Release readiness: build/test/typecheck workflows
|
||||
|
||||
## ChronoMind (`learning_ai_clock`)
|
||||
|
||||
- **Opportunities**
|
||||
- Telemetry-driven quality tracking for timer engine + NL parsing.
|
||||
- A2A “routine regression” agent: detect changes that affect scheduling.
|
||||
- **MCP hooks**
|
||||
- telemetry tools for PWA
|
||||
- platform-service jobs/webhooks for timer sharing integrations
|
||||
|
||||
## NomGap (`learning_ai_fastgap`)
|
||||
|
||||
- **Opportunities**
|
||||
- React Native offline-first flows map well onto offline queue + platform-client.
|
||||
- A2A “protocol tuning” agent: uses telemetry + extraction to correlate adherence patterns.
|
||||
- **MCP hooks**
|
||||
- telemetry + kill switch clients already exist; MCP can standardize their usage.
|
||||
|
||||
## PeakPulse (`learning_ai_peakpulse`)
|
||||
|
||||
- **Opportunities**
|
||||
- Sync reliability: a diagnostics session targeted at a user’s device can capture network failures.
|
||||
- A2A “safety alerts correctness” agent using telemetry to validate thresholds.
|
||||
- **MCP hooks**
|
||||
- platform-service telemetry/diagnostics
|
||||
- product backend endpoints for session uploads (via sync engine)
|
||||
|
||||
## MindLyst (`learning_multimodal_memory_agents`)
|
||||
|
||||
- **Opportunities**
|
||||
- Extraction-service is core to triage and insight enrichment; prompt/task iteration loop is high ROI.
|
||||
- A2A “triage regression” agent that runs eval suites.
|
||||
- **MCP hooks**
|
||||
- extraction tools
|
||||
- telemetry for web + native apps
|
||||
|
||||
## JarvisJr (`learning_ai_jarvis_jr`)
|
||||
|
||||
- **Opportunities**
|
||||
- Multi-agent product: A2A patterns can be applied to its own internal coaching “crew”.
|
||||
- Marketplace + certification workflows can be agent-automated.
|
||||
- **MCP hooks**
|
||||
- platform-service for auth/telemetry
|
||||
- product backend for marketplace modules
|
||||
|
||||
## LysnrAI (`learning_voice_ai_agent`)
|
||||
|
||||
- **Opportunities**
|
||||
- Support/debug workflows are already telemetry-heavy (keyboard + desktop).
|
||||
- A2A “keyboard bug triage” agent that starts diagnostics sessions + drafts fixes.
|
||||
- **MCP hooks**
|
||||
- platform-service telemetry/diagnostics
|
||||
- admin dashboard tooling
|
||||
|
||||
## Product-specific MCP server (optional)
|
||||
|
||||
For each product, you can optionally add a small MCP namespace that calls the product backend (`backend/` in each repo) for domain actions.
|
||||
|
||||
Examples:
|
||||
|
||||
- ChronoMind: `timers.list/create`, `routines.run/validate`
|
||||
- PeakPulse: `sessions.upload`, `routes.export`
|
||||
- JarvisJr: `agents.list/publish`, `marketplace.certify`
|
||||
|
||||
## Recommended first A2A workflows to ship per product
|
||||
|
||||
- **All products**: Support Debug Pack (telemetry + diagnostics)
|
||||
- **MindLyst**: Extraction task design + eval loop
|
||||
- **JarvisJr**: Marketplace certification assistant
|
||||
- **NomGap**: Offline queue flush assistant / sync reliability assistant
|
||||
59
docs/MCP+A2A/EXECUTION_CHECKLIST.md
Normal file
59
docs/MCP+A2A/EXECUTION_CHECKLIST.md
Normal file
@ -0,0 +1,59 @@
|
||||
# MCP + A2A — Execution Checklist
|
||||
|
||||
This is the “ready to start building” checklist that turns the docs in this folder into an executable plan.
|
||||
|
||||
## 1) Decisions to make (30–60 minutes)
|
||||
|
||||
- **MCP server placement**
|
||||
- Recommended default: create a new service/package under `learning_ai_common_plat` (not colocated inside `platform-service`) to keep runtime concerns separated.
|
||||
- **Integration mode**
|
||||
- Recommended default: REST-only calls to `platform-service` and `extraction-service` for Phase 1.
|
||||
- Defer direct Cosmos reads until you have a clear perf/cost need.
|
||||
- **Auth strategy**
|
||||
- Recommended default: platform-service JWT for interactive use; platform API tokens only for trusted automation.
|
||||
- **Where to store A2A handoffs**
|
||||
- Recommended default: Phase 1 store handoffs as telemetry events + structured logs; Phase 2 introduce a dedicated Cosmos container if you need queryability.
|
||||
|
||||
## 2) Must-fix dependency before MVP
|
||||
|
||||
- **Diagnostics client/server route mismatch**
|
||||
- `platform-service` ingests via session-scoped endpoints:
|
||||
- `POST /api/diagnostics/sessions/:id/logs`
|
||||
- `POST /api/diagnostics/sessions/:id/traces`
|
||||
- screenshots via session-scoped SAS upload
|
||||
- `@bytelyst/diagnostics-client` currently flushes to `POST /api/diagnostics/ingest`.
|
||||
|
||||
Pick one (recommended: update the client):
|
||||
|
||||
- Decision: update `@bytelyst/diagnostics-client` to post to session-scoped endpoints. No backwards-compatible `POST /api/diagnostics/ingest` alias endpoint.
|
||||
|
||||
## 3) Phase 1 build steps (P0 slice)
|
||||
|
||||
- **Implement MCP tool namespaces**
|
||||
- `platform.telemetry.*`
|
||||
- `platform.diagnostics.*`
|
||||
- `extraction.*`
|
||||
- **Enforce hard guardrails in MCP layer**
|
||||
- `productId` required and forwarded as `x-product-id`
|
||||
- `x-request-id` required and propagated
|
||||
- default query caps + max caps
|
||||
- expiry required for any “amplification” (telemetry policy, diagnostics session)
|
||||
- role gating (viewer/admin/super_admin)
|
||||
- **Ship one compound tool**
|
||||
- `support.createDebugPack(...)`
|
||||
|
||||
## 4) Phase 1 definition of done
|
||||
|
||||
- Read-only tools work end-to-end against real services.
|
||||
- Mutating tools are role-gated and generate audit trails.
|
||||
- The compound debug pack produces a single structured artifact with:
|
||||
- telemetry cluster references
|
||||
- optional diagnostics session reference
|
||||
- a short markdown summary
|
||||
|
||||
## 5) Phase 2+ quick sanity checks
|
||||
|
||||
- If you make telemetry policy-aware clients:
|
||||
- ensure `GET /api/telemetry/config` consumption is cached (ETag) and privacy-safe.
|
||||
- If you add direct Cosmos reads:
|
||||
- enforce product scoping at query layer and add explicit auditing for sensitive reads.
|
||||
261
docs/MCP+A2A/IMPLEMENTATION_PLAN.md
Normal file
261
docs/MCP+A2A/IMPLEMENTATION_PLAN.md
Normal file
@ -0,0 +1,261 @@
|
||||
# MCP + A2A — Implementation Plan (Execution-Ready)
|
||||
|
||||
## Objective
|
||||
|
||||
Deliver a **safe, auditable, product-aware** MCP + A2A capability layer on top of existing ByteLyst services (primarily `platform-service` and `extraction-service`) so that agents can:
|
||||
|
||||
- diagnose incidents (telemetry + diagnostics)
|
||||
- manage telemetry policies / rollouts
|
||||
- orchestrate remote diagnostics sessions
|
||||
- iterate on extraction tasks/prompts with eval loops
|
||||
- run repeatable ops workflows (jobs, flags, maintenance)
|
||||
|
||||
This plan intentionally starts with a **minimal P0 slice** and expands in phases.
|
||||
|
||||
## Guiding constraints (must-haves)
|
||||
|
||||
- **Product isolation**
|
||||
- Every tool call is scoped to an explicit `productId`.
|
||||
- **Auditability**
|
||||
- Every mutating tool call must produce an audit record (directly or via existing APIs).
|
||||
- **Least privilege**
|
||||
- Query tools available to viewer roles; mutating tools gated to admin/super_admin.
|
||||
- **Safety defaults**
|
||||
- Mutations support `dryRun` where feasible.
|
||||
- Diagnostic amplification (policies/sessions) must require an `expiresAt`.
|
||||
- **No new “shadow APIs”**
|
||||
- Prefer calling existing service endpoints; only add endpoints if the tool surface cannot be expressed otherwise.
|
||||
|
||||
## Phase 0 — Baseline readiness (1–2 days)
|
||||
|
||||
### Deliverables
|
||||
|
||||
- A “tool surface” inventory mapped to existing REST endpoints and required headers.
|
||||
- A role matrix for tool authorization.
|
||||
- A request-id propagation and logging standard for MCP tool calls.
|
||||
|
||||
### Phase 0 checklist (definition of done)
|
||||
|
||||
- Confirm whether MCP tool implementations will call **REST only** (preferred) or allow **direct Cosmos reads** for selected query paths.
|
||||
- Choose whether MCP is:
|
||||
- a new service/package in `learning_ai_common_plat`, or
|
||||
- colocated under `services/platform-service`.
|
||||
- Define the initial auth strategy:
|
||||
- JWT only, or
|
||||
- JWT + API tokens for automation.
|
||||
- Define tool-level authorization rules (viewer/admin/super_admin) and how they’re enforced.
|
||||
- Agree on a consistent product scoping rule:
|
||||
- `productId` is mandatory input for every tool call and sent as `x-product-id` downstream.
|
||||
|
||||
### Decisions to lock
|
||||
|
||||
- **MCP server shape**
|
||||
- Single server with namespaces (`platform.*`, `extraction.*`) vs. two servers.
|
||||
- **Auth mechanism**
|
||||
- Primary: platform-service JWT.
|
||||
- Secondary: platform API tokens for trusted automation.
|
||||
|
||||
### Required invariants
|
||||
|
||||
- `x-request-id` propagated on all downstream calls.
|
||||
- `x-product-id` required and validated for all calls.
|
||||
|
||||
## Phase 1 — MVP MCP server (P0 slice) (3–7 days)
|
||||
|
||||
### Goal
|
||||
|
||||
Enable a Support/Ops agent to answer: _“What’s happening?”_ and _“Start a bounded diagnostics session.”_
|
||||
|
||||
### Tool surface (MVP)
|
||||
|
||||
#### Read-only (viewer)
|
||||
|
||||
- `telemetry.queryEvents(filters)`
|
||||
- `telemetry.listClusters(filters)`
|
||||
- `telemetry.listPolicies()`
|
||||
- `telemetry.getMetrics()`
|
||||
- `diagnostics.getSession(sessionId)`
|
||||
- `diagnostics.listSessions(filters)`
|
||||
- `diagnostics.getLogs(sessionId, filters)`
|
||||
- `diagnostics.getTraces(sessionId, filters)`
|
||||
- `extraction.sidecarHealth()`
|
||||
|
||||
### Phase 1 prerequisites (to avoid hidden integration failures)
|
||||
|
||||
- Align `@bytelyst/diagnostics-client` ingest endpoint with `platform-service`.
|
||||
- Today the service ingests via `POST /api/diagnostics/sessions/:id/logs|traces` (and session-scoped screenshot upload), while the client posts to `POST /api/diagnostics/ingest`.
|
||||
- Decision: update the client to use the session-scoped endpoints (no backwards-compat alias endpoint).
|
||||
- Decide whether `@bytelyst/telemetry-client` should become policy-aware by consuming `GET /api/telemetry/config`.
|
||||
- If yes, treat it as a Phase 1 deliverable (with caching + ETag). Otherwise, explicitly defer to Phase 2.
|
||||
|
||||
#### Mutating (admin)
|
||||
|
||||
- `telemetry.previewPolicy(targeting)`
|
||||
- `telemetry.createPolicy(input)`
|
||||
- requires `expiresAt`
|
||||
- `telemetry.updatePolicy(id, updates)`
|
||||
- `telemetry.updateClusterStatus(clusterId, pk, status)`
|
||||
- `diagnostics.createSession(target, config)`
|
||||
- requires `expiresAt` or `expiresInMinutes`
|
||||
- `diagnostics.updateSession(sessionId, updates)`
|
||||
- `diagnostics.cancelSession(sessionId)`
|
||||
|
||||
#### Compound workflow tool (admin)
|
||||
|
||||
- `support.createDebugPack(input)`
|
||||
- internally calls:
|
||||
- telemetry queries
|
||||
- optional diagnostics session create + polling
|
||||
- returns a single structured artifact:
|
||||
- `debugPackId`, `clusterRefs`, `sessionRefs`, and a markdown summary
|
||||
|
||||
### Output contracts (schemas)
|
||||
|
||||
Define explicit JSON schemas (Zod or equivalent) for:
|
||||
|
||||
- `TelemetryFilters`
|
||||
- `TelemetryPolicyInput` (requires expiry)
|
||||
- `DiagnosticsSessionTarget`
|
||||
- `DiagnosticsSessionConfig`
|
||||
- `DebugPackRequest`
|
||||
- `DebugPackResponse`
|
||||
|
||||
### Guardrails
|
||||
|
||||
- Query limits:
|
||||
- default `limit` and max `limit` enforced at MCP layer.
|
||||
- Policy guardrails:
|
||||
- require `expiresAt`
|
||||
- require explicit `eventTypes/modules`
|
||||
- block wildcard collection unless super_admin
|
||||
- Diagnostics guardrails:
|
||||
- enforce max duration
|
||||
- enforce max capture volume per flush
|
||||
|
||||
### Acceptance criteria
|
||||
|
||||
- A single agent can:
|
||||
- pull clusters for a product + time window
|
||||
- propose a telemetry policy and preview targeting
|
||||
- create an expiring policy
|
||||
- start a diagnostics session and retrieve data
|
||||
- generate a “Debug Pack” artifact
|
||||
|
||||
### Phase 1 engineering checklist (definition of done)
|
||||
|
||||
- MCP layer enforces:
|
||||
- request-id propagation (`x-request-id`)
|
||||
- required product scoping (`productId`)
|
||||
- default query caps and maximum caps
|
||||
- expiry requirements for policies/sessions
|
||||
- Every mutating tool call produces an audit record (either:
|
||||
- by calling existing audit endpoints, or
|
||||
- by ensuring the underlying platform-service endpoint already records audit)
|
||||
- Tool names and inputs are documented and stable (no breaking renames during Phase 1)
|
||||
|
||||
## Phase 2 — A2A orchestration (1–2 weeks)
|
||||
|
||||
### Goal
|
||||
|
||||
Turn multi-step support/ops workflows into **repeatable agent playbooks** with explicit handoffs.
|
||||
|
||||
### Standard agents
|
||||
|
||||
- **DispatcherAgent**
|
||||
- **TelemetryAnalystAgent**
|
||||
- **DiagnosticsOrchestratorAgent**
|
||||
- **OpsExecutorAgent**
|
||||
- **ReportWriterAgent**
|
||||
|
||||
### Handoff artifacts
|
||||
|
||||
- `SupportIncidentBrief`
|
||||
- `TelemetryFindings`
|
||||
- `DiagnosticsSessionPlan`
|
||||
- `OpsChangePlan`
|
||||
- `FinalIncidentReport`
|
||||
|
||||
### Acceptance criteria
|
||||
|
||||
- “Support Debug Pack” runs end-to-end via:
|
||||
- Dispatcher → Telemetry Analyst → Diagnostics Orchestrator → Report Writer
|
||||
- Every handoff is persisted (even if only in logs initially) with stable IDs.
|
||||
|
||||
## Phase 3 — Extraction task iteration loop (1–3 weeks)
|
||||
|
||||
### Goal
|
||||
|
||||
Make extraction prompt/task improvements safe, testable, and regression-resistant.
|
||||
|
||||
### MCP tools
|
||||
|
||||
- `extraction.extract(text, taskId?, modelId?)`
|
||||
- `extraction.extractBatch(inputs)`
|
||||
- `extraction.submitJob(inputs, webhookUrl?)`
|
||||
- `extraction.getJob(jobId)`
|
||||
- `extraction.metrics()` / `extraction.cacheStats()`
|
||||
|
||||
### A2A workflow
|
||||
|
||||
- **TaskDesignerAgent** drafts task prompt + examples
|
||||
- **EvalRunnerAgent** runs batch eval sets
|
||||
- **RegressionAgent** compares to baseline
|
||||
- **PublisherAgent** updates task registry + rollout
|
||||
|
||||
### Acceptance criteria
|
||||
|
||||
- A single command/workflow can:
|
||||
- run eval suite
|
||||
- compute simple quality metrics (schema validity, required fields coverage)
|
||||
- produce a report and recommended next edit
|
||||
|
||||
## Phase 4 — Ops expansion (jobs/flags/maintenance/webhooks) (1–2 weeks)
|
||||
|
||||
### Tools
|
||||
|
||||
- `jobs.list`, `jobs.trigger`, `jobs.listRuns`
|
||||
- `flags.list`, `flags.upsert`, `flags.evaluate`
|
||||
- `maintenance.get`, `maintenance.set`
|
||||
- `webhooks.listSubscriptions`, `webhooks.test`, `webhooks.rotateSecret`
|
||||
|
||||
### Acceptance criteria
|
||||
|
||||
- “Ops Copilot” can safely execute a bounded change plan:
|
||||
- propose change
|
||||
- dry-run if supported
|
||||
- execute with audit
|
||||
- verify outcome
|
||||
|
||||
## Security & privacy checklist
|
||||
|
||||
- Explicit `productId` on every tool call
|
||||
- Avoid returning raw PII in tool results
|
||||
- Ensure diagnostics redaction remains enforced server-side
|
||||
- Enforce expirations on policies and sessions
|
||||
- Rate limit MCP server endpoints
|
||||
|
||||
## Rollout strategy
|
||||
|
||||
- Start with internal-only usage (super_admin).
|
||||
- Add admin roles once guardrails are proven.
|
||||
- Add viewer read-only access for broader teams.
|
||||
- Add product-specific namespaces only after platform namespaces stabilize.
|
||||
|
||||
## Work breakdown (suggested)
|
||||
|
||||
- **Milestone A**: MVP MCP server + telemetry/diagnostics read-only
|
||||
- **Milestone B**: mutating tools + dry-run/expiry enforcement
|
||||
- **Milestone C**: `support.createDebugPack` compound tool
|
||||
- **Milestone D**: A2A runner + handoff schemas
|
||||
- **Milestone E**: extraction eval loop
|
||||
|
||||
## Open questions (need decisions)
|
||||
|
||||
- Should the MCP server call services via:
|
||||
- service REST endpoints only, or
|
||||
- direct Cosmos reads for some query paths?
|
||||
- Where should A2A handoff artifacts be stored:
|
||||
- telemetry events,
|
||||
- a dedicated Cosmos container,
|
||||
- or both?
|
||||
- Do we want one MCP server repo/package, or colocated under `platform-service`?
|
||||
155
docs/MCP+A2A/MCP_SERVER_FRAMEWORK.md
Normal file
155
docs/MCP+A2A/MCP_SERVER_FRAMEWORK.md
Normal file
@ -0,0 +1,155 @@
|
||||
# MCP Server Framework — Recommended Architecture (ByteLyst)
|
||||
|
||||
## Why an MCP server here
|
||||
|
||||
This workspace already has a clear separation of concerns:
|
||||
|
||||
- **Authoritative services** (Fastify): `platform-service`, `extraction-service`, plus product backends.
|
||||
- **Dashboards** (Next.js): admin + tracker.
|
||||
- **Client SDKs**: Swift/Kotlin platform SDKs + TS client packages.
|
||||
|
||||
An MCP server becomes the **single programmatic gateway** that agents can call to:
|
||||
|
||||
- query/act on platform state
|
||||
- assemble debugging evidence
|
||||
- run repeatable ops workflows
|
||||
- safely orchestrate A2A agents
|
||||
|
||||
## Core design constraints
|
||||
|
||||
- **Do not bypass service invariants**
|
||||
- Prefer calling service endpoints or repositories with the same validation (Zod) and auth.
|
||||
- **Auditability**
|
||||
- Every mutating tool should emit audit logs (or call APIs that already do).
|
||||
- **Least privilege**
|
||||
- Split tools by role (viewer/admin/super_admin).
|
||||
- **Product isolation**
|
||||
- All tools/resources must explicitly bind to `productId`.
|
||||
|
||||
## Reality check: what exists today
|
||||
|
||||
- `platform-service` already exposes:
|
||||
- `GET /api/telemetry/config` (ETag-based client collection config)
|
||||
- `GET /api/telemetry/query`, `GET /api/telemetry/clusters`, policies CRUD (admin)
|
||||
- diagnostics session CRUD + `GET /api/diagnostics/sessions/:id/logs|traces|screenshots` (admin)
|
||||
- `extraction-service` already exposes:
|
||||
- `/extract`, `/extract/batch`, `/extract/jobs`, sidecar health, metrics, cache stats
|
||||
|
||||
The primary “new work” for MCP is orchestration, safety gating, and consistent auth/audit — not inventing new primitives.
|
||||
|
||||
## Proposed MCP servers (2-tier)
|
||||
|
||||
### 1) `bytelyst-platform-mcp` (primary)
|
||||
|
||||
Backed by `platform-service` (port 4003) and optionally Cosmos for direct reads.
|
||||
|
||||
- **Responsibilities**
|
||||
- Telemetry querying + policy management
|
||||
- Remote diagnostics sessions orchestration
|
||||
- Jobs trigger/list
|
||||
- Flags/settings/maintenance
|
||||
- Webhooks + delivery logs
|
||||
- Audit query
|
||||
|
||||
### 2) `bytelyst-extraction-mcp` (specialized)
|
||||
|
||||
Backed by `extraction-service` (port 4005).
|
||||
|
||||
- **Responsibilities**
|
||||
- Extract / batch extract
|
||||
- Submit and monitor async extraction jobs
|
||||
- Sidecar health + circuit breaker insight
|
||||
- Metrics + cache stats
|
||||
|
||||
(Optionally, these can be a single MCP server with two namespaces.)
|
||||
|
||||
## Tool taxonomy
|
||||
|
||||
### A) Read-only tools
|
||||
|
||||
- `telemetry.queryEvents`
|
||||
- `telemetry.listClusters`
|
||||
- `telemetry.getMetrics`
|
||||
- `diagnostics.getSession`
|
||||
- `diagnostics.getLogs/getTraces`
|
||||
- `jobs.list/listRuns`
|
||||
- `flags.list`
|
||||
- `settings.get`
|
||||
- `webhooks.listSubscriptions/listDeliveries`
|
||||
- `extraction.metrics/cacheStats/sidecarHealth`
|
||||
|
||||
### B) Mutating tools (require elevated role)
|
||||
|
||||
- `telemetry.createPolicy/updatePolicy/deletePolicy`
|
||||
- `telemetry.updateClusterStatus`
|
||||
- `diagnostics.createSession/updateSession/cancelSession`
|
||||
- `jobs.trigger`
|
||||
- `maintenance.set`
|
||||
- `flags.set` (or flag upserts)
|
||||
- `webhooks.rotateSecret` / `webhooks.test`
|
||||
- `extraction.rateLimitReset` (if you keep that admin endpoint)
|
||||
|
||||
### C) Compound tools (“one tool = one workflow”)
|
||||
|
||||
- `support.createDebugPack(reportInput)`
|
||||
- pulls telemetry timeline + cluster context
|
||||
- optionally starts diagnostics session
|
||||
- returns a single structured artifact (markdown/json)
|
||||
|
||||
This reduces prompt fragility vs. requiring the LLM to call 8 tools in the right order.
|
||||
|
||||
## MCP resources
|
||||
|
||||
Resources should be stable references agents can read repeatedly:
|
||||
|
||||
- `platform.modules.index`
|
||||
- module list + base routes + required headers
|
||||
- `telemetry.schema`
|
||||
- `diagnostics.schema`
|
||||
- `extraction.tasks.catalog`
|
||||
- `ops.runbooks`
|
||||
- e.g. “how to debug iOS keyboard insert_noop”
|
||||
- `product.identity`
|
||||
- productId, plan tiers, allowed baseUrls
|
||||
|
||||
## Prompts (MCP prompt templates)
|
||||
|
||||
- `prompt.support_triage`
|
||||
- `prompt.telemetry_policy_proposal`
|
||||
- `prompt.remote_diagnostics_session_plan`
|
||||
- `prompt.extraction_task_design`
|
||||
|
||||
## Authentication & authorization
|
||||
|
||||
- **Primary**: platform-service JWT (same `verifyToken` logic).
|
||||
- **Secondary**: service-to-service API tokens (only for trusted automation).
|
||||
- **Tool gating**
|
||||
- viewer: query-only
|
||||
- admin: policy updates, create diagnostics sessions
|
||||
- super_admin: secret rotation, maintenance, destructive operations
|
||||
|
||||
## Observability for the MCP server
|
||||
|
||||
- Use structured logs (Fastify/pino style) and propagate `x-request-id`.
|
||||
- Record tool invocation metrics into `telemetry` as `backend_service` channel:
|
||||
- module: `mcp`
|
||||
- eventName: `tool_invoked`, `tool_failed`, `a2a_handoff`
|
||||
|
||||
## Safe defaults / guardrails
|
||||
|
||||
- Any mutating tool should support a `dryRun: true` mode.
|
||||
- Enforce `expiresAt` on any “diagnostic collection amplification” (telemetry policy, diagnostics session).
|
||||
- Cap queries by default (limit/pageSize), require explicit `limit` increases.
|
||||
|
||||
## Known integration risk (fix early)
|
||||
|
||||
- `@bytelyst/diagnostics-client` currently flushes to `POST /api/diagnostics/ingest`, while `platform-service` ingests via session-scoped endpoints.
|
||||
- Resolve this mismatch before using diagnostics tooling as a core MCP/A2A workflow dependency.
|
||||
- Decision: update `@bytelyst/diagnostics-client` to post to `POST /api/diagnostics/sessions/:id/logs|traces`.
|
||||
|
||||
## Suggested initial tool surface (minimal viable)
|
||||
|
||||
- `telemetry.queryEvents`, `telemetry.listClusters`, `telemetry.listPolicies`, `telemetry.previewPolicy`, `telemetry.createPolicy`
|
||||
- `diagnostics.createSession`, `diagnostics.getSession`, `diagnostics.getLogs`, `diagnostics.getTraces`
|
||||
- `extraction.extract`, `extraction.extractBatch`, `extraction.sidecarHealth`
|
||||
- `jobs.list`, `jobs.trigger`
|
||||
35
docs/MCP+A2A/README.md
Normal file
35
docs/MCP+A2A/README.md
Normal file
@ -0,0 +1,35 @@
|
||||
# MCP + A2A — Workspace Initiative
|
||||
|
||||
This folder contains a workspace-wide scan of the ByteLyst ecosystem (platform + shared packages + dashboards + product repos) for opportunities to leverage:
|
||||
|
||||
- MCP (Model Context Protocol)
|
||||
- A2A (agent-to-agent) patterns
|
||||
|
||||
## Documents
|
||||
|
||||
- `WORKSPACE_USE_CASE_CATALOG.md`
|
||||
- `MCP_SERVER_FRAMEWORK.md`
|
||||
- `A2A_ORCHESTRATION_FRAMEWORK.md`
|
||||
- `IMPLEMENTATION_PLAN.md`
|
||||
- `DOMAIN_PLATFORM_SERVICE.md`
|
||||
- `DOMAIN_EXTRACTION_SERVICE.md`
|
||||
- `DOMAIN_DASHBOARDS.md`
|
||||
- `DOMAIN_PACKAGES_AND_SDKS.md`
|
||||
- `DOMAIN_PRODUCTS.md`
|
||||
|
||||
## Scope of scan
|
||||
|
||||
Primary sources used:
|
||||
|
||||
- `services/platform-service` (telemetry, diagnostics, jobs, settings, webhooks, auth, etc.)
|
||||
- `services/extraction-service` (sidecar + tasks + async jobs + rate limits)
|
||||
- `packages/*` (Swift/Kotlin platform SDKs, TS clients, event bus, telemetry/diagnostics clients)
|
||||
- `dashboards/admin-web`, `dashboards/tracker-web`, `dashboards/ux-lab`
|
||||
- Product repos: ChronoMind (`learning_ai_clock`), NomGap (`learning_ai_fastgap`), PeakPulse (`learning_ai_peakpulse`), LysnrAI (`learning_voice_ai_agent`), MindLyst (`learning_multimodal_memory_agents`), JarvisJr (`learning_ai_jarvis_jr`)
|
||||
|
||||
## Notation
|
||||
|
||||
- **Tool**: an MCP tool callable by an LLM/agent.
|
||||
- **Resource**: an MCP resource (read-only or read-mostly) exposed for context.
|
||||
- **Prompt**: an MCP prompt template.
|
||||
- **Agent**: an A2A-capable worker with a specific responsibility.
|
||||
173
docs/MCP+A2A/WORKSPACE_USE_CASE_CATALOG.md
Normal file
173
docs/MCP+A2A/WORKSPACE_USE_CASE_CATALOG.md
Normal file
@ -0,0 +1,173 @@
|
||||
# WORKSPACE — MCP + A2A Use-Case Catalog
|
||||
|
||||
## Goals
|
||||
|
||||
- Identify **high-leverage** MCP tools/resources and A2A agent workflows across the ByteLyst workspace.
|
||||
- Provide a **prioritized backlog** that maps cleanly onto existing services:
|
||||
- `platform-service` (telemetry, diagnostics, jobs, settings, auth, webhooks, etc.)
|
||||
- `extraction-service` (structured extraction, async jobs)
|
||||
- dashboards (admin/tracker)
|
||||
- shared packages + mobile SDKs
|
||||
- product repos (ChronoMind, NomGap, PeakPulse, MindLyst, JarvisJr, LysnrAI)
|
||||
|
||||
## What “MCP” and “A2A” mean in this workspace
|
||||
|
||||
- **MCP server**: exposes authoritative product/platform capabilities as tools/resources.
|
||||
- Tools should map to stable APIs (prefer calling service endpoints / repositories rather than scraping UI).
|
||||
- Resources should expose read-mostly context: schemas, runbooks, current status, recent incidents, etc.
|
||||
- **A2A**: multiple specialized agents collaborating via explicit contracts.
|
||||
- Example: _Support agent_ → _Triage agent_ → _Diagnostics agent_ → _Fix agent_ → _Release agent_.
|
||||
|
||||
## Reality check: “exists today” vs “proposed tool surface”
|
||||
|
||||
- **Backed by existing services today**
|
||||
- Telemetry query/clusters/policies (platform-service)
|
||||
- Telemetry client config (`GET /api/telemetry/config`, ETag-based)
|
||||
- Diagnostics sessions + ingest (platform-service)
|
||||
- Extraction extract/batch/jobs/health (extraction-service)
|
||||
- Jobs/flags/settings/webhooks endpoints (platform-service)
|
||||
- **Proposed tools that may require new endpoints or privileged integrations**
|
||||
- `secrets.rotate(...)`, `services.restart(...)` (depends on Key Vault/infra integration)
|
||||
- UX-lab dataset generators (`dev.generateTelemetryDataset(...)`) (local-only helper tooling)
|
||||
- Compound tools like `support.createDebugPack(...)` (implemented in MCP layer as orchestration)
|
||||
|
||||
## Prioritization rubric
|
||||
|
||||
- **P0**: removes constant manual toil; immediate engineering impact.
|
||||
- **P1**: unlocks new ops / reliability capabilities.
|
||||
- **P2**: enables new product intelligence / automations.
|
||||
- **P3**: experimental / UX labs.
|
||||
|
||||
## P0 — Highest leverage (do first)
|
||||
|
||||
### 1) Incident / Support “one-click evidence” pack (Telemetry + Diagnostics)
|
||||
|
||||
- **Why**
|
||||
- You already have rich primitives: `telemetry` (clusters + policies) and `diagnostics` (remote sessions).
|
||||
- The missing piece is automated _assembly_ of a support-ready packet.
|
||||
- **MCP tools**
|
||||
- `telemetry.queryEvents(filters)`
|
||||
- `telemetry.listClusters(filters)`
|
||||
- `telemetry.updateClusterStatus(clusterId, pk, status)`
|
||||
- `diagnostics.createSession(targetUserId|targetAnonymousId|targetDeviceId, config)`
|
||||
- `diagnostics.getSession(sessionId)`
|
||||
- `diagnostics.getLogs(sessionId, filters)`
|
||||
- `diagnostics.getTraces(sessionId, filters)`
|
||||
- `diagnostics.listScreenshots(sessionId)`
|
||||
- **A2A agents**
|
||||
- **SupportTriageAgent**: extracts identifiers from user report (email/userId/anonymousInstallId, appVersion). (Maps to `targetAnonymousId` for diagnostics.)
|
||||
- **TelemetryAnalystAgent**: pulls clusters + timelines; proposes policy changes.
|
||||
- **DiagnosticsOrchestratorAgent**: starts a debug session when needed; monitors results.
|
||||
- **ReportWriterAgent**: produces a “Debug Pack” markdown/PDF.
|
||||
|
||||
### 2) Telemetry policy authoring assistant (safe-by-default)
|
||||
|
||||
- **Why**
|
||||
- Policies are powerful but easy to over-collect. Assistant can:
|
||||
- suggest targeting
|
||||
- set auto-expiry
|
||||
- choose sampling
|
||||
- **MCP tools**
|
||||
- `telemetry.listPolicies()`
|
||||
- `telemetry.previewPolicy(targeting)`
|
||||
- `telemetry.createPolicy(input)`
|
||||
- `telemetry.updatePolicy(id, updates)`
|
||||
- **A2A agents**
|
||||
- **PolicyPlannerAgent**: proposes policy with guardrails.
|
||||
- **PolicyReviewerAgent**: checks privacy constraints + expiry + scope.
|
||||
|
||||
### 3) “Ops Copilot” for platform-service modules
|
||||
|
||||
- **Why**
|
||||
- platform-service contains many modules (auth, flags, delivery, jobs, settings, maintenance, etc.).
|
||||
- Most ops tasks are combinations of a few actions (query, toggle, trigger job).
|
||||
- **MCP tools**
|
||||
- `jobs.list()` / `jobs.trigger(name)` / `jobs.listRuns(name)`
|
||||
- `maintenance.get()` / `maintenance.set(mode, schedule)`
|
||||
- `flags.list()` / `flags.set(key, enabled, targeting)`
|
||||
- `settings.get(userId)` / `settings.set(userId, patch)`
|
||||
- `webhooks.listSubscriptions()` / `webhooks.test()` / `webhooks.rotateSecret()`
|
||||
- **A2A agents**
|
||||
- **OpsAgent**: executes operations.
|
||||
- **ComplianceAgent**: ensures audit trail + least privilege.
|
||||
|
||||
### 4) Extraction service “task builder” and evaluation loop
|
||||
|
||||
- **Why**
|
||||
- extraction-service already has model registry, caching, async jobs, sidecar health.
|
||||
- Task prompts/examples are high leverage; building & evaluating them is repetitive.
|
||||
- **MCP tools**
|
||||
- `extraction.extract(text, taskId, modelId)`
|
||||
- `extraction.extractBatch(inputs)`
|
||||
- `extraction.submitJob(inputs, webhookUrl?)`
|
||||
- `extraction.getJob(jobId)`
|
||||
- `extraction.sidecarHealth()`
|
||||
- `extraction.metrics()` / `extraction.cacheStats()`
|
||||
- **A2A agents**
|
||||
- **TaskDesignerAgent**: drafts taskPrompt + examples.
|
||||
- **EvalRunnerAgent**: runs eval sets, compares outputs.
|
||||
- **PromptRegressionAgent**: ensures no quality regressions.
|
||||
|
||||
## P1 — Operational maturity
|
||||
|
||||
### 5) Automated regression watch: telemetry clusters → auto diagnostics session
|
||||
|
||||
- **Why**
|
||||
- telemetry clusters already have severity escalation + webhook alerts.
|
||||
- diagnostics supports remote sessions.
|
||||
- **Flow**
|
||||
- Cluster crosses threshold → A2A triggers a targeted diagnostics session for a small segment.
|
||||
- **MCP tools**
|
||||
- `telemetry.listClusters()`
|
||||
- `diagnostics.createSession(...)`
|
||||
|
||||
### 6) Secret rotation “assistant” (Key Vault + config propagation)
|
||||
|
||||
- **Why**
|
||||
- Secrets already resolved centrally via `@bytelyst/config`.
|
||||
- Rotation is still error-prone across services/dashboards.
|
||||
- **MCP tools/resources**
|
||||
- Resource: `secrets.mapping` (which env vars resolve which AKV secrets)
|
||||
- Tool: `secrets.rotate(name)` (where permitted) + `services.restart(service)` (optional)
|
||||
|
||||
### 7) Webhook subscription lifecycle assistant
|
||||
|
||||
- **Why**
|
||||
- You have webhooks + delivery logs; assistants can recommend retries, disable rules, test endpoints.
|
||||
- **MCP tools**
|
||||
- `webhooks.listSubscriptions(productId)`
|
||||
- `webhooks.listDeliveries(subscriptionId)`
|
||||
- `webhooks.rotateSecret(subscriptionId)`
|
||||
- `webhooks.test(subscriptionId)`
|
||||
|
||||
## P2 — Product intelligence and automation
|
||||
|
||||
### 8) A/B experimentation assistant (platform-service `ab-testing`, `experiments`)
|
||||
|
||||
- **Use**
|
||||
- Draft experiment plan, compute exposure targeting, monitor telemetry signals.
|
||||
|
||||
### 9) “Changelog writer” from merged PRs + telemetry impact
|
||||
|
||||
- **Use**
|
||||
- Collect changes; relate to drop in cluster counts; propose release notes.
|
||||
|
||||
## P3 — UX-lab accelerators
|
||||
|
||||
### 10) MCP-driven UX Lab data generators
|
||||
|
||||
- **Why**
|
||||
- UX lab apps (telemetry explorer, ops UI kit) need rich sample datasets.
|
||||
- **MCP tool**
|
||||
- `dev.generateTelemetryDataset(shape, size, seed)`
|
||||
|
||||
## Cross-product patterns that MCP/A2A should standardize
|
||||
|
||||
- **Telemetry**
|
||||
- Common event naming + “productId is mandatory” invariant.
|
||||
- **Kill switch**
|
||||
- Single check path; consistent failure mode (fail-open).
|
||||
- **Extraction**
|
||||
- Task IDs used consistently across products (triage, reflection, insights, etc.).
|
||||
- **Workflows**
|
||||
- Release/build/test workflows can become A2A playbooks using MCP tools.
|
||||
@ -13,7 +13,6 @@ import type {
|
||||
LogEntry,
|
||||
Breadcrumb,
|
||||
NetworkRequest,
|
||||
IngestBatch,
|
||||
DeviceState,
|
||||
} from './types.js';
|
||||
import { BreadcrumbTrail } from './breadcrumbs.js';
|
||||
@ -28,7 +27,6 @@ type ErrorEvent = {
|
||||
colno: number;
|
||||
error?: { stack?: string };
|
||||
};
|
||||
type EventListener = (event: unknown) => void;
|
||||
|
||||
export interface DiagnosticsClientOptions extends DiagnosticsConfig {
|
||||
/** Custom logger */
|
||||
@ -217,7 +215,7 @@ export class DiagnosticsClient {
|
||||
level,
|
||||
message,
|
||||
timestamp: new Date().toISOString(),
|
||||
module: context.module as string ?? 'unknown',
|
||||
module: (context.module as string) ?? 'unknown',
|
||||
context,
|
||||
correlationId: context.correlationId as string,
|
||||
};
|
||||
@ -254,7 +252,10 @@ export class DiagnosticsClient {
|
||||
span.durationMs = new Date(span.endTime).getTime() - new Date(span.startTime).getTime();
|
||||
span.status = 'ok';
|
||||
this.traceBuffer.push(span);
|
||||
this.breadcrumbs.add('trace', `Completed: ${name}`, { spanId: span.spanId, durationMs: span.durationMs });
|
||||
this.breadcrumbs.add('trace', `Completed: ${name}`, {
|
||||
spanId: span.spanId,
|
||||
durationMs: span.durationMs,
|
||||
});
|
||||
return result;
|
||||
} catch (error) {
|
||||
span.endTime = new Date().toISOString();
|
||||
@ -262,7 +263,10 @@ export class DiagnosticsClient {
|
||||
span.status = 'error';
|
||||
span.statusMessage = error instanceof Error ? error.message : String(error);
|
||||
this.traceBuffer.push(span);
|
||||
this.breadcrumbs.add('trace', `Failed: ${name}`, { spanId: span.spanId, error: span.statusMessage });
|
||||
this.breadcrumbs.add('trace', `Failed: ${name}`, {
|
||||
spanId: span.spanId,
|
||||
error: span.statusMessage,
|
||||
});
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
@ -298,7 +302,7 @@ export class DiagnosticsClient {
|
||||
url.searchParams.set('installId', this.config.anonymousInstallId);
|
||||
|
||||
const headers: Record<string, string> = {
|
||||
'Accept': 'application/json',
|
||||
Accept: 'application/json',
|
||||
};
|
||||
|
||||
if (this.lastEtag) {
|
||||
@ -347,7 +351,10 @@ export class DiagnosticsClient {
|
||||
this.config.logger.error('[diagnostics] Failed to poll for session', {
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
});
|
||||
this.state = { type: 'error', error: error instanceof Error ? error : new Error(String(error)) };
|
||||
this.state = {
|
||||
type: 'error',
|
||||
error: error instanceof Error ? error : new Error(String(error)),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@ -364,56 +371,90 @@ export class DiagnosticsClient {
|
||||
return;
|
||||
}
|
||||
|
||||
// Build batch
|
||||
const batch: IngestBatch = {
|
||||
sessionId: session.id,
|
||||
};
|
||||
const sessionId = session.id;
|
||||
|
||||
if (this.logBuffer.length > 0) {
|
||||
batch.logs = this.logBuffer.splice(0, 50); // Max 50 per batch
|
||||
}
|
||||
|
||||
if (this.traceBuffer.length > 0) {
|
||||
batch.traces = this.traceBuffer.splice(0, 50);
|
||||
}
|
||||
|
||||
if (this.networkBuffer.length > 0) {
|
||||
batch.network = this.networkBuffer.splice(0, 50);
|
||||
}
|
||||
|
||||
// Add breadcrumbs
|
||||
const logs = this.logBuffer.splice(0, 50); // Server max: 50
|
||||
const traces = this.traceBuffer.splice(0, 50); // Server max: 50
|
||||
const network = this.networkBuffer.splice(0, 50);
|
||||
const crumbs = this.breadcrumbs.getAll();
|
||||
if (crumbs.length > 0) {
|
||||
batch.breadcrumbs = [...crumbs];
|
||||
this.breadcrumbs.clear();
|
||||
this.breadcrumbs.clear();
|
||||
|
||||
// Encode breadcrumbs + network captures as log entries so we can ingest
|
||||
// without requiring additional server-side schemas/endpoints.
|
||||
const synthesizedLogs = [] as LogEntry[];
|
||||
|
||||
for (const c of crumbs) {
|
||||
synthesizedLogs.push({
|
||||
level: 'info',
|
||||
message: `[breadcrumb] ${c.category}: ${c.message}`,
|
||||
timestamp: c.timestamp,
|
||||
module: 'diagnostics.breadcrumb',
|
||||
context: c.data ?? {},
|
||||
});
|
||||
}
|
||||
|
||||
// Skip if nothing to send
|
||||
if (!batch.logs && !batch.traces && !batch.network && !batch.breadcrumbs) {
|
||||
for (const n of network) {
|
||||
synthesizedLogs.push({
|
||||
level: n.error ? 'error' : 'info',
|
||||
message: `[network] ${n.method} ${n.url} ${n.status ?? ''}`.trim(),
|
||||
timestamp: n.startTime,
|
||||
module: 'diagnostics.network',
|
||||
context: {
|
||||
requestHeaders: n.requestHeaders,
|
||||
requestBody: n.requestBody,
|
||||
status: n.status,
|
||||
responseHeaders: n.responseHeaders,
|
||||
responseBody: n.responseBody,
|
||||
startTime: n.startTime,
|
||||
endTime: n.endTime,
|
||||
durationMs: n.durationMs,
|
||||
error: n.error,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
const allLogs = [...logs, ...synthesizedLogs];
|
||||
|
||||
if (allLogs.length === 0 && traces.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const token = await this.getAuthToken();
|
||||
const headers: Record<string, string> = {
|
||||
'Content-Type': 'application/json',
|
||||
...(token ? { Authorization: `Bearer ${token}` } : {}),
|
||||
};
|
||||
|
||||
try {
|
||||
const url = new URL('/api/diagnostics/ingest', this.config.serverUrl);
|
||||
const token = await this.getAuthToken();
|
||||
if (allLogs.length > 0) {
|
||||
const url = new URL(
|
||||
`/api/diagnostics/sessions/${encodeURIComponent(sessionId)}/logs`,
|
||||
this.config.serverUrl
|
||||
);
|
||||
const response = await fetch(url.toString(), {
|
||||
method: 'POST',
|
||||
headers,
|
||||
body: JSON.stringify({ sessionId, logs: allLogs }),
|
||||
});
|
||||
if (!response.ok) throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
const response = await fetch(url.toString(), {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
...(token && { 'Authorization': `Bearer ${token}` }),
|
||||
},
|
||||
body: JSON.stringify(batch),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
if (traces.length > 0) {
|
||||
const url = new URL(
|
||||
`/api/diagnostics/sessions/${encodeURIComponent(sessionId)}/traces`,
|
||||
this.config.serverUrl
|
||||
);
|
||||
const response = await fetch(url.toString(), {
|
||||
method: 'POST',
|
||||
headers,
|
||||
body: JSON.stringify({ sessionId, traces }),
|
||||
});
|
||||
if (!response.ok) throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
this.config.logger.debug('[diagnostics] Flushed batch', {
|
||||
logs: batch.logs?.length ?? 0,
|
||||
traces: batch.traces?.length ?? 0,
|
||||
network: batch.network?.length ?? 0,
|
||||
logs: allLogs.length,
|
||||
traces: traces.length,
|
||||
});
|
||||
} catch (error) {
|
||||
this.config.logger.error('[diagnostics] Failed to flush batch', {
|
||||
@ -421,9 +462,14 @@ export class DiagnosticsClient {
|
||||
});
|
||||
|
||||
// Put items back in buffers for retry
|
||||
if (batch.logs) this.logBuffer.unshift(...batch.logs);
|
||||
if (batch.traces) this.traceBuffer.unshift(...batch.traces);
|
||||
if (batch.network) this.networkBuffer.unshift(...batch.network);
|
||||
if (logs.length > 0) this.logBuffer.unshift(...logs);
|
||||
if (traces.length > 0) this.traceBuffer.unshift(...traces);
|
||||
if (network.length > 0) this.networkBuffer.unshift(...network);
|
||||
|
||||
// Breadcrumbs were converted; keep a small breadcrumb trail hint for later flush.
|
||||
for (const c of crumbs.slice(-10)) {
|
||||
this.breadcrumbs.add(c.category, c.message, c.data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -432,7 +478,7 @@ export class DiagnosticsClient {
|
||||
*/
|
||||
private setupNetworkCapture(): void {
|
||||
this.networkInterceptor = new NetworkInterceptor(
|
||||
(request) => {
|
||||
request => {
|
||||
this.networkBuffer.push(request);
|
||||
},
|
||||
{
|
||||
@ -455,9 +501,9 @@ export class DiagnosticsClient {
|
||||
};
|
||||
|
||||
const capture = (level: LogLevel, args: unknown[]) => {
|
||||
const message = args.map(a =>
|
||||
typeof a === 'object' ? JSON.stringify(a) : String(a)
|
||||
).join(' ');
|
||||
const message = args
|
||||
.map(a => (typeof a === 'object' ? JSON.stringify(a) : String(a)))
|
||||
.join(' ');
|
||||
this.log(level, message, { module: 'console', source: 'captured' });
|
||||
};
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user