diff --git a/docs/MCP+A2A/EXECUTION_CHECKLIST.md b/docs/MCP+A2A/EXECUTION_CHECKLIST.md index 33277c30..f1d7fe59 100644 --- a/docs/MCP+A2A/EXECUTION_CHECKLIST.md +++ b/docs/MCP+A2A/EXECUTION_CHECKLIST.md @@ -15,29 +15,36 @@ This is the “ready to start building” checklist that turns the docs in this flushes to session-scoped endpoints (`/api/diagnostics/sessions/:id/logs|traces`). No change needed. - Confirmed in `packages/diagnostics-client/src/client.ts` flush() method, lines 430–453 -## 3) Phase 1 build steps (P0 slice) +## 3) Phase 1 build steps -- **Implement MCP tool namespaces** ✅ — [027e216](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/027e216) - - [x] `platform.telemetry.*` — query, clusters, metrics (3 tools) - - [x] `platform.diagnostics.*` — sessions.list/create/get/update/getLogs/getTraces (6 tools) - - [x] `extraction.*` — run, models, cacheStats (3 tools) -- **Enforce hard guardrails in MCP layer** ✅ — [027e216](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/027e216) - - [x] `productId` required in all query tools, forwarded as `x-product-id` - - [x] `x-request-id` propagated via `req.id` on every upstream call - - [x] default query caps (`QUERY_DEFAULT_LIMIT=20`) + hard caps (`QUERY_MAX_LIMIT=100`) - - [x] role gating (`viewer` / `admin` / `super_admin`) enforced in `requireRole()` - - [ ] expiry enforcement for diagnostics sessions — delegated to platform-service `maxDurationMinutes` -- **Ship one compound tool** ✅ — [027e216](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/027e216) - - [x] `support.createDebugPack(productId, targetUserId?, from?, to?, reason?)` +### Milestone A + C (P0 slice) ✅ — [027e216](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/027e216) -## 4) Phase 1 definition of done +- [x] `platform.telemetry.*` — query, clusters, metrics (3 tools) +- [x] `platform.diagnostics.*` — sessions.list/create/get/update/getLogs/getTraces (6 tools) +- [x] `extraction.*` — run, models, cacheStats (3 tools) +- [x] `support.createDebugPack(productId, targetUserId?, from?, to?, reason?)` + +### Milestone B (mutating tools + expiry + cluster status) ✅ — [26d3403](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/26d3403) + +- [x] `platform.telemetry.policies.*` — list, preview, create (expiresAt required), update, delete (5 tools) +- [x] `platform.telemetry.clusters.updateStatus` — resolve/ignore/reopen clusters +- [x] `platform.diagnostics.sessions.cancel` — dedicated cancel tool +- [x] `extraction.sidecarHealth` — Python sidecar health check +- [x] `expiresAt` enforced as **required** field on `telemetry.policies.create` at MCP layer +- [x] `productId` required in all query tools, forwarded as `x-product-id` +- [x] `x-request-id` propagated via `req.id` on every upstream call +- [x] default query caps (`QUERY_DEFAULT_LIMIT=20`) + hard caps (`QUERY_MAX_LIMIT=100`) +- [x] role gating (`viewer` / `admin` / `super_admin`) enforced in `requireRole()` +- [x] expiry for diagnostics sessions delegated to platform-service `maxDurationMinutes` + +## 4) Phase 1 definition of done ✅ COMPLETE - [x] Read-only tools work end-to-end against real services (proxy to platform-service + extraction-service) - [x] Mutating tools are role-gated (`admin` minimum) and log audit entries via `req.log` -- [x] Compound debug pack produces a single structured artifact with: - - [x] telemetry cluster references (up to 10 shown, count included) - - [x] optional diagnostics session reference (id, status, expiresAt) - - [x] short markdown summary +- [x] Compound debug pack produces a single structured artifact with cluster refs, session ref, markdown summary +- [x] Telemetry policy CRUD with mandatory `expiresAt` guardrail +- [x] Error cluster triage (resolve / ignore / reopen) +- [x] Python sidecar health visible via `extraction.sidecarHealth` - [ ] End-to-end integration test with real platform-service (Phase 2) ## 5) Phase 2+ quick sanity checks diff --git a/services/mcp-server/src/lib/extraction-client.ts b/services/mcp-server/src/lib/extraction-client.ts index bb17a036..5df3534d 100644 --- a/services/mcp-server/src/lib/extraction-client.ts +++ b/services/mcp-server/src/lib/extraction-client.ts @@ -61,3 +61,12 @@ export async function extractionCacheStats(opts: { requestId?: string }): Promis if (!res.ok) throw new Error(`extraction-service GET /api/extract/cache-stats → ${res.status}`); return res.json(); } + +export async function extractionSidecarHealth(opts: { requestId?: string }): Promise { + const url = `${config.EXTRACTION_SERVICE_URL}/api/extract/sidecar-health`; + const headers: Record = { + ...(opts.requestId ? { 'x-request-id': opts.requestId } : {}), + }; + const res = await fetch(url, { headers, signal: AbortSignal.timeout(10_000) }); + return res.json(); +} diff --git a/services/mcp-server/src/lib/platform-client.ts b/services/mcp-server/src/lib/platform-client.ts index 2578f1b1..a53a7934 100644 --- a/services/mcp-server/src/lib/platform-client.ts +++ b/services/mcp-server/src/lib/platform-client.ts @@ -90,6 +90,69 @@ export async function telemetryMetrics(opts: PlatformClientOptions): Promise('/api/telemetry/metrics', { method: 'GET' }, opts); } +export async function telemetryListPolicies(opts: PlatformClientOptions): Promise { + return platformFetch('/api/telemetry/policies', { method: 'GET' }, opts); +} + +export async function telemetryPreviewPolicy( + body: { targeting?: Record }, + opts: PlatformClientOptions +): Promise { + return platformFetch( + '/api/telemetry/policies/preview', + { method: 'POST', body: JSON.stringify(body) }, + opts + ); +} + +export async function telemetryCreatePolicy( + body: Record, + opts: PlatformClientOptions +): Promise { + return platformFetch( + '/api/telemetry/policies', + { method: 'POST', body: JSON.stringify(body) }, + opts + ); +} + +export async function telemetryUpdatePolicy( + policyId: string, + body: Record, + opts: PlatformClientOptions +): Promise { + return platformFetch( + `/api/telemetry/policies/${encodeURIComponent(policyId)}`, + { method: 'PUT', body: JSON.stringify(body) }, + opts + ); +} + +export async function telemetryDeletePolicy( + policyId: string, + opts: PlatformClientOptions +): Promise<{ success: boolean }> { + return platformFetch<{ success: boolean }>( + `/api/telemetry/policies/${encodeURIComponent(policyId)}`, + { method: 'DELETE' }, + opts + ); +} + +export async function telemetryUpdateCluster( + clusterId: string, + pk: string, + status: 'open' | 'resolved' | 'ignored', + opts: PlatformClientOptions +): Promise { + const qs = new URLSearchParams({ pk }); + return platformFetch( + `/api/telemetry/clusters/${encodeURIComponent(clusterId)}?${qs}`, + { method: 'PATCH', body: JSON.stringify({ status }) }, + opts + ); +} + // ── Diagnostics ─────────────────────────────────────────────────────────────── export interface DebugSession { diff --git a/services/mcp-server/src/modules/extraction/extraction-tools.ts b/services/mcp-server/src/modules/extraction/extraction-tools.ts index 7560c84c..711fbce3 100644 --- a/services/mcp-server/src/modules/extraction/extraction-tools.ts +++ b/services/mcp-server/src/modules/extraction/extraction-tools.ts @@ -4,6 +4,7 @@ import { extractionRun, extractionModels, extractionCacheStats, + extractionSidecarHealth, } from '../../lib/extraction-client.js'; registerTool({ @@ -43,3 +44,14 @@ registerTool({ return extractionCacheStats({ requestId: req.id }); }, }); + +registerTool({ + name: 'extraction.sidecarHealth', + description: + 'Check the health of the Python extraction sidecar process. Returns status and last-seen timestamp. Requires admin role.', + requiredRole: 'admin', + inputSchema: z.object({}), + async execute(_args, req) { + return extractionSidecarHealth({ requestId: req.id }); + }, +}); diff --git a/services/mcp-server/src/modules/platform/diagnostics-tools.ts b/services/mcp-server/src/modules/platform/diagnostics-tools.ts index ed00e838..60c0febf 100644 --- a/services/mcp-server/src/modules/platform/diagnostics-tools.ts +++ b/services/mcp-server/src/modules/platform/diagnostics-tools.ts @@ -139,3 +139,20 @@ registerTool({ }); }, }); + +registerTool({ + name: 'platform.diagnostics.sessions.cancel', + description: + 'Cancel an active remote debug session immediately. Stops further data collection. Requires admin role.', + requiredRole: 'admin', + inputSchema: z.object({ + sessionId: z.string().min(1).describe('Debug session ID to cancel'), + }), + async execute(args, req) { + return diagnosticsUpdateSession( + args.sessionId, + { status: 'cancelled' }, + { token: tokenOf(req), requestId: req.id } + ); + }, +}); diff --git a/services/mcp-server/src/modules/platform/telemetry-policy-tools.ts b/services/mcp-server/src/modules/platform/telemetry-policy-tools.ts new file mode 100644 index 00000000..1f1010d6 --- /dev/null +++ b/services/mcp-server/src/modules/platform/telemetry-policy-tools.ts @@ -0,0 +1,180 @@ +import { z } from 'zod'; +import { registerTool } from '../tools/registry.js'; +import { + telemetryListPolicies, + telemetryPreviewPolicy, + telemetryCreatePolicy, + telemetryUpdatePolicy, + telemetryDeletePolicy, + telemetryUpdateCluster, +} from '../../lib/platform-client.js'; +import type { McpToolRequest } from '../tools/types.js'; + +const tokenOf = (req: McpToolRequest) => req.headers.authorization?.slice(7); + +// ── Shared targeting sub-schema ──────────────────────────────────────────── +const TargetingSchema = z + .object({ + userIds: z.array(z.string()).optional(), + platforms: z.array(z.string()).optional().describe('e.g. ios, android, web'), + channels: z.array(z.string()).optional().describe('e.g. release, beta, internal'), + osFamilies: z.array(z.string()).optional(), + appVersions: z.array(z.string()).optional(), + releaseChannels: z.array(z.string()).optional(), + countryCodes: z.array(z.string()).optional(), + percentage: z.coerce.number().min(0).max(100).optional().describe('% of installs to target'), + }) + .optional() + .describe('Targeting criteria — omit to apply to all installs'); + +// ── platform.telemetry.policies.list ────────────────────────────────────── + +registerTool({ + name: 'platform.telemetry.policies.list', + description: 'List all telemetry collection policies for a product. Requires admin role.', + requiredRole: 'admin', + inputSchema: z.object({ + productId: z.string().min(1).describe('Product ID to scope the query'), + }), + async execute(args, req) { + return telemetryListPolicies({ + token: tokenOf(req), + requestId: req.id, + productId: args.productId, + }); + }, +}); + +// ── platform.telemetry.policies.preview ────────────────────────────────── + +registerTool({ + name: 'platform.telemetry.policies.preview', + description: + 'Preview how many clients a targeting config would match (dry-run before creating a policy). Requires admin role.', + requiredRole: 'admin', + inputSchema: z.object({ + productId: z.string().min(1).describe('Product ID to scope the preview'), + targeting: TargetingSchema, + }), + async execute(args, req) { + return telemetryPreviewPolicy( + { targeting: args.targeting as Record | undefined }, + { token: tokenOf(req), requestId: req.id, productId: args.productId } + ); + }, +}); + +// ── platform.telemetry.policies.create ─────────────────────────────────── + +registerTool({ + name: 'platform.telemetry.policies.create', + description: [ + 'Create a telemetry collection policy. expiresAt is REQUIRED — never create an open-ended policy.', + 'eventTypes defaults to [warn, error, fatal]. samplingRate 0.0–1.0 (1.0 = 100%). Requires admin role.', + ].join(' '), + requiredRole: 'admin', + inputSchema: z.object({ + productId: z.string().min(1).describe('Product ID'), + name: z.string().min(1).max(200).describe('Human-readable policy name'), + description: z.string().optional(), + expiresAt: z + .string() + .datetime() + .describe('ISO 8601 expiry — REQUIRED. Policies must not run indefinitely.'), + eventTypes: z + .array(z.enum(['debug', 'info', 'warn', 'error', 'fatal'])) + .optional() + .describe('Event types to collect (default: warn, error, fatal)'), + modules: z.array(z.string()).optional().describe('Module names to target (empty = all)'), + samplingRate: z.coerce + .number() + .min(0) + .max(1) + .optional() + .describe('Fraction of events to collect (default: 1.0)'), + enabled: z.boolean().optional().describe('Enable immediately (default: true)'), + priority: z.coerce.number().min(0).max(999).optional(), + startsAt: z.string().datetime().optional(), + targeting: TargetingSchema, + }), + async execute(args, req) { + const { productId, ...body } = args; + return telemetryCreatePolicy(body as Record, { + token: tokenOf(req), + requestId: req.id, + productId, + }); + }, +}); + +// ── platform.telemetry.policies.update ────────────────────────────────── + +registerTool({ + name: 'platform.telemetry.policies.update', + description: 'Update an existing telemetry policy. All fields are optional. Requires admin role.', + requiredRole: 'admin', + inputSchema: z.object({ + productId: z.string().min(1).describe('Product ID (for x-product-id scoping)'), + policyId: z.string().min(1).describe('Policy ID to update'), + name: z.string().min(1).max(200).optional(), + description: z.string().optional(), + enabled: z.boolean().optional(), + expiresAt: z.string().datetime().optional().describe('Update or extend expiry (ISO 8601)'), + eventTypes: z.array(z.enum(['debug', 'info', 'warn', 'error', 'fatal'])).optional(), + modules: z.array(z.string()).optional(), + samplingRate: z.coerce.number().min(0).max(1).optional(), + targeting: TargetingSchema, + }), + async execute(args, req) { + const { productId, policyId, ...body } = args; + return telemetryUpdatePolicy(policyId, body as Record, { + token: tokenOf(req), + requestId: req.id, + productId, + }); + }, +}); + +// ── platform.telemetry.policies.delete ────────────────────────────────── + +registerTool({ + name: 'platform.telemetry.policies.delete', + description: 'Delete a telemetry policy by ID. Requires admin role.', + requiredRole: 'admin', + inputSchema: z.object({ + productId: z.string().min(1).describe('Product ID (for x-product-id scoping)'), + policyId: z.string().min(1).describe('Policy ID to delete'), + }), + async execute(args, req) { + return telemetryDeletePolicy(args.policyId, { + token: tokenOf(req), + requestId: req.id, + productId: args.productId, + }); + }, +}); + +// ── platform.telemetry.clusters.updateStatus ──────────────────────────── + +registerTool({ + name: 'platform.telemetry.clusters.updateStatus', + description: + 'Resolve or ignore an error cluster (fingerprinted error group). Use resolved when fixed, ignored to suppress. Requires admin role.', + requiredRole: 'admin', + inputSchema: z.object({ + productId: z.string().min(1).describe('Product ID (for x-product-id scoping)'), + clusterId: z.string().min(1).describe('Cluster ID (fingerprint:yyyyMM)'), + pk: z + .string() + .min(1) + .describe('Cluster partition key (productId:platform:module — from clusters list result)'), + status: z.enum(['open', 'resolved', 'ignored']).describe('New cluster status'), + }), + async execute(args, req) { + return telemetryUpdateCluster(args.clusterId, args.pk, args.status, { + token: tokenOf(req), + requestId: req.id, + productId: args.productId, + }); + }, +}); diff --git a/services/mcp-server/src/server.ts b/services/mcp-server/src/server.ts index bc9882bd..ddacff64 100644 --- a/services/mcp-server/src/server.ts +++ b/services/mcp-server/src/server.ts @@ -18,6 +18,7 @@ import { toolRoutes } from './modules/tools/routes.js'; // Register all tool namespaces (side-effect: populates the tool registry) import './modules/platform/telemetry-tools.js'; +import './modules/platform/telemetry-policy-tools.js'; import './modules/platform/diagnostics-tools.js'; import './modules/extraction/extraction-tools.js'; import './modules/support/debug-pack.js';