feat(mcp-server): Milestone B — telemetry policy CRUD, cluster status, diagnostics cancel, extraction sidecar health; enforce expiresAt on createPolicy

2026-03-05 12:17:15 -08:00 · 2026-03-05 12:17:15 -08:00 · d1d643f782
commit d1d643f782
parent 26d3403d5a
7 changed files with 307 additions and 18 deletions
--- a/docs/MCP+A2A/EXECUTION_CHECKLIST.md
+++ b/docs/MCP+A2A/EXECUTION_CHECKLIST.md
@ -15,29 +15,36 @@ This is the “ready to start building” checklist that turns the docs in this
  flushes to session-scoped endpoints (`/api/diagnostics/sessions/:id/logs|traces`). No change needed.
  - Confirmed in `packages/diagnostics-client/src/client.ts` flush() method, lines 430–453

-## 3) Phase 1 build steps (P0 slice)
+## 3) Phase 1 build steps

- **Implement MCP tool namespaces** ✅ — [027e216](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/027e216)
-  - [x] `platform.telemetry.*` — query, clusters, metrics (3 tools)
-  - [x] `platform.diagnostics.*` — sessions.list/create/get/update/getLogs/getTraces (6 tools)
-  - [x] `extraction.*` — run, models, cacheStats (3 tools)
- **Enforce hard guardrails in MCP layer** ✅ — [027e216](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/027e216)
-  - [x] `productId` required in all query tools, forwarded as `x-product-id`
-  - [x] `x-request-id` propagated via `req.id` on every upstream call
-  - [x] default query caps (`QUERY_DEFAULT_LIMIT=20`) + hard caps (`QUERY_MAX_LIMIT=100`)
-  - [x] role gating (`viewer` / `admin` / `super_admin`) enforced in `requireRole()`
-  - [ ] expiry enforcement for diagnostics sessions — delegated to platform-service `maxDurationMinutes`
- **Ship one compound tool** ✅ — [027e216](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/027e216)
-  - [x] `support.createDebugPack(productId, targetUserId?, from?, to?, reason?)`
+### Milestone A + C (P0 slice) ✅ — [027e216](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/027e216)

-## 4) Phase 1 definition of done
+- [x] `platform.telemetry.*` — query, clusters, metrics (3 tools)
+- [x] `platform.diagnostics.*` — sessions.list/create/get/update/getLogs/getTraces (6 tools)
+- [x] `extraction.*` — run, models, cacheStats (3 tools)
+- [x] `support.createDebugPack(productId, targetUserId?, from?, to?, reason?)`
+
+### Milestone B (mutating tools + expiry + cluster status) ✅ — [26d3403](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/26d3403)
+
+- [x] `platform.telemetry.policies.*` — list, preview, create (expiresAt required), update, delete (5 tools)
+- [x] `platform.telemetry.clusters.updateStatus` — resolve/ignore/reopen clusters
+- [x] `platform.diagnostics.sessions.cancel` — dedicated cancel tool
+- [x] `extraction.sidecarHealth` — Python sidecar health check
+- [x] `expiresAt` enforced as **required** field on `telemetry.policies.create` at MCP layer
+- [x] `productId` required in all query tools, forwarded as `x-product-id`
+- [x] `x-request-id` propagated via `req.id` on every upstream call
+- [x] default query caps (`QUERY_DEFAULT_LIMIT=20`) + hard caps (`QUERY_MAX_LIMIT=100`)
+- [x] role gating (`viewer` / `admin` / `super_admin`) enforced in `requireRole()`
+- [x] expiry for diagnostics sessions delegated to platform-service `maxDurationMinutes`
+
+## 4) Phase 1 definition of done ✅ COMPLETE

 - [x] Read-only tools work end-to-end against real services (proxy to platform-service + extraction-service)
 - [x] Mutating tools are role-gated (`admin` minimum) and log audit entries via `req.log`
- [x] Compound debug pack produces a single structured artifact with:
-  - [x] telemetry cluster references (up to 10 shown, count included)
-  - [x] optional diagnostics session reference (id, status, expiresAt)
-  - [x] short markdown summary
+- [x] Compound debug pack produces a single structured artifact with cluster refs, session ref, markdown summary
+- [x] Telemetry policy CRUD with mandatory `expiresAt` guardrail
+- [x] Error cluster triage (resolve / ignore / reopen)
+- [x] Python sidecar health visible via `extraction.sidecarHealth`
 - [ ] End-to-end integration test with real platform-service (Phase 2)

 ## 5) Phase 2+ quick sanity checks
--- a/services/mcp-server/src/lib/extraction-client.ts
+++ b/services/mcp-server/src/lib/extraction-client.ts
@ -61,3 +61,12 @@ export async function extractionCacheStats(opts: { requestId?: string }): Promis
  if (!res.ok) throw new Error(`extraction-service GET /api/extract/cache-stats → ${res.status}`);
  return res.json();
 }
+
+export async function extractionSidecarHealth(opts: { requestId?: string }): Promise<unknown> {
+  const url = `${config.EXTRACTION_SERVICE_URL}/api/extract/sidecar-health`;
+  const headers: Record<string, string> = {
+    ...(opts.requestId ? { 'x-request-id': opts.requestId } : {}),
+  };
+  const res = await fetch(url, { headers, signal: AbortSignal.timeout(10_000) });
+  return res.json();
+}
--- a/services/mcp-server/src/lib/platform-client.ts
+++ b/services/mcp-server/src/lib/platform-client.ts
@ -90,6 +90,69 @@ export async function telemetryMetrics(opts: PlatformClientOptions): Promise<unk
  return platformFetch<unknown>('/api/telemetry/metrics', { method: 'GET' }, opts);
 }

+export async function telemetryListPolicies(opts: PlatformClientOptions): Promise<unknown> {
+  return platformFetch<unknown>('/api/telemetry/policies', { method: 'GET' }, opts);
+}
+
+export async function telemetryPreviewPolicy(
+  body: { targeting?: Record<string, unknown> },
+  opts: PlatformClientOptions
+): Promise<unknown> {
+  return platformFetch<unknown>(
+    '/api/telemetry/policies/preview',
+    { method: 'POST', body: JSON.stringify(body) },
+    opts
+  );
+}
+
+export async function telemetryCreatePolicy(
+  body: Record<string, unknown>,
+  opts: PlatformClientOptions
+): Promise<unknown> {
+  return platformFetch<unknown>(
+    '/api/telemetry/policies',
+    { method: 'POST', body: JSON.stringify(body) },
+    opts
+  );
+}
+
+export async function telemetryUpdatePolicy(
+  policyId: string,
+  body: Record<string, unknown>,
+  opts: PlatformClientOptions
+): Promise<unknown> {
+  return platformFetch<unknown>(
+    `/api/telemetry/policies/${encodeURIComponent(policyId)}`,
+    { method: 'PUT', body: JSON.stringify(body) },
+    opts
+  );
+}
+
+export async function telemetryDeletePolicy(
+  policyId: string,
+  opts: PlatformClientOptions
+): Promise<{ success: boolean }> {
+  return platformFetch<{ success: boolean }>(
+    `/api/telemetry/policies/${encodeURIComponent(policyId)}`,
+    { method: 'DELETE' },
+    opts
+  );
+}
+
+export async function telemetryUpdateCluster(
+  clusterId: string,
+  pk: string,
+  status: 'open' | 'resolved' | 'ignored',
+  opts: PlatformClientOptions
+): Promise<unknown> {
+  const qs = new URLSearchParams({ pk });
+  return platformFetch<unknown>(
+    `/api/telemetry/clusters/${encodeURIComponent(clusterId)}?${qs}`,
+    { method: 'PATCH', body: JSON.stringify({ status }) },
+    opts
+  );
+}
+
 // ── Diagnostics ───────────────────────────────────────────────────────────────

 export interface DebugSession {
--- a/services/mcp-server/src/modules/extraction/extraction-tools.ts
+++ b/services/mcp-server/src/modules/extraction/extraction-tools.ts
@ -4,6 +4,7 @@ import {
  extractionRun,
  extractionModels,
  extractionCacheStats,
+  extractionSidecarHealth,
 } from '../../lib/extraction-client.js';

 registerTool({
@ -43,3 +44,14 @@ registerTool({
    return extractionCacheStats({ requestId: req.id });
  },
 });
+
+registerTool({
+  name: 'extraction.sidecarHealth',
+  description:
+    'Check the health of the Python extraction sidecar process. Returns status and last-seen timestamp. Requires admin role.',
+  requiredRole: 'admin',
+  inputSchema: z.object({}),
+  async execute(_args, req) {
+    return extractionSidecarHealth({ requestId: req.id });
+  },
+});
--- a/services/mcp-server/src/modules/platform/diagnostics-tools.ts
+++ b/services/mcp-server/src/modules/platform/diagnostics-tools.ts
@ -139,3 +139,20 @@ registerTool({
    });
  },
 });
+
+registerTool({
+  name: 'platform.diagnostics.sessions.cancel',
+  description:
+    'Cancel an active remote debug session immediately. Stops further data collection. Requires admin role.',
+  requiredRole: 'admin',
+  inputSchema: z.object({
+    sessionId: z.string().min(1).describe('Debug session ID to cancel'),
+  }),
+  async execute(args, req) {
+    return diagnosticsUpdateSession(
+      args.sessionId,
+      { status: 'cancelled' },
+      { token: tokenOf(req), requestId: req.id }
+    );
+  },
+});
--- a/services/mcp-server/src/modules/platform/telemetry-policy-tools.ts
+++ b/services/mcp-server/src/modules/platform/telemetry-policy-tools.ts
@ -0,0 +1,180 @@
+import { z } from 'zod';
+import { registerTool } from '../tools/registry.js';
+import {
+  telemetryListPolicies,
+  telemetryPreviewPolicy,
+  telemetryCreatePolicy,
+  telemetryUpdatePolicy,
+  telemetryDeletePolicy,
+  telemetryUpdateCluster,
+} from '../../lib/platform-client.js';
+import type { McpToolRequest } from '../tools/types.js';
+
+const tokenOf = (req: McpToolRequest) => req.headers.authorization?.slice(7);
+
+// ── Shared targeting sub-schema ────────────────────────────────────────────
+const TargetingSchema = z
+  .object({
+    userIds: z.array(z.string()).optional(),
+    platforms: z.array(z.string()).optional().describe('e.g. ios, android, web'),
+    channels: z.array(z.string()).optional().describe('e.g. release, beta, internal'),
+    osFamilies: z.array(z.string()).optional(),
+    appVersions: z.array(z.string()).optional(),
+    releaseChannels: z.array(z.string()).optional(),
+    countryCodes: z.array(z.string()).optional(),
+    percentage: z.coerce.number().min(0).max(100).optional().describe('% of installs to target'),
+  })
+  .optional()
+  .describe('Targeting criteria — omit to apply to all installs');
+
+// ── platform.telemetry.policies.list ──────────────────────────────────────
+
+registerTool({
+  name: 'platform.telemetry.policies.list',
+  description: 'List all telemetry collection policies for a product. Requires admin role.',
+  requiredRole: 'admin',
+  inputSchema: z.object({
+    productId: z.string().min(1).describe('Product ID to scope the query'),
+  }),
+  async execute(args, req) {
+    return telemetryListPolicies({
+      token: tokenOf(req),
+      requestId: req.id,
+      productId: args.productId,
+    });
+  },
+});
+
+// ── platform.telemetry.policies.preview ──────────────────────────────────
+
+registerTool({
+  name: 'platform.telemetry.policies.preview',
+  description:
+    'Preview how many clients a targeting config would match (dry-run before creating a policy). Requires admin role.',
+  requiredRole: 'admin',
+  inputSchema: z.object({
+    productId: z.string().min(1).describe('Product ID to scope the preview'),
+    targeting: TargetingSchema,
+  }),
+  async execute(args, req) {
+    return telemetryPreviewPolicy(
+      { targeting: args.targeting as Record<string, unknown> | undefined },
+      { token: tokenOf(req), requestId: req.id, productId: args.productId }
+    );
+  },
+});
+
+// ── platform.telemetry.policies.create ───────────────────────────────────
+
+registerTool({
+  name: 'platform.telemetry.policies.create',
+  description: [
+    'Create a telemetry collection policy. expiresAt is REQUIRED — never create an open-ended policy.',
+    'eventTypes defaults to [warn, error, fatal]. samplingRate 0.0–1.0 (1.0 = 100%). Requires admin role.',
+  ].join(' '),
+  requiredRole: 'admin',
+  inputSchema: z.object({
+    productId: z.string().min(1).describe('Product ID'),
+    name: z.string().min(1).max(200).describe('Human-readable policy name'),
+    description: z.string().optional(),
+    expiresAt: z
+      .string()
+      .datetime()
+      .describe('ISO 8601 expiry — REQUIRED. Policies must not run indefinitely.'),
+    eventTypes: z
+      .array(z.enum(['debug', 'info', 'warn', 'error', 'fatal']))
+      .optional()
+      .describe('Event types to collect (default: warn, error, fatal)'),
+    modules: z.array(z.string()).optional().describe('Module names to target (empty = all)'),
+    samplingRate: z.coerce
+      .number()
+      .min(0)
+      .max(1)
+      .optional()
+      .describe('Fraction of events to collect (default: 1.0)'),
+    enabled: z.boolean().optional().describe('Enable immediately (default: true)'),
+    priority: z.coerce.number().min(0).max(999).optional(),
+    startsAt: z.string().datetime().optional(),
+    targeting: TargetingSchema,
+  }),
+  async execute(args, req) {
+    const { productId, ...body } = args;
+    return telemetryCreatePolicy(body as Record<string, unknown>, {
+      token: tokenOf(req),
+      requestId: req.id,
+      productId,
+    });
+  },
+});
+
+// ── platform.telemetry.policies.update ──────────────────────────────────
+
+registerTool({
+  name: 'platform.telemetry.policies.update',
+  description: 'Update an existing telemetry policy. All fields are optional. Requires admin role.',
+  requiredRole: 'admin',
+  inputSchema: z.object({
+    productId: z.string().min(1).describe('Product ID (for x-product-id scoping)'),
+    policyId: z.string().min(1).describe('Policy ID to update'),
+    name: z.string().min(1).max(200).optional(),
+    description: z.string().optional(),
+    enabled: z.boolean().optional(),
+    expiresAt: z.string().datetime().optional().describe('Update or extend expiry (ISO 8601)'),
+    eventTypes: z.array(z.enum(['debug', 'info', 'warn', 'error', 'fatal'])).optional(),
+    modules: z.array(z.string()).optional(),
+    samplingRate: z.coerce.number().min(0).max(1).optional(),
+    targeting: TargetingSchema,
+  }),
+  async execute(args, req) {
+    const { productId, policyId, ...body } = args;
+    return telemetryUpdatePolicy(policyId, body as Record<string, unknown>, {
+      token: tokenOf(req),
+      requestId: req.id,
+      productId,
+    });
+  },
+});
+
+// ── platform.telemetry.policies.delete ──────────────────────────────────
+
+registerTool({
+  name: 'platform.telemetry.policies.delete',
+  description: 'Delete a telemetry policy by ID. Requires admin role.',
+  requiredRole: 'admin',
+  inputSchema: z.object({
+    productId: z.string().min(1).describe('Product ID (for x-product-id scoping)'),
+    policyId: z.string().min(1).describe('Policy ID to delete'),
+  }),
+  async execute(args, req) {
+    return telemetryDeletePolicy(args.policyId, {
+      token: tokenOf(req),
+      requestId: req.id,
+      productId: args.productId,
+    });
+  },
+});
+
+// ── platform.telemetry.clusters.updateStatus ────────────────────────────
+
+registerTool({
+  name: 'platform.telemetry.clusters.updateStatus',
+  description:
+    'Resolve or ignore an error cluster (fingerprinted error group). Use resolved when fixed, ignored to suppress. Requires admin role.',
+  requiredRole: 'admin',
+  inputSchema: z.object({
+    productId: z.string().min(1).describe('Product ID (for x-product-id scoping)'),
+    clusterId: z.string().min(1).describe('Cluster ID (fingerprint:yyyyMM)'),
+    pk: z
+      .string()
+      .min(1)
+      .describe('Cluster partition key (productId:platform:module — from clusters list result)'),
+    status: z.enum(['open', 'resolved', 'ignored']).describe('New cluster status'),
+  }),
+  async execute(args, req) {
+    return telemetryUpdateCluster(args.clusterId, args.pk, args.status, {
+      token: tokenOf(req),
+      requestId: req.id,
+      productId: args.productId,
+    });
+  },
+});
--- a/services/mcp-server/src/server.ts
+++ b/services/mcp-server/src/server.ts
@ -18,6 +18,7 @@ import { toolRoutes } from './modules/tools/routes.js';

 // Register all tool namespaces (side-effect: populates the tool registry)
 import './modules/platform/telemetry-tools.js';
+import './modules/platform/telemetry-policy-tools.js';
 import './modules/platform/diagnostics-tools.js';
 import './modules/extraction/extraction-tools.js';
 import './modules/support/debug-pack.js';