feat(mcp-server): Milestone B — telemetry policy CRUD, cluster status, diagnostics cancel, extraction sidecar health; enforce expiresAt on createPolicy

This commit is contained in:
saravanakumardb1 2026-03-05 12:17:15 -08:00
parent 26d3403d5a
commit d1d643f782
7 changed files with 307 additions and 18 deletions

View File

@ -15,29 +15,36 @@ This is the “ready to start building” checklist that turns the docs in this
flushes to session-scoped endpoints (`/api/diagnostics/sessions/:id/logs|traces`). No change needed.
- Confirmed in `packages/diagnostics-client/src/client.ts` flush() method, lines 430453
## 3) Phase 1 build steps (P0 slice)
## 3) Phase 1 build steps
- **Implement MCP tool namespaces** ✅ — [027e216](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/027e216)
- [x] `platform.telemetry.*` — query, clusters, metrics (3 tools)
- [x] `platform.diagnostics.*` — sessions.list/create/get/update/getLogs/getTraces (6 tools)
- [x] `extraction.*` — run, models, cacheStats (3 tools)
- **Enforce hard guardrails in MCP layer** ✅ — [027e216](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/027e216)
- [x] `productId` required in all query tools, forwarded as `x-product-id`
- [x] `x-request-id` propagated via `req.id` on every upstream call
- [x] default query caps (`QUERY_DEFAULT_LIMIT=20`) + hard caps (`QUERY_MAX_LIMIT=100`)
- [x] role gating (`viewer` / `admin` / `super_admin`) enforced in `requireRole()`
- [ ] expiry enforcement for diagnostics sessions — delegated to platform-service `maxDurationMinutes`
- **Ship one compound tool** ✅ — [027e216](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/027e216)
- [x] `support.createDebugPack(productId, targetUserId?, from?, to?, reason?)`
### Milestone A + C (P0 slice) ✅ — [027e216](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/027e216)
## 4) Phase 1 definition of done
- [x] `platform.telemetry.*` — query, clusters, metrics (3 tools)
- [x] `platform.diagnostics.*` — sessions.list/create/get/update/getLogs/getTraces (6 tools)
- [x] `extraction.*` — run, models, cacheStats (3 tools)
- [x] `support.createDebugPack(productId, targetUserId?, from?, to?, reason?)`
### Milestone B (mutating tools + expiry + cluster status) ✅ — [26d3403](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/26d3403)
- [x] `platform.telemetry.policies.*` — list, preview, create (expiresAt required), update, delete (5 tools)
- [x] `platform.telemetry.clusters.updateStatus` — resolve/ignore/reopen clusters
- [x] `platform.diagnostics.sessions.cancel` — dedicated cancel tool
- [x] `extraction.sidecarHealth` — Python sidecar health check
- [x] `expiresAt` enforced as **required** field on `telemetry.policies.create` at MCP layer
- [x] `productId` required in all query tools, forwarded as `x-product-id`
- [x] `x-request-id` propagated via `req.id` on every upstream call
- [x] default query caps (`QUERY_DEFAULT_LIMIT=20`) + hard caps (`QUERY_MAX_LIMIT=100`)
- [x] role gating (`viewer` / `admin` / `super_admin`) enforced in `requireRole()`
- [x] expiry for diagnostics sessions delegated to platform-service `maxDurationMinutes`
## 4) Phase 1 definition of done ✅ COMPLETE
- [x] Read-only tools work end-to-end against real services (proxy to platform-service + extraction-service)
- [x] Mutating tools are role-gated (`admin` minimum) and log audit entries via `req.log`
- [x] Compound debug pack produces a single structured artifact with:
- [x] telemetry cluster references (up to 10 shown, count included)
- [x] optional diagnostics session reference (id, status, expiresAt)
- [x] short markdown summary
- [x] Compound debug pack produces a single structured artifact with cluster refs, session ref, markdown summary
- [x] Telemetry policy CRUD with mandatory `expiresAt` guardrail
- [x] Error cluster triage (resolve / ignore / reopen)
- [x] Python sidecar health visible via `extraction.sidecarHealth`
- [ ] End-to-end integration test with real platform-service (Phase 2)
## 5) Phase 2+ quick sanity checks

View File

@ -61,3 +61,12 @@ export async function extractionCacheStats(opts: { requestId?: string }): Promis
if (!res.ok) throw new Error(`extraction-service GET /api/extract/cache-stats → ${res.status}`);
return res.json();
}
export async function extractionSidecarHealth(opts: { requestId?: string }): Promise<unknown> {
const url = `${config.EXTRACTION_SERVICE_URL}/api/extract/sidecar-health`;
const headers: Record<string, string> = {
...(opts.requestId ? { 'x-request-id': opts.requestId } : {}),
};
const res = await fetch(url, { headers, signal: AbortSignal.timeout(10_000) });
return res.json();
}

View File

@ -90,6 +90,69 @@ export async function telemetryMetrics(opts: PlatformClientOptions): Promise<unk
return platformFetch<unknown>('/api/telemetry/metrics', { method: 'GET' }, opts);
}
export async function telemetryListPolicies(opts: PlatformClientOptions): Promise<unknown> {
return platformFetch<unknown>('/api/telemetry/policies', { method: 'GET' }, opts);
}
export async function telemetryPreviewPolicy(
body: { targeting?: Record<string, unknown> },
opts: PlatformClientOptions
): Promise<unknown> {
return platformFetch<unknown>(
'/api/telemetry/policies/preview',
{ method: 'POST', body: JSON.stringify(body) },
opts
);
}
export async function telemetryCreatePolicy(
body: Record<string, unknown>,
opts: PlatformClientOptions
): Promise<unknown> {
return platformFetch<unknown>(
'/api/telemetry/policies',
{ method: 'POST', body: JSON.stringify(body) },
opts
);
}
export async function telemetryUpdatePolicy(
policyId: string,
body: Record<string, unknown>,
opts: PlatformClientOptions
): Promise<unknown> {
return platformFetch<unknown>(
`/api/telemetry/policies/${encodeURIComponent(policyId)}`,
{ method: 'PUT', body: JSON.stringify(body) },
opts
);
}
export async function telemetryDeletePolicy(
policyId: string,
opts: PlatformClientOptions
): Promise<{ success: boolean }> {
return platformFetch<{ success: boolean }>(
`/api/telemetry/policies/${encodeURIComponent(policyId)}`,
{ method: 'DELETE' },
opts
);
}
export async function telemetryUpdateCluster(
clusterId: string,
pk: string,
status: 'open' | 'resolved' | 'ignored',
opts: PlatformClientOptions
): Promise<unknown> {
const qs = new URLSearchParams({ pk });
return platformFetch<unknown>(
`/api/telemetry/clusters/${encodeURIComponent(clusterId)}?${qs}`,
{ method: 'PATCH', body: JSON.stringify({ status }) },
opts
);
}
// ── Diagnostics ───────────────────────────────────────────────────────────────
export interface DebugSession {

View File

@ -4,6 +4,7 @@ import {
extractionRun,
extractionModels,
extractionCacheStats,
extractionSidecarHealth,
} from '../../lib/extraction-client.js';
registerTool({
@ -43,3 +44,14 @@ registerTool({
return extractionCacheStats({ requestId: req.id });
},
});
registerTool({
name: 'extraction.sidecarHealth',
description:
'Check the health of the Python extraction sidecar process. Returns status and last-seen timestamp. Requires admin role.',
requiredRole: 'admin',
inputSchema: z.object({}),
async execute(_args, req) {
return extractionSidecarHealth({ requestId: req.id });
},
});

View File

@ -139,3 +139,20 @@ registerTool({
});
},
});
registerTool({
name: 'platform.diagnostics.sessions.cancel',
description:
'Cancel an active remote debug session immediately. Stops further data collection. Requires admin role.',
requiredRole: 'admin',
inputSchema: z.object({
sessionId: z.string().min(1).describe('Debug session ID to cancel'),
}),
async execute(args, req) {
return diagnosticsUpdateSession(
args.sessionId,
{ status: 'cancelled' },
{ token: tokenOf(req), requestId: req.id }
);
},
});

View File

@ -0,0 +1,180 @@
import { z } from 'zod';
import { registerTool } from '../tools/registry.js';
import {
telemetryListPolicies,
telemetryPreviewPolicy,
telemetryCreatePolicy,
telemetryUpdatePolicy,
telemetryDeletePolicy,
telemetryUpdateCluster,
} from '../../lib/platform-client.js';
import type { McpToolRequest } from '../tools/types.js';
const tokenOf = (req: McpToolRequest) => req.headers.authorization?.slice(7);
// ── Shared targeting sub-schema ────────────────────────────────────────────
const TargetingSchema = z
.object({
userIds: z.array(z.string()).optional(),
platforms: z.array(z.string()).optional().describe('e.g. ios, android, web'),
channels: z.array(z.string()).optional().describe('e.g. release, beta, internal'),
osFamilies: z.array(z.string()).optional(),
appVersions: z.array(z.string()).optional(),
releaseChannels: z.array(z.string()).optional(),
countryCodes: z.array(z.string()).optional(),
percentage: z.coerce.number().min(0).max(100).optional().describe('% of installs to target'),
})
.optional()
.describe('Targeting criteria — omit to apply to all installs');
// ── platform.telemetry.policies.list ──────────────────────────────────────
registerTool({
name: 'platform.telemetry.policies.list',
description: 'List all telemetry collection policies for a product. Requires admin role.',
requiredRole: 'admin',
inputSchema: z.object({
productId: z.string().min(1).describe('Product ID to scope the query'),
}),
async execute(args, req) {
return telemetryListPolicies({
token: tokenOf(req),
requestId: req.id,
productId: args.productId,
});
},
});
// ── platform.telemetry.policies.preview ──────────────────────────────────
registerTool({
name: 'platform.telemetry.policies.preview',
description:
'Preview how many clients a targeting config would match (dry-run before creating a policy). Requires admin role.',
requiredRole: 'admin',
inputSchema: z.object({
productId: z.string().min(1).describe('Product ID to scope the preview'),
targeting: TargetingSchema,
}),
async execute(args, req) {
return telemetryPreviewPolicy(
{ targeting: args.targeting as Record<string, unknown> | undefined },
{ token: tokenOf(req), requestId: req.id, productId: args.productId }
);
},
});
// ── platform.telemetry.policies.create ───────────────────────────────────
registerTool({
name: 'platform.telemetry.policies.create',
description: [
'Create a telemetry collection policy. expiresAt is REQUIRED — never create an open-ended policy.',
'eventTypes defaults to [warn, error, fatal]. samplingRate 0.01.0 (1.0 = 100%). Requires admin role.',
].join(' '),
requiredRole: 'admin',
inputSchema: z.object({
productId: z.string().min(1).describe('Product ID'),
name: z.string().min(1).max(200).describe('Human-readable policy name'),
description: z.string().optional(),
expiresAt: z
.string()
.datetime()
.describe('ISO 8601 expiry — REQUIRED. Policies must not run indefinitely.'),
eventTypes: z
.array(z.enum(['debug', 'info', 'warn', 'error', 'fatal']))
.optional()
.describe('Event types to collect (default: warn, error, fatal)'),
modules: z.array(z.string()).optional().describe('Module names to target (empty = all)'),
samplingRate: z.coerce
.number()
.min(0)
.max(1)
.optional()
.describe('Fraction of events to collect (default: 1.0)'),
enabled: z.boolean().optional().describe('Enable immediately (default: true)'),
priority: z.coerce.number().min(0).max(999).optional(),
startsAt: z.string().datetime().optional(),
targeting: TargetingSchema,
}),
async execute(args, req) {
const { productId, ...body } = args;
return telemetryCreatePolicy(body as Record<string, unknown>, {
token: tokenOf(req),
requestId: req.id,
productId,
});
},
});
// ── platform.telemetry.policies.update ──────────────────────────────────
registerTool({
name: 'platform.telemetry.policies.update',
description: 'Update an existing telemetry policy. All fields are optional. Requires admin role.',
requiredRole: 'admin',
inputSchema: z.object({
productId: z.string().min(1).describe('Product ID (for x-product-id scoping)'),
policyId: z.string().min(1).describe('Policy ID to update'),
name: z.string().min(1).max(200).optional(),
description: z.string().optional(),
enabled: z.boolean().optional(),
expiresAt: z.string().datetime().optional().describe('Update or extend expiry (ISO 8601)'),
eventTypes: z.array(z.enum(['debug', 'info', 'warn', 'error', 'fatal'])).optional(),
modules: z.array(z.string()).optional(),
samplingRate: z.coerce.number().min(0).max(1).optional(),
targeting: TargetingSchema,
}),
async execute(args, req) {
const { productId, policyId, ...body } = args;
return telemetryUpdatePolicy(policyId, body as Record<string, unknown>, {
token: tokenOf(req),
requestId: req.id,
productId,
});
},
});
// ── platform.telemetry.policies.delete ──────────────────────────────────
registerTool({
name: 'platform.telemetry.policies.delete',
description: 'Delete a telemetry policy by ID. Requires admin role.',
requiredRole: 'admin',
inputSchema: z.object({
productId: z.string().min(1).describe('Product ID (for x-product-id scoping)'),
policyId: z.string().min(1).describe('Policy ID to delete'),
}),
async execute(args, req) {
return telemetryDeletePolicy(args.policyId, {
token: tokenOf(req),
requestId: req.id,
productId: args.productId,
});
},
});
// ── platform.telemetry.clusters.updateStatus ────────────────────────────
registerTool({
name: 'platform.telemetry.clusters.updateStatus',
description:
'Resolve or ignore an error cluster (fingerprinted error group). Use resolved when fixed, ignored to suppress. Requires admin role.',
requiredRole: 'admin',
inputSchema: z.object({
productId: z.string().min(1).describe('Product ID (for x-product-id scoping)'),
clusterId: z.string().min(1).describe('Cluster ID (fingerprint:yyyyMM)'),
pk: z
.string()
.min(1)
.describe('Cluster partition key (productId:platform:module — from clusters list result)'),
status: z.enum(['open', 'resolved', 'ignored']).describe('New cluster status'),
}),
async execute(args, req) {
return telemetryUpdateCluster(args.clusterId, args.pk, args.status, {
token: tokenOf(req),
requestId: req.id,
productId: args.productId,
});
},
});

View File

@ -18,6 +18,7 @@ import { toolRoutes } from './modules/tools/routes.js';
// Register all tool namespaces (side-effect: populates the tool registry)
import './modules/platform/telemetry-tools.js';
import './modules/platform/telemetry-policy-tools.js';
import './modules/platform/diagnostics-tools.js';
import './modules/extraction/extraction-tools.js';
import './modules/support/debug-pack.js';