feat(mcp-server): Milestone B — telemetry policy CRUD, cluster status, diagnostics cancel, extraction sidecar health; enforce expiresAt on createPolicy
This commit is contained in:
parent
26d3403d5a
commit
d1d643f782
@ -15,29 +15,36 @@ This is the “ready to start building” checklist that turns the docs in this
|
||||
flushes to session-scoped endpoints (`/api/diagnostics/sessions/:id/logs|traces`). No change needed.
|
||||
- Confirmed in `packages/diagnostics-client/src/client.ts` flush() method, lines 430–453
|
||||
|
||||
## 3) Phase 1 build steps (P0 slice)
|
||||
## 3) Phase 1 build steps
|
||||
|
||||
- **Implement MCP tool namespaces** ✅ — [027e216](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/027e216)
|
||||
- [x] `platform.telemetry.*` — query, clusters, metrics (3 tools)
|
||||
- [x] `platform.diagnostics.*` — sessions.list/create/get/update/getLogs/getTraces (6 tools)
|
||||
- [x] `extraction.*` — run, models, cacheStats (3 tools)
|
||||
- **Enforce hard guardrails in MCP layer** ✅ — [027e216](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/027e216)
|
||||
- [x] `productId` required in all query tools, forwarded as `x-product-id`
|
||||
- [x] `x-request-id` propagated via `req.id` on every upstream call
|
||||
- [x] default query caps (`QUERY_DEFAULT_LIMIT=20`) + hard caps (`QUERY_MAX_LIMIT=100`)
|
||||
- [x] role gating (`viewer` / `admin` / `super_admin`) enforced in `requireRole()`
|
||||
- [ ] expiry enforcement for diagnostics sessions — delegated to platform-service `maxDurationMinutes`
|
||||
- **Ship one compound tool** ✅ — [027e216](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/027e216)
|
||||
- [x] `support.createDebugPack(productId, targetUserId?, from?, to?, reason?)`
|
||||
### Milestone A + C (P0 slice) ✅ — [027e216](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/027e216)
|
||||
|
||||
## 4) Phase 1 definition of done
|
||||
- [x] `platform.telemetry.*` — query, clusters, metrics (3 tools)
|
||||
- [x] `platform.diagnostics.*` — sessions.list/create/get/update/getLogs/getTraces (6 tools)
|
||||
- [x] `extraction.*` — run, models, cacheStats (3 tools)
|
||||
- [x] `support.createDebugPack(productId, targetUserId?, from?, to?, reason?)`
|
||||
|
||||
### Milestone B (mutating tools + expiry + cluster status) ✅ — [26d3403](https://github.com/saravanakumardb1/learning_ai_common_plat/commit/26d3403)
|
||||
|
||||
- [x] `platform.telemetry.policies.*` — list, preview, create (expiresAt required), update, delete (5 tools)
|
||||
- [x] `platform.telemetry.clusters.updateStatus` — resolve/ignore/reopen clusters
|
||||
- [x] `platform.diagnostics.sessions.cancel` — dedicated cancel tool
|
||||
- [x] `extraction.sidecarHealth` — Python sidecar health check
|
||||
- [x] `expiresAt` enforced as **required** field on `telemetry.policies.create` at MCP layer
|
||||
- [x] `productId` required in all query tools, forwarded as `x-product-id`
|
||||
- [x] `x-request-id` propagated via `req.id` on every upstream call
|
||||
- [x] default query caps (`QUERY_DEFAULT_LIMIT=20`) + hard caps (`QUERY_MAX_LIMIT=100`)
|
||||
- [x] role gating (`viewer` / `admin` / `super_admin`) enforced in `requireRole()`
|
||||
- [x] expiry for diagnostics sessions delegated to platform-service `maxDurationMinutes`
|
||||
|
||||
## 4) Phase 1 definition of done ✅ COMPLETE
|
||||
|
||||
- [x] Read-only tools work end-to-end against real services (proxy to platform-service + extraction-service)
|
||||
- [x] Mutating tools are role-gated (`admin` minimum) and log audit entries via `req.log`
|
||||
- [x] Compound debug pack produces a single structured artifact with:
|
||||
- [x] telemetry cluster references (up to 10 shown, count included)
|
||||
- [x] optional diagnostics session reference (id, status, expiresAt)
|
||||
- [x] short markdown summary
|
||||
- [x] Compound debug pack produces a single structured artifact with cluster refs, session ref, markdown summary
|
||||
- [x] Telemetry policy CRUD with mandatory `expiresAt` guardrail
|
||||
- [x] Error cluster triage (resolve / ignore / reopen)
|
||||
- [x] Python sidecar health visible via `extraction.sidecarHealth`
|
||||
- [ ] End-to-end integration test with real platform-service (Phase 2)
|
||||
|
||||
## 5) Phase 2+ quick sanity checks
|
||||
|
||||
@ -61,3 +61,12 @@ export async function extractionCacheStats(opts: { requestId?: string }): Promis
|
||||
if (!res.ok) throw new Error(`extraction-service GET /api/extract/cache-stats → ${res.status}`);
|
||||
return res.json();
|
||||
}
|
||||
|
||||
export async function extractionSidecarHealth(opts: { requestId?: string }): Promise<unknown> {
|
||||
const url = `${config.EXTRACTION_SERVICE_URL}/api/extract/sidecar-health`;
|
||||
const headers: Record<string, string> = {
|
||||
...(opts.requestId ? { 'x-request-id': opts.requestId } : {}),
|
||||
};
|
||||
const res = await fetch(url, { headers, signal: AbortSignal.timeout(10_000) });
|
||||
return res.json();
|
||||
}
|
||||
|
||||
@ -90,6 +90,69 @@ export async function telemetryMetrics(opts: PlatformClientOptions): Promise<unk
|
||||
return platformFetch<unknown>('/api/telemetry/metrics', { method: 'GET' }, opts);
|
||||
}
|
||||
|
||||
export async function telemetryListPolicies(opts: PlatformClientOptions): Promise<unknown> {
|
||||
return platformFetch<unknown>('/api/telemetry/policies', { method: 'GET' }, opts);
|
||||
}
|
||||
|
||||
export async function telemetryPreviewPolicy(
|
||||
body: { targeting?: Record<string, unknown> },
|
||||
opts: PlatformClientOptions
|
||||
): Promise<unknown> {
|
||||
return platformFetch<unknown>(
|
||||
'/api/telemetry/policies/preview',
|
||||
{ method: 'POST', body: JSON.stringify(body) },
|
||||
opts
|
||||
);
|
||||
}
|
||||
|
||||
export async function telemetryCreatePolicy(
|
||||
body: Record<string, unknown>,
|
||||
opts: PlatformClientOptions
|
||||
): Promise<unknown> {
|
||||
return platformFetch<unknown>(
|
||||
'/api/telemetry/policies',
|
||||
{ method: 'POST', body: JSON.stringify(body) },
|
||||
opts
|
||||
);
|
||||
}
|
||||
|
||||
export async function telemetryUpdatePolicy(
|
||||
policyId: string,
|
||||
body: Record<string, unknown>,
|
||||
opts: PlatformClientOptions
|
||||
): Promise<unknown> {
|
||||
return platformFetch<unknown>(
|
||||
`/api/telemetry/policies/${encodeURIComponent(policyId)}`,
|
||||
{ method: 'PUT', body: JSON.stringify(body) },
|
||||
opts
|
||||
);
|
||||
}
|
||||
|
||||
export async function telemetryDeletePolicy(
|
||||
policyId: string,
|
||||
opts: PlatformClientOptions
|
||||
): Promise<{ success: boolean }> {
|
||||
return platformFetch<{ success: boolean }>(
|
||||
`/api/telemetry/policies/${encodeURIComponent(policyId)}`,
|
||||
{ method: 'DELETE' },
|
||||
opts
|
||||
);
|
||||
}
|
||||
|
||||
export async function telemetryUpdateCluster(
|
||||
clusterId: string,
|
||||
pk: string,
|
||||
status: 'open' | 'resolved' | 'ignored',
|
||||
opts: PlatformClientOptions
|
||||
): Promise<unknown> {
|
||||
const qs = new URLSearchParams({ pk });
|
||||
return platformFetch<unknown>(
|
||||
`/api/telemetry/clusters/${encodeURIComponent(clusterId)}?${qs}`,
|
||||
{ method: 'PATCH', body: JSON.stringify({ status }) },
|
||||
opts
|
||||
);
|
||||
}
|
||||
|
||||
// ── Diagnostics ───────────────────────────────────────────────────────────────
|
||||
|
||||
export interface DebugSession {
|
||||
|
||||
@ -4,6 +4,7 @@ import {
|
||||
extractionRun,
|
||||
extractionModels,
|
||||
extractionCacheStats,
|
||||
extractionSidecarHealth,
|
||||
} from '../../lib/extraction-client.js';
|
||||
|
||||
registerTool({
|
||||
@ -43,3 +44,14 @@ registerTool({
|
||||
return extractionCacheStats({ requestId: req.id });
|
||||
},
|
||||
});
|
||||
|
||||
registerTool({
|
||||
name: 'extraction.sidecarHealth',
|
||||
description:
|
||||
'Check the health of the Python extraction sidecar process. Returns status and last-seen timestamp. Requires admin role.',
|
||||
requiredRole: 'admin',
|
||||
inputSchema: z.object({}),
|
||||
async execute(_args, req) {
|
||||
return extractionSidecarHealth({ requestId: req.id });
|
||||
},
|
||||
});
|
||||
|
||||
@ -139,3 +139,20 @@ registerTool({
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
registerTool({
|
||||
name: 'platform.diagnostics.sessions.cancel',
|
||||
description:
|
||||
'Cancel an active remote debug session immediately. Stops further data collection. Requires admin role.',
|
||||
requiredRole: 'admin',
|
||||
inputSchema: z.object({
|
||||
sessionId: z.string().min(1).describe('Debug session ID to cancel'),
|
||||
}),
|
||||
async execute(args, req) {
|
||||
return diagnosticsUpdateSession(
|
||||
args.sessionId,
|
||||
{ status: 'cancelled' },
|
||||
{ token: tokenOf(req), requestId: req.id }
|
||||
);
|
||||
},
|
||||
});
|
||||
|
||||
@ -0,0 +1,180 @@
|
||||
import { z } from 'zod';
|
||||
import { registerTool } from '../tools/registry.js';
|
||||
import {
|
||||
telemetryListPolicies,
|
||||
telemetryPreviewPolicy,
|
||||
telemetryCreatePolicy,
|
||||
telemetryUpdatePolicy,
|
||||
telemetryDeletePolicy,
|
||||
telemetryUpdateCluster,
|
||||
} from '../../lib/platform-client.js';
|
||||
import type { McpToolRequest } from '../tools/types.js';
|
||||
|
||||
const tokenOf = (req: McpToolRequest) => req.headers.authorization?.slice(7);
|
||||
|
||||
// ── Shared targeting sub-schema ────────────────────────────────────────────
|
||||
const TargetingSchema = z
|
||||
.object({
|
||||
userIds: z.array(z.string()).optional(),
|
||||
platforms: z.array(z.string()).optional().describe('e.g. ios, android, web'),
|
||||
channels: z.array(z.string()).optional().describe('e.g. release, beta, internal'),
|
||||
osFamilies: z.array(z.string()).optional(),
|
||||
appVersions: z.array(z.string()).optional(),
|
||||
releaseChannels: z.array(z.string()).optional(),
|
||||
countryCodes: z.array(z.string()).optional(),
|
||||
percentage: z.coerce.number().min(0).max(100).optional().describe('% of installs to target'),
|
||||
})
|
||||
.optional()
|
||||
.describe('Targeting criteria — omit to apply to all installs');
|
||||
|
||||
// ── platform.telemetry.policies.list ──────────────────────────────────────
|
||||
|
||||
registerTool({
|
||||
name: 'platform.telemetry.policies.list',
|
||||
description: 'List all telemetry collection policies for a product. Requires admin role.',
|
||||
requiredRole: 'admin',
|
||||
inputSchema: z.object({
|
||||
productId: z.string().min(1).describe('Product ID to scope the query'),
|
||||
}),
|
||||
async execute(args, req) {
|
||||
return telemetryListPolicies({
|
||||
token: tokenOf(req),
|
||||
requestId: req.id,
|
||||
productId: args.productId,
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
// ── platform.telemetry.policies.preview ──────────────────────────────────
|
||||
|
||||
registerTool({
|
||||
name: 'platform.telemetry.policies.preview',
|
||||
description:
|
||||
'Preview how many clients a targeting config would match (dry-run before creating a policy). Requires admin role.',
|
||||
requiredRole: 'admin',
|
||||
inputSchema: z.object({
|
||||
productId: z.string().min(1).describe('Product ID to scope the preview'),
|
||||
targeting: TargetingSchema,
|
||||
}),
|
||||
async execute(args, req) {
|
||||
return telemetryPreviewPolicy(
|
||||
{ targeting: args.targeting as Record<string, unknown> | undefined },
|
||||
{ token: tokenOf(req), requestId: req.id, productId: args.productId }
|
||||
);
|
||||
},
|
||||
});
|
||||
|
||||
// ── platform.telemetry.policies.create ───────────────────────────────────
|
||||
|
||||
registerTool({
|
||||
name: 'platform.telemetry.policies.create',
|
||||
description: [
|
||||
'Create a telemetry collection policy. expiresAt is REQUIRED — never create an open-ended policy.',
|
||||
'eventTypes defaults to [warn, error, fatal]. samplingRate 0.0–1.0 (1.0 = 100%). Requires admin role.',
|
||||
].join(' '),
|
||||
requiredRole: 'admin',
|
||||
inputSchema: z.object({
|
||||
productId: z.string().min(1).describe('Product ID'),
|
||||
name: z.string().min(1).max(200).describe('Human-readable policy name'),
|
||||
description: z.string().optional(),
|
||||
expiresAt: z
|
||||
.string()
|
||||
.datetime()
|
||||
.describe('ISO 8601 expiry — REQUIRED. Policies must not run indefinitely.'),
|
||||
eventTypes: z
|
||||
.array(z.enum(['debug', 'info', 'warn', 'error', 'fatal']))
|
||||
.optional()
|
||||
.describe('Event types to collect (default: warn, error, fatal)'),
|
||||
modules: z.array(z.string()).optional().describe('Module names to target (empty = all)'),
|
||||
samplingRate: z.coerce
|
||||
.number()
|
||||
.min(0)
|
||||
.max(1)
|
||||
.optional()
|
||||
.describe('Fraction of events to collect (default: 1.0)'),
|
||||
enabled: z.boolean().optional().describe('Enable immediately (default: true)'),
|
||||
priority: z.coerce.number().min(0).max(999).optional(),
|
||||
startsAt: z.string().datetime().optional(),
|
||||
targeting: TargetingSchema,
|
||||
}),
|
||||
async execute(args, req) {
|
||||
const { productId, ...body } = args;
|
||||
return telemetryCreatePolicy(body as Record<string, unknown>, {
|
||||
token: tokenOf(req),
|
||||
requestId: req.id,
|
||||
productId,
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
// ── platform.telemetry.policies.update ──────────────────────────────────
|
||||
|
||||
registerTool({
|
||||
name: 'platform.telemetry.policies.update',
|
||||
description: 'Update an existing telemetry policy. All fields are optional. Requires admin role.',
|
||||
requiredRole: 'admin',
|
||||
inputSchema: z.object({
|
||||
productId: z.string().min(1).describe('Product ID (for x-product-id scoping)'),
|
||||
policyId: z.string().min(1).describe('Policy ID to update'),
|
||||
name: z.string().min(1).max(200).optional(),
|
||||
description: z.string().optional(),
|
||||
enabled: z.boolean().optional(),
|
||||
expiresAt: z.string().datetime().optional().describe('Update or extend expiry (ISO 8601)'),
|
||||
eventTypes: z.array(z.enum(['debug', 'info', 'warn', 'error', 'fatal'])).optional(),
|
||||
modules: z.array(z.string()).optional(),
|
||||
samplingRate: z.coerce.number().min(0).max(1).optional(),
|
||||
targeting: TargetingSchema,
|
||||
}),
|
||||
async execute(args, req) {
|
||||
const { productId, policyId, ...body } = args;
|
||||
return telemetryUpdatePolicy(policyId, body as Record<string, unknown>, {
|
||||
token: tokenOf(req),
|
||||
requestId: req.id,
|
||||
productId,
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
// ── platform.telemetry.policies.delete ──────────────────────────────────
|
||||
|
||||
registerTool({
|
||||
name: 'platform.telemetry.policies.delete',
|
||||
description: 'Delete a telemetry policy by ID. Requires admin role.',
|
||||
requiredRole: 'admin',
|
||||
inputSchema: z.object({
|
||||
productId: z.string().min(1).describe('Product ID (for x-product-id scoping)'),
|
||||
policyId: z.string().min(1).describe('Policy ID to delete'),
|
||||
}),
|
||||
async execute(args, req) {
|
||||
return telemetryDeletePolicy(args.policyId, {
|
||||
token: tokenOf(req),
|
||||
requestId: req.id,
|
||||
productId: args.productId,
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
// ── platform.telemetry.clusters.updateStatus ────────────────────────────
|
||||
|
||||
registerTool({
|
||||
name: 'platform.telemetry.clusters.updateStatus',
|
||||
description:
|
||||
'Resolve or ignore an error cluster (fingerprinted error group). Use resolved when fixed, ignored to suppress. Requires admin role.',
|
||||
requiredRole: 'admin',
|
||||
inputSchema: z.object({
|
||||
productId: z.string().min(1).describe('Product ID (for x-product-id scoping)'),
|
||||
clusterId: z.string().min(1).describe('Cluster ID (fingerprint:yyyyMM)'),
|
||||
pk: z
|
||||
.string()
|
||||
.min(1)
|
||||
.describe('Cluster partition key (productId:platform:module — from clusters list result)'),
|
||||
status: z.enum(['open', 'resolved', 'ignored']).describe('New cluster status'),
|
||||
}),
|
||||
async execute(args, req) {
|
||||
return telemetryUpdateCluster(args.clusterId, args.pk, args.status, {
|
||||
token: tokenOf(req),
|
||||
requestId: req.id,
|
||||
productId: args.productId,
|
||||
});
|
||||
},
|
||||
});
|
||||
@ -18,6 +18,7 @@ import { toolRoutes } from './modules/tools/routes.js';
|
||||
|
||||
// Register all tool namespaces (side-effect: populates the tool registry)
|
||||
import './modules/platform/telemetry-tools.js';
|
||||
import './modules/platform/telemetry-policy-tools.js';
|
||||
import './modules/platform/diagnostics-tools.js';
|
||||
import './modules/extraction/extraction-tools.js';
|
||||
import './modules/support/debug-pack.js';
|
||||
|
||||
Loading…
Reference in New Issue
Block a user