diff --git a/services/mcp-server/src/lib/extraction-client.ts b/services/mcp-server/src/lib/extraction-client.ts index e4809be8..ae4351bb 100644 --- a/services/mcp-server/src/lib/extraction-client.ts +++ b/services/mcp-server/src/lib/extraction-client.ts @@ -192,12 +192,12 @@ export async function extractionGetProductRateLimitStatus( export async function extractionResetProductRateLimit( productId: string, - opts: { requestId?: string } + opts?: { requestId?: string } ): Promise { const url = `${config.EXTRACTION_SERVICE_URL}/api/extract/rate-limits/product/reset`; const headers: Record = { 'Content-Type': 'application/json', - ...(opts.requestId ? { 'x-request-id': opts.requestId } : {}), + ...(opts?.requestId ? { 'x-request-id': opts.requestId } : {}), }; const res = await fetch(url, { method: 'POST', diff --git a/services/mcp-server/src/modules/a2a/post-incident-cleanup-pipeline.ts b/services/mcp-server/src/modules/a2a/post-incident-cleanup-pipeline.ts index 79ec304b..2624652f 100644 --- a/services/mcp-server/src/modules/a2a/post-incident-cleanup-pipeline.ts +++ b/services/mcp-server/src/modules/a2a/post-incident-cleanup-pipeline.ts @@ -148,17 +148,30 @@ export async function runPostIncidentCleanupPipeline( continue; } - await telemetryDeletePolicy(policy.id, { - token: req.headers.authorization?.replace('Bearer ', '') || '', - requestId: req.id, - }); + try { + await telemetryDeletePolicy(policy.id, { + token: req.headers.authorization?.replace('Bearer ', '') || '', + requestId: req.id, + }); - policiesDeleted++; + policiesDeleted++; - req.log.info( - { runId, policyId: policy.id, policyName: policy.name }, - 'Deleted telemetry policy' - ); + req.log.info( + { runId, policyId: policy.id, policyName: policy.name }, + 'Deleted telemetry policy' + ); + } catch (policyError) { + req.log.warn( + { + runId, + policyId: policy.id, + policyName: policy.name, + error: policyError instanceof Error ? policyError.message : String(policyError), + }, + 'Failed to delete telemetry policy (continuing)' + ); + // Continue with next policy instead of failing entire pipeline + } } } @@ -169,25 +182,37 @@ export async function runPostIncidentCleanupPipeline( auditLogExported = true; auditLogSize = 0; } else { - const auditResponse = await telemetryQuery( - { - productId, - eventType: 'audit', - from: incidentTimeWindow.from, - to: incidentTimeWindow.to, - limit: 10000, - }, - { token: req.headers.authorization?.replace('Bearer ', '') || '', requestId: req.id } - ); + try { + const auditResponse = await telemetryQuery( + { + productId, + eventType: 'audit', + from: incidentTimeWindow.from, + to: incidentTimeWindow.to, + limit: 10000, + }, + { token: req.headers.authorization?.replace('Bearer ', '') || '', requestId: req.id } + ); - const auditEvents = auditResponse.events || []; - auditLogExported = true; - auditLogSize = JSON.stringify(auditEvents).length; + const auditEvents = (auditResponse as { events?: unknown[] }).events || []; + auditLogExported = true; + auditLogSize = JSON.stringify(auditEvents).length; - req.log.info( - { runId, eventCount: auditEvents.length, sizeBytes: auditLogSize }, - 'Exported audit log' - ); + req.log.info( + { runId, eventCount: auditEvents.length, sizeBytes: auditLogSize }, + 'Exported audit log' + ); + } catch (auditError) { + req.log.warn( + { + runId, + timeWindow: incidentTimeWindow, + error: auditError instanceof Error ? auditError.message : String(auditError), + }, + 'Failed to export audit log (continuing)' + ); + // Continue without audit log instead of failing entire pipeline + } } } diff --git a/services/mcp-server/src/modules/a2a/regression-watch-pipeline.ts b/services/mcp-server/src/modules/a2a/regression-watch-pipeline.ts index 2ef28d91..0137a821 100644 --- a/services/mcp-server/src/modules/a2a/regression-watch-pipeline.ts +++ b/services/mcp-server/src/modules/a2a/regression-watch-pipeline.ts @@ -97,6 +97,7 @@ export async function runRegressionWatchPipeline( // Step 2: Create diagnostics sessions for representative clusters const sessionsToCreate = Math.min(clustersAboveThreshold.length, maxSessionsToCreate); const sessionIds: string[] = []; + let sessionsCreated = 0; for (let i = 0; i < sessionsToCreate; i++) { const cluster = clustersAboveThreshold[i]; @@ -107,40 +108,55 @@ export async function runRegressionWatchPipeline( '[DRY RUN] Would create diagnostics session' ); sessionIds.push(`dry-run-session-${i + 1}`); + sessionsCreated++; continue; } - // Extract target from cluster - use fingerprint as identifier - const sessionResponse = await diagnosticsCreateSession( - { - productId, - targetUserId: 'system', // Use system user for cluster-based diagnostics - collectionLevel: 'trace', - captureLogs: true, - captureNetwork: true, - maxDurationMinutes: 30, - }, - { token: req.headers.authorization?.replace('Bearer ', '') || '', requestId: req.id } - ); + try { + // Extract target from cluster - use fingerprint as identifier + const sessionResponse = await diagnosticsCreateSession( + { + productId, + targetUserId: 'system', // Use system user for cluster-based diagnostics + collectionLevel: 'trace', + captureLogs: true, + captureNetwork: true, + maxDurationMinutes: 30, + }, + { token: req.headers.authorization?.replace('Bearer ', '') || '', requestId: req.id } + ); - sessionIds.push(sessionResponse.id); + sessionIds.push(sessionResponse.id); + sessionsCreated++; - req.log.info( - { runId, clusterId: cluster.id, sessionId: sessionResponse.id }, - 'Created diagnostics session for cluster' - ); + req.log.info( + { runId, clusterId: cluster.id, sessionId: sessionResponse.id }, + 'Created diagnostics session for cluster' + ); + } catch (sessionError) { + req.log.warn( + { + runId, + clusterId: cluster.id, + pk: cluster.pk, + error: sessionError instanceof Error ? sessionError.message : String(sessionError), + }, + 'Failed to create diagnostics session for cluster (continuing)' + ); + // Continue with next cluster instead of failing entire pipeline + } } const summary = dryRun ? `[DRY RUN] Would create ${sessionsToCreate} diagnostics sessions for ${clustersAboveThreshold.length} clusters at ${severityThreshold} severity.` - : `Created ${sessionsToCreate} diagnostics sessions for ${clustersAboveThreshold.length} clusters at ${severityThreshold} severity.`; + : `Created ${sessionsCreated} diagnostics sessions for ${clustersAboveThreshold.length} clusters at ${severityThreshold} severity.`; return { runId, productId, clustersFound: clusters.length, clustersAboveThreshold: clustersAboveThreshold.length, - sessionsCreated: dryRun ? 0 : sessionsToCreate, + sessionsCreated: dryRun ? 0 : sessionsCreated, sessionIds, dryRun, summary,