diff --git a/services/platform-service/src/modules/agent-evals/repository.ts b/services/platform-service/src/modules/agent-evals/repository.ts index d55d28a1..e7767637 100644 --- a/services/platform-service/src/modules/agent-evals/repository.ts +++ b/services/platform-service/src/modules/agent-evals/repository.ts @@ -102,3 +102,27 @@ export async function listResults(runId: string): Promise limit: 1000, }); } + +export async function listRunsBySuite( + productId: string, + suiteId: string, + limit = 50 +): Promise { + return runCollection().findMany({ + filter: { productId, suiteId }, + sort: { startedAt: -1 }, + limit, + }); +} + +export async function listRunsByAgent( + productId: string, + agentId: string, + limit = 50 +): Promise { + return runCollection().findMany({ + filter: { productId, agentId }, + sort: { startedAt: -1 }, + limit, + }); +} diff --git a/services/platform-service/src/modules/agent-evals/routes.test.ts b/services/platform-service/src/modules/agent-evals/routes.test.ts index 9b0cd9a7..fb245f0a 100644 --- a/services/platform-service/src/modules/agent-evals/routes.test.ts +++ b/services/platform-service/src/modules/agent-evals/routes.test.ts @@ -13,8 +13,13 @@ const repoMock = { createResults: vi.fn(), listResults: vi.fn(), updateRun: vi.fn(), + listRunsBySuite: vi.fn(), + listRunsByAgent: vi.fn(), }; +const jobRunnerMock = { ensureJobDefinitions: vi.fn() }; +const jobRegistryMock = { registerJob: vi.fn() }; + const agentRepoMock = { getAgent: vi.fn(), getAgentVersion: vi.fn(), @@ -22,6 +27,8 @@ const agentRepoMock = { vi.mock('./repository.js', () => repoMock); vi.mock('../agents/repository.js', () => agentRepoMock); +vi.mock('../jobs/runner.js', () => jobRunnerMock); +vi.mock('../jobs/registry.js', () => jobRegistryMock); async function buildApp(payload?: { sub: string; productId: string; role?: string }) { const { agentEvalRoutes } = await import('./routes.js'); @@ -140,4 +147,172 @@ describe('agentEvalRoutes', () => { }) ); }); + + // ── Run History ───────────────────────────────────────── + + it('GET /agent-evals/suites/:id/runs returns run history', async () => { + repoMock.listRunsBySuite.mockResolvedValue([{ id: 'evalrun_1' }]); + const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' }); + + const res = await app.inject({ method: 'GET', url: '/api/agent-evals/suites/evals_1/runs' }); + expect(res.statusCode).toBe(200); + expect(repoMock.listRunsBySuite).toHaveBeenCalledWith('lysnrai', 'evals_1', 20); + }); + + // ── Regression Comparison ─────────────────────────────── + + it('GET /agent-evals/suites/:id/regression detects score regression', async () => { + repoMock.listRunsBySuite.mockResolvedValue([ + { + id: 'run_2', + status: 'completed', + score: 0.7, + agentVersion: 2, + verdict: 'fail', + passedCases: 7, + failedCases: 3, + startedAt: '2026-03-16', + }, + { + id: 'run_1', + status: 'completed', + score: 0.9, + agentVersion: 1, + verdict: 'pass', + passedCases: 9, + failedCases: 1, + startedAt: '2026-03-15', + }, + ]); + const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' }); + + const res = await app.inject({ + method: 'GET', + url: '/api/agent-evals/suites/evals_1/regression', + }); + expect(res.statusCode).toBe(200); + const body = res.json(); + expect(body.regression).toBe(true); + expect(body.scoreDelta).toBeLessThan(0); + }); + + it('GET /agent-evals/suites/:id/regression returns false with insufficient runs', async () => { + repoMock.listRunsBySuite.mockResolvedValue([{ id: 'run_1', status: 'completed', score: 0.9 }]); + const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' }); + + const res = await app.inject({ + method: 'GET', + url: '/api/agent-evals/suites/evals_1/regression', + }); + expect(res.statusCode).toBe(200); + expect(res.json().regression).toBe(false); + }); + + // ── Release Gate ──────────────────────────────────────── + + it('GET /agent-evals/suites/:id/gate returns allowed when gate passed', async () => { + repoMock.getSuite.mockResolvedValue({ id: 'evals_1', passThreshold: 0.8 }); + repoMock.listRunsBySuite.mockResolvedValue([ + { + id: 'run_1', + releaseGate: true, + status: 'completed', + verdict: 'pass', + agentVersion: 2, + score: 0.95, + }, + ]); + const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' }); + + const res = await app.inject({ method: 'GET', url: '/api/agent-evals/suites/evals_1/gate' }); + expect(res.statusCode).toBe(200); + expect(res.json().allowed).toBe(true); + }); + + it('GET /agent-evals/suites/:id/gate returns not allowed when no gate run', async () => { + repoMock.getSuite.mockResolvedValue({ id: 'evals_1', passThreshold: 0.8 }); + repoMock.listRunsBySuite.mockResolvedValue([]); + const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' }); + + const res = await app.inject({ method: 'GET', url: '/api/agent-evals/suites/evals_1/gate' }); + expect(res.statusCode).toBe(200); + expect(res.json().allowed).toBe(false); + }); + + // ── Compliance Report ─────────────────────────────────── + + it('GET /agent-evals/agents/:agentId/report returns compliance summary', async () => { + repoMock.listSuites.mockResolvedValue([ + { id: 'evals_1', status: 'active' }, + { id: 'evals_2', status: 'draft' }, + ]); + repoMock.listRunsByAgent.mockResolvedValue([ + { + id: 'run_1', + status: 'completed', + verdict: 'pass', + score: 0.95, + agentVersion: 1, + suiteId: 'evals_1', + startedAt: '2026-03-15', + completedAt: '2026-03-15', + }, + { + id: 'run_2', + status: 'completed', + verdict: 'fail', + score: 0.6, + agentVersion: 2, + suiteId: 'evals_1', + startedAt: '2026-03-16', + completedAt: '2026-03-16', + }, + { id: 'run_3', status: 'running' }, + ]); + const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' }); + + const res = await app.inject({ method: 'GET', url: '/api/agent-evals/agents/agt_1/report' }); + expect(res.statusCode).toBe(200); + const body = res.json(); + expect(body.totalSuites).toBe(2); + expect(body.activeSuites).toBe(1); + expect(body.completedRuns).toBe(2); + expect(body.passed).toBe(1); + expect(body.failed).toBe(1); + expect(body.passRate).toBe(50); + }); + + // ── Schedule Eval ─────────────────────────────────────── + + it('POST /agent-evals/suites/:id/schedule creates scheduled eval job', async () => { + repoMock.getSuite.mockResolvedValue({ id: 'evals_1', name: 'Release Gate', agentId: 'agt_1' }); + jobRunnerMock.ensureJobDefinitions.mockResolvedValue(undefined); + jobRegistryMock.registerJob.mockReturnValue(undefined); + const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' }); + + const res = await app.inject({ + method: 'POST', + url: '/api/agent-evals/suites/evals_1/schedule', + payload: { cronExpression: '0 0 * * *' }, + }); + + expect(res.statusCode).toBe(200); + expect(res.json()).toEqual({ + scheduled: true, + jobName: 'eval:evals_1', + cronExpression: '0 0 * * *', + }); + }); + + it('POST /agent-evals/suites/:id/schedule requires cronExpression', async () => { + repoMock.getSuite.mockResolvedValue({ id: 'evals_1' }); + const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' }); + + const res = await app.inject({ + method: 'POST', + url: '/api/agent-evals/suites/evals_1/schedule', + payload: {}, + }); + expect(res.statusCode).toBe(400); + }); }); diff --git a/services/platform-service/src/modules/agent-evals/routes.ts b/services/platform-service/src/modules/agent-evals/routes.ts index 86079b17..7d345a30 100644 --- a/services/platform-service/src/modules/agent-evals/routes.ts +++ b/services/platform-service/src/modules/agent-evals/routes.ts @@ -229,4 +229,189 @@ export async function agentEvalRoutes(app: FastifyInstance) { reviewRequired, }); }); + + // ── Run History ───────────────────────────────────────── + + app.get('/agent-evals/suites/:id/runs', async req => { + const access = requireAdmin(req); + const { id } = req.params as { id: string }; + const query = req.query as { limit?: string }; + return repo.listRunsBySuite(access.productId, id, parseInt(query.limit ?? '20', 10)); + }); + + // ── Regression Comparison ─────────────────────────────── + + app.get('/agent-evals/suites/:id/regression', async req => { + const access = requireAdmin(req); + const { id } = req.params as { id: string }; + + const runs = await repo.listRunsBySuite(access.productId, id, 10); + const completedRuns = runs.filter(r => r.status === 'completed' && r.score !== undefined); + + if (completedRuns.length < 2) { + return { + regression: false, + message: 'Need at least 2 completed runs to compare', + runs: completedRuns, + }; + } + + const latest = completedRuns[0]; + const previous = completedRuns[1]; + const scoreDelta = (latest.score ?? 0) - (previous.score ?? 0); + const regression = scoreDelta < -0.05; // 5% drop = regression + + return { + regression, + latest: { + runId: latest.id, + version: latest.agentVersion, + score: latest.score, + verdict: latest.verdict, + passedCases: latest.passedCases, + failedCases: latest.failedCases, + }, + previous: { + runId: previous.id, + version: previous.agentVersion, + score: previous.score, + verdict: previous.verdict, + passedCases: previous.passedCases, + failedCases: previous.failedCases, + }, + scoreDelta: Math.round(scoreDelta * 1000) / 1000, + trend: completedRuns.slice(0, 5).map(r => ({ + runId: r.id, + version: r.agentVersion, + score: r.score, + date: r.startedAt, + })), + }; + }); + + // ── Release Gate Check ────────────────────────────────── + + app.get('/agent-evals/suites/:id/gate', async req => { + const access = requireAdmin(req); + const { id } = req.params as { id: string }; + const suite = await repo.getSuite(id, access.productId); + + const runs = await repo.listRunsBySuite(access.productId, id, 5); + const latestGateRun = runs.find(r => r.releaseGate && r.status === 'completed'); + + if (!latestGateRun) { + return { allowed: false, reason: 'No completed release gate run found' }; + } + + const passed = latestGateRun.verdict === 'pass'; + return { + allowed: passed, + runId: latestGateRun.id, + version: latestGateRun.agentVersion, + score: latestGateRun.score, + verdict: latestGateRun.verdict, + passThreshold: suite.passThreshold, + reason: passed + ? `Score ${latestGateRun.score} >= threshold ${suite.passThreshold}` + : `Score ${latestGateRun.score} below threshold ${suite.passThreshold}`, + }; + }); + + // ── Agent Compliance Report ───────────────────────────── + + app.get('/agent-evals/agents/:agentId/report', async req => { + const access = requireAdmin(req); + const { agentId } = req.params as { agentId: string }; + + const suites = await repo.listSuites(access.productId, { agentId, limit: 100 }); + const allRuns = await repo.listRunsByAgent(access.productId, agentId, 100); + + const completedRuns = allRuns.filter(r => r.status === 'completed'); + const passedRuns = completedRuns.filter(r => r.verdict === 'pass'); + const failedRuns = completedRuns.filter(r => r.verdict === 'fail'); + const reviewRequired = completedRuns.filter(r => r.verdict === 'needs_review'); + + const scores = completedRuns.map(r => r.score).filter((s): s is number => s !== undefined); + const avgScore = + scores.length > 0 + ? Math.round((scores.reduce((a, b) => a + b, 0) / scores.length) * 1000) / 1000 + : 0; + + return { + agentId, + totalSuites: suites.length, + activeSuites: suites.filter(s => s.status === 'active').length, + totalRuns: allRuns.length, + completedRuns: completedRuns.length, + passed: passedRuns.length, + failed: failedRuns.length, + needsReview: reviewRequired.length, + averageScore: avgScore, + passRate: + completedRuns.length > 0 ? Math.round((passedRuns.length / completedRuns.length) * 100) : 0, + recentRuns: completedRuns.slice(0, 5).map(r => ({ + runId: r.id, + suiteId: r.suiteId, + version: r.agentVersion, + score: r.score, + verdict: r.verdict, + date: r.completedAt ?? r.startedAt, + })), + }; + }); + + // ── Schedule Eval Suite ───────────────────────────────── + + app.post('/agent-evals/suites/:id/schedule', async req => { + const access = requireAdmin(req); + const { id } = req.params as { id: string }; + const body = req.body as { cronExpression?: string }; + + if (!body.cronExpression) { + validationError('cronExpression is required'); + } + + const suite = await repo.getSuite(id, access.productId); + const { ensureJobDefinitions } = await import('../jobs/runner.js'); + const { registerJob } = await import('../jobs/registry.js'); + + const jobName = `eval:${id}`; + registerJob(jobName, async () => { + // Create a new eval run (queued status, to be picked up by eval executor) + const agent = await agentRepo.getAgent(suite.agentId, access.productId); + const versionId = `${suite.agentId}:v${suite.targetVersion ?? agent.currentVersion}`; + const version = await agentRepo.getAgentVersion(versionId, suite.agentId); + const cases = await repo.listCases(id); + + await repo.createRun({ + id: `evalrun_${randomUUID()}`, + suiteId: id, + agentId: suite.agentId, + productId: access.productId, + agentVersionId: version.id, + agentVersion: version.version, + status: 'queued', + passThreshold: suite.passThreshold, + releaseGate: false, + totalCases: cases.length, + passedCases: 0, + failedCases: 0, + triggeredBy: 'scheduler', + startedAt: new Date().toISOString(), + reviewRequired: false, + }); + + return { success: true, message: `Eval run queued for suite ${id}` }; + }); + + await ensureJobDefinitions([ + { + name: jobName, + cron: body.cronExpression, + description: `Scheduled eval for suite ${suite.name}`, + }, + ]); + + return { scheduled: true, jobName, cronExpression: body.cronExpression }; + }); }