feat(platform): Phase 4 — AI Governance & Evals

- Run history: GET /agent-evals/suites/:id/runs with limit param - Regression comparison: GET /agent-evals/suites/:id/regression - Detects 5%+ score drop between consecutive runs - Returns latest vs previous comparison + trend data - Release gate check: GET /agent-evals/suites/:id/gate - Checks if latest release-gate run passed threshold - Agent compliance report: GET /agent-evals/agents/:agentId/report - Aggregates pass rate, avg score, suite counts, recent runs - Eval scheduling: POST /agent-evals/suites/:id/schedule - Wires eval suite to job runner with cron expression - New repo functions: listRunsBySuite, listRunsByAgent - 1,324 tests passing (8 new)
2026-03-20 03:30:03 -07:00 · 2026-03-20 03:30:03 -07:00 · 9758192377
commit 9758192377
parent 05acacd400
3 changed files with 384 additions and 0 deletions
--- a/services/platform-service/src/modules/agent-evals/repository.ts
+++ b/services/platform-service/src/modules/agent-evals/repository.ts
@ -102,3 +102,27 @@ export async function listResults(runId: string): Promise<EvaluationResultDoc[]>
    limit: 1000,
  });
 }
+
+export async function listRunsBySuite(
+  productId: string,
+  suiteId: string,
+  limit = 50
+): Promise<EvaluationRunDoc[]> {
+  return runCollection().findMany({
+    filter: { productId, suiteId },
+    sort: { startedAt: -1 },
+    limit,
+  });
+}
+
+export async function listRunsByAgent(
+  productId: string,
+  agentId: string,
+  limit = 50
+): Promise<EvaluationRunDoc[]> {
+  return runCollection().findMany({
+    filter: { productId, agentId },
+    sort: { startedAt: -1 },
+    limit,
+  });
+}
--- a/services/platform-service/src/modules/agent-evals/routes.test.ts
+++ b/services/platform-service/src/modules/agent-evals/routes.test.ts
@ -13,8 +13,13 @@ const repoMock = {
  createResults: vi.fn(),
  listResults: vi.fn(),
  updateRun: vi.fn(),
+  listRunsBySuite: vi.fn(),
+  listRunsByAgent: vi.fn(),
 };

+const jobRunnerMock = { ensureJobDefinitions: vi.fn() };
+const jobRegistryMock = { registerJob: vi.fn() };
+
 const agentRepoMock = {
  getAgent: vi.fn(),
  getAgentVersion: vi.fn(),
@ -22,6 +27,8 @@ const agentRepoMock = {

 vi.mock('./repository.js', () => repoMock);
 vi.mock('../agents/repository.js', () => agentRepoMock);
+vi.mock('../jobs/runner.js', () => jobRunnerMock);
+vi.mock('../jobs/registry.js', () => jobRegistryMock);

 async function buildApp(payload?: { sub: string; productId: string; role?: string }) {
  const { agentEvalRoutes } = await import('./routes.js');
@ -140,4 +147,172 @@ describe('agentEvalRoutes', () => {
      })
    );
  });
+
+  // ── Run History ─────────────────────────────────────────
+
+  it('GET /agent-evals/suites/:id/runs returns run history', async () => {
+    repoMock.listRunsBySuite.mockResolvedValue([{ id: 'evalrun_1' }]);
+    const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
+
+    const res = await app.inject({ method: 'GET', url: '/api/agent-evals/suites/evals_1/runs' });
+    expect(res.statusCode).toBe(200);
+    expect(repoMock.listRunsBySuite).toHaveBeenCalledWith('lysnrai', 'evals_1', 20);
+  });
+
+  // ── Regression Comparison ───────────────────────────────
+
+  it('GET /agent-evals/suites/:id/regression detects score regression', async () => {
+    repoMock.listRunsBySuite.mockResolvedValue([
+      {
+        id: 'run_2',
+        status: 'completed',
+        score: 0.7,
+        agentVersion: 2,
+        verdict: 'fail',
+        passedCases: 7,
+        failedCases: 3,
+        startedAt: '2026-03-16',
+      },
+      {
+        id: 'run_1',
+        status: 'completed',
+        score: 0.9,
+        agentVersion: 1,
+        verdict: 'pass',
+        passedCases: 9,
+        failedCases: 1,
+        startedAt: '2026-03-15',
+      },
+    ]);
+    const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
+
+    const res = await app.inject({
+      method: 'GET',
+      url: '/api/agent-evals/suites/evals_1/regression',
+    });
+    expect(res.statusCode).toBe(200);
+    const body = res.json();
+    expect(body.regression).toBe(true);
+    expect(body.scoreDelta).toBeLessThan(0);
+  });
+
+  it('GET /agent-evals/suites/:id/regression returns false with insufficient runs', async () => {
+    repoMock.listRunsBySuite.mockResolvedValue([{ id: 'run_1', status: 'completed', score: 0.9 }]);
+    const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
+
+    const res = await app.inject({
+      method: 'GET',
+      url: '/api/agent-evals/suites/evals_1/regression',
+    });
+    expect(res.statusCode).toBe(200);
+    expect(res.json().regression).toBe(false);
+  });
+
+  // ── Release Gate ────────────────────────────────────────
+
+  it('GET /agent-evals/suites/:id/gate returns allowed when gate passed', async () => {
+    repoMock.getSuite.mockResolvedValue({ id: 'evals_1', passThreshold: 0.8 });
+    repoMock.listRunsBySuite.mockResolvedValue([
+      {
+        id: 'run_1',
+        releaseGate: true,
+        status: 'completed',
+        verdict: 'pass',
+        agentVersion: 2,
+        score: 0.95,
+      },
+    ]);
+    const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
+
+    const res = await app.inject({ method: 'GET', url: '/api/agent-evals/suites/evals_1/gate' });
+    expect(res.statusCode).toBe(200);
+    expect(res.json().allowed).toBe(true);
+  });
+
+  it('GET /agent-evals/suites/:id/gate returns not allowed when no gate run', async () => {
+    repoMock.getSuite.mockResolvedValue({ id: 'evals_1', passThreshold: 0.8 });
+    repoMock.listRunsBySuite.mockResolvedValue([]);
+    const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
+
+    const res = await app.inject({ method: 'GET', url: '/api/agent-evals/suites/evals_1/gate' });
+    expect(res.statusCode).toBe(200);
+    expect(res.json().allowed).toBe(false);
+  });
+
+  // ── Compliance Report ───────────────────────────────────
+
+  it('GET /agent-evals/agents/:agentId/report returns compliance summary', async () => {
+    repoMock.listSuites.mockResolvedValue([
+      { id: 'evals_1', status: 'active' },
+      { id: 'evals_2', status: 'draft' },
+    ]);
+    repoMock.listRunsByAgent.mockResolvedValue([
+      {
+        id: 'run_1',
+        status: 'completed',
+        verdict: 'pass',
+        score: 0.95,
+        agentVersion: 1,
+        suiteId: 'evals_1',
+        startedAt: '2026-03-15',
+        completedAt: '2026-03-15',
+      },
+      {
+        id: 'run_2',
+        status: 'completed',
+        verdict: 'fail',
+        score: 0.6,
+        agentVersion: 2,
+        suiteId: 'evals_1',
+        startedAt: '2026-03-16',
+        completedAt: '2026-03-16',
+      },
+      { id: 'run_3', status: 'running' },
+    ]);
+    const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
+
+    const res = await app.inject({ method: 'GET', url: '/api/agent-evals/agents/agt_1/report' });
+    expect(res.statusCode).toBe(200);
+    const body = res.json();
+    expect(body.totalSuites).toBe(2);
+    expect(body.activeSuites).toBe(1);
+    expect(body.completedRuns).toBe(2);
+    expect(body.passed).toBe(1);
+    expect(body.failed).toBe(1);
+    expect(body.passRate).toBe(50);
+  });
+
+  // ── Schedule Eval ───────────────────────────────────────
+
+  it('POST /agent-evals/suites/:id/schedule creates scheduled eval job', async () => {
+    repoMock.getSuite.mockResolvedValue({ id: 'evals_1', name: 'Release Gate', agentId: 'agt_1' });
+    jobRunnerMock.ensureJobDefinitions.mockResolvedValue(undefined);
+    jobRegistryMock.registerJob.mockReturnValue(undefined);
+    const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
+
+    const res = await app.inject({
+      method: 'POST',
+      url: '/api/agent-evals/suites/evals_1/schedule',
+      payload: { cronExpression: '0 0 * * *' },
+    });
+
+    expect(res.statusCode).toBe(200);
+    expect(res.json()).toEqual({
+      scheduled: true,
+      jobName: 'eval:evals_1',
+      cronExpression: '0 0 * * *',
+    });
+  });
+
+  it('POST /agent-evals/suites/:id/schedule requires cronExpression', async () => {
+    repoMock.getSuite.mockResolvedValue({ id: 'evals_1' });
+    const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
+
+    const res = await app.inject({
+      method: 'POST',
+      url: '/api/agent-evals/suites/evals_1/schedule',
+      payload: {},
+    });
+    expect(res.statusCode).toBe(400);
+  });
 });
--- a/services/platform-service/src/modules/agent-evals/routes.ts
+++ b/services/platform-service/src/modules/agent-evals/routes.ts
@ -229,4 +229,189 @@ export async function agentEvalRoutes(app: FastifyInstance) {
      reviewRequired,
    });
  });
+
+  // ── Run History ─────────────────────────────────────────
+
+  app.get('/agent-evals/suites/:id/runs', async req => {
+    const access = requireAdmin(req);
+    const { id } = req.params as { id: string };
+    const query = req.query as { limit?: string };
+    return repo.listRunsBySuite(access.productId, id, parseInt(query.limit ?? '20', 10));
+  });
+
+  // ── Regression Comparison ───────────────────────────────
+
+  app.get('/agent-evals/suites/:id/regression', async req => {
+    const access = requireAdmin(req);
+    const { id } = req.params as { id: string };
+
+    const runs = await repo.listRunsBySuite(access.productId, id, 10);
+    const completedRuns = runs.filter(r => r.status === 'completed' && r.score !== undefined);
+
+    if (completedRuns.length < 2) {
+      return {
+        regression: false,
+        message: 'Need at least 2 completed runs to compare',
+        runs: completedRuns,
+      };
+    }
+
+    const latest = completedRuns[0];
+    const previous = completedRuns[1];
+    const scoreDelta = (latest.score ?? 0) - (previous.score ?? 0);
+    const regression = scoreDelta < -0.05; // 5% drop = regression
+
+    return {
+      regression,
+      latest: {
+        runId: latest.id,
+        version: latest.agentVersion,
+        score: latest.score,
+        verdict: latest.verdict,
+        passedCases: latest.passedCases,
+        failedCases: latest.failedCases,
+      },
+      previous: {
+        runId: previous.id,
+        version: previous.agentVersion,
+        score: previous.score,
+        verdict: previous.verdict,
+        passedCases: previous.passedCases,
+        failedCases: previous.failedCases,
+      },
+      scoreDelta: Math.round(scoreDelta * 1000) / 1000,
+      trend: completedRuns.slice(0, 5).map(r => ({
+        runId: r.id,
+        version: r.agentVersion,
+        score: r.score,
+        date: r.startedAt,
+      })),
+    };
+  });
+
+  // ── Release Gate Check ──────────────────────────────────
+
+  app.get('/agent-evals/suites/:id/gate', async req => {
+    const access = requireAdmin(req);
+    const { id } = req.params as { id: string };
+    const suite = await repo.getSuite(id, access.productId);
+
+    const runs = await repo.listRunsBySuite(access.productId, id, 5);
+    const latestGateRun = runs.find(r => r.releaseGate && r.status === 'completed');
+
+    if (!latestGateRun) {
+      return { allowed: false, reason: 'No completed release gate run found' };
+    }
+
+    const passed = latestGateRun.verdict === 'pass';
+    return {
+      allowed: passed,
+      runId: latestGateRun.id,
+      version: latestGateRun.agentVersion,
+      score: latestGateRun.score,
+      verdict: latestGateRun.verdict,
+      passThreshold: suite.passThreshold,
+      reason: passed
+        ? `Score ${latestGateRun.score} >= threshold ${suite.passThreshold}`
+        : `Score ${latestGateRun.score} below threshold ${suite.passThreshold}`,
+    };
+  });
+
+  // ── Agent Compliance Report ─────────────────────────────
+
+  app.get('/agent-evals/agents/:agentId/report', async req => {
+    const access = requireAdmin(req);
+    const { agentId } = req.params as { agentId: string };
+
+    const suites = await repo.listSuites(access.productId, { agentId, limit: 100 });
+    const allRuns = await repo.listRunsByAgent(access.productId, agentId, 100);
+
+    const completedRuns = allRuns.filter(r => r.status === 'completed');
+    const passedRuns = completedRuns.filter(r => r.verdict === 'pass');
+    const failedRuns = completedRuns.filter(r => r.verdict === 'fail');
+    const reviewRequired = completedRuns.filter(r => r.verdict === 'needs_review');
+
+    const scores = completedRuns.map(r => r.score).filter((s): s is number => s !== undefined);
+    const avgScore =
+      scores.length > 0
+        ? Math.round((scores.reduce((a, b) => a + b, 0) / scores.length) * 1000) / 1000
+        : 0;
+
+    return {
+      agentId,
+      totalSuites: suites.length,
+      activeSuites: suites.filter(s => s.status === 'active').length,
+      totalRuns: allRuns.length,
+      completedRuns: completedRuns.length,
+      passed: passedRuns.length,
+      failed: failedRuns.length,
+      needsReview: reviewRequired.length,
+      averageScore: avgScore,
+      passRate:
+        completedRuns.length > 0 ? Math.round((passedRuns.length / completedRuns.length) * 100) : 0,
+      recentRuns: completedRuns.slice(0, 5).map(r => ({
+        runId: r.id,
+        suiteId: r.suiteId,
+        version: r.agentVersion,
+        score: r.score,
+        verdict: r.verdict,
+        date: r.completedAt ?? r.startedAt,
+      })),
+    };
+  });
+
+  // ── Schedule Eval Suite ─────────────────────────────────
+
+  app.post('/agent-evals/suites/:id/schedule', async req => {
+    const access = requireAdmin(req);
+    const { id } = req.params as { id: string };
+    const body = req.body as { cronExpression?: string };
+
+    if (!body.cronExpression) {
+      validationError('cronExpression is required');
+    }
+
+    const suite = await repo.getSuite(id, access.productId);
+    const { ensureJobDefinitions } = await import('../jobs/runner.js');
+    const { registerJob } = await import('../jobs/registry.js');
+
+    const jobName = `eval:${id}`;
+    registerJob(jobName, async () => {
+      // Create a new eval run (queued status, to be picked up by eval executor)
+      const agent = await agentRepo.getAgent(suite.agentId, access.productId);
+      const versionId = `${suite.agentId}:v${suite.targetVersion ?? agent.currentVersion}`;
+      const version = await agentRepo.getAgentVersion(versionId, suite.agentId);
+      const cases = await repo.listCases(id);
+
+      await repo.createRun({
+        id: `evalrun_${randomUUID()}`,
+        suiteId: id,
+        agentId: suite.agentId,
+        productId: access.productId,
+        agentVersionId: version.id,
+        agentVersion: version.version,
+        status: 'queued',
+        passThreshold: suite.passThreshold,
+        releaseGate: false,
+        totalCases: cases.length,
+        passedCases: 0,
+        failedCases: 0,
+        triggeredBy: 'scheduler',
+        startedAt: new Date().toISOString(),
+        reviewRequired: false,
+      });
+
+      return { success: true, message: `Eval run queued for suite ${id}` };
+    });
+
+    await ensureJobDefinitions([
+      {
+        name: jobName,
+        cron: body.cronExpression,
+        description: `Scheduled eval for suite ${suite.name}`,
+      },
+    ]);
+
+    return { scheduled: true, jobName, cronExpression: body.cronExpression };
+  });
 }