feat(platform): Phase 4 — AI Governance & Evals

- Run history: GET /agent-evals/suites/:id/runs with limit param
- Regression comparison: GET /agent-evals/suites/:id/regression
  - Detects 5%+ score drop between consecutive runs
  - Returns latest vs previous comparison + trend data
- Release gate check: GET /agent-evals/suites/:id/gate
  - Checks if latest release-gate run passed threshold
- Agent compliance report: GET /agent-evals/agents/:agentId/report
  - Aggregates pass rate, avg score, suite counts, recent runs
- Eval scheduling: POST /agent-evals/suites/:id/schedule
  - Wires eval suite to job runner with cron expression
- New repo functions: listRunsBySuite, listRunsByAgent
- 1,324 tests passing (8 new)
This commit is contained in:
saravanakumardb1 2026-03-20 03:30:03 -07:00
parent 05acacd400
commit 9758192377
3 changed files with 384 additions and 0 deletions

View File

@ -102,3 +102,27 @@ export async function listResults(runId: string): Promise<EvaluationResultDoc[]>
limit: 1000,
});
}
export async function listRunsBySuite(
productId: string,
suiteId: string,
limit = 50
): Promise<EvaluationRunDoc[]> {
return runCollection().findMany({
filter: { productId, suiteId },
sort: { startedAt: -1 },
limit,
});
}
export async function listRunsByAgent(
productId: string,
agentId: string,
limit = 50
): Promise<EvaluationRunDoc[]> {
return runCollection().findMany({
filter: { productId, agentId },
sort: { startedAt: -1 },
limit,
});
}

View File

@ -13,8 +13,13 @@ const repoMock = {
createResults: vi.fn(),
listResults: vi.fn(),
updateRun: vi.fn(),
listRunsBySuite: vi.fn(),
listRunsByAgent: vi.fn(),
};
const jobRunnerMock = { ensureJobDefinitions: vi.fn() };
const jobRegistryMock = { registerJob: vi.fn() };
const agentRepoMock = {
getAgent: vi.fn(),
getAgentVersion: vi.fn(),
@ -22,6 +27,8 @@ const agentRepoMock = {
vi.mock('./repository.js', () => repoMock);
vi.mock('../agents/repository.js', () => agentRepoMock);
vi.mock('../jobs/runner.js', () => jobRunnerMock);
vi.mock('../jobs/registry.js', () => jobRegistryMock);
async function buildApp(payload?: { sub: string; productId: string; role?: string }) {
const { agentEvalRoutes } = await import('./routes.js');
@ -140,4 +147,172 @@ describe('agentEvalRoutes', () => {
})
);
});
// ── Run History ─────────────────────────────────────────
it('GET /agent-evals/suites/:id/runs returns run history', async () => {
repoMock.listRunsBySuite.mockResolvedValue([{ id: 'evalrun_1' }]);
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
const res = await app.inject({ method: 'GET', url: '/api/agent-evals/suites/evals_1/runs' });
expect(res.statusCode).toBe(200);
expect(repoMock.listRunsBySuite).toHaveBeenCalledWith('lysnrai', 'evals_1', 20);
});
// ── Regression Comparison ───────────────────────────────
it('GET /agent-evals/suites/:id/regression detects score regression', async () => {
repoMock.listRunsBySuite.mockResolvedValue([
{
id: 'run_2',
status: 'completed',
score: 0.7,
agentVersion: 2,
verdict: 'fail',
passedCases: 7,
failedCases: 3,
startedAt: '2026-03-16',
},
{
id: 'run_1',
status: 'completed',
score: 0.9,
agentVersion: 1,
verdict: 'pass',
passedCases: 9,
failedCases: 1,
startedAt: '2026-03-15',
},
]);
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
const res = await app.inject({
method: 'GET',
url: '/api/agent-evals/suites/evals_1/regression',
});
expect(res.statusCode).toBe(200);
const body = res.json();
expect(body.regression).toBe(true);
expect(body.scoreDelta).toBeLessThan(0);
});
it('GET /agent-evals/suites/:id/regression returns false with insufficient runs', async () => {
repoMock.listRunsBySuite.mockResolvedValue([{ id: 'run_1', status: 'completed', score: 0.9 }]);
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
const res = await app.inject({
method: 'GET',
url: '/api/agent-evals/suites/evals_1/regression',
});
expect(res.statusCode).toBe(200);
expect(res.json().regression).toBe(false);
});
// ── Release Gate ────────────────────────────────────────
it('GET /agent-evals/suites/:id/gate returns allowed when gate passed', async () => {
repoMock.getSuite.mockResolvedValue({ id: 'evals_1', passThreshold: 0.8 });
repoMock.listRunsBySuite.mockResolvedValue([
{
id: 'run_1',
releaseGate: true,
status: 'completed',
verdict: 'pass',
agentVersion: 2,
score: 0.95,
},
]);
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
const res = await app.inject({ method: 'GET', url: '/api/agent-evals/suites/evals_1/gate' });
expect(res.statusCode).toBe(200);
expect(res.json().allowed).toBe(true);
});
it('GET /agent-evals/suites/:id/gate returns not allowed when no gate run', async () => {
repoMock.getSuite.mockResolvedValue({ id: 'evals_1', passThreshold: 0.8 });
repoMock.listRunsBySuite.mockResolvedValue([]);
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
const res = await app.inject({ method: 'GET', url: '/api/agent-evals/suites/evals_1/gate' });
expect(res.statusCode).toBe(200);
expect(res.json().allowed).toBe(false);
});
// ── Compliance Report ───────────────────────────────────
it('GET /agent-evals/agents/:agentId/report returns compliance summary', async () => {
repoMock.listSuites.mockResolvedValue([
{ id: 'evals_1', status: 'active' },
{ id: 'evals_2', status: 'draft' },
]);
repoMock.listRunsByAgent.mockResolvedValue([
{
id: 'run_1',
status: 'completed',
verdict: 'pass',
score: 0.95,
agentVersion: 1,
suiteId: 'evals_1',
startedAt: '2026-03-15',
completedAt: '2026-03-15',
},
{
id: 'run_2',
status: 'completed',
verdict: 'fail',
score: 0.6,
agentVersion: 2,
suiteId: 'evals_1',
startedAt: '2026-03-16',
completedAt: '2026-03-16',
},
{ id: 'run_3', status: 'running' },
]);
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
const res = await app.inject({ method: 'GET', url: '/api/agent-evals/agents/agt_1/report' });
expect(res.statusCode).toBe(200);
const body = res.json();
expect(body.totalSuites).toBe(2);
expect(body.activeSuites).toBe(1);
expect(body.completedRuns).toBe(2);
expect(body.passed).toBe(1);
expect(body.failed).toBe(1);
expect(body.passRate).toBe(50);
});
// ── Schedule Eval ───────────────────────────────────────
it('POST /agent-evals/suites/:id/schedule creates scheduled eval job', async () => {
repoMock.getSuite.mockResolvedValue({ id: 'evals_1', name: 'Release Gate', agentId: 'agt_1' });
jobRunnerMock.ensureJobDefinitions.mockResolvedValue(undefined);
jobRegistryMock.registerJob.mockReturnValue(undefined);
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
const res = await app.inject({
method: 'POST',
url: '/api/agent-evals/suites/evals_1/schedule',
payload: { cronExpression: '0 0 * * *' },
});
expect(res.statusCode).toBe(200);
expect(res.json()).toEqual({
scheduled: true,
jobName: 'eval:evals_1',
cronExpression: '0 0 * * *',
});
});
it('POST /agent-evals/suites/:id/schedule requires cronExpression', async () => {
repoMock.getSuite.mockResolvedValue({ id: 'evals_1' });
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
const res = await app.inject({
method: 'POST',
url: '/api/agent-evals/suites/evals_1/schedule',
payload: {},
});
expect(res.statusCode).toBe(400);
});
});

View File

@ -229,4 +229,189 @@ export async function agentEvalRoutes(app: FastifyInstance) {
reviewRequired,
});
});
// ── Run History ─────────────────────────────────────────
app.get('/agent-evals/suites/:id/runs', async req => {
const access = requireAdmin(req);
const { id } = req.params as { id: string };
const query = req.query as { limit?: string };
return repo.listRunsBySuite(access.productId, id, parseInt(query.limit ?? '20', 10));
});
// ── Regression Comparison ───────────────────────────────
app.get('/agent-evals/suites/:id/regression', async req => {
const access = requireAdmin(req);
const { id } = req.params as { id: string };
const runs = await repo.listRunsBySuite(access.productId, id, 10);
const completedRuns = runs.filter(r => r.status === 'completed' && r.score !== undefined);
if (completedRuns.length < 2) {
return {
regression: false,
message: 'Need at least 2 completed runs to compare',
runs: completedRuns,
};
}
const latest = completedRuns[0];
const previous = completedRuns[1];
const scoreDelta = (latest.score ?? 0) - (previous.score ?? 0);
const regression = scoreDelta < -0.05; // 5% drop = regression
return {
regression,
latest: {
runId: latest.id,
version: latest.agentVersion,
score: latest.score,
verdict: latest.verdict,
passedCases: latest.passedCases,
failedCases: latest.failedCases,
},
previous: {
runId: previous.id,
version: previous.agentVersion,
score: previous.score,
verdict: previous.verdict,
passedCases: previous.passedCases,
failedCases: previous.failedCases,
},
scoreDelta: Math.round(scoreDelta * 1000) / 1000,
trend: completedRuns.slice(0, 5).map(r => ({
runId: r.id,
version: r.agentVersion,
score: r.score,
date: r.startedAt,
})),
};
});
// ── Release Gate Check ──────────────────────────────────
app.get('/agent-evals/suites/:id/gate', async req => {
const access = requireAdmin(req);
const { id } = req.params as { id: string };
const suite = await repo.getSuite(id, access.productId);
const runs = await repo.listRunsBySuite(access.productId, id, 5);
const latestGateRun = runs.find(r => r.releaseGate && r.status === 'completed');
if (!latestGateRun) {
return { allowed: false, reason: 'No completed release gate run found' };
}
const passed = latestGateRun.verdict === 'pass';
return {
allowed: passed,
runId: latestGateRun.id,
version: latestGateRun.agentVersion,
score: latestGateRun.score,
verdict: latestGateRun.verdict,
passThreshold: suite.passThreshold,
reason: passed
? `Score ${latestGateRun.score} >= threshold ${suite.passThreshold}`
: `Score ${latestGateRun.score} below threshold ${suite.passThreshold}`,
};
});
// ── Agent Compliance Report ─────────────────────────────
app.get('/agent-evals/agents/:agentId/report', async req => {
const access = requireAdmin(req);
const { agentId } = req.params as { agentId: string };
const suites = await repo.listSuites(access.productId, { agentId, limit: 100 });
const allRuns = await repo.listRunsByAgent(access.productId, agentId, 100);
const completedRuns = allRuns.filter(r => r.status === 'completed');
const passedRuns = completedRuns.filter(r => r.verdict === 'pass');
const failedRuns = completedRuns.filter(r => r.verdict === 'fail');
const reviewRequired = completedRuns.filter(r => r.verdict === 'needs_review');
const scores = completedRuns.map(r => r.score).filter((s): s is number => s !== undefined);
const avgScore =
scores.length > 0
? Math.round((scores.reduce((a, b) => a + b, 0) / scores.length) * 1000) / 1000
: 0;
return {
agentId,
totalSuites: suites.length,
activeSuites: suites.filter(s => s.status === 'active').length,
totalRuns: allRuns.length,
completedRuns: completedRuns.length,
passed: passedRuns.length,
failed: failedRuns.length,
needsReview: reviewRequired.length,
averageScore: avgScore,
passRate:
completedRuns.length > 0 ? Math.round((passedRuns.length / completedRuns.length) * 100) : 0,
recentRuns: completedRuns.slice(0, 5).map(r => ({
runId: r.id,
suiteId: r.suiteId,
version: r.agentVersion,
score: r.score,
verdict: r.verdict,
date: r.completedAt ?? r.startedAt,
})),
};
});
// ── Schedule Eval Suite ─────────────────────────────────
app.post('/agent-evals/suites/:id/schedule', async req => {
const access = requireAdmin(req);
const { id } = req.params as { id: string };
const body = req.body as { cronExpression?: string };
if (!body.cronExpression) {
validationError('cronExpression is required');
}
const suite = await repo.getSuite(id, access.productId);
const { ensureJobDefinitions } = await import('../jobs/runner.js');
const { registerJob } = await import('../jobs/registry.js');
const jobName = `eval:${id}`;
registerJob(jobName, async () => {
// Create a new eval run (queued status, to be picked up by eval executor)
const agent = await agentRepo.getAgent(suite.agentId, access.productId);
const versionId = `${suite.agentId}:v${suite.targetVersion ?? agent.currentVersion}`;
const version = await agentRepo.getAgentVersion(versionId, suite.agentId);
const cases = await repo.listCases(id);
await repo.createRun({
id: `evalrun_${randomUUID()}`,
suiteId: id,
agentId: suite.agentId,
productId: access.productId,
agentVersionId: version.id,
agentVersion: version.version,
status: 'queued',
passThreshold: suite.passThreshold,
releaseGate: false,
totalCases: cases.length,
passedCases: 0,
failedCases: 0,
triggeredBy: 'scheduler',
startedAt: new Date().toISOString(),
reviewRequired: false,
});
return { success: true, message: `Eval run queued for suite ${id}` };
});
await ensureJobDefinitions([
{
name: jobName,
cron: body.cronExpression,
description: `Scheduled eval for suite ${suite.name}`,
},
]);
return { scheduled: true, jobName, cronExpression: body.cronExpression };
});
}