feat(platform): Phase 4 — AI Governance & Evals
- Run history: GET /agent-evals/suites/:id/runs with limit param - Regression comparison: GET /agent-evals/suites/:id/regression - Detects 5%+ score drop between consecutive runs - Returns latest vs previous comparison + trend data - Release gate check: GET /agent-evals/suites/:id/gate - Checks if latest release-gate run passed threshold - Agent compliance report: GET /agent-evals/agents/:agentId/report - Aggregates pass rate, avg score, suite counts, recent runs - Eval scheduling: POST /agent-evals/suites/:id/schedule - Wires eval suite to job runner with cron expression - New repo functions: listRunsBySuite, listRunsByAgent - 1,324 tests passing (8 new)
This commit is contained in:
parent
05acacd400
commit
9758192377
@ -102,3 +102,27 @@ export async function listResults(runId: string): Promise<EvaluationResultDoc[]>
|
||||
limit: 1000,
|
||||
});
|
||||
}
|
||||
|
||||
export async function listRunsBySuite(
|
||||
productId: string,
|
||||
suiteId: string,
|
||||
limit = 50
|
||||
): Promise<EvaluationRunDoc[]> {
|
||||
return runCollection().findMany({
|
||||
filter: { productId, suiteId },
|
||||
sort: { startedAt: -1 },
|
||||
limit,
|
||||
});
|
||||
}
|
||||
|
||||
export async function listRunsByAgent(
|
||||
productId: string,
|
||||
agentId: string,
|
||||
limit = 50
|
||||
): Promise<EvaluationRunDoc[]> {
|
||||
return runCollection().findMany({
|
||||
filter: { productId, agentId },
|
||||
sort: { startedAt: -1 },
|
||||
limit,
|
||||
});
|
||||
}
|
||||
|
||||
@ -13,8 +13,13 @@ const repoMock = {
|
||||
createResults: vi.fn(),
|
||||
listResults: vi.fn(),
|
||||
updateRun: vi.fn(),
|
||||
listRunsBySuite: vi.fn(),
|
||||
listRunsByAgent: vi.fn(),
|
||||
};
|
||||
|
||||
const jobRunnerMock = { ensureJobDefinitions: vi.fn() };
|
||||
const jobRegistryMock = { registerJob: vi.fn() };
|
||||
|
||||
const agentRepoMock = {
|
||||
getAgent: vi.fn(),
|
||||
getAgentVersion: vi.fn(),
|
||||
@ -22,6 +27,8 @@ const agentRepoMock = {
|
||||
|
||||
vi.mock('./repository.js', () => repoMock);
|
||||
vi.mock('../agents/repository.js', () => agentRepoMock);
|
||||
vi.mock('../jobs/runner.js', () => jobRunnerMock);
|
||||
vi.mock('../jobs/registry.js', () => jobRegistryMock);
|
||||
|
||||
async function buildApp(payload?: { sub: string; productId: string; role?: string }) {
|
||||
const { agentEvalRoutes } = await import('./routes.js');
|
||||
@ -140,4 +147,172 @@ describe('agentEvalRoutes', () => {
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
// ── Run History ─────────────────────────────────────────
|
||||
|
||||
it('GET /agent-evals/suites/:id/runs returns run history', async () => {
|
||||
repoMock.listRunsBySuite.mockResolvedValue([{ id: 'evalrun_1' }]);
|
||||
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
|
||||
|
||||
const res = await app.inject({ method: 'GET', url: '/api/agent-evals/suites/evals_1/runs' });
|
||||
expect(res.statusCode).toBe(200);
|
||||
expect(repoMock.listRunsBySuite).toHaveBeenCalledWith('lysnrai', 'evals_1', 20);
|
||||
});
|
||||
|
||||
// ── Regression Comparison ───────────────────────────────
|
||||
|
||||
it('GET /agent-evals/suites/:id/regression detects score regression', async () => {
|
||||
repoMock.listRunsBySuite.mockResolvedValue([
|
||||
{
|
||||
id: 'run_2',
|
||||
status: 'completed',
|
||||
score: 0.7,
|
||||
agentVersion: 2,
|
||||
verdict: 'fail',
|
||||
passedCases: 7,
|
||||
failedCases: 3,
|
||||
startedAt: '2026-03-16',
|
||||
},
|
||||
{
|
||||
id: 'run_1',
|
||||
status: 'completed',
|
||||
score: 0.9,
|
||||
agentVersion: 1,
|
||||
verdict: 'pass',
|
||||
passedCases: 9,
|
||||
failedCases: 1,
|
||||
startedAt: '2026-03-15',
|
||||
},
|
||||
]);
|
||||
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
|
||||
|
||||
const res = await app.inject({
|
||||
method: 'GET',
|
||||
url: '/api/agent-evals/suites/evals_1/regression',
|
||||
});
|
||||
expect(res.statusCode).toBe(200);
|
||||
const body = res.json();
|
||||
expect(body.regression).toBe(true);
|
||||
expect(body.scoreDelta).toBeLessThan(0);
|
||||
});
|
||||
|
||||
it('GET /agent-evals/suites/:id/regression returns false with insufficient runs', async () => {
|
||||
repoMock.listRunsBySuite.mockResolvedValue([{ id: 'run_1', status: 'completed', score: 0.9 }]);
|
||||
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
|
||||
|
||||
const res = await app.inject({
|
||||
method: 'GET',
|
||||
url: '/api/agent-evals/suites/evals_1/regression',
|
||||
});
|
||||
expect(res.statusCode).toBe(200);
|
||||
expect(res.json().regression).toBe(false);
|
||||
});
|
||||
|
||||
// ── Release Gate ────────────────────────────────────────
|
||||
|
||||
it('GET /agent-evals/suites/:id/gate returns allowed when gate passed', async () => {
|
||||
repoMock.getSuite.mockResolvedValue({ id: 'evals_1', passThreshold: 0.8 });
|
||||
repoMock.listRunsBySuite.mockResolvedValue([
|
||||
{
|
||||
id: 'run_1',
|
||||
releaseGate: true,
|
||||
status: 'completed',
|
||||
verdict: 'pass',
|
||||
agentVersion: 2,
|
||||
score: 0.95,
|
||||
},
|
||||
]);
|
||||
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
|
||||
|
||||
const res = await app.inject({ method: 'GET', url: '/api/agent-evals/suites/evals_1/gate' });
|
||||
expect(res.statusCode).toBe(200);
|
||||
expect(res.json().allowed).toBe(true);
|
||||
});
|
||||
|
||||
it('GET /agent-evals/suites/:id/gate returns not allowed when no gate run', async () => {
|
||||
repoMock.getSuite.mockResolvedValue({ id: 'evals_1', passThreshold: 0.8 });
|
||||
repoMock.listRunsBySuite.mockResolvedValue([]);
|
||||
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
|
||||
|
||||
const res = await app.inject({ method: 'GET', url: '/api/agent-evals/suites/evals_1/gate' });
|
||||
expect(res.statusCode).toBe(200);
|
||||
expect(res.json().allowed).toBe(false);
|
||||
});
|
||||
|
||||
// ── Compliance Report ───────────────────────────────────
|
||||
|
||||
it('GET /agent-evals/agents/:agentId/report returns compliance summary', async () => {
|
||||
repoMock.listSuites.mockResolvedValue([
|
||||
{ id: 'evals_1', status: 'active' },
|
||||
{ id: 'evals_2', status: 'draft' },
|
||||
]);
|
||||
repoMock.listRunsByAgent.mockResolvedValue([
|
||||
{
|
||||
id: 'run_1',
|
||||
status: 'completed',
|
||||
verdict: 'pass',
|
||||
score: 0.95,
|
||||
agentVersion: 1,
|
||||
suiteId: 'evals_1',
|
||||
startedAt: '2026-03-15',
|
||||
completedAt: '2026-03-15',
|
||||
},
|
||||
{
|
||||
id: 'run_2',
|
||||
status: 'completed',
|
||||
verdict: 'fail',
|
||||
score: 0.6,
|
||||
agentVersion: 2,
|
||||
suiteId: 'evals_1',
|
||||
startedAt: '2026-03-16',
|
||||
completedAt: '2026-03-16',
|
||||
},
|
||||
{ id: 'run_3', status: 'running' },
|
||||
]);
|
||||
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
|
||||
|
||||
const res = await app.inject({ method: 'GET', url: '/api/agent-evals/agents/agt_1/report' });
|
||||
expect(res.statusCode).toBe(200);
|
||||
const body = res.json();
|
||||
expect(body.totalSuites).toBe(2);
|
||||
expect(body.activeSuites).toBe(1);
|
||||
expect(body.completedRuns).toBe(2);
|
||||
expect(body.passed).toBe(1);
|
||||
expect(body.failed).toBe(1);
|
||||
expect(body.passRate).toBe(50);
|
||||
});
|
||||
|
||||
// ── Schedule Eval ───────────────────────────────────────
|
||||
|
||||
it('POST /agent-evals/suites/:id/schedule creates scheduled eval job', async () => {
|
||||
repoMock.getSuite.mockResolvedValue({ id: 'evals_1', name: 'Release Gate', agentId: 'agt_1' });
|
||||
jobRunnerMock.ensureJobDefinitions.mockResolvedValue(undefined);
|
||||
jobRegistryMock.registerJob.mockReturnValue(undefined);
|
||||
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
|
||||
|
||||
const res = await app.inject({
|
||||
method: 'POST',
|
||||
url: '/api/agent-evals/suites/evals_1/schedule',
|
||||
payload: { cronExpression: '0 0 * * *' },
|
||||
});
|
||||
|
||||
expect(res.statusCode).toBe(200);
|
||||
expect(res.json()).toEqual({
|
||||
scheduled: true,
|
||||
jobName: 'eval:evals_1',
|
||||
cronExpression: '0 0 * * *',
|
||||
});
|
||||
});
|
||||
|
||||
it('POST /agent-evals/suites/:id/schedule requires cronExpression', async () => {
|
||||
repoMock.getSuite.mockResolvedValue({ id: 'evals_1' });
|
||||
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
|
||||
|
||||
const res = await app.inject({
|
||||
method: 'POST',
|
||||
url: '/api/agent-evals/suites/evals_1/schedule',
|
||||
payload: {},
|
||||
});
|
||||
expect(res.statusCode).toBe(400);
|
||||
});
|
||||
});
|
||||
|
||||
@ -229,4 +229,189 @@ export async function agentEvalRoutes(app: FastifyInstance) {
|
||||
reviewRequired,
|
||||
});
|
||||
});
|
||||
|
||||
// ── Run History ─────────────────────────────────────────
|
||||
|
||||
app.get('/agent-evals/suites/:id/runs', async req => {
|
||||
const access = requireAdmin(req);
|
||||
const { id } = req.params as { id: string };
|
||||
const query = req.query as { limit?: string };
|
||||
return repo.listRunsBySuite(access.productId, id, parseInt(query.limit ?? '20', 10));
|
||||
});
|
||||
|
||||
// ── Regression Comparison ───────────────────────────────
|
||||
|
||||
app.get('/agent-evals/suites/:id/regression', async req => {
|
||||
const access = requireAdmin(req);
|
||||
const { id } = req.params as { id: string };
|
||||
|
||||
const runs = await repo.listRunsBySuite(access.productId, id, 10);
|
||||
const completedRuns = runs.filter(r => r.status === 'completed' && r.score !== undefined);
|
||||
|
||||
if (completedRuns.length < 2) {
|
||||
return {
|
||||
regression: false,
|
||||
message: 'Need at least 2 completed runs to compare',
|
||||
runs: completedRuns,
|
||||
};
|
||||
}
|
||||
|
||||
const latest = completedRuns[0];
|
||||
const previous = completedRuns[1];
|
||||
const scoreDelta = (latest.score ?? 0) - (previous.score ?? 0);
|
||||
const regression = scoreDelta < -0.05; // 5% drop = regression
|
||||
|
||||
return {
|
||||
regression,
|
||||
latest: {
|
||||
runId: latest.id,
|
||||
version: latest.agentVersion,
|
||||
score: latest.score,
|
||||
verdict: latest.verdict,
|
||||
passedCases: latest.passedCases,
|
||||
failedCases: latest.failedCases,
|
||||
},
|
||||
previous: {
|
||||
runId: previous.id,
|
||||
version: previous.agentVersion,
|
||||
score: previous.score,
|
||||
verdict: previous.verdict,
|
||||
passedCases: previous.passedCases,
|
||||
failedCases: previous.failedCases,
|
||||
},
|
||||
scoreDelta: Math.round(scoreDelta * 1000) / 1000,
|
||||
trend: completedRuns.slice(0, 5).map(r => ({
|
||||
runId: r.id,
|
||||
version: r.agentVersion,
|
||||
score: r.score,
|
||||
date: r.startedAt,
|
||||
})),
|
||||
};
|
||||
});
|
||||
|
||||
// ── Release Gate Check ──────────────────────────────────
|
||||
|
||||
app.get('/agent-evals/suites/:id/gate', async req => {
|
||||
const access = requireAdmin(req);
|
||||
const { id } = req.params as { id: string };
|
||||
const suite = await repo.getSuite(id, access.productId);
|
||||
|
||||
const runs = await repo.listRunsBySuite(access.productId, id, 5);
|
||||
const latestGateRun = runs.find(r => r.releaseGate && r.status === 'completed');
|
||||
|
||||
if (!latestGateRun) {
|
||||
return { allowed: false, reason: 'No completed release gate run found' };
|
||||
}
|
||||
|
||||
const passed = latestGateRun.verdict === 'pass';
|
||||
return {
|
||||
allowed: passed,
|
||||
runId: latestGateRun.id,
|
||||
version: latestGateRun.agentVersion,
|
||||
score: latestGateRun.score,
|
||||
verdict: latestGateRun.verdict,
|
||||
passThreshold: suite.passThreshold,
|
||||
reason: passed
|
||||
? `Score ${latestGateRun.score} >= threshold ${suite.passThreshold}`
|
||||
: `Score ${latestGateRun.score} below threshold ${suite.passThreshold}`,
|
||||
};
|
||||
});
|
||||
|
||||
// ── Agent Compliance Report ─────────────────────────────
|
||||
|
||||
app.get('/agent-evals/agents/:agentId/report', async req => {
|
||||
const access = requireAdmin(req);
|
||||
const { agentId } = req.params as { agentId: string };
|
||||
|
||||
const suites = await repo.listSuites(access.productId, { agentId, limit: 100 });
|
||||
const allRuns = await repo.listRunsByAgent(access.productId, agentId, 100);
|
||||
|
||||
const completedRuns = allRuns.filter(r => r.status === 'completed');
|
||||
const passedRuns = completedRuns.filter(r => r.verdict === 'pass');
|
||||
const failedRuns = completedRuns.filter(r => r.verdict === 'fail');
|
||||
const reviewRequired = completedRuns.filter(r => r.verdict === 'needs_review');
|
||||
|
||||
const scores = completedRuns.map(r => r.score).filter((s): s is number => s !== undefined);
|
||||
const avgScore =
|
||||
scores.length > 0
|
||||
? Math.round((scores.reduce((a, b) => a + b, 0) / scores.length) * 1000) / 1000
|
||||
: 0;
|
||||
|
||||
return {
|
||||
agentId,
|
||||
totalSuites: suites.length,
|
||||
activeSuites: suites.filter(s => s.status === 'active').length,
|
||||
totalRuns: allRuns.length,
|
||||
completedRuns: completedRuns.length,
|
||||
passed: passedRuns.length,
|
||||
failed: failedRuns.length,
|
||||
needsReview: reviewRequired.length,
|
||||
averageScore: avgScore,
|
||||
passRate:
|
||||
completedRuns.length > 0 ? Math.round((passedRuns.length / completedRuns.length) * 100) : 0,
|
||||
recentRuns: completedRuns.slice(0, 5).map(r => ({
|
||||
runId: r.id,
|
||||
suiteId: r.suiteId,
|
||||
version: r.agentVersion,
|
||||
score: r.score,
|
||||
verdict: r.verdict,
|
||||
date: r.completedAt ?? r.startedAt,
|
||||
})),
|
||||
};
|
||||
});
|
||||
|
||||
// ── Schedule Eval Suite ─────────────────────────────────
|
||||
|
||||
app.post('/agent-evals/suites/:id/schedule', async req => {
|
||||
const access = requireAdmin(req);
|
||||
const { id } = req.params as { id: string };
|
||||
const body = req.body as { cronExpression?: string };
|
||||
|
||||
if (!body.cronExpression) {
|
||||
validationError('cronExpression is required');
|
||||
}
|
||||
|
||||
const suite = await repo.getSuite(id, access.productId);
|
||||
const { ensureJobDefinitions } = await import('../jobs/runner.js');
|
||||
const { registerJob } = await import('../jobs/registry.js');
|
||||
|
||||
const jobName = `eval:${id}`;
|
||||
registerJob(jobName, async () => {
|
||||
// Create a new eval run (queued status, to be picked up by eval executor)
|
||||
const agent = await agentRepo.getAgent(suite.agentId, access.productId);
|
||||
const versionId = `${suite.agentId}:v${suite.targetVersion ?? agent.currentVersion}`;
|
||||
const version = await agentRepo.getAgentVersion(versionId, suite.agentId);
|
||||
const cases = await repo.listCases(id);
|
||||
|
||||
await repo.createRun({
|
||||
id: `evalrun_${randomUUID()}`,
|
||||
suiteId: id,
|
||||
agentId: suite.agentId,
|
||||
productId: access.productId,
|
||||
agentVersionId: version.id,
|
||||
agentVersion: version.version,
|
||||
status: 'queued',
|
||||
passThreshold: suite.passThreshold,
|
||||
releaseGate: false,
|
||||
totalCases: cases.length,
|
||||
passedCases: 0,
|
||||
failedCases: 0,
|
||||
triggeredBy: 'scheduler',
|
||||
startedAt: new Date().toISOString(),
|
||||
reviewRequired: false,
|
||||
});
|
||||
|
||||
return { success: true, message: `Eval run queued for suite ${id}` };
|
||||
});
|
||||
|
||||
await ensureJobDefinitions([
|
||||
{
|
||||
name: jobName,
|
||||
cron: body.cronExpression,
|
||||
description: `Scheduled eval for suite ${suite.name}`,
|
||||
},
|
||||
]);
|
||||
|
||||
return { scheduled: true, jobName, cronExpression: body.cronExpression };
|
||||
});
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user