feat(platform-service): add agent evaluation governance
This commit is contained in:
parent
8d78b6ce59
commit
3f06427038
@ -74,6 +74,11 @@ const CONTAINER_DEFS: Record<string, ContainerConfig> = {
|
||||
// Agent registry and versioned prompt/config definitions
|
||||
agent_registry: { partitionKeyPath: '/productId' },
|
||||
agent_versions: { partitionKeyPath: '/agentId' },
|
||||
// Agent governance / evaluations
|
||||
agent_evaluation_suites: { partitionKeyPath: '/productId' },
|
||||
agent_evaluation_cases: { partitionKeyPath: '/suiteId' },
|
||||
agent_evaluation_runs: { partitionKeyPath: '/productId', defaultTtl: 30 * 86400 },
|
||||
agent_evaluation_results: { partitionKeyPath: '/runId', defaultTtl: 30 * 86400 },
|
||||
// Telemetry (client diagnostics — see docs/WINDSURF/CLIENT_TELEMETRY_DESIGN.md)
|
||||
telemetry_events: { partitionKeyPath: '/pk', defaultTtl: 30 * 86400 },
|
||||
telemetry_error_clusters: { partitionKeyPath: '/pk', defaultTtl: 90 * 86400 },
|
||||
|
||||
@ -0,0 +1,80 @@
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { MemoryDatastoreProvider } from '@bytelyst/datastore';
|
||||
import { _resetDatastoreProvider, setProvider } from '../../lib/datastore.js';
|
||||
import * as repo from './repository.js';
|
||||
|
||||
describe('agent eval repository', () => {
|
||||
beforeEach(() => {
|
||||
setProvider(new MemoryDatastoreProvider());
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
_resetDatastoreProvider();
|
||||
});
|
||||
|
||||
it('stores suites, cases, runs, and results', async () => {
|
||||
await repo.createSuite({
|
||||
id: 'evals_1',
|
||||
productId: 'lysnrai',
|
||||
agentId: 'agt_1',
|
||||
name: 'Release Gate',
|
||||
status: 'draft',
|
||||
passThreshold: 0.85,
|
||||
tags: ['release'],
|
||||
createdBy: 'admin_1',
|
||||
createdAt: '2026-03-15T00:00:00.000Z',
|
||||
updatedAt: '2026-03-15T00:00:00.000Z',
|
||||
});
|
||||
|
||||
await repo.createCase({
|
||||
id: 'evals_1:case:1',
|
||||
suiteId: 'evals_1',
|
||||
productId: 'lysnrai',
|
||||
name: 'Handles incident prompt',
|
||||
input: { prompt: 'Investigate outage' },
|
||||
critical: true,
|
||||
tags: ['critical'],
|
||||
createdAt: '2026-03-15T00:00:00.000Z',
|
||||
});
|
||||
|
||||
await repo.createRun({
|
||||
id: 'evalrun_1',
|
||||
suiteId: 'evals_1',
|
||||
agentId: 'agt_1',
|
||||
productId: 'lysnrai',
|
||||
agentVersionId: 'agt_1:v2',
|
||||
agentVersion: 2,
|
||||
status: 'queued',
|
||||
passThreshold: 0.85,
|
||||
releaseGate: true,
|
||||
totalCases: 1,
|
||||
passedCases: 0,
|
||||
failedCases: 0,
|
||||
triggeredBy: 'admin_1',
|
||||
startedAt: '2026-03-15T00:00:00.000Z',
|
||||
reviewRequired: false,
|
||||
});
|
||||
|
||||
await repo.createResults([
|
||||
{
|
||||
id: 'evalrun_1:evals_1:case:1',
|
||||
runId: 'evalrun_1',
|
||||
caseId: 'evals_1:case:1',
|
||||
productId: 'lysnrai',
|
||||
passed: true,
|
||||
score: 1,
|
||||
createdAt: '2026-03-15T00:01:00.000Z',
|
||||
},
|
||||
]);
|
||||
|
||||
const suites = await repo.listSuites('lysnrai', { limit: 20 });
|
||||
const cases = await repo.listCases('evals_1');
|
||||
const run = await repo.getRun('evalrun_1', 'lysnrai');
|
||||
const results = await repo.listResults('evalrun_1');
|
||||
|
||||
expect(suites).toHaveLength(1);
|
||||
expect(cases[0].critical).toBe(true);
|
||||
expect(run.agentVersion).toBe(2);
|
||||
expect(results[0].passed).toBe(true);
|
||||
});
|
||||
});
|
||||
104
services/platform-service/src/modules/agent-evals/repository.ts
Normal file
104
services/platform-service/src/modules/agent-evals/repository.ts
Normal file
@ -0,0 +1,104 @@
|
||||
import { NotFoundError } from '../../lib/errors.js';
|
||||
import { getCollection } from '../../lib/datastore.js';
|
||||
import type {
|
||||
EvaluationCaseDoc,
|
||||
EvaluationResultDoc,
|
||||
EvaluationRunDoc,
|
||||
EvaluationSuiteDoc,
|
||||
ListEvaluationSuitesQuery,
|
||||
} from './types.js';
|
||||
|
||||
function suiteCollection() {
|
||||
return getCollection<EvaluationSuiteDoc>('agent_evaluation_suites', '/productId');
|
||||
}
|
||||
|
||||
function caseCollection() {
|
||||
return getCollection<EvaluationCaseDoc>('agent_evaluation_cases', '/suiteId');
|
||||
}
|
||||
|
||||
function runCollection() {
|
||||
return getCollection<EvaluationRunDoc>('agent_evaluation_runs', '/productId');
|
||||
}
|
||||
|
||||
function resultCollection() {
|
||||
return getCollection<EvaluationResultDoc>('agent_evaluation_results', '/runId');
|
||||
}
|
||||
|
||||
export async function createSuite(doc: EvaluationSuiteDoc): Promise<EvaluationSuiteDoc> {
|
||||
return suiteCollection().create(doc);
|
||||
}
|
||||
|
||||
export async function listSuites(
|
||||
productId: string,
|
||||
query: ListEvaluationSuitesQuery
|
||||
): Promise<EvaluationSuiteDoc[]> {
|
||||
return suiteCollection().findMany({
|
||||
filter: {
|
||||
productId,
|
||||
...(query.agentId ? { agentId: query.agentId } : {}),
|
||||
...(query.status ? { status: query.status } : {}),
|
||||
},
|
||||
sort: { createdAt: -1 },
|
||||
limit: query.limit,
|
||||
});
|
||||
}
|
||||
|
||||
export async function getSuite(id: string, productId: string): Promise<EvaluationSuiteDoc> {
|
||||
const suite = await suiteCollection().findById(id, productId);
|
||||
if (!suite) throw new NotFoundError(`Evaluation suite '${id}' not found`);
|
||||
return suite;
|
||||
}
|
||||
|
||||
export async function updateSuite(
|
||||
id: string,
|
||||
productId: string,
|
||||
updates: Partial<EvaluationSuiteDoc>
|
||||
): Promise<EvaluationSuiteDoc> {
|
||||
const updated = await suiteCollection().update(id, productId, {
|
||||
...updates,
|
||||
updatedAt: new Date().toISOString(),
|
||||
});
|
||||
if (!updated) throw new NotFoundError(`Evaluation suite '${id}' not found`);
|
||||
return updated;
|
||||
}
|
||||
|
||||
export async function createCase(doc: EvaluationCaseDoc): Promise<EvaluationCaseDoc> {
|
||||
return caseCollection().create(doc);
|
||||
}
|
||||
|
||||
export async function listCases(suiteId: string): Promise<EvaluationCaseDoc[]> {
|
||||
return caseCollection().findMany({
|
||||
filter: { suiteId },
|
||||
sort: { createdAt: 1 },
|
||||
limit: 500,
|
||||
});
|
||||
}
|
||||
|
||||
export async function createRun(doc: EvaluationRunDoc): Promise<EvaluationRunDoc> {
|
||||
return runCollection().create(doc);
|
||||
}
|
||||
|
||||
export async function getRun(id: string, productId: string): Promise<EvaluationRunDoc> {
|
||||
const run = await runCollection().findById(id, productId);
|
||||
if (!run) throw new NotFoundError(`Evaluation run '${id}' not found`);
|
||||
return run;
|
||||
}
|
||||
|
||||
export async function updateRun(id: string, productId: string, updates: Partial<EvaluationRunDoc>) {
|
||||
const updated = await runCollection().update(id, productId, updates);
|
||||
if (!updated) throw new NotFoundError(`Evaluation run '${id}' not found`);
|
||||
return updated;
|
||||
}
|
||||
|
||||
export async function createResults(docs: EvaluationResultDoc[]): Promise<EvaluationResultDoc[]> {
|
||||
const writes = docs.map(doc => resultCollection().create(doc));
|
||||
return Promise.all(writes);
|
||||
}
|
||||
|
||||
export async function listResults(runId: string): Promise<EvaluationResultDoc[]> {
|
||||
return resultCollection().findMany({
|
||||
filter: { runId },
|
||||
sort: { createdAt: 1 },
|
||||
limit: 1000,
|
||||
});
|
||||
}
|
||||
143
services/platform-service/src/modules/agent-evals/routes.test.ts
Normal file
143
services/platform-service/src/modules/agent-evals/routes.test.ts
Normal file
@ -0,0 +1,143 @@
|
||||
import Fastify from 'fastify';
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
const repoMock = {
|
||||
listSuites: vi.fn(),
|
||||
createSuite: vi.fn(),
|
||||
getSuite: vi.fn(),
|
||||
updateSuite: vi.fn(),
|
||||
listCases: vi.fn(),
|
||||
createCase: vi.fn(),
|
||||
createRun: vi.fn(),
|
||||
getRun: vi.fn(),
|
||||
createResults: vi.fn(),
|
||||
listResults: vi.fn(),
|
||||
updateRun: vi.fn(),
|
||||
};
|
||||
|
||||
const agentRepoMock = {
|
||||
getAgent: vi.fn(),
|
||||
getAgentVersion: vi.fn(),
|
||||
};
|
||||
|
||||
vi.mock('./repository.js', () => repoMock);
|
||||
vi.mock('../agents/repository.js', () => agentRepoMock);
|
||||
|
||||
async function buildApp(payload?: { sub: string; productId: string; role?: string }) {
|
||||
const { agentEvalRoutes } = await import('./routes.js');
|
||||
const app = Fastify({ logger: false });
|
||||
if (payload) {
|
||||
app.addHook('onRequest', async req => {
|
||||
req.jwtPayload = payload;
|
||||
});
|
||||
}
|
||||
await app.register(agentEvalRoutes, { prefix: '/api' });
|
||||
return app;
|
||||
}
|
||||
|
||||
describe('agentEvalRoutes', () => {
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
vi.restoreAllMocks();
|
||||
});
|
||||
|
||||
it('POST /agent-evals/suites creates a suite for an existing agent', async () => {
|
||||
agentRepoMock.getAgent.mockResolvedValue({ id: 'agt_1' });
|
||||
repoMock.createSuite.mockResolvedValue({ id: 'evals_1', name: 'Release Gate' });
|
||||
|
||||
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
|
||||
const res = await app.inject({
|
||||
method: 'POST',
|
||||
url: '/api/agent-evals/suites',
|
||||
payload: {
|
||||
agentId: 'agt_1',
|
||||
name: 'Release Gate',
|
||||
passThreshold: 0.9,
|
||||
},
|
||||
});
|
||||
|
||||
expect(res.statusCode).toBe(200);
|
||||
expect(agentRepoMock.getAgent).toHaveBeenCalledWith('agt_1', 'lysnrai');
|
||||
expect(repoMock.createSuite).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
agentId: 'agt_1',
|
||||
passThreshold: 0.9,
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
it('POST /agent-evals/suites/:id/runs creates a version-aware run', async () => {
|
||||
repoMock.getSuite.mockResolvedValue({
|
||||
id: 'evals_1',
|
||||
productId: 'lysnrai',
|
||||
agentId: 'agt_1',
|
||||
passThreshold: 0.85,
|
||||
targetVersion: 2,
|
||||
});
|
||||
agentRepoMock.getAgent.mockResolvedValue({ id: 'agt_1', currentVersion: 3 });
|
||||
agentRepoMock.getAgentVersion.mockResolvedValue({ id: 'agt_1:v2', version: 2 });
|
||||
repoMock.listCases.mockResolvedValue([{ id: 'case_1' }, { id: 'case_2' }]);
|
||||
repoMock.createRun.mockResolvedValue({ id: 'evalrun_1' });
|
||||
|
||||
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
|
||||
const res = await app.inject({
|
||||
method: 'POST',
|
||||
url: '/api/agent-evals/suites/evals_1/runs',
|
||||
payload: {
|
||||
releaseGate: true,
|
||||
},
|
||||
});
|
||||
|
||||
expect(res.statusCode).toBe(200);
|
||||
expect(agentRepoMock.getAgentVersion).toHaveBeenCalledWith('agt_1:v2', 'agt_1');
|
||||
expect(repoMock.createRun).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
releaseGate: true,
|
||||
totalCases: 2,
|
||||
agentVersion: 2,
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
it('POST /agent-evals/runs/:id/results finalizes the run verdict', async () => {
|
||||
repoMock.getRun.mockResolvedValue({
|
||||
id: 'evalrun_1',
|
||||
productId: 'lysnrai',
|
||||
suiteId: 'evals_1',
|
||||
passThreshold: 0.8,
|
||||
});
|
||||
repoMock.listCases.mockResolvedValue([
|
||||
{ id: 'case_1', critical: true },
|
||||
{ id: 'case_2', critical: false },
|
||||
]);
|
||||
repoMock.createResults.mockResolvedValue([]);
|
||||
repoMock.updateRun.mockResolvedValue({ id: 'evalrun_1', verdict: 'needs_review' });
|
||||
|
||||
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
|
||||
const res = await app.inject({
|
||||
method: 'POST',
|
||||
url: '/api/agent-evals/runs/evalrun_1/results',
|
||||
payload: {
|
||||
summary: 'Critical regression detected',
|
||||
results: [
|
||||
{ caseId: 'case_1', passed: false, score: 0.2 },
|
||||
{ caseId: 'case_2', passed: true, score: 1 },
|
||||
],
|
||||
},
|
||||
});
|
||||
|
||||
expect(res.statusCode).toBe(200);
|
||||
expect(repoMock.createResults).toHaveBeenCalled();
|
||||
expect(repoMock.updateRun).toHaveBeenCalledWith(
|
||||
'evalrun_1',
|
||||
'lysnrai',
|
||||
expect.objectContaining({
|
||||
verdict: 'needs_review',
|
||||
reviewRequired: true,
|
||||
})
|
||||
);
|
||||
});
|
||||
});
|
||||
232
services/platform-service/src/modules/agent-evals/routes.ts
Normal file
232
services/platform-service/src/modules/agent-evals/routes.ts
Normal file
@ -0,0 +1,232 @@
|
||||
import { randomUUID } from 'node:crypto';
|
||||
import type { FastifyInstance } from 'fastify';
|
||||
import { BadRequestError, ForbiddenError } from '../../lib/errors.js';
|
||||
import * as agentRepo from '../agents/repository.js';
|
||||
import {
|
||||
CreateEvaluationCaseSchema,
|
||||
CreateEvaluationRunSchema,
|
||||
CreateEvaluationSuiteSchema,
|
||||
EvaluationCaseDoc,
|
||||
EvaluationResultDoc,
|
||||
EvaluationRunDoc,
|
||||
EvaluationSuiteDoc,
|
||||
ListEvaluationSuitesQuerySchema,
|
||||
RecordEvaluationResultsSchema,
|
||||
UpdateEvaluationSuiteSchema,
|
||||
} from './types.js';
|
||||
import * as repo from './repository.js';
|
||||
|
||||
function requireAdmin(req: { jwtPayload?: { sub?: string; role?: string; productId?: string } }): {
|
||||
userId: string;
|
||||
productId: string;
|
||||
} {
|
||||
const payload = req.jwtPayload;
|
||||
if (!payload?.sub) throw new ForbiddenError('Authentication required');
|
||||
if (!payload.role || !['super_admin', 'admin'].includes(payload.role)) {
|
||||
throw new ForbiddenError('Admin access required');
|
||||
}
|
||||
return {
|
||||
userId: payload.sub,
|
||||
productId: payload.productId ?? process.env.DEFAULT_PRODUCT_ID ?? 'lysnrai',
|
||||
};
|
||||
}
|
||||
|
||||
function validationError(message: string): never {
|
||||
throw new BadRequestError(message);
|
||||
}
|
||||
|
||||
export async function agentEvalRoutes(app: FastifyInstance) {
|
||||
app.get('/agent-evals/suites', async req => {
|
||||
const access = requireAdmin(req);
|
||||
const parsed = ListEvaluationSuitesQuerySchema.safeParse(req.query);
|
||||
if (!parsed.success) {
|
||||
validationError(parsed.error.issues.map(issue => issue.message).join('; '));
|
||||
}
|
||||
return repo.listSuites(access.productId, parsed.data);
|
||||
});
|
||||
|
||||
app.post('/agent-evals/suites', async req => {
|
||||
const access = requireAdmin(req);
|
||||
const parsed = CreateEvaluationSuiteSchema.safeParse(req.body);
|
||||
if (!parsed.success) {
|
||||
validationError(parsed.error.issues.map(issue => issue.message).join('; '));
|
||||
}
|
||||
|
||||
await agentRepo.getAgent(parsed.data.agentId, access.productId);
|
||||
const now = new Date().toISOString();
|
||||
const suite: EvaluationSuiteDoc = {
|
||||
id: `evals_${randomUUID()}`,
|
||||
productId: access.productId,
|
||||
agentId: parsed.data.agentId,
|
||||
name: parsed.data.name,
|
||||
description: parsed.data.description,
|
||||
status: 'draft',
|
||||
passThreshold: parsed.data.passThreshold,
|
||||
targetVersion: parsed.data.targetVersion,
|
||||
tags: parsed.data.tags,
|
||||
metadata: parsed.data.metadata,
|
||||
createdBy: access.userId,
|
||||
createdAt: now,
|
||||
updatedAt: now,
|
||||
};
|
||||
return repo.createSuite(suite);
|
||||
});
|
||||
|
||||
app.get('/agent-evals/suites/:id', async req => {
|
||||
const access = requireAdmin(req);
|
||||
const { id } = req.params as { id: string };
|
||||
return repo.getSuite(id, access.productId);
|
||||
});
|
||||
|
||||
app.patch('/agent-evals/suites/:id', async req => {
|
||||
const access = requireAdmin(req);
|
||||
const { id } = req.params as { id: string };
|
||||
const parsed = UpdateEvaluationSuiteSchema.safeParse(req.body);
|
||||
if (!parsed.success) {
|
||||
validationError(parsed.error.issues.map(issue => issue.message).join('; '));
|
||||
}
|
||||
return repo.updateSuite(id, access.productId, parsed.data);
|
||||
});
|
||||
|
||||
app.get('/agent-evals/suites/:id/cases', async req => {
|
||||
requireAdmin(req);
|
||||
const { id } = req.params as { id: string };
|
||||
return repo.listCases(id);
|
||||
});
|
||||
|
||||
app.post('/agent-evals/suites/:id/cases', async req => {
|
||||
const access = requireAdmin(req);
|
||||
const { id } = req.params as { id: string };
|
||||
await repo.getSuite(id, access.productId);
|
||||
const parsed = CreateEvaluationCaseSchema.safeParse(req.body);
|
||||
if (!parsed.success) {
|
||||
validationError(parsed.error.issues.map(issue => issue.message).join('; '));
|
||||
}
|
||||
|
||||
const doc: EvaluationCaseDoc = {
|
||||
id: `${id}:case:${randomUUID()}`,
|
||||
suiteId: id,
|
||||
productId: access.productId,
|
||||
name: parsed.data.name,
|
||||
input: parsed.data.input,
|
||||
expectedOutput: parsed.data.expectedOutput,
|
||||
rubric: parsed.data.rubric,
|
||||
tags: parsed.data.tags,
|
||||
critical: parsed.data.critical,
|
||||
createdAt: new Date().toISOString(),
|
||||
};
|
||||
return repo.createCase(doc);
|
||||
});
|
||||
|
||||
app.post('/agent-evals/suites/:id/runs', async req => {
|
||||
const access = requireAdmin(req);
|
||||
const { id } = req.params as { id: string };
|
||||
const suite = await repo.getSuite(id, access.productId);
|
||||
const parsed = CreateEvaluationRunSchema.safeParse(req.body ?? {});
|
||||
if (!parsed.success) {
|
||||
validationError(parsed.error.issues.map(issue => issue.message).join('; '));
|
||||
}
|
||||
|
||||
const agent = await agentRepo.getAgent(suite.agentId, access.productId);
|
||||
const versionId =
|
||||
parsed.data.agentVersionId ??
|
||||
`${suite.agentId}:v${suite.targetVersion ?? agent.currentVersion}`;
|
||||
const version = await agentRepo.getAgentVersion(versionId, suite.agentId);
|
||||
const cases = await repo.listCases(id);
|
||||
|
||||
const run: EvaluationRunDoc = {
|
||||
id: `evalrun_${randomUUID()}`,
|
||||
suiteId: id,
|
||||
agentId: suite.agentId,
|
||||
productId: access.productId,
|
||||
agentVersionId: version.id,
|
||||
agentVersion: version.version,
|
||||
status: 'queued',
|
||||
passThreshold: suite.passThreshold,
|
||||
releaseGate: parsed.data.releaseGate,
|
||||
totalCases: cases.length,
|
||||
passedCases: 0,
|
||||
failedCases: 0,
|
||||
triggeredBy: access.userId,
|
||||
startedAt: new Date().toISOString(),
|
||||
reviewRequired: false,
|
||||
};
|
||||
|
||||
return repo.createRun(run);
|
||||
});
|
||||
|
||||
app.get('/agent-evals/runs/:id', async req => {
|
||||
const access = requireAdmin(req);
|
||||
const { id } = req.params as { id: string };
|
||||
return repo.getRun(id, access.productId);
|
||||
});
|
||||
|
||||
app.get('/agent-evals/runs/:id/results', async req => {
|
||||
requireAdmin(req);
|
||||
const { id } = req.params as { id: string };
|
||||
return repo.listResults(id);
|
||||
});
|
||||
|
||||
app.post('/agent-evals/runs/:id/results', async req => {
|
||||
const access = requireAdmin(req);
|
||||
const { id } = req.params as { id: string };
|
||||
const run = await repo.getRun(id, access.productId);
|
||||
const parsed = RecordEvaluationResultsSchema.safeParse(req.body);
|
||||
if (!parsed.success) {
|
||||
validationError(parsed.error.issues.map(issue => issue.message).join('; '));
|
||||
}
|
||||
|
||||
const cases = await repo.listCases(run.suiteId);
|
||||
const knownCaseIds = new Set(cases.map(item => item.id));
|
||||
for (const result of parsed.data.results) {
|
||||
if (!knownCaseIds.has(result.caseId)) {
|
||||
validationError(`Unknown evaluation case '${result.caseId}' for suite '${run.suiteId}'`);
|
||||
}
|
||||
}
|
||||
|
||||
const now = new Date().toISOString();
|
||||
const docs: EvaluationResultDoc[] = parsed.data.results.map(result => ({
|
||||
id: `${id}:${result.caseId}`,
|
||||
runId: id,
|
||||
caseId: result.caseId,
|
||||
productId: access.productId,
|
||||
passed: result.passed,
|
||||
score: result.score,
|
||||
actualOutput: result.actualOutput,
|
||||
notes: result.notes,
|
||||
createdAt: now,
|
||||
}));
|
||||
await repo.createResults(docs);
|
||||
|
||||
const passedCases = parsed.data.results.filter(result => result.passed).length;
|
||||
const failedCases = parsed.data.results.length - passedCases;
|
||||
const averageScore =
|
||||
parsed.data.results.reduce(
|
||||
(sum, result) => sum + (result.score ?? (result.passed ? 1 : 0)),
|
||||
0
|
||||
) / parsed.data.results.length;
|
||||
const criticalFailures = cases
|
||||
.filter(item => item.critical)
|
||||
.some(item =>
|
||||
parsed.data.results.some(result => result.caseId === item.id && !result.passed)
|
||||
);
|
||||
const reviewRequired = criticalFailures || averageScore < run.passThreshold;
|
||||
|
||||
const verdict = criticalFailures
|
||||
? 'needs_review'
|
||||
: averageScore >= run.passThreshold
|
||||
? 'pass'
|
||||
: 'fail';
|
||||
|
||||
return repo.updateRun(id, access.productId, {
|
||||
status: 'completed',
|
||||
passedCases,
|
||||
failedCases,
|
||||
score: averageScore,
|
||||
verdict,
|
||||
completedAt: now,
|
||||
summary: parsed.data.summary,
|
||||
reviewRequired,
|
||||
});
|
||||
});
|
||||
}
|
||||
145
services/platform-service/src/modules/agent-evals/types.ts
Normal file
145
services/platform-service/src/modules/agent-evals/types.ts
Normal file
@ -0,0 +1,145 @@
|
||||
import { z } from 'zod';
|
||||
|
||||
export const EvaluationSuiteStatusSchema = z.enum(['draft', 'active', 'archived']);
|
||||
export const EvaluationRunStatusSchema = z.enum(['queued', 'running', 'completed', 'failed']);
|
||||
export const EvaluationVerdictSchema = z.enum(['pass', 'fail', 'needs_review']);
|
||||
|
||||
export const EvaluationSuiteSchema = z.object({
|
||||
id: z.string().min(1),
|
||||
productId: z.string().min(1),
|
||||
agentId: z.string().min(1),
|
||||
name: z.string().min(1),
|
||||
description: z.string().optional(),
|
||||
status: EvaluationSuiteStatusSchema,
|
||||
passThreshold: z.number().min(0).max(1),
|
||||
targetVersion: z.number().int().positive().optional(),
|
||||
tags: z.array(z.string()).default([]),
|
||||
metadata: z.record(z.unknown()).optional(),
|
||||
createdBy: z.string().min(1),
|
||||
createdAt: z.string(),
|
||||
updatedAt: z.string(),
|
||||
});
|
||||
|
||||
export type EvaluationSuiteDoc = z.infer<typeof EvaluationSuiteSchema> & {
|
||||
_ts?: number;
|
||||
_etag?: string;
|
||||
};
|
||||
|
||||
export const EvaluationCaseSchema = z.object({
|
||||
id: z.string().min(1),
|
||||
suiteId: z.string().min(1),
|
||||
productId: z.string().min(1),
|
||||
name: z.string().min(1),
|
||||
input: z.record(z.unknown()),
|
||||
expectedOutput: z.record(z.unknown()).optional(),
|
||||
rubric: z.string().optional(),
|
||||
tags: z.array(z.string()).default([]),
|
||||
critical: z.boolean().default(false),
|
||||
createdAt: z.string(),
|
||||
});
|
||||
|
||||
export type EvaluationCaseDoc = z.infer<typeof EvaluationCaseSchema> & {
|
||||
_ts?: number;
|
||||
_etag?: string;
|
||||
};
|
||||
|
||||
export const EvaluationRunSchema = z.object({
|
||||
id: z.string().min(1),
|
||||
suiteId: z.string().min(1),
|
||||
agentId: z.string().min(1),
|
||||
productId: z.string().min(1),
|
||||
agentVersionId: z.string().min(1),
|
||||
agentVersion: z.number().int().positive(),
|
||||
status: EvaluationRunStatusSchema,
|
||||
verdict: EvaluationVerdictSchema.optional(),
|
||||
passThreshold: z.number().min(0).max(1),
|
||||
releaseGate: z.boolean().default(false),
|
||||
totalCases: z.number().int().min(0),
|
||||
passedCases: z.number().int().min(0).default(0),
|
||||
failedCases: z.number().int().min(0).default(0),
|
||||
score: z.number().min(0).max(1).optional(),
|
||||
triggeredBy: z.string().min(1),
|
||||
startedAt: z.string(),
|
||||
completedAt: z.string().optional(),
|
||||
summary: z.string().optional(),
|
||||
reviewRequired: z.boolean().default(false),
|
||||
});
|
||||
|
||||
export type EvaluationRunDoc = z.infer<typeof EvaluationRunSchema> & {
|
||||
_ts?: number;
|
||||
_etag?: string;
|
||||
};
|
||||
|
||||
export const EvaluationResultSchema = z.object({
|
||||
id: z.string().min(1),
|
||||
runId: z.string().min(1),
|
||||
caseId: z.string().min(1),
|
||||
productId: z.string().min(1),
|
||||
passed: z.boolean(),
|
||||
score: z.number().min(0).max(1).optional(),
|
||||
actualOutput: z.record(z.unknown()).optional(),
|
||||
notes: z.string().optional(),
|
||||
createdAt: z.string(),
|
||||
});
|
||||
|
||||
export type EvaluationResultDoc = z.infer<typeof EvaluationResultSchema> & {
|
||||
_ts?: number;
|
||||
_etag?: string;
|
||||
};
|
||||
|
||||
export const CreateEvaluationSuiteSchema = z.object({
|
||||
agentId: z.string().min(1),
|
||||
name: z.string().min(1),
|
||||
description: z.string().optional(),
|
||||
passThreshold: z.number().min(0).max(1).default(0.8),
|
||||
targetVersion: z.number().int().positive().optional(),
|
||||
tags: z.array(z.string()).default([]),
|
||||
metadata: z.record(z.unknown()).optional(),
|
||||
});
|
||||
|
||||
export const UpdateEvaluationSuiteSchema = z.object({
|
||||
name: z.string().min(1).optional(),
|
||||
description: z.string().optional(),
|
||||
status: EvaluationSuiteStatusSchema.optional(),
|
||||
passThreshold: z.number().min(0).max(1).optional(),
|
||||
targetVersion: z.number().int().positive().optional(),
|
||||
tags: z.array(z.string()).optional(),
|
||||
metadata: z.record(z.unknown()).optional(),
|
||||
});
|
||||
|
||||
export const CreateEvaluationCaseSchema = z.object({
|
||||
name: z.string().min(1),
|
||||
input: z.record(z.unknown()),
|
||||
expectedOutput: z.record(z.unknown()).optional(),
|
||||
rubric: z.string().optional(),
|
||||
tags: z.array(z.string()).default([]),
|
||||
critical: z.boolean().default(false),
|
||||
});
|
||||
|
||||
export const CreateEvaluationRunSchema = z.object({
|
||||
agentVersionId: z.string().min(1).optional(),
|
||||
releaseGate: z.boolean().default(false),
|
||||
});
|
||||
|
||||
export const RecordEvaluationResultsSchema = z.object({
|
||||
summary: z.string().optional(),
|
||||
results: z
|
||||
.array(
|
||||
z.object({
|
||||
caseId: z.string().min(1),
|
||||
passed: z.boolean(),
|
||||
score: z.number().min(0).max(1).optional(),
|
||||
actualOutput: z.record(z.unknown()).optional(),
|
||||
notes: z.string().optional(),
|
||||
})
|
||||
)
|
||||
.min(1),
|
||||
});
|
||||
|
||||
export const ListEvaluationSuitesQuerySchema = z.object({
|
||||
agentId: z.string().min(1).optional(),
|
||||
status: EvaluationSuiteStatusSchema.optional(),
|
||||
limit: z.coerce.number().min(1).max(100).default(20),
|
||||
});
|
||||
|
||||
export type ListEvaluationSuitesQuery = z.infer<typeof ListEvaluationSuitesQuerySchema>;
|
||||
@ -36,6 +36,7 @@ import { enterpriseRoutes } from './modules/auth/enterprise/routes.js';
|
||||
import { magicLinkRoutes } from './modules/auth/magic-link/routes.js';
|
||||
import { auditRoutes } from './modules/audit/routes.js';
|
||||
import { agentRoutes } from './modules/agents/routes.js';
|
||||
import { agentEvalRoutes } from './modules/agent-evals/routes.js';
|
||||
import { notificationRoutes } from './modules/notifications/routes.js';
|
||||
import { flagRoutes } from './modules/flags/routes.js';
|
||||
import { rateLimitRoutes } from './modules/ratelimit/routes.js';
|
||||
@ -141,6 +142,7 @@ await app.register(enterpriseRoutes, { prefix: '/api' });
|
||||
await app.register(magicLinkRoutes, { prefix: '/api' });
|
||||
await app.register(auditRoutes, { prefix: '/api' });
|
||||
await app.register(agentRoutes, { prefix: '/api' });
|
||||
await app.register(agentEvalRoutes, { prefix: '/api' });
|
||||
await app.register(notificationRoutes, { prefix: '/api' });
|
||||
await app.register(flagRoutes, { prefix: '/api' });
|
||||
await app.register(rateLimitRoutes, { prefix: '/api' });
|
||||
|
||||
Loading…
Reference in New Issue
Block a user