feat(platform-service): add agent evaluation governance

This commit is contained in:
root 2026-03-15 09:26:40 +00:00
parent 8d78b6ce59
commit 3f06427038
7 changed files with 711 additions and 0 deletions

View File

@ -74,6 +74,11 @@ const CONTAINER_DEFS: Record<string, ContainerConfig> = {
// Agent registry and versioned prompt/config definitions
agent_registry: { partitionKeyPath: '/productId' },
agent_versions: { partitionKeyPath: '/agentId' },
// Agent governance / evaluations
agent_evaluation_suites: { partitionKeyPath: '/productId' },
agent_evaluation_cases: { partitionKeyPath: '/suiteId' },
agent_evaluation_runs: { partitionKeyPath: '/productId', defaultTtl: 30 * 86400 },
agent_evaluation_results: { partitionKeyPath: '/runId', defaultTtl: 30 * 86400 },
// Telemetry (client diagnostics — see docs/WINDSURF/CLIENT_TELEMETRY_DESIGN.md)
telemetry_events: { partitionKeyPath: '/pk', defaultTtl: 30 * 86400 },
telemetry_error_clusters: { partitionKeyPath: '/pk', defaultTtl: 90 * 86400 },

View File

@ -0,0 +1,80 @@
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { MemoryDatastoreProvider } from '@bytelyst/datastore';
import { _resetDatastoreProvider, setProvider } from '../../lib/datastore.js';
import * as repo from './repository.js';
describe('agent eval repository', () => {
beforeEach(() => {
setProvider(new MemoryDatastoreProvider());
});
afterEach(() => {
_resetDatastoreProvider();
});
it('stores suites, cases, runs, and results', async () => {
await repo.createSuite({
id: 'evals_1',
productId: 'lysnrai',
agentId: 'agt_1',
name: 'Release Gate',
status: 'draft',
passThreshold: 0.85,
tags: ['release'],
createdBy: 'admin_1',
createdAt: '2026-03-15T00:00:00.000Z',
updatedAt: '2026-03-15T00:00:00.000Z',
});
await repo.createCase({
id: 'evals_1:case:1',
suiteId: 'evals_1',
productId: 'lysnrai',
name: 'Handles incident prompt',
input: { prompt: 'Investigate outage' },
critical: true,
tags: ['critical'],
createdAt: '2026-03-15T00:00:00.000Z',
});
await repo.createRun({
id: 'evalrun_1',
suiteId: 'evals_1',
agentId: 'agt_1',
productId: 'lysnrai',
agentVersionId: 'agt_1:v2',
agentVersion: 2,
status: 'queued',
passThreshold: 0.85,
releaseGate: true,
totalCases: 1,
passedCases: 0,
failedCases: 0,
triggeredBy: 'admin_1',
startedAt: '2026-03-15T00:00:00.000Z',
reviewRequired: false,
});
await repo.createResults([
{
id: 'evalrun_1:evals_1:case:1',
runId: 'evalrun_1',
caseId: 'evals_1:case:1',
productId: 'lysnrai',
passed: true,
score: 1,
createdAt: '2026-03-15T00:01:00.000Z',
},
]);
const suites = await repo.listSuites('lysnrai', { limit: 20 });
const cases = await repo.listCases('evals_1');
const run = await repo.getRun('evalrun_1', 'lysnrai');
const results = await repo.listResults('evalrun_1');
expect(suites).toHaveLength(1);
expect(cases[0].critical).toBe(true);
expect(run.agentVersion).toBe(2);
expect(results[0].passed).toBe(true);
});
});

View File

@ -0,0 +1,104 @@
import { NotFoundError } from '../../lib/errors.js';
import { getCollection } from '../../lib/datastore.js';
import type {
EvaluationCaseDoc,
EvaluationResultDoc,
EvaluationRunDoc,
EvaluationSuiteDoc,
ListEvaluationSuitesQuery,
} from './types.js';
function suiteCollection() {
return getCollection<EvaluationSuiteDoc>('agent_evaluation_suites', '/productId');
}
function caseCollection() {
return getCollection<EvaluationCaseDoc>('agent_evaluation_cases', '/suiteId');
}
function runCollection() {
return getCollection<EvaluationRunDoc>('agent_evaluation_runs', '/productId');
}
function resultCollection() {
return getCollection<EvaluationResultDoc>('agent_evaluation_results', '/runId');
}
export async function createSuite(doc: EvaluationSuiteDoc): Promise<EvaluationSuiteDoc> {
return suiteCollection().create(doc);
}
export async function listSuites(
productId: string,
query: ListEvaluationSuitesQuery
): Promise<EvaluationSuiteDoc[]> {
return suiteCollection().findMany({
filter: {
productId,
...(query.agentId ? { agentId: query.agentId } : {}),
...(query.status ? { status: query.status } : {}),
},
sort: { createdAt: -1 },
limit: query.limit,
});
}
export async function getSuite(id: string, productId: string): Promise<EvaluationSuiteDoc> {
const suite = await suiteCollection().findById(id, productId);
if (!suite) throw new NotFoundError(`Evaluation suite '${id}' not found`);
return suite;
}
export async function updateSuite(
id: string,
productId: string,
updates: Partial<EvaluationSuiteDoc>
): Promise<EvaluationSuiteDoc> {
const updated = await suiteCollection().update(id, productId, {
...updates,
updatedAt: new Date().toISOString(),
});
if (!updated) throw new NotFoundError(`Evaluation suite '${id}' not found`);
return updated;
}
export async function createCase(doc: EvaluationCaseDoc): Promise<EvaluationCaseDoc> {
return caseCollection().create(doc);
}
export async function listCases(suiteId: string): Promise<EvaluationCaseDoc[]> {
return caseCollection().findMany({
filter: { suiteId },
sort: { createdAt: 1 },
limit: 500,
});
}
export async function createRun(doc: EvaluationRunDoc): Promise<EvaluationRunDoc> {
return runCollection().create(doc);
}
export async function getRun(id: string, productId: string): Promise<EvaluationRunDoc> {
const run = await runCollection().findById(id, productId);
if (!run) throw new NotFoundError(`Evaluation run '${id}' not found`);
return run;
}
export async function updateRun(id: string, productId: string, updates: Partial<EvaluationRunDoc>) {
const updated = await runCollection().update(id, productId, updates);
if (!updated) throw new NotFoundError(`Evaluation run '${id}' not found`);
return updated;
}
export async function createResults(docs: EvaluationResultDoc[]): Promise<EvaluationResultDoc[]> {
const writes = docs.map(doc => resultCollection().create(doc));
return Promise.all(writes);
}
export async function listResults(runId: string): Promise<EvaluationResultDoc[]> {
return resultCollection().findMany({
filter: { runId },
sort: { createdAt: 1 },
limit: 1000,
});
}

View File

@ -0,0 +1,143 @@
import Fastify from 'fastify';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
const repoMock = {
listSuites: vi.fn(),
createSuite: vi.fn(),
getSuite: vi.fn(),
updateSuite: vi.fn(),
listCases: vi.fn(),
createCase: vi.fn(),
createRun: vi.fn(),
getRun: vi.fn(),
createResults: vi.fn(),
listResults: vi.fn(),
updateRun: vi.fn(),
};
const agentRepoMock = {
getAgent: vi.fn(),
getAgentVersion: vi.fn(),
};
vi.mock('./repository.js', () => repoMock);
vi.mock('../agents/repository.js', () => agentRepoMock);
async function buildApp(payload?: { sub: string; productId: string; role?: string }) {
const { agentEvalRoutes } = await import('./routes.js');
const app = Fastify({ logger: false });
if (payload) {
app.addHook('onRequest', async req => {
req.jwtPayload = payload;
});
}
await app.register(agentEvalRoutes, { prefix: '/api' });
return app;
}
describe('agentEvalRoutes', () => {
beforeEach(() => {
vi.clearAllMocks();
});
afterEach(() => {
vi.restoreAllMocks();
});
it('POST /agent-evals/suites creates a suite for an existing agent', async () => {
agentRepoMock.getAgent.mockResolvedValue({ id: 'agt_1' });
repoMock.createSuite.mockResolvedValue({ id: 'evals_1', name: 'Release Gate' });
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
const res = await app.inject({
method: 'POST',
url: '/api/agent-evals/suites',
payload: {
agentId: 'agt_1',
name: 'Release Gate',
passThreshold: 0.9,
},
});
expect(res.statusCode).toBe(200);
expect(agentRepoMock.getAgent).toHaveBeenCalledWith('agt_1', 'lysnrai');
expect(repoMock.createSuite).toHaveBeenCalledWith(
expect.objectContaining({
agentId: 'agt_1',
passThreshold: 0.9,
})
);
});
it('POST /agent-evals/suites/:id/runs creates a version-aware run', async () => {
repoMock.getSuite.mockResolvedValue({
id: 'evals_1',
productId: 'lysnrai',
agentId: 'agt_1',
passThreshold: 0.85,
targetVersion: 2,
});
agentRepoMock.getAgent.mockResolvedValue({ id: 'agt_1', currentVersion: 3 });
agentRepoMock.getAgentVersion.mockResolvedValue({ id: 'agt_1:v2', version: 2 });
repoMock.listCases.mockResolvedValue([{ id: 'case_1' }, { id: 'case_2' }]);
repoMock.createRun.mockResolvedValue({ id: 'evalrun_1' });
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
const res = await app.inject({
method: 'POST',
url: '/api/agent-evals/suites/evals_1/runs',
payload: {
releaseGate: true,
},
});
expect(res.statusCode).toBe(200);
expect(agentRepoMock.getAgentVersion).toHaveBeenCalledWith('agt_1:v2', 'agt_1');
expect(repoMock.createRun).toHaveBeenCalledWith(
expect.objectContaining({
releaseGate: true,
totalCases: 2,
agentVersion: 2,
})
);
});
it('POST /agent-evals/runs/:id/results finalizes the run verdict', async () => {
repoMock.getRun.mockResolvedValue({
id: 'evalrun_1',
productId: 'lysnrai',
suiteId: 'evals_1',
passThreshold: 0.8,
});
repoMock.listCases.mockResolvedValue([
{ id: 'case_1', critical: true },
{ id: 'case_2', critical: false },
]);
repoMock.createResults.mockResolvedValue([]);
repoMock.updateRun.mockResolvedValue({ id: 'evalrun_1', verdict: 'needs_review' });
const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
const res = await app.inject({
method: 'POST',
url: '/api/agent-evals/runs/evalrun_1/results',
payload: {
summary: 'Critical regression detected',
results: [
{ caseId: 'case_1', passed: false, score: 0.2 },
{ caseId: 'case_2', passed: true, score: 1 },
],
},
});
expect(res.statusCode).toBe(200);
expect(repoMock.createResults).toHaveBeenCalled();
expect(repoMock.updateRun).toHaveBeenCalledWith(
'evalrun_1',
'lysnrai',
expect.objectContaining({
verdict: 'needs_review',
reviewRequired: true,
})
);
});
});

View File

@ -0,0 +1,232 @@
import { randomUUID } from 'node:crypto';
import type { FastifyInstance } from 'fastify';
import { BadRequestError, ForbiddenError } from '../../lib/errors.js';
import * as agentRepo from '../agents/repository.js';
import {
CreateEvaluationCaseSchema,
CreateEvaluationRunSchema,
CreateEvaluationSuiteSchema,
EvaluationCaseDoc,
EvaluationResultDoc,
EvaluationRunDoc,
EvaluationSuiteDoc,
ListEvaluationSuitesQuerySchema,
RecordEvaluationResultsSchema,
UpdateEvaluationSuiteSchema,
} from './types.js';
import * as repo from './repository.js';
function requireAdmin(req: { jwtPayload?: { sub?: string; role?: string; productId?: string } }): {
userId: string;
productId: string;
} {
const payload = req.jwtPayload;
if (!payload?.sub) throw new ForbiddenError('Authentication required');
if (!payload.role || !['super_admin', 'admin'].includes(payload.role)) {
throw new ForbiddenError('Admin access required');
}
return {
userId: payload.sub,
productId: payload.productId ?? process.env.DEFAULT_PRODUCT_ID ?? 'lysnrai',
};
}
function validationError(message: string): never {
throw new BadRequestError(message);
}
export async function agentEvalRoutes(app: FastifyInstance) {
app.get('/agent-evals/suites', async req => {
const access = requireAdmin(req);
const parsed = ListEvaluationSuitesQuerySchema.safeParse(req.query);
if (!parsed.success) {
validationError(parsed.error.issues.map(issue => issue.message).join('; '));
}
return repo.listSuites(access.productId, parsed.data);
});
app.post('/agent-evals/suites', async req => {
const access = requireAdmin(req);
const parsed = CreateEvaluationSuiteSchema.safeParse(req.body);
if (!parsed.success) {
validationError(parsed.error.issues.map(issue => issue.message).join('; '));
}
await agentRepo.getAgent(parsed.data.agentId, access.productId);
const now = new Date().toISOString();
const suite: EvaluationSuiteDoc = {
id: `evals_${randomUUID()}`,
productId: access.productId,
agentId: parsed.data.agentId,
name: parsed.data.name,
description: parsed.data.description,
status: 'draft',
passThreshold: parsed.data.passThreshold,
targetVersion: parsed.data.targetVersion,
tags: parsed.data.tags,
metadata: parsed.data.metadata,
createdBy: access.userId,
createdAt: now,
updatedAt: now,
};
return repo.createSuite(suite);
});
app.get('/agent-evals/suites/:id', async req => {
const access = requireAdmin(req);
const { id } = req.params as { id: string };
return repo.getSuite(id, access.productId);
});
app.patch('/agent-evals/suites/:id', async req => {
const access = requireAdmin(req);
const { id } = req.params as { id: string };
const parsed = UpdateEvaluationSuiteSchema.safeParse(req.body);
if (!parsed.success) {
validationError(parsed.error.issues.map(issue => issue.message).join('; '));
}
return repo.updateSuite(id, access.productId, parsed.data);
});
app.get('/agent-evals/suites/:id/cases', async req => {
requireAdmin(req);
const { id } = req.params as { id: string };
return repo.listCases(id);
});
app.post('/agent-evals/suites/:id/cases', async req => {
const access = requireAdmin(req);
const { id } = req.params as { id: string };
await repo.getSuite(id, access.productId);
const parsed = CreateEvaluationCaseSchema.safeParse(req.body);
if (!parsed.success) {
validationError(parsed.error.issues.map(issue => issue.message).join('; '));
}
const doc: EvaluationCaseDoc = {
id: `${id}:case:${randomUUID()}`,
suiteId: id,
productId: access.productId,
name: parsed.data.name,
input: parsed.data.input,
expectedOutput: parsed.data.expectedOutput,
rubric: parsed.data.rubric,
tags: parsed.data.tags,
critical: parsed.data.critical,
createdAt: new Date().toISOString(),
};
return repo.createCase(doc);
});
app.post('/agent-evals/suites/:id/runs', async req => {
const access = requireAdmin(req);
const { id } = req.params as { id: string };
const suite = await repo.getSuite(id, access.productId);
const parsed = CreateEvaluationRunSchema.safeParse(req.body ?? {});
if (!parsed.success) {
validationError(parsed.error.issues.map(issue => issue.message).join('; '));
}
const agent = await agentRepo.getAgent(suite.agentId, access.productId);
const versionId =
parsed.data.agentVersionId ??
`${suite.agentId}:v${suite.targetVersion ?? agent.currentVersion}`;
const version = await agentRepo.getAgentVersion(versionId, suite.agentId);
const cases = await repo.listCases(id);
const run: EvaluationRunDoc = {
id: `evalrun_${randomUUID()}`,
suiteId: id,
agentId: suite.agentId,
productId: access.productId,
agentVersionId: version.id,
agentVersion: version.version,
status: 'queued',
passThreshold: suite.passThreshold,
releaseGate: parsed.data.releaseGate,
totalCases: cases.length,
passedCases: 0,
failedCases: 0,
triggeredBy: access.userId,
startedAt: new Date().toISOString(),
reviewRequired: false,
};
return repo.createRun(run);
});
app.get('/agent-evals/runs/:id', async req => {
const access = requireAdmin(req);
const { id } = req.params as { id: string };
return repo.getRun(id, access.productId);
});
app.get('/agent-evals/runs/:id/results', async req => {
requireAdmin(req);
const { id } = req.params as { id: string };
return repo.listResults(id);
});
app.post('/agent-evals/runs/:id/results', async req => {
const access = requireAdmin(req);
const { id } = req.params as { id: string };
const run = await repo.getRun(id, access.productId);
const parsed = RecordEvaluationResultsSchema.safeParse(req.body);
if (!parsed.success) {
validationError(parsed.error.issues.map(issue => issue.message).join('; '));
}
const cases = await repo.listCases(run.suiteId);
const knownCaseIds = new Set(cases.map(item => item.id));
for (const result of parsed.data.results) {
if (!knownCaseIds.has(result.caseId)) {
validationError(`Unknown evaluation case '${result.caseId}' for suite '${run.suiteId}'`);
}
}
const now = new Date().toISOString();
const docs: EvaluationResultDoc[] = parsed.data.results.map(result => ({
id: `${id}:${result.caseId}`,
runId: id,
caseId: result.caseId,
productId: access.productId,
passed: result.passed,
score: result.score,
actualOutput: result.actualOutput,
notes: result.notes,
createdAt: now,
}));
await repo.createResults(docs);
const passedCases = parsed.data.results.filter(result => result.passed).length;
const failedCases = parsed.data.results.length - passedCases;
const averageScore =
parsed.data.results.reduce(
(sum, result) => sum + (result.score ?? (result.passed ? 1 : 0)),
0
) / parsed.data.results.length;
const criticalFailures = cases
.filter(item => item.critical)
.some(item =>
parsed.data.results.some(result => result.caseId === item.id && !result.passed)
);
const reviewRequired = criticalFailures || averageScore < run.passThreshold;
const verdict = criticalFailures
? 'needs_review'
: averageScore >= run.passThreshold
? 'pass'
: 'fail';
return repo.updateRun(id, access.productId, {
status: 'completed',
passedCases,
failedCases,
score: averageScore,
verdict,
completedAt: now,
summary: parsed.data.summary,
reviewRequired,
});
});
}

View File

@ -0,0 +1,145 @@
import { z } from 'zod';
export const EvaluationSuiteStatusSchema = z.enum(['draft', 'active', 'archived']);
export const EvaluationRunStatusSchema = z.enum(['queued', 'running', 'completed', 'failed']);
export const EvaluationVerdictSchema = z.enum(['pass', 'fail', 'needs_review']);
export const EvaluationSuiteSchema = z.object({
id: z.string().min(1),
productId: z.string().min(1),
agentId: z.string().min(1),
name: z.string().min(1),
description: z.string().optional(),
status: EvaluationSuiteStatusSchema,
passThreshold: z.number().min(0).max(1),
targetVersion: z.number().int().positive().optional(),
tags: z.array(z.string()).default([]),
metadata: z.record(z.unknown()).optional(),
createdBy: z.string().min(1),
createdAt: z.string(),
updatedAt: z.string(),
});
export type EvaluationSuiteDoc = z.infer<typeof EvaluationSuiteSchema> & {
_ts?: number;
_etag?: string;
};
export const EvaluationCaseSchema = z.object({
id: z.string().min(1),
suiteId: z.string().min(1),
productId: z.string().min(1),
name: z.string().min(1),
input: z.record(z.unknown()),
expectedOutput: z.record(z.unknown()).optional(),
rubric: z.string().optional(),
tags: z.array(z.string()).default([]),
critical: z.boolean().default(false),
createdAt: z.string(),
});
export type EvaluationCaseDoc = z.infer<typeof EvaluationCaseSchema> & {
_ts?: number;
_etag?: string;
};
export const EvaluationRunSchema = z.object({
id: z.string().min(1),
suiteId: z.string().min(1),
agentId: z.string().min(1),
productId: z.string().min(1),
agentVersionId: z.string().min(1),
agentVersion: z.number().int().positive(),
status: EvaluationRunStatusSchema,
verdict: EvaluationVerdictSchema.optional(),
passThreshold: z.number().min(0).max(1),
releaseGate: z.boolean().default(false),
totalCases: z.number().int().min(0),
passedCases: z.number().int().min(0).default(0),
failedCases: z.number().int().min(0).default(0),
score: z.number().min(0).max(1).optional(),
triggeredBy: z.string().min(1),
startedAt: z.string(),
completedAt: z.string().optional(),
summary: z.string().optional(),
reviewRequired: z.boolean().default(false),
});
export type EvaluationRunDoc = z.infer<typeof EvaluationRunSchema> & {
_ts?: number;
_etag?: string;
};
export const EvaluationResultSchema = z.object({
id: z.string().min(1),
runId: z.string().min(1),
caseId: z.string().min(1),
productId: z.string().min(1),
passed: z.boolean(),
score: z.number().min(0).max(1).optional(),
actualOutput: z.record(z.unknown()).optional(),
notes: z.string().optional(),
createdAt: z.string(),
});
export type EvaluationResultDoc = z.infer<typeof EvaluationResultSchema> & {
_ts?: number;
_etag?: string;
};
export const CreateEvaluationSuiteSchema = z.object({
agentId: z.string().min(1),
name: z.string().min(1),
description: z.string().optional(),
passThreshold: z.number().min(0).max(1).default(0.8),
targetVersion: z.number().int().positive().optional(),
tags: z.array(z.string()).default([]),
metadata: z.record(z.unknown()).optional(),
});
export const UpdateEvaluationSuiteSchema = z.object({
name: z.string().min(1).optional(),
description: z.string().optional(),
status: EvaluationSuiteStatusSchema.optional(),
passThreshold: z.number().min(0).max(1).optional(),
targetVersion: z.number().int().positive().optional(),
tags: z.array(z.string()).optional(),
metadata: z.record(z.unknown()).optional(),
});
export const CreateEvaluationCaseSchema = z.object({
name: z.string().min(1),
input: z.record(z.unknown()),
expectedOutput: z.record(z.unknown()).optional(),
rubric: z.string().optional(),
tags: z.array(z.string()).default([]),
critical: z.boolean().default(false),
});
export const CreateEvaluationRunSchema = z.object({
agentVersionId: z.string().min(1).optional(),
releaseGate: z.boolean().default(false),
});
export const RecordEvaluationResultsSchema = z.object({
summary: z.string().optional(),
results: z
.array(
z.object({
caseId: z.string().min(1),
passed: z.boolean(),
score: z.number().min(0).max(1).optional(),
actualOutput: z.record(z.unknown()).optional(),
notes: z.string().optional(),
})
)
.min(1),
});
export const ListEvaluationSuitesQuerySchema = z.object({
agentId: z.string().min(1).optional(),
status: EvaluationSuiteStatusSchema.optional(),
limit: z.coerce.number().min(1).max(100).default(20),
});
export type ListEvaluationSuitesQuery = z.infer<typeof ListEvaluationSuitesQuerySchema>;

View File

@ -36,6 +36,7 @@ import { enterpriseRoutes } from './modules/auth/enterprise/routes.js';
import { magicLinkRoutes } from './modules/auth/magic-link/routes.js';
import { auditRoutes } from './modules/audit/routes.js';
import { agentRoutes } from './modules/agents/routes.js';
import { agentEvalRoutes } from './modules/agent-evals/routes.js';
import { notificationRoutes } from './modules/notifications/routes.js';
import { flagRoutes } from './modules/flags/routes.js';
import { rateLimitRoutes } from './modules/ratelimit/routes.js';
@ -141,6 +142,7 @@ await app.register(enterpriseRoutes, { prefix: '/api' });
await app.register(magicLinkRoutes, { prefix: '/api' });
await app.register(auditRoutes, { prefix: '/api' });
await app.register(agentRoutes, { prefix: '/api' });
await app.register(agentEvalRoutes, { prefix: '/api' });
await app.register(notificationRoutes, { prefix: '/api' });
await app.register(flagRoutes, { prefix: '/api' });
await app.register(rateLimitRoutes, { prefix: '/api' });