feat(platform-service): add agent evaluation governance

2026-03-15 09:26:40 +00:00 · 2026-03-15 09:26:40 +00:00 · 3f06427038
commit 3f06427038
parent 8d78b6ce59
7 changed files with 711 additions and 0 deletions
--- a/services/platform-service/src/lib/cosmos-init.ts
+++ b/services/platform-service/src/lib/cosmos-init.ts
@ -74,6 +74,11 @@ const CONTAINER_DEFS: Record<string, ContainerConfig> = {
  // Agent registry and versioned prompt/config definitions
  agent_registry: { partitionKeyPath: '/productId' },
  agent_versions: { partitionKeyPath: '/agentId' },
+  // Agent governance / evaluations
+  agent_evaluation_suites: { partitionKeyPath: '/productId' },
+  agent_evaluation_cases: { partitionKeyPath: '/suiteId' },
+  agent_evaluation_runs: { partitionKeyPath: '/productId', defaultTtl: 30 * 86400 },
+  agent_evaluation_results: { partitionKeyPath: '/runId', defaultTtl: 30 * 86400 },
  // Telemetry (client diagnostics — see docs/WINDSURF/CLIENT_TELEMETRY_DESIGN.md)
  telemetry_events: { partitionKeyPath: '/pk', defaultTtl: 30 * 86400 },
  telemetry_error_clusters: { partitionKeyPath: '/pk', defaultTtl: 90 * 86400 },
--- a/services/platform-service/src/modules/agent-evals/repository.test.ts
+++ b/services/platform-service/src/modules/agent-evals/repository.test.ts
@ -0,0 +1,80 @@
+import { afterEach, beforeEach, describe, expect, it } from 'vitest';
+import { MemoryDatastoreProvider } from '@bytelyst/datastore';
+import { _resetDatastoreProvider, setProvider } from '../../lib/datastore.js';
+import * as repo from './repository.js';
+
+describe('agent eval repository', () => {
+  beforeEach(() => {
+    setProvider(new MemoryDatastoreProvider());
+  });
+
+  afterEach(() => {
+    _resetDatastoreProvider();
+  });
+
+  it('stores suites, cases, runs, and results', async () => {
+    await repo.createSuite({
+      id: 'evals_1',
+      productId: 'lysnrai',
+      agentId: 'agt_1',
+      name: 'Release Gate',
+      status: 'draft',
+      passThreshold: 0.85,
+      tags: ['release'],
+      createdBy: 'admin_1',
+      createdAt: '2026-03-15T00:00:00.000Z',
+      updatedAt: '2026-03-15T00:00:00.000Z',
+    });
+
+    await repo.createCase({
+      id: 'evals_1:case:1',
+      suiteId: 'evals_1',
+      productId: 'lysnrai',
+      name: 'Handles incident prompt',
+      input: { prompt: 'Investigate outage' },
+      critical: true,
+      tags: ['critical'],
+      createdAt: '2026-03-15T00:00:00.000Z',
+    });
+
+    await repo.createRun({
+      id: 'evalrun_1',
+      suiteId: 'evals_1',
+      agentId: 'agt_1',
+      productId: 'lysnrai',
+      agentVersionId: 'agt_1:v2',
+      agentVersion: 2,
+      status: 'queued',
+      passThreshold: 0.85,
+      releaseGate: true,
+      totalCases: 1,
+      passedCases: 0,
+      failedCases: 0,
+      triggeredBy: 'admin_1',
+      startedAt: '2026-03-15T00:00:00.000Z',
+      reviewRequired: false,
+    });
+
+    await repo.createResults([
+      {
+        id: 'evalrun_1:evals_1:case:1',
+        runId: 'evalrun_1',
+        caseId: 'evals_1:case:1',
+        productId: 'lysnrai',
+        passed: true,
+        score: 1,
+        createdAt: '2026-03-15T00:01:00.000Z',
+      },
+    ]);
+
+    const suites = await repo.listSuites('lysnrai', { limit: 20 });
+    const cases = await repo.listCases('evals_1');
+    const run = await repo.getRun('evalrun_1', 'lysnrai');
+    const results = await repo.listResults('evalrun_1');
+
+    expect(suites).toHaveLength(1);
+    expect(cases[0].critical).toBe(true);
+    expect(run.agentVersion).toBe(2);
+    expect(results[0].passed).toBe(true);
+  });
+});
--- a/services/platform-service/src/modules/agent-evals/repository.ts
+++ b/services/platform-service/src/modules/agent-evals/repository.ts
@ -0,0 +1,104 @@
+import { NotFoundError } from '../../lib/errors.js';
+import { getCollection } from '../../lib/datastore.js';
+import type {
+  EvaluationCaseDoc,
+  EvaluationResultDoc,
+  EvaluationRunDoc,
+  EvaluationSuiteDoc,
+  ListEvaluationSuitesQuery,
+} from './types.js';
+
+function suiteCollection() {
+  return getCollection<EvaluationSuiteDoc>('agent_evaluation_suites', '/productId');
+}
+
+function caseCollection() {
+  return getCollection<EvaluationCaseDoc>('agent_evaluation_cases', '/suiteId');
+}
+
+function runCollection() {
+  return getCollection<EvaluationRunDoc>('agent_evaluation_runs', '/productId');
+}
+
+function resultCollection() {
+  return getCollection<EvaluationResultDoc>('agent_evaluation_results', '/runId');
+}
+
+export async function createSuite(doc: EvaluationSuiteDoc): Promise<EvaluationSuiteDoc> {
+  return suiteCollection().create(doc);
+}
+
+export async function listSuites(
+  productId: string,
+  query: ListEvaluationSuitesQuery
+): Promise<EvaluationSuiteDoc[]> {
+  return suiteCollection().findMany({
+    filter: {
+      productId,
+      ...(query.agentId ? { agentId: query.agentId } : {}),
+      ...(query.status ? { status: query.status } : {}),
+    },
+    sort: { createdAt: -1 },
+    limit: query.limit,
+  });
+}
+
+export async function getSuite(id: string, productId: string): Promise<EvaluationSuiteDoc> {
+  const suite = await suiteCollection().findById(id, productId);
+  if (!suite) throw new NotFoundError(`Evaluation suite '${id}' not found`);
+  return suite;
+}
+
+export async function updateSuite(
+  id: string,
+  productId: string,
+  updates: Partial<EvaluationSuiteDoc>
+): Promise<EvaluationSuiteDoc> {
+  const updated = await suiteCollection().update(id, productId, {
+    ...updates,
+    updatedAt: new Date().toISOString(),
+  });
+  if (!updated) throw new NotFoundError(`Evaluation suite '${id}' not found`);
+  return updated;
+}
+
+export async function createCase(doc: EvaluationCaseDoc): Promise<EvaluationCaseDoc> {
+  return caseCollection().create(doc);
+}
+
+export async function listCases(suiteId: string): Promise<EvaluationCaseDoc[]> {
+  return caseCollection().findMany({
+    filter: { suiteId },
+    sort: { createdAt: 1 },
+    limit: 500,
+  });
+}
+
+export async function createRun(doc: EvaluationRunDoc): Promise<EvaluationRunDoc> {
+  return runCollection().create(doc);
+}
+
+export async function getRun(id: string, productId: string): Promise<EvaluationRunDoc> {
+  const run = await runCollection().findById(id, productId);
+  if (!run) throw new NotFoundError(`Evaluation run '${id}' not found`);
+  return run;
+}
+
+export async function updateRun(id: string, productId: string, updates: Partial<EvaluationRunDoc>) {
+  const updated = await runCollection().update(id, productId, updates);
+  if (!updated) throw new NotFoundError(`Evaluation run '${id}' not found`);
+  return updated;
+}
+
+export async function createResults(docs: EvaluationResultDoc[]): Promise<EvaluationResultDoc[]> {
+  const writes = docs.map(doc => resultCollection().create(doc));
+  return Promise.all(writes);
+}
+
+export async function listResults(runId: string): Promise<EvaluationResultDoc[]> {
+  return resultCollection().findMany({
+    filter: { runId },
+    sort: { createdAt: 1 },
+    limit: 1000,
+  });
+}
--- a/services/platform-service/src/modules/agent-evals/routes.test.ts
+++ b/services/platform-service/src/modules/agent-evals/routes.test.ts
@ -0,0 +1,143 @@
+import Fastify from 'fastify';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+
+const repoMock = {
+  listSuites: vi.fn(),
+  createSuite: vi.fn(),
+  getSuite: vi.fn(),
+  updateSuite: vi.fn(),
+  listCases: vi.fn(),
+  createCase: vi.fn(),
+  createRun: vi.fn(),
+  getRun: vi.fn(),
+  createResults: vi.fn(),
+  listResults: vi.fn(),
+  updateRun: vi.fn(),
+};
+
+const agentRepoMock = {
+  getAgent: vi.fn(),
+  getAgentVersion: vi.fn(),
+};
+
+vi.mock('./repository.js', () => repoMock);
+vi.mock('../agents/repository.js', () => agentRepoMock);
+
+async function buildApp(payload?: { sub: string; productId: string; role?: string }) {
+  const { agentEvalRoutes } = await import('./routes.js');
+  const app = Fastify({ logger: false });
+  if (payload) {
+    app.addHook('onRequest', async req => {
+      req.jwtPayload = payload;
+    });
+  }
+  await app.register(agentEvalRoutes, { prefix: '/api' });
+  return app;
+}
+
+describe('agentEvalRoutes', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('POST /agent-evals/suites creates a suite for an existing agent', async () => {
+    agentRepoMock.getAgent.mockResolvedValue({ id: 'agt_1' });
+    repoMock.createSuite.mockResolvedValue({ id: 'evals_1', name: 'Release Gate' });
+
+    const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
+    const res = await app.inject({
+      method: 'POST',
+      url: '/api/agent-evals/suites',
+      payload: {
+        agentId: 'agt_1',
+        name: 'Release Gate',
+        passThreshold: 0.9,
+      },
+    });
+
+    expect(res.statusCode).toBe(200);
+    expect(agentRepoMock.getAgent).toHaveBeenCalledWith('agt_1', 'lysnrai');
+    expect(repoMock.createSuite).toHaveBeenCalledWith(
+      expect.objectContaining({
+        agentId: 'agt_1',
+        passThreshold: 0.9,
+      })
+    );
+  });
+
+  it('POST /agent-evals/suites/:id/runs creates a version-aware run', async () => {
+    repoMock.getSuite.mockResolvedValue({
+      id: 'evals_1',
+      productId: 'lysnrai',
+      agentId: 'agt_1',
+      passThreshold: 0.85,
+      targetVersion: 2,
+    });
+    agentRepoMock.getAgent.mockResolvedValue({ id: 'agt_1', currentVersion: 3 });
+    agentRepoMock.getAgentVersion.mockResolvedValue({ id: 'agt_1:v2', version: 2 });
+    repoMock.listCases.mockResolvedValue([{ id: 'case_1' }, { id: 'case_2' }]);
+    repoMock.createRun.mockResolvedValue({ id: 'evalrun_1' });
+
+    const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
+    const res = await app.inject({
+      method: 'POST',
+      url: '/api/agent-evals/suites/evals_1/runs',
+      payload: {
+        releaseGate: true,
+      },
+    });
+
+    expect(res.statusCode).toBe(200);
+    expect(agentRepoMock.getAgentVersion).toHaveBeenCalledWith('agt_1:v2', 'agt_1');
+    expect(repoMock.createRun).toHaveBeenCalledWith(
+      expect.objectContaining({
+        releaseGate: true,
+        totalCases: 2,
+        agentVersion: 2,
+      })
+    );
+  });
+
+  it('POST /agent-evals/runs/:id/results finalizes the run verdict', async () => {
+    repoMock.getRun.mockResolvedValue({
+      id: 'evalrun_1',
+      productId: 'lysnrai',
+      suiteId: 'evals_1',
+      passThreshold: 0.8,
+    });
+    repoMock.listCases.mockResolvedValue([
+      { id: 'case_1', critical: true },
+      { id: 'case_2', critical: false },
+    ]);
+    repoMock.createResults.mockResolvedValue([]);
+    repoMock.updateRun.mockResolvedValue({ id: 'evalrun_1', verdict: 'needs_review' });
+
+    const app = await buildApp({ sub: 'admin_1', productId: 'lysnrai', role: 'admin' });
+    const res = await app.inject({
+      method: 'POST',
+      url: '/api/agent-evals/runs/evalrun_1/results',
+      payload: {
+        summary: 'Critical regression detected',
+        results: [
+          { caseId: 'case_1', passed: false, score: 0.2 },
+          { caseId: 'case_2', passed: true, score: 1 },
+        ],
+      },
+    });
+
+    expect(res.statusCode).toBe(200);
+    expect(repoMock.createResults).toHaveBeenCalled();
+    expect(repoMock.updateRun).toHaveBeenCalledWith(
+      'evalrun_1',
+      'lysnrai',
+      expect.objectContaining({
+        verdict: 'needs_review',
+        reviewRequired: true,
+      })
+    );
+  });
+});
--- a/services/platform-service/src/modules/agent-evals/routes.ts
+++ b/services/platform-service/src/modules/agent-evals/routes.ts
@ -0,0 +1,232 @@
+import { randomUUID } from 'node:crypto';
+import type { FastifyInstance } from 'fastify';
+import { BadRequestError, ForbiddenError } from '../../lib/errors.js';
+import * as agentRepo from '../agents/repository.js';
+import {
+  CreateEvaluationCaseSchema,
+  CreateEvaluationRunSchema,
+  CreateEvaluationSuiteSchema,
+  EvaluationCaseDoc,
+  EvaluationResultDoc,
+  EvaluationRunDoc,
+  EvaluationSuiteDoc,
+  ListEvaluationSuitesQuerySchema,
+  RecordEvaluationResultsSchema,
+  UpdateEvaluationSuiteSchema,
+} from './types.js';
+import * as repo from './repository.js';
+
+function requireAdmin(req: { jwtPayload?: { sub?: string; role?: string; productId?: string } }): {
+  userId: string;
+  productId: string;
+} {
+  const payload = req.jwtPayload;
+  if (!payload?.sub) throw new ForbiddenError('Authentication required');
+  if (!payload.role || !['super_admin', 'admin'].includes(payload.role)) {
+    throw new ForbiddenError('Admin access required');
+  }
+  return {
+    userId: payload.sub,
+    productId: payload.productId ?? process.env.DEFAULT_PRODUCT_ID ?? 'lysnrai',
+  };
+}
+
+function validationError(message: string): never {
+  throw new BadRequestError(message);
+}
+
+export async function agentEvalRoutes(app: FastifyInstance) {
+  app.get('/agent-evals/suites', async req => {
+    const access = requireAdmin(req);
+    const parsed = ListEvaluationSuitesQuerySchema.safeParse(req.query);
+    if (!parsed.success) {
+      validationError(parsed.error.issues.map(issue => issue.message).join('; '));
+    }
+    return repo.listSuites(access.productId, parsed.data);
+  });
+
+  app.post('/agent-evals/suites', async req => {
+    const access = requireAdmin(req);
+    const parsed = CreateEvaluationSuiteSchema.safeParse(req.body);
+    if (!parsed.success) {
+      validationError(parsed.error.issues.map(issue => issue.message).join('; '));
+    }
+
+    await agentRepo.getAgent(parsed.data.agentId, access.productId);
+    const now = new Date().toISOString();
+    const suite: EvaluationSuiteDoc = {
+      id: `evals_${randomUUID()}`,
+      productId: access.productId,
+      agentId: parsed.data.agentId,
+      name: parsed.data.name,
+      description: parsed.data.description,
+      status: 'draft',
+      passThreshold: parsed.data.passThreshold,
+      targetVersion: parsed.data.targetVersion,
+      tags: parsed.data.tags,
+      metadata: parsed.data.metadata,
+      createdBy: access.userId,
+      createdAt: now,
+      updatedAt: now,
+    };
+    return repo.createSuite(suite);
+  });
+
+  app.get('/agent-evals/suites/:id', async req => {
+    const access = requireAdmin(req);
+    const { id } = req.params as { id: string };
+    return repo.getSuite(id, access.productId);
+  });
+
+  app.patch('/agent-evals/suites/:id', async req => {
+    const access = requireAdmin(req);
+    const { id } = req.params as { id: string };
+    const parsed = UpdateEvaluationSuiteSchema.safeParse(req.body);
+    if (!parsed.success) {
+      validationError(parsed.error.issues.map(issue => issue.message).join('; '));
+    }
+    return repo.updateSuite(id, access.productId, parsed.data);
+  });
+
+  app.get('/agent-evals/suites/:id/cases', async req => {
+    requireAdmin(req);
+    const { id } = req.params as { id: string };
+    return repo.listCases(id);
+  });
+
+  app.post('/agent-evals/suites/:id/cases', async req => {
+    const access = requireAdmin(req);
+    const { id } = req.params as { id: string };
+    await repo.getSuite(id, access.productId);
+    const parsed = CreateEvaluationCaseSchema.safeParse(req.body);
+    if (!parsed.success) {
+      validationError(parsed.error.issues.map(issue => issue.message).join('; '));
+    }
+
+    const doc: EvaluationCaseDoc = {
+      id: `${id}:case:${randomUUID()}`,
+      suiteId: id,
+      productId: access.productId,
+      name: parsed.data.name,
+      input: parsed.data.input,
+      expectedOutput: parsed.data.expectedOutput,
+      rubric: parsed.data.rubric,
+      tags: parsed.data.tags,
+      critical: parsed.data.critical,
+      createdAt: new Date().toISOString(),
+    };
+    return repo.createCase(doc);
+  });
+
+  app.post('/agent-evals/suites/:id/runs', async req => {
+    const access = requireAdmin(req);
+    const { id } = req.params as { id: string };
+    const suite = await repo.getSuite(id, access.productId);
+    const parsed = CreateEvaluationRunSchema.safeParse(req.body ?? {});
+    if (!parsed.success) {
+      validationError(parsed.error.issues.map(issue => issue.message).join('; '));
+    }
+
+    const agent = await agentRepo.getAgent(suite.agentId, access.productId);
+    const versionId =
+      parsed.data.agentVersionId ??
+      `${suite.agentId}:v${suite.targetVersion ?? agent.currentVersion}`;
+    const version = await agentRepo.getAgentVersion(versionId, suite.agentId);
+    const cases = await repo.listCases(id);
+
+    const run: EvaluationRunDoc = {
+      id: `evalrun_${randomUUID()}`,
+      suiteId: id,
+      agentId: suite.agentId,
+      productId: access.productId,
+      agentVersionId: version.id,
+      agentVersion: version.version,
+      status: 'queued',
+      passThreshold: suite.passThreshold,
+      releaseGate: parsed.data.releaseGate,
+      totalCases: cases.length,
+      passedCases: 0,
+      failedCases: 0,
+      triggeredBy: access.userId,
+      startedAt: new Date().toISOString(),
+      reviewRequired: false,
+    };
+
+    return repo.createRun(run);
+  });
+
+  app.get('/agent-evals/runs/:id', async req => {
+    const access = requireAdmin(req);
+    const { id } = req.params as { id: string };
+    return repo.getRun(id, access.productId);
+  });
+
+  app.get('/agent-evals/runs/:id/results', async req => {
+    requireAdmin(req);
+    const { id } = req.params as { id: string };
+    return repo.listResults(id);
+  });
+
+  app.post('/agent-evals/runs/:id/results', async req => {
+    const access = requireAdmin(req);
+    const { id } = req.params as { id: string };
+    const run = await repo.getRun(id, access.productId);
+    const parsed = RecordEvaluationResultsSchema.safeParse(req.body);
+    if (!parsed.success) {
+      validationError(parsed.error.issues.map(issue => issue.message).join('; '));
+    }
+
+    const cases = await repo.listCases(run.suiteId);
+    const knownCaseIds = new Set(cases.map(item => item.id));
+    for (const result of parsed.data.results) {
+      if (!knownCaseIds.has(result.caseId)) {
+        validationError(`Unknown evaluation case '${result.caseId}' for suite '${run.suiteId}'`);
+      }
+    }
+
+    const now = new Date().toISOString();
+    const docs: EvaluationResultDoc[] = parsed.data.results.map(result => ({
+      id: `${id}:${result.caseId}`,
+      runId: id,
+      caseId: result.caseId,
+      productId: access.productId,
+      passed: result.passed,
+      score: result.score,
+      actualOutput: result.actualOutput,
+      notes: result.notes,
+      createdAt: now,
+    }));
+    await repo.createResults(docs);
+
+    const passedCases = parsed.data.results.filter(result => result.passed).length;
+    const failedCases = parsed.data.results.length - passedCases;
+    const averageScore =
+      parsed.data.results.reduce(
+        (sum, result) => sum + (result.score ?? (result.passed ? 1 : 0)),
+        0
+      ) / parsed.data.results.length;
+    const criticalFailures = cases
+      .filter(item => item.critical)
+      .some(item =>
+        parsed.data.results.some(result => result.caseId === item.id && !result.passed)
+      );
+    const reviewRequired = criticalFailures || averageScore < run.passThreshold;
+
+    const verdict = criticalFailures
+      ? 'needs_review'
+      : averageScore >= run.passThreshold
+        ? 'pass'
+        : 'fail';
+
+    return repo.updateRun(id, access.productId, {
+      status: 'completed',
+      passedCases,
+      failedCases,
+      score: averageScore,
+      verdict,
+      completedAt: now,
+      summary: parsed.data.summary,
+      reviewRequired,
+    });
+  });
+}
--- a/services/platform-service/src/modules/agent-evals/types.ts
+++ b/services/platform-service/src/modules/agent-evals/types.ts
@ -0,0 +1,145 @@
+import { z } from 'zod';
+
+export const EvaluationSuiteStatusSchema = z.enum(['draft', 'active', 'archived']);
+export const EvaluationRunStatusSchema = z.enum(['queued', 'running', 'completed', 'failed']);
+export const EvaluationVerdictSchema = z.enum(['pass', 'fail', 'needs_review']);
+
+export const EvaluationSuiteSchema = z.object({
+  id: z.string().min(1),
+  productId: z.string().min(1),
+  agentId: z.string().min(1),
+  name: z.string().min(1),
+  description: z.string().optional(),
+  status: EvaluationSuiteStatusSchema,
+  passThreshold: z.number().min(0).max(1),
+  targetVersion: z.number().int().positive().optional(),
+  tags: z.array(z.string()).default([]),
+  metadata: z.record(z.unknown()).optional(),
+  createdBy: z.string().min(1),
+  createdAt: z.string(),
+  updatedAt: z.string(),
+});
+
+export type EvaluationSuiteDoc = z.infer<typeof EvaluationSuiteSchema> & {
+  _ts?: number;
+  _etag?: string;
+};
+
+export const EvaluationCaseSchema = z.object({
+  id: z.string().min(1),
+  suiteId: z.string().min(1),
+  productId: z.string().min(1),
+  name: z.string().min(1),
+  input: z.record(z.unknown()),
+  expectedOutput: z.record(z.unknown()).optional(),
+  rubric: z.string().optional(),
+  tags: z.array(z.string()).default([]),
+  critical: z.boolean().default(false),
+  createdAt: z.string(),
+});
+
+export type EvaluationCaseDoc = z.infer<typeof EvaluationCaseSchema> & {
+  _ts?: number;
+  _etag?: string;
+};
+
+export const EvaluationRunSchema = z.object({
+  id: z.string().min(1),
+  suiteId: z.string().min(1),
+  agentId: z.string().min(1),
+  productId: z.string().min(1),
+  agentVersionId: z.string().min(1),
+  agentVersion: z.number().int().positive(),
+  status: EvaluationRunStatusSchema,
+  verdict: EvaluationVerdictSchema.optional(),
+  passThreshold: z.number().min(0).max(1),
+  releaseGate: z.boolean().default(false),
+  totalCases: z.number().int().min(0),
+  passedCases: z.number().int().min(0).default(0),
+  failedCases: z.number().int().min(0).default(0),
+  score: z.number().min(0).max(1).optional(),
+  triggeredBy: z.string().min(1),
+  startedAt: z.string(),
+  completedAt: z.string().optional(),
+  summary: z.string().optional(),
+  reviewRequired: z.boolean().default(false),
+});
+
+export type EvaluationRunDoc = z.infer<typeof EvaluationRunSchema> & {
+  _ts?: number;
+  _etag?: string;
+};
+
+export const EvaluationResultSchema = z.object({
+  id: z.string().min(1),
+  runId: z.string().min(1),
+  caseId: z.string().min(1),
+  productId: z.string().min(1),
+  passed: z.boolean(),
+  score: z.number().min(0).max(1).optional(),
+  actualOutput: z.record(z.unknown()).optional(),
+  notes: z.string().optional(),
+  createdAt: z.string(),
+});
+
+export type EvaluationResultDoc = z.infer<typeof EvaluationResultSchema> & {
+  _ts?: number;
+  _etag?: string;
+};
+
+export const CreateEvaluationSuiteSchema = z.object({
+  agentId: z.string().min(1),
+  name: z.string().min(1),
+  description: z.string().optional(),
+  passThreshold: z.number().min(0).max(1).default(0.8),
+  targetVersion: z.number().int().positive().optional(),
+  tags: z.array(z.string()).default([]),
+  metadata: z.record(z.unknown()).optional(),
+});
+
+export const UpdateEvaluationSuiteSchema = z.object({
+  name: z.string().min(1).optional(),
+  description: z.string().optional(),
+  status: EvaluationSuiteStatusSchema.optional(),
+  passThreshold: z.number().min(0).max(1).optional(),
+  targetVersion: z.number().int().positive().optional(),
+  tags: z.array(z.string()).optional(),
+  metadata: z.record(z.unknown()).optional(),
+});
+
+export const CreateEvaluationCaseSchema = z.object({
+  name: z.string().min(1),
+  input: z.record(z.unknown()),
+  expectedOutput: z.record(z.unknown()).optional(),
+  rubric: z.string().optional(),
+  tags: z.array(z.string()).default([]),
+  critical: z.boolean().default(false),
+});
+
+export const CreateEvaluationRunSchema = z.object({
+  agentVersionId: z.string().min(1).optional(),
+  releaseGate: z.boolean().default(false),
+});
+
+export const RecordEvaluationResultsSchema = z.object({
+  summary: z.string().optional(),
+  results: z
+    .array(
+      z.object({
+        caseId: z.string().min(1),
+        passed: z.boolean(),
+        score: z.number().min(0).max(1).optional(),
+        actualOutput: z.record(z.unknown()).optional(),
+        notes: z.string().optional(),
+      })
+    )
+    .min(1),
+});
+
+export const ListEvaluationSuitesQuerySchema = z.object({
+  agentId: z.string().min(1).optional(),
+  status: EvaluationSuiteStatusSchema.optional(),
+  limit: z.coerce.number().min(1).max(100).default(20),
+});
+
+export type ListEvaluationSuitesQuery = z.infer<typeof ListEvaluationSuitesQuerySchema>;
--- a/services/platform-service/src/server.ts
+++ b/services/platform-service/src/server.ts
@ -36,6 +36,7 @@ import { enterpriseRoutes } from './modules/auth/enterprise/routes.js';
 import { magicLinkRoutes } from './modules/auth/magic-link/routes.js';
 import { auditRoutes } from './modules/audit/routes.js';
 import { agentRoutes } from './modules/agents/routes.js';
+import { agentEvalRoutes } from './modules/agent-evals/routes.js';
 import { notificationRoutes } from './modules/notifications/routes.js';
 import { flagRoutes } from './modules/flags/routes.js';
 import { rateLimitRoutes } from './modules/ratelimit/routes.js';
@ -141,6 +142,7 @@ await app.register(enterpriseRoutes, { prefix: '/api' });
 await app.register(magicLinkRoutes, { prefix: '/api' });
 await app.register(auditRoutes, { prefix: '/api' });
 await app.register(agentRoutes, { prefix: '/api' });
+await app.register(agentEvalRoutes, { prefix: '/api' });
 await app.register(notificationRoutes, { prefix: '/api' });
 await app.register(flagRoutes, { prefix: '/api' });
 await app.register(rateLimitRoutes, { prefix: '/api' });