diff --git a/dashboards/admin-web/src/app/(dashboard)/agent-evals/page.tsx b/dashboards/admin-web/src/app/(dashboard)/agent-evals/page.tsx new file mode 100644 index 00000000..1f177992 --- /dev/null +++ b/dashboards/admin-web/src/app/(dashboard)/agent-evals/page.tsx @@ -0,0 +1,307 @@ +'use client'; + +import { useState, useEffect, useCallback } from 'react'; +import { + FlaskConical, + Plus, + MoreHorizontal, + Play, + CheckCircle2, + XCircle, + Clock, + Trash2, +} from 'lucide-react'; +import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'; +import { Badge } from '@/components/ui/badge'; +import { Button } from '@/components/ui/button'; +import { Input } from '@/components/ui/input'; +import { Label } from '@/components/ui/label'; +import { Textarea } from '@/components/ui/textarea'; +import { + Table, + TableBody, + TableCell, + TableHead, + TableHeader, + TableRow, +} from '@/components/ui/table'; +import { + DropdownMenu, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuTrigger, +} from '@/components/ui/dropdown-menu'; +import { + Dialog, + DialogContent, + DialogDescription, + DialogFooter, + DialogHeader, + DialogTitle, +} from '@/components/ui/dialog'; + +interface EvalSuite { + id: string; + name: string; + description?: string; + agentId?: string; + testCaseCount: number; + lastRunStatus?: string; + lastRunScore?: number; + lastRunAt?: string; + createdAt: string; +} + +const statusConfig: Record = { + passed: { color: 'bg-emerald-50 text-emerald-700', icon: CheckCircle2 }, + failed: { color: 'bg-red-50 text-red-700', icon: XCircle }, + running: { color: 'bg-blue-50 text-blue-700', icon: Clock }, + pending: { color: 'bg-gray-50 text-gray-600', icon: Clock }, +}; + +function formatDate(iso: string) { + return new Date(iso).toLocaleDateString('en-US', { + month: 'short', + day: 'numeric', + year: 'numeric', + }); +} + +async function apiFetch(path: string, opts?: RequestInit) { + const res = await fetch(`/api/agent-evals/${path}`, { + headers: { 'Content-Type': 'application/json' }, + ...opts, + }); + return res.json(); +} + +export default function AgentEvalsPage() { + const [suites, setSuites] = useState([]); + const [loading, setLoading] = useState(true); + const [showCreate, setShowCreate] = useState(false); + const [creating, setCreating] = useState(false); + const [newName, setNewName] = useState(''); + const [newDesc, setNewDesc] = useState(''); + + const loadData = useCallback(async () => { + setLoading(true); + const data = await apiFetch('suites'); + setSuites(Array.isArray(data?.suites) ? data.suites : Array.isArray(data) ? data : []); + setLoading(false); + }, []); + + useEffect(() => { + void loadData(); + }, [loadData]); + + async function handleCreate() { + setCreating(true); + await apiFetch('suites', { + method: 'POST', + body: JSON.stringify({ name: newName, description: newDesc }), + }); + setCreating(false); + setShowCreate(false); + setNewName(''); + setNewDesc(''); + loadData(); + } + + async function handleRun(id: string) { + await apiFetch(`suites/${id}/run`, { method: 'POST' }); + loadData(); + } + + async function handleDelete(id: string) { + if (!confirm('Delete this evaluation suite?')) return; + await apiFetch(`suites/${id}`, { method: 'DELETE' }); + loadData(); + } + + const passedCount = suites.filter(s => s.lastRunStatus === 'passed').length; + const failedCount = suites.filter(s => s.lastRunStatus === 'failed').length; + + return ( +
+
+
+

Agent Evaluations

+

+ Create and run evaluation suites for AI agents +

+
+ +
+ +
+ + + + Total Suites + + + +
{suites.length}
+
+
+ + + Passed + + +
{passedCount}
+
+
+ + + Failed + + +
{failedCount}
+
+
+ + + + Total Test Cases + + + +
+ {suites.reduce((sum, s) => sum + (s.testCaseCount ?? 0), 0)} +
+
+
+
+ + + + {loading ? ( +
Loading...
+ ) : suites.length === 0 ? ( +
+ + No evaluation suites yet. +
+ ) : ( + + + + Suite + Test Cases + Last Run + Score + Created + + + + + {suites.map(s => { + const cfg = statusConfig[s.lastRunStatus || 'pending'] || statusConfig.pending; + const StatusIcon = cfg.icon; + return ( + + +
{s.name}
+ {s.description && ( +

+ {s.description} +

+ )} +
+ {s.testCaseCount} + + {s.lastRunStatus ? ( + + + {s.lastRunStatus} + + ) : ( + Never run + )} + + + {s.lastRunScore != null ? ( + = 80 ? 'text-emerald-600' : (s.lastRunScore ?? 0) >= 50 ? 'text-amber-600' : 'text-red-600'}`} + > + {s.lastRunScore}% + + ) : ( + '—' + )} + + + {formatDate(s.createdAt)} + + + + + + + + handleRun(s.id)}> + + Run + + handleDelete(s.id)} + className="text-destructive" + > + + Delete + + + + +
+ ); + })} +
+
+ )} +
+
+ + + + + New Evaluation Suite + Create a test suite to evaluate AI agent quality. + +
+
+ + setNewName(e.target.value)} + /> +
+
+ +