feat: surface scoring explainability in fleet control plane

Adds 'why does this job route here?' to the §7 scheduler:
- coordinator.explainJob() re-runs scoreCandidate against every live factory,
  returning per-factory weighted breakdown, eligibility + reasons, deps state,
  and the best eligible factory (read-only, side-effect free)
- GET /fleet/jobs/:id/explain route (404 when job missing)
- fleet-client.getJobExplain() + JobExplain/ScoreBreakdown types
- ExplainPanel on the job detail page: score table per factory with the six
  weighted terms, eligibility, and unmet-deps note; degrades gracefully
- Tests: +2 coordinator, +1 routes, +2 fleet-client (fleet 144 green,
  tracker-web 214 green)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Saravanakumar D 2026-05-30 18:21:14 -07:00
parent 69f553d432
commit 2d5f9be642
7 changed files with 311 additions and 2 deletions

View File

@ -20,6 +20,7 @@ import {
getJobEvents,
getJobArtifacts,
getJobDag,
getJobExplain,
listFactories,
getBudget,
upsertBudget,
@ -136,6 +137,29 @@ describe('fleet-client', () => {
});
});
describe('getJobExplain', () => {
it('returns score breakdown on success', async () => {
fetchSpy.mockResolvedValue({
jobId: 'j1',
stage: 'queued',
weights: {},
depsSatisfied: true,
unmetDeps: [],
factories: [{ factoryId: 'f1', eligible: true, ineligibleReasons: [], score: 3.2 }],
bestFactoryId: 'f1',
});
const res = await getJobExplain('j1');
expect(res?.bestFactoryId).toBe('f1');
expect(fetchSpy).toHaveBeenCalledWith('/jobs/j1/explain', expect.anything());
});
it('returns null on 404', async () => {
fetchSpy.mockRejectedValue(new Error('404 Not Found'));
const res = await getJobExplain('missing');
expect(res).toBeNull();
});
});
describe('listFactories', () => {
it('returns factories on success', async () => {
fetchSpy.mockResolvedValue({ factories: [{ id: 'f1' }] });

View File

@ -12,6 +12,7 @@ import {
getJobEvents,
getJobArtifacts,
getJobDag,
getJobExplain,
patchJob,
operatorAction,
type OperatorAction,
@ -20,6 +21,7 @@ import {
type FleetEvent,
type FleetArtifact,
type DagNode,
type JobExplain,
} from '@/lib/fleet-client';
export default function FleetJobDetailPage() {
@ -32,24 +34,27 @@ export default function FleetJobDetailPage() {
const [events, setEvents] = useState<FleetEvent[]>([]);
const [artifacts, setArtifacts] = useState<FleetArtifact[]>([]);
const [dag, setDag] = useState<DagNode | null>(null);
const [explain, setExplain] = useState<JobExplain | null>(null);
const [loading, setLoading] = useState(true);
const [shipping, setShipping] = useState(false);
const [acting, setActing] = useState<OperatorAction | null>(null);
const refresh = useCallback(async () => {
try {
const [j, r, e, a, d] = await Promise.all([
const [j, r, e, a, d, x] = await Promise.all([
getJob(jobId),
getJobRuns(jobId),
getJobEvents(jobId),
getJobArtifacts(jobId),
getJobDag(jobId),
getJobExplain(jobId),
]);
setJob(j);
setRuns(r.runs);
setEvents(e.events);
setArtifacts(a.artifacts);
setDag(d?.dag ?? null);
setExplain(x);
} catch {
/* degrade */
} finally {
@ -169,6 +174,9 @@ export default function FleetJobDetailPage() {
</section>
)}
{/* Routing explainability (§7) */}
{explain && <ExplainPanel explain={explain} />}
{/* Event timeline */}
<section>
<h2 className="text-lg font-semibold mb-2">Event Timeline</h2>
@ -277,3 +285,73 @@ function DagTree({ node, depth = 0 }: { node: DagNode; depth?: number }) {
</div>
);
}
const SCORE_TERMS: { key: keyof JobExplain['factories'][number]['breakdown']; label: string }[] = [
{ key: 'capabilityFit', label: 'Capability' },
{ key: 'affinity', label: 'Affinity' },
{ key: 'load', label: 'Load' },
{ key: 'costFit', label: 'Cost fit' },
{ key: 'health', label: 'Health' },
{ key: 'starvation', label: 'Starvation' },
];
function ExplainPanel({ explain }: { explain: JobExplain }) {
return (
<section>
<h2 className="text-lg font-semibold mb-1">Routing Explainability</h2>
<p className="text-muted-foreground text-xs mb-3">
Why this job routes where it does the §7 weighted score per factory.{' '}
{explain.bestFactoryId ? (
<>
Best factory: <span className="font-mono">{explain.bestFactoryId}</span>.
</>
) : (
<>No eligible factory right now.</>
)}
{!explain.depsSatisfied && (
<span className="text-amber-600"> Blocked on deps: {explain.unmetDeps.join(', ')}.</span>
)}
</p>
{explain.factories.length === 0 ? (
<p className="text-muted-foreground text-sm">No factories have reported in.</p>
) : (
<table className="w-full text-sm" aria-label="Routing score breakdown">
<thead>
<tr className="border-b text-left text-muted-foreground">
<th className="pb-2 pr-4">Factory</th>
<th className="pb-2 pr-4">Score</th>
{SCORE_TERMS.map(t => (
<th key={t.key} className="pb-2 pr-4 text-right">
{t.label}
</th>
))}
<th className="pb-2">Eligible</th>
</tr>
</thead>
<tbody>
{explain.factories.map(f => (
<tr key={f.factoryId} className="border-b last:border-0">
<td className="py-2 pr-4 font-mono text-xs">{f.factoryId}</td>
<td className="py-2 pr-4 font-medium">{f.score.toFixed(2)}</td>
{SCORE_TERMS.map(t => (
<td key={t.key} className="py-2 pr-4 text-right font-mono text-xs">
{f.breakdown[t.key].toFixed(2)}
</td>
))}
<td className="py-2">
{f.eligible ? (
<span className="text-green-600"></span>
) : (
<span className="text-muted-foreground" title={f.ineligibleReasons.join('; ')}>
</span>
)}
</td>
</tr>
))}
</tbody>
</table>
)}
</section>
);
}

View File

@ -87,6 +87,33 @@ export interface DagNode {
children: DagNode[];
}
export interface ScoreBreakdown {
capabilityFit: number;
affinity: number;
load: number;
costFit: number;
health: number;
starvation: number;
}
export interface FactoryScoreExplain {
factoryId: string;
eligible: boolean;
ineligibleReasons: string[];
score: number;
breakdown: ScoreBreakdown;
}
export interface JobExplain {
jobId: string;
stage: string;
weights: Record<string, number>;
depsSatisfied: boolean;
unmetDeps: string[];
factories: FactoryScoreExplain[];
bestFactoryId: string | null;
}
// ── Client ──────────────────────────────────────────────────────────────────
const fleetApi = createApiClient({
@ -167,6 +194,10 @@ export async function getJobDag(jobId: string): Promise<{ dag: DagNode } | null>
return apiFetchOptional(`/jobs/${jobId}/dag`);
}
export async function getJobExplain(jobId: string): Promise<JobExplain | null> {
return apiFetchOptional(`/jobs/${jobId}/explain`);
}
// ── Factories ───────────────────────────────────────────────────────────────
export async function listFactories(): Promise<{ factories: FleetFactory[] }> {

View File

@ -837,4 +837,67 @@ describe('fleet coordinator — Phase 3 per-product budgets', () => {
const rejectEvents = (await repo.listEvents(job.id)).filter(e => e.type === 'operator_action');
expect(rejectEvents).toHaveLength(1);
});
// ── Phase 3: SCORING EXPLAINABILITY ──
it('explainJob: returns per-factory score breakdowns, eligibility, and the best factory', async () => {
const { job } = await coord.submitJob(PID, input({ capabilities: ['os:mac'] }));
// a capable factory and an incapable one both heartbeat in
await coord.heartbeat({
productId: PID,
factoryId: 'cap',
capabilities: ['os:mac', 'has:git'],
health: 'ok',
load: 0,
});
await coord.heartbeat({
productId: PID,
factoryId: 'nocap',
capabilities: ['os:linux'],
health: 'ok',
load: 5,
});
const explain = await coord.explainJob(job.id, PID);
expect(explain).not.toBeNull();
expect(explain!.jobId).toBe(job.id);
expect(explain!.depsSatisfied).toBe(true);
expect(explain!.factories).toHaveLength(2);
// each breakdown's six weighted terms sum to the reported score
for (const f of explain!.factories) {
const sum =
f.breakdown.capabilityFit +
f.breakdown.affinity +
f.breakdown.load +
f.breakdown.costFit +
f.breakdown.health +
f.breakdown.starvation;
expect(f.score).toBeCloseTo(sum, 9);
}
const cap = explain!.factories.find(f => f.factoryId === 'cap');
const nocap = explain!.factories.find(f => f.factoryId === 'nocap');
expect(cap?.eligible).toBe(true);
expect(nocap?.eligible).toBe(false);
expect(nocap?.ineligibleReasons).toContain('missing required capabilities');
expect(explain!.bestFactoryId).toBe('cap');
});
it('explainJob: reports unmet deps and no eligible factory; unknown job is null', async () => {
await coord.submitJob(PID, input({ idempotencyKey: 'dep' }));
const { job } = await coord.submitJob(PID, input({ idempotencyKey: 'child', deps: ['dep'] }));
await coord.heartbeat({
productId: PID,
factoryId: 'f1',
capabilities: [],
health: 'ok',
load: 0,
});
const explain = await coord.explainJob(job.id, PID);
expect(explain!.depsSatisfied).toBe(false);
expect(explain!.unmetDeps).toContain('dep');
expect(explain!.bestFactoryId).toBeNull(); // deps unmet ⇒ nothing eligible
expect(explain!.factories[0].ineligibleReasons.some(r => r.includes('unmet deps'))).toBe(true);
expect(await coord.explainJob('missing', PID)).toBeNull();
});
});

View File

@ -23,7 +23,10 @@ import * as repo from './repository.js';
import {
selectJob,
selectPreemptionVictim,
scoreCandidate,
capabilitiesSubset,
type RunningJobView,
type ScoreBreakdown,
type SchedulerContext,
type SchedulerFactory,
type SchedulerWeights,
@ -909,7 +912,83 @@ export async function operatorAction(
return { ok: true, doc: res.doc };
}
// ── Heartbeat (§8) ────────────────────────────────────────────────────────────
// ── Scoring explainability (§7 / Phase 3 — "why does this job route here?") ────
/** One factory's scored explanation for a job (already-weighted breakdown). */
export interface FactoryScoreExplain {
factoryId: string;
eligible: boolean;
ineligibleReasons: string[];
score: number;
breakdown: ScoreBreakdown;
}
/** Full explainability payload for a job against the current fleet. */
export interface JobExplain {
jobId: string;
stage: FleetStage;
weights: SchedulerWeights;
depsSatisfied: boolean;
unmetDeps: string[];
factories: FactoryScoreExplain[];
bestFactoryId: string | null;
}
/**
* Explain how a job would be scored against every live factory for its product
* (§7 scoring surfaced for the control plane). Read-only and side-effect free:
* it re-runs the same `scoreCandidate` the scheduler uses, against persisted
* factory state, so operators can see WHY a job routes (or fails to route).
*
* Affinity (prefers-engine) and cost-fit depend on claim-time hints that are not
* persisted on the factory doc, so they score as neutral here the breakdown
* reflects the structural signals available from stored state.
*/
export async function explainJob(jobId: string, productId: string): Promise<JobExplain | null> {
const job = await repo.getJob(jobId, productId);
if (!job) return null;
const weights = resolveWeights(weightRegistry, productId);
const unmet = await unmetDeps(job);
const depsSatisfied = unmet.length === 0;
const factories = await repo.listFactories(productId);
const ctx: SchedulerContext = { now: Date.now() };
const scored: FactoryScoreExplain[] = factories.map(f => {
const sf: SchedulerFactory = {
capabilities: f.capabilities,
health: f.health,
load: f.load,
seatLimit: f.seatLimit,
};
const { score, breakdown } = scoreCandidate(job, sf, ctx, weights);
const reasons: string[] = [];
if ((f.health ?? 'ok') === 'down') reasons.push('factory health is down');
if (!capabilitiesSubset(job.capabilities ?? [], f.capabilities)) {
reasons.push('missing required capabilities');
}
if (!depsSatisfied) reasons.push(`unmet deps: ${unmet.join(', ')}`);
return {
factoryId: f.factoryId,
eligible: reasons.length === 0,
ineligibleReasons: reasons,
score,
breakdown,
};
});
scored.sort((a, b) => b.score - a.score);
const best = scored.find(s => s.eligible) ?? null;
return {
jobId,
stage: job.stage,
weights,
depsSatisfied,
unmetDeps: unmet,
factories: scored,
bestFactoryId: best?.factoryId ?? null,
};
}
export interface HeartbeatContext {
productId: string;

View File

@ -165,4 +165,28 @@ describe('fleetRoutes', () => {
});
expect(bad.statusCode).toBe(400);
});
it('GET /fleet/jobs/:id/explain returns a per-factory score breakdown', async () => {
const app = await buildApp();
const sub = await submit(app, { idempotencyKey: 'k1', bodyMd: '# task' });
const jobId = JSON.parse(sub.body).job.id as string;
await app.inject({
method: 'POST',
url: '/api/fleet/factories/heartbeat',
payload: { factoryId: 'fac_1', capabilities: [], health: 'ok' },
});
const explain = await app.inject({
method: 'GET',
url: `/api/fleet/jobs/${jobId}/explain`,
});
expect(explain.statusCode).toBe(200);
const body = JSON.parse(explain.body);
expect(body.jobId).toBe(jobId);
expect(body.factories).toHaveLength(1);
expect(body.bestFactoryId).toBe('fac_1');
const missing = await app.inject({ method: 'GET', url: '/api/fleet/jobs/nope/explain' });
expect(missing.statusCode).toBe(404);
});
});

View File

@ -237,6 +237,16 @@ export async function fleetRoutes(app: FastifyInstance) {
return { events };
});
// ── Scoring explainability — why does this job route where it does? (§7) ──
app.get('/fleet/jobs/:id/explain', async req => {
await extractAuth(req);
const { id } = req.params as { id: string };
const pid = getRequestProductId(req);
const explain = await coordinator.explainJob(id, pid);
if (!explain) throw new NotFoundError('Job not found');
return explain;
});
// ── Artifacts: upload (base64 body → blob + pointer) ──
app.post('/fleet/jobs/:id/artifacts', async (req, reply) => {
await extractAuth(req);