diff --git a/dashboards/tracker-web/src/__tests__/fleet-client.test.ts b/dashboards/tracker-web/src/__tests__/fleet-client.test.ts index 53c067cf..d14d3344 100644 --- a/dashboards/tracker-web/src/__tests__/fleet-client.test.ts +++ b/dashboards/tracker-web/src/__tests__/fleet-client.test.ts @@ -20,6 +20,7 @@ import { getJobEvents, getJobArtifacts, getJobDag, + getJobExplain, listFactories, getBudget, upsertBudget, @@ -136,6 +137,29 @@ describe('fleet-client', () => { }); }); + describe('getJobExplain', () => { + it('returns score breakdown on success', async () => { + fetchSpy.mockResolvedValue({ + jobId: 'j1', + stage: 'queued', + weights: {}, + depsSatisfied: true, + unmetDeps: [], + factories: [{ factoryId: 'f1', eligible: true, ineligibleReasons: [], score: 3.2 }], + bestFactoryId: 'f1', + }); + const res = await getJobExplain('j1'); + expect(res?.bestFactoryId).toBe('f1'); + expect(fetchSpy).toHaveBeenCalledWith('/jobs/j1/explain', expect.anything()); + }); + + it('returns null on 404', async () => { + fetchSpy.mockRejectedValue(new Error('404 Not Found')); + const res = await getJobExplain('missing'); + expect(res).toBeNull(); + }); + }); + describe('listFactories', () => { it('returns factories on success', async () => { fetchSpy.mockResolvedValue({ factories: [{ id: 'f1' }] }); diff --git a/dashboards/tracker-web/src/app/dashboard/fleet/jobs/[id]/page.tsx b/dashboards/tracker-web/src/app/dashboard/fleet/jobs/[id]/page.tsx index b6f8603b..e387df87 100644 --- a/dashboards/tracker-web/src/app/dashboard/fleet/jobs/[id]/page.tsx +++ b/dashboards/tracker-web/src/app/dashboard/fleet/jobs/[id]/page.tsx @@ -12,6 +12,7 @@ import { getJobEvents, getJobArtifacts, getJobDag, + getJobExplain, patchJob, operatorAction, type OperatorAction, @@ -20,6 +21,7 @@ import { type FleetEvent, type FleetArtifact, type DagNode, + type JobExplain, } from '@/lib/fleet-client'; export default function FleetJobDetailPage() { @@ -32,24 +34,27 @@ export default function FleetJobDetailPage() { const [events, setEvents] = useState([]); const [artifacts, setArtifacts] = useState([]); const [dag, setDag] = useState(null); + const [explain, setExplain] = useState(null); const [loading, setLoading] = useState(true); const [shipping, setShipping] = useState(false); const [acting, setActing] = useState(null); const refresh = useCallback(async () => { try { - const [j, r, e, a, d] = await Promise.all([ + const [j, r, e, a, d, x] = await Promise.all([ getJob(jobId), getJobRuns(jobId), getJobEvents(jobId), getJobArtifacts(jobId), getJobDag(jobId), + getJobExplain(jobId), ]); setJob(j); setRuns(r.runs); setEvents(e.events); setArtifacts(a.artifacts); setDag(d?.dag ?? null); + setExplain(x); } catch { /* degrade */ } finally { @@ -169,6 +174,9 @@ export default function FleetJobDetailPage() { )} + {/* Routing explainability (§7) */} + {explain && } + {/* Event timeline */}

Event Timeline

@@ -277,3 +285,73 @@ function DagTree({ node, depth = 0 }: { node: DagNode; depth?: number }) { ); } + +const SCORE_TERMS: { key: keyof JobExplain['factories'][number]['breakdown']; label: string }[] = [ + { key: 'capabilityFit', label: 'Capability' }, + { key: 'affinity', label: 'Affinity' }, + { key: 'load', label: 'Load' }, + { key: 'costFit', label: 'Cost fit' }, + { key: 'health', label: 'Health' }, + { key: 'starvation', label: 'Starvation' }, +]; + +function ExplainPanel({ explain }: { explain: JobExplain }) { + return ( +
+

Routing Explainability

+

+ Why this job routes where it does — the §7 weighted score per factory.{' '} + {explain.bestFactoryId ? ( + <> + Best factory: {explain.bestFactoryId}. + + ) : ( + <>No eligible factory right now. + )} + {!explain.depsSatisfied && ( + Blocked on deps: {explain.unmetDeps.join(', ')}. + )} +

+ {explain.factories.length === 0 ? ( +

No factories have reported in.

+ ) : ( + + + + + + {SCORE_TERMS.map(t => ( + + ))} + + + + + {explain.factories.map(f => ( + + + + {SCORE_TERMS.map(t => ( + + ))} + + + ))} + +
FactoryScore + {t.label} + Eligible
{f.factoryId}{f.score.toFixed(2)} + {f.breakdown[t.key].toFixed(2)} + + {f.eligible ? ( + + ) : ( + + ✗ + + )} +
+ )} +
+ ); +} diff --git a/dashboards/tracker-web/src/lib/fleet-client.ts b/dashboards/tracker-web/src/lib/fleet-client.ts index 46f512d9..46e6a439 100644 --- a/dashboards/tracker-web/src/lib/fleet-client.ts +++ b/dashboards/tracker-web/src/lib/fleet-client.ts @@ -87,6 +87,33 @@ export interface DagNode { children: DagNode[]; } +export interface ScoreBreakdown { + capabilityFit: number; + affinity: number; + load: number; + costFit: number; + health: number; + starvation: number; +} + +export interface FactoryScoreExplain { + factoryId: string; + eligible: boolean; + ineligibleReasons: string[]; + score: number; + breakdown: ScoreBreakdown; +} + +export interface JobExplain { + jobId: string; + stage: string; + weights: Record; + depsSatisfied: boolean; + unmetDeps: string[]; + factories: FactoryScoreExplain[]; + bestFactoryId: string | null; +} + // ── Client ────────────────────────────────────────────────────────────────── const fleetApi = createApiClient({ @@ -167,6 +194,10 @@ export async function getJobDag(jobId: string): Promise<{ dag: DagNode } | null> return apiFetchOptional(`/jobs/${jobId}/dag`); } +export async function getJobExplain(jobId: string): Promise { + return apiFetchOptional(`/jobs/${jobId}/explain`); +} + // ── Factories ─────────────────────────────────────────────────────────────── export async function listFactories(): Promise<{ factories: FleetFactory[] }> { diff --git a/services/platform-service/src/modules/fleet/coordinator.test.ts b/services/platform-service/src/modules/fleet/coordinator.test.ts index de927df2..d4a3abc5 100644 --- a/services/platform-service/src/modules/fleet/coordinator.test.ts +++ b/services/platform-service/src/modules/fleet/coordinator.test.ts @@ -837,4 +837,67 @@ describe('fleet coordinator — Phase 3 per-product budgets', () => { const rejectEvents = (await repo.listEvents(job.id)).filter(e => e.type === 'operator_action'); expect(rejectEvents).toHaveLength(1); }); + + // ── Phase 3: SCORING EXPLAINABILITY ── + it('explainJob: returns per-factory score breakdowns, eligibility, and the best factory', async () => { + const { job } = await coord.submitJob(PID, input({ capabilities: ['os:mac'] })); + // a capable factory and an incapable one both heartbeat in + await coord.heartbeat({ + productId: PID, + factoryId: 'cap', + capabilities: ['os:mac', 'has:git'], + health: 'ok', + load: 0, + }); + await coord.heartbeat({ + productId: PID, + factoryId: 'nocap', + capabilities: ['os:linux'], + health: 'ok', + load: 5, + }); + + const explain = await coord.explainJob(job.id, PID); + expect(explain).not.toBeNull(); + expect(explain!.jobId).toBe(job.id); + expect(explain!.depsSatisfied).toBe(true); + expect(explain!.factories).toHaveLength(2); + // each breakdown's six weighted terms sum to the reported score + for (const f of explain!.factories) { + const sum = + f.breakdown.capabilityFit + + f.breakdown.affinity + + f.breakdown.load + + f.breakdown.costFit + + f.breakdown.health + + f.breakdown.starvation; + expect(f.score).toBeCloseTo(sum, 9); + } + const cap = explain!.factories.find(f => f.factoryId === 'cap'); + const nocap = explain!.factories.find(f => f.factoryId === 'nocap'); + expect(cap?.eligible).toBe(true); + expect(nocap?.eligible).toBe(false); + expect(nocap?.ineligibleReasons).toContain('missing required capabilities'); + expect(explain!.bestFactoryId).toBe('cap'); + }); + + it('explainJob: reports unmet deps and no eligible factory; unknown job is null', async () => { + await coord.submitJob(PID, input({ idempotencyKey: 'dep' })); + const { job } = await coord.submitJob(PID, input({ idempotencyKey: 'child', deps: ['dep'] })); + await coord.heartbeat({ + productId: PID, + factoryId: 'f1', + capabilities: [], + health: 'ok', + load: 0, + }); + + const explain = await coord.explainJob(job.id, PID); + expect(explain!.depsSatisfied).toBe(false); + expect(explain!.unmetDeps).toContain('dep'); + expect(explain!.bestFactoryId).toBeNull(); // deps unmet ⇒ nothing eligible + expect(explain!.factories[0].ineligibleReasons.some(r => r.includes('unmet deps'))).toBe(true); + + expect(await coord.explainJob('missing', PID)).toBeNull(); + }); }); diff --git a/services/platform-service/src/modules/fleet/coordinator.ts b/services/platform-service/src/modules/fleet/coordinator.ts index 304e2fb8..89403c32 100644 --- a/services/platform-service/src/modules/fleet/coordinator.ts +++ b/services/platform-service/src/modules/fleet/coordinator.ts @@ -23,7 +23,10 @@ import * as repo from './repository.js'; import { selectJob, selectPreemptionVictim, + scoreCandidate, + capabilitiesSubset, type RunningJobView, + type ScoreBreakdown, type SchedulerContext, type SchedulerFactory, type SchedulerWeights, @@ -909,7 +912,83 @@ export async function operatorAction( return { ok: true, doc: res.doc }; } -// ── Heartbeat (§8) ──────────────────────────────────────────────────────────── +// ── Scoring explainability (§7 / Phase 3 — "why does this job route here?") ──── + +/** One factory's scored explanation for a job (already-weighted breakdown). */ +export interface FactoryScoreExplain { + factoryId: string; + eligible: boolean; + ineligibleReasons: string[]; + score: number; + breakdown: ScoreBreakdown; +} + +/** Full explainability payload for a job against the current fleet. */ +export interface JobExplain { + jobId: string; + stage: FleetStage; + weights: SchedulerWeights; + depsSatisfied: boolean; + unmetDeps: string[]; + factories: FactoryScoreExplain[]; + bestFactoryId: string | null; +} + +/** + * Explain how a job would be scored against every live factory for its product + * (§7 scoring surfaced for the control plane). Read-only and side-effect free: + * it re-runs the same `scoreCandidate` the scheduler uses, against persisted + * factory state, so operators can see WHY a job routes (or fails to route). + * + * Affinity (prefers-engine) and cost-fit depend on claim-time hints that are not + * persisted on the factory doc, so they score as neutral here — the breakdown + * reflects the structural signals available from stored state. + */ +export async function explainJob(jobId: string, productId: string): Promise { + const job = await repo.getJob(jobId, productId); + if (!job) return null; + + const weights = resolveWeights(weightRegistry, productId); + const unmet = await unmetDeps(job); + const depsSatisfied = unmet.length === 0; + const factories = await repo.listFactories(productId); + const ctx: SchedulerContext = { now: Date.now() }; + + const scored: FactoryScoreExplain[] = factories.map(f => { + const sf: SchedulerFactory = { + capabilities: f.capabilities, + health: f.health, + load: f.load, + seatLimit: f.seatLimit, + }; + const { score, breakdown } = scoreCandidate(job, sf, ctx, weights); + const reasons: string[] = []; + if ((f.health ?? 'ok') === 'down') reasons.push('factory health is down'); + if (!capabilitiesSubset(job.capabilities ?? [], f.capabilities)) { + reasons.push('missing required capabilities'); + } + if (!depsSatisfied) reasons.push(`unmet deps: ${unmet.join(', ')}`); + return { + factoryId: f.factoryId, + eligible: reasons.length === 0, + ineligibleReasons: reasons, + score, + breakdown, + }; + }); + scored.sort((a, b) => b.score - a.score); + const best = scored.find(s => s.eligible) ?? null; + + return { + jobId, + stage: job.stage, + weights, + depsSatisfied, + unmetDeps: unmet, + factories: scored, + bestFactoryId: best?.factoryId ?? null, + }; +} export interface HeartbeatContext { productId: string; diff --git a/services/platform-service/src/modules/fleet/routes.test.ts b/services/platform-service/src/modules/fleet/routes.test.ts index de2f7e87..bcd6ace4 100644 --- a/services/platform-service/src/modules/fleet/routes.test.ts +++ b/services/platform-service/src/modules/fleet/routes.test.ts @@ -165,4 +165,28 @@ describe('fleetRoutes', () => { }); expect(bad.statusCode).toBe(400); }); + + it('GET /fleet/jobs/:id/explain returns a per-factory score breakdown', async () => { + const app = await buildApp(); + const sub = await submit(app, { idempotencyKey: 'k1', bodyMd: '# task' }); + const jobId = JSON.parse(sub.body).job.id as string; + await app.inject({ + method: 'POST', + url: '/api/fleet/factories/heartbeat', + payload: { factoryId: 'fac_1', capabilities: [], health: 'ok' }, + }); + + const explain = await app.inject({ + method: 'GET', + url: `/api/fleet/jobs/${jobId}/explain`, + }); + expect(explain.statusCode).toBe(200); + const body = JSON.parse(explain.body); + expect(body.jobId).toBe(jobId); + expect(body.factories).toHaveLength(1); + expect(body.bestFactoryId).toBe('fac_1'); + + const missing = await app.inject({ method: 'GET', url: '/api/fleet/jobs/nope/explain' }); + expect(missing.statusCode).toBe(404); + }); }); diff --git a/services/platform-service/src/modules/fleet/routes.ts b/services/platform-service/src/modules/fleet/routes.ts index 0c608d03..c0f87d95 100644 --- a/services/platform-service/src/modules/fleet/routes.ts +++ b/services/platform-service/src/modules/fleet/routes.ts @@ -237,6 +237,16 @@ export async function fleetRoutes(app: FastifyInstance) { return { events }; }); + // ── Scoring explainability — why does this job route where it does? (§7) ── + app.get('/fleet/jobs/:id/explain', async req => { + await extractAuth(req); + const { id } = req.params as { id: string }; + const pid = getRequestProductId(req); + const explain = await coordinator.explainJob(id, pid); + if (!explain) throw new NotFoundError('Job not found'); + return explain; + }); + // ── Artifacts: upload (base64 body → blob + pointer) ── app.post('/fleet/jobs/:id/artifacts', async (req, reply) => { await extractAuth(req);