feat: surface scoring explainability in fleet control plane
Adds 'why does this job route here?' to the §7 scheduler: - coordinator.explainJob() re-runs scoreCandidate against every live factory, returning per-factory weighted breakdown, eligibility + reasons, deps state, and the best eligible factory (read-only, side-effect free) - GET /fleet/jobs/:id/explain route (404 when job missing) - fleet-client.getJobExplain() + JobExplain/ScoreBreakdown types - ExplainPanel on the job detail page: score table per factory with the six weighted terms, eligibility, and unmet-deps note; degrades gracefully - Tests: +2 coordinator, +1 routes, +2 fleet-client (fleet 144 green, tracker-web 214 green) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
parent
69f553d432
commit
2d5f9be642
@ -20,6 +20,7 @@ import {
|
||||
getJobEvents,
|
||||
getJobArtifacts,
|
||||
getJobDag,
|
||||
getJobExplain,
|
||||
listFactories,
|
||||
getBudget,
|
||||
upsertBudget,
|
||||
@ -136,6 +137,29 @@ describe('fleet-client', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('getJobExplain', () => {
|
||||
it('returns score breakdown on success', async () => {
|
||||
fetchSpy.mockResolvedValue({
|
||||
jobId: 'j1',
|
||||
stage: 'queued',
|
||||
weights: {},
|
||||
depsSatisfied: true,
|
||||
unmetDeps: [],
|
||||
factories: [{ factoryId: 'f1', eligible: true, ineligibleReasons: [], score: 3.2 }],
|
||||
bestFactoryId: 'f1',
|
||||
});
|
||||
const res = await getJobExplain('j1');
|
||||
expect(res?.bestFactoryId).toBe('f1');
|
||||
expect(fetchSpy).toHaveBeenCalledWith('/jobs/j1/explain', expect.anything());
|
||||
});
|
||||
|
||||
it('returns null on 404', async () => {
|
||||
fetchSpy.mockRejectedValue(new Error('404 Not Found'));
|
||||
const res = await getJobExplain('missing');
|
||||
expect(res).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe('listFactories', () => {
|
||||
it('returns factories on success', async () => {
|
||||
fetchSpy.mockResolvedValue({ factories: [{ id: 'f1' }] });
|
||||
|
||||
@ -12,6 +12,7 @@ import {
|
||||
getJobEvents,
|
||||
getJobArtifacts,
|
||||
getJobDag,
|
||||
getJobExplain,
|
||||
patchJob,
|
||||
operatorAction,
|
||||
type OperatorAction,
|
||||
@ -20,6 +21,7 @@ import {
|
||||
type FleetEvent,
|
||||
type FleetArtifact,
|
||||
type DagNode,
|
||||
type JobExplain,
|
||||
} from '@/lib/fleet-client';
|
||||
|
||||
export default function FleetJobDetailPage() {
|
||||
@ -32,24 +34,27 @@ export default function FleetJobDetailPage() {
|
||||
const [events, setEvents] = useState<FleetEvent[]>([]);
|
||||
const [artifacts, setArtifacts] = useState<FleetArtifact[]>([]);
|
||||
const [dag, setDag] = useState<DagNode | null>(null);
|
||||
const [explain, setExplain] = useState<JobExplain | null>(null);
|
||||
const [loading, setLoading] = useState(true);
|
||||
const [shipping, setShipping] = useState(false);
|
||||
const [acting, setActing] = useState<OperatorAction | null>(null);
|
||||
|
||||
const refresh = useCallback(async () => {
|
||||
try {
|
||||
const [j, r, e, a, d] = await Promise.all([
|
||||
const [j, r, e, a, d, x] = await Promise.all([
|
||||
getJob(jobId),
|
||||
getJobRuns(jobId),
|
||||
getJobEvents(jobId),
|
||||
getJobArtifacts(jobId),
|
||||
getJobDag(jobId),
|
||||
getJobExplain(jobId),
|
||||
]);
|
||||
setJob(j);
|
||||
setRuns(r.runs);
|
||||
setEvents(e.events);
|
||||
setArtifacts(a.artifacts);
|
||||
setDag(d?.dag ?? null);
|
||||
setExplain(x);
|
||||
} catch {
|
||||
/* degrade */
|
||||
} finally {
|
||||
@ -169,6 +174,9 @@ export default function FleetJobDetailPage() {
|
||||
</section>
|
||||
)}
|
||||
|
||||
{/* Routing explainability (§7) */}
|
||||
{explain && <ExplainPanel explain={explain} />}
|
||||
|
||||
{/* Event timeline */}
|
||||
<section>
|
||||
<h2 className="text-lg font-semibold mb-2">Event Timeline</h2>
|
||||
@ -277,3 +285,73 @@ function DagTree({ node, depth = 0 }: { node: DagNode; depth?: number }) {
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
const SCORE_TERMS: { key: keyof JobExplain['factories'][number]['breakdown']; label: string }[] = [
|
||||
{ key: 'capabilityFit', label: 'Capability' },
|
||||
{ key: 'affinity', label: 'Affinity' },
|
||||
{ key: 'load', label: 'Load' },
|
||||
{ key: 'costFit', label: 'Cost fit' },
|
||||
{ key: 'health', label: 'Health' },
|
||||
{ key: 'starvation', label: 'Starvation' },
|
||||
];
|
||||
|
||||
function ExplainPanel({ explain }: { explain: JobExplain }) {
|
||||
return (
|
||||
<section>
|
||||
<h2 className="text-lg font-semibold mb-1">Routing Explainability</h2>
|
||||
<p className="text-muted-foreground text-xs mb-3">
|
||||
Why this job routes where it does — the §7 weighted score per factory.{' '}
|
||||
{explain.bestFactoryId ? (
|
||||
<>
|
||||
Best factory: <span className="font-mono">{explain.bestFactoryId}</span>.
|
||||
</>
|
||||
) : (
|
||||
<>No eligible factory right now.</>
|
||||
)}
|
||||
{!explain.depsSatisfied && (
|
||||
<span className="text-amber-600"> Blocked on deps: {explain.unmetDeps.join(', ')}.</span>
|
||||
)}
|
||||
</p>
|
||||
{explain.factories.length === 0 ? (
|
||||
<p className="text-muted-foreground text-sm">No factories have reported in.</p>
|
||||
) : (
|
||||
<table className="w-full text-sm" aria-label="Routing score breakdown">
|
||||
<thead>
|
||||
<tr className="border-b text-left text-muted-foreground">
|
||||
<th className="pb-2 pr-4">Factory</th>
|
||||
<th className="pb-2 pr-4">Score</th>
|
||||
{SCORE_TERMS.map(t => (
|
||||
<th key={t.key} className="pb-2 pr-4 text-right">
|
||||
{t.label}
|
||||
</th>
|
||||
))}
|
||||
<th className="pb-2">Eligible</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{explain.factories.map(f => (
|
||||
<tr key={f.factoryId} className="border-b last:border-0">
|
||||
<td className="py-2 pr-4 font-mono text-xs">{f.factoryId}</td>
|
||||
<td className="py-2 pr-4 font-medium">{f.score.toFixed(2)}</td>
|
||||
{SCORE_TERMS.map(t => (
|
||||
<td key={t.key} className="py-2 pr-4 text-right font-mono text-xs">
|
||||
{f.breakdown[t.key].toFixed(2)}
|
||||
</td>
|
||||
))}
|
||||
<td className="py-2">
|
||||
{f.eligible ? (
|
||||
<span className="text-green-600">✓</span>
|
||||
) : (
|
||||
<span className="text-muted-foreground" title={f.ineligibleReasons.join('; ')}>
|
||||
✗
|
||||
</span>
|
||||
)}
|
||||
</td>
|
||||
</tr>
|
||||
))}
|
||||
</tbody>
|
||||
</table>
|
||||
)}
|
||||
</section>
|
||||
);
|
||||
}
|
||||
|
||||
@ -87,6 +87,33 @@ export interface DagNode {
|
||||
children: DagNode[];
|
||||
}
|
||||
|
||||
export interface ScoreBreakdown {
|
||||
capabilityFit: number;
|
||||
affinity: number;
|
||||
load: number;
|
||||
costFit: number;
|
||||
health: number;
|
||||
starvation: number;
|
||||
}
|
||||
|
||||
export interface FactoryScoreExplain {
|
||||
factoryId: string;
|
||||
eligible: boolean;
|
||||
ineligibleReasons: string[];
|
||||
score: number;
|
||||
breakdown: ScoreBreakdown;
|
||||
}
|
||||
|
||||
export interface JobExplain {
|
||||
jobId: string;
|
||||
stage: string;
|
||||
weights: Record<string, number>;
|
||||
depsSatisfied: boolean;
|
||||
unmetDeps: string[];
|
||||
factories: FactoryScoreExplain[];
|
||||
bestFactoryId: string | null;
|
||||
}
|
||||
|
||||
// ── Client ──────────────────────────────────────────────────────────────────
|
||||
|
||||
const fleetApi = createApiClient({
|
||||
@ -167,6 +194,10 @@ export async function getJobDag(jobId: string): Promise<{ dag: DagNode } | null>
|
||||
return apiFetchOptional(`/jobs/${jobId}/dag`);
|
||||
}
|
||||
|
||||
export async function getJobExplain(jobId: string): Promise<JobExplain | null> {
|
||||
return apiFetchOptional(`/jobs/${jobId}/explain`);
|
||||
}
|
||||
|
||||
// ── Factories ───────────────────────────────────────────────────────────────
|
||||
|
||||
export async function listFactories(): Promise<{ factories: FleetFactory[] }> {
|
||||
|
||||
@ -837,4 +837,67 @@ describe('fleet coordinator — Phase 3 per-product budgets', () => {
|
||||
const rejectEvents = (await repo.listEvents(job.id)).filter(e => e.type === 'operator_action');
|
||||
expect(rejectEvents).toHaveLength(1);
|
||||
});
|
||||
|
||||
// ── Phase 3: SCORING EXPLAINABILITY ──
|
||||
it('explainJob: returns per-factory score breakdowns, eligibility, and the best factory', async () => {
|
||||
const { job } = await coord.submitJob(PID, input({ capabilities: ['os:mac'] }));
|
||||
// a capable factory and an incapable one both heartbeat in
|
||||
await coord.heartbeat({
|
||||
productId: PID,
|
||||
factoryId: 'cap',
|
||||
capabilities: ['os:mac', 'has:git'],
|
||||
health: 'ok',
|
||||
load: 0,
|
||||
});
|
||||
await coord.heartbeat({
|
||||
productId: PID,
|
||||
factoryId: 'nocap',
|
||||
capabilities: ['os:linux'],
|
||||
health: 'ok',
|
||||
load: 5,
|
||||
});
|
||||
|
||||
const explain = await coord.explainJob(job.id, PID);
|
||||
expect(explain).not.toBeNull();
|
||||
expect(explain!.jobId).toBe(job.id);
|
||||
expect(explain!.depsSatisfied).toBe(true);
|
||||
expect(explain!.factories).toHaveLength(2);
|
||||
// each breakdown's six weighted terms sum to the reported score
|
||||
for (const f of explain!.factories) {
|
||||
const sum =
|
||||
f.breakdown.capabilityFit +
|
||||
f.breakdown.affinity +
|
||||
f.breakdown.load +
|
||||
f.breakdown.costFit +
|
||||
f.breakdown.health +
|
||||
f.breakdown.starvation;
|
||||
expect(f.score).toBeCloseTo(sum, 9);
|
||||
}
|
||||
const cap = explain!.factories.find(f => f.factoryId === 'cap');
|
||||
const nocap = explain!.factories.find(f => f.factoryId === 'nocap');
|
||||
expect(cap?.eligible).toBe(true);
|
||||
expect(nocap?.eligible).toBe(false);
|
||||
expect(nocap?.ineligibleReasons).toContain('missing required capabilities');
|
||||
expect(explain!.bestFactoryId).toBe('cap');
|
||||
});
|
||||
|
||||
it('explainJob: reports unmet deps and no eligible factory; unknown job is null', async () => {
|
||||
await coord.submitJob(PID, input({ idempotencyKey: 'dep' }));
|
||||
const { job } = await coord.submitJob(PID, input({ idempotencyKey: 'child', deps: ['dep'] }));
|
||||
await coord.heartbeat({
|
||||
productId: PID,
|
||||
factoryId: 'f1',
|
||||
capabilities: [],
|
||||
health: 'ok',
|
||||
load: 0,
|
||||
});
|
||||
|
||||
const explain = await coord.explainJob(job.id, PID);
|
||||
expect(explain!.depsSatisfied).toBe(false);
|
||||
expect(explain!.unmetDeps).toContain('dep');
|
||||
expect(explain!.bestFactoryId).toBeNull(); // deps unmet ⇒ nothing eligible
|
||||
expect(explain!.factories[0].ineligibleReasons.some(r => r.includes('unmet deps'))).toBe(true);
|
||||
|
||||
expect(await coord.explainJob('missing', PID)).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
@ -23,7 +23,10 @@ import * as repo from './repository.js';
|
||||
import {
|
||||
selectJob,
|
||||
selectPreemptionVictim,
|
||||
scoreCandidate,
|
||||
capabilitiesSubset,
|
||||
type RunningJobView,
|
||||
type ScoreBreakdown,
|
||||
type SchedulerContext,
|
||||
type SchedulerFactory,
|
||||
type SchedulerWeights,
|
||||
@ -909,7 +912,83 @@ export async function operatorAction(
|
||||
return { ok: true, doc: res.doc };
|
||||
}
|
||||
|
||||
// ── Heartbeat (§8) ────────────────────────────────────────────────────────────
|
||||
// ── Scoring explainability (§7 / Phase 3 — "why does this job route here?") ────
|
||||
|
||||
/** One factory's scored explanation for a job (already-weighted breakdown). */
|
||||
export interface FactoryScoreExplain {
|
||||
factoryId: string;
|
||||
eligible: boolean;
|
||||
ineligibleReasons: string[];
|
||||
score: number;
|
||||
breakdown: ScoreBreakdown;
|
||||
}
|
||||
|
||||
/** Full explainability payload for a job against the current fleet. */
|
||||
export interface JobExplain {
|
||||
jobId: string;
|
||||
stage: FleetStage;
|
||||
weights: SchedulerWeights;
|
||||
depsSatisfied: boolean;
|
||||
unmetDeps: string[];
|
||||
factories: FactoryScoreExplain[];
|
||||
bestFactoryId: string | null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Explain how a job would be scored against every live factory for its product
|
||||
* (§7 scoring surfaced for the control plane). Read-only and side-effect free:
|
||||
* it re-runs the same `scoreCandidate` the scheduler uses, against persisted
|
||||
* factory state, so operators can see WHY a job routes (or fails to route).
|
||||
*
|
||||
* Affinity (prefers-engine) and cost-fit depend on claim-time hints that are not
|
||||
* persisted on the factory doc, so they score as neutral here — the breakdown
|
||||
* reflects the structural signals available from stored state.
|
||||
*/
|
||||
export async function explainJob(jobId: string, productId: string): Promise<JobExplain | null> {
|
||||
const job = await repo.getJob(jobId, productId);
|
||||
if (!job) return null;
|
||||
|
||||
const weights = resolveWeights(weightRegistry, productId);
|
||||
const unmet = await unmetDeps(job);
|
||||
const depsSatisfied = unmet.length === 0;
|
||||
const factories = await repo.listFactories(productId);
|
||||
const ctx: SchedulerContext = { now: Date.now() };
|
||||
|
||||
const scored: FactoryScoreExplain[] = factories.map(f => {
|
||||
const sf: SchedulerFactory = {
|
||||
capabilities: f.capabilities,
|
||||
health: f.health,
|
||||
load: f.load,
|
||||
seatLimit: f.seatLimit,
|
||||
};
|
||||
const { score, breakdown } = scoreCandidate(job, sf, ctx, weights);
|
||||
const reasons: string[] = [];
|
||||
if ((f.health ?? 'ok') === 'down') reasons.push('factory health is down');
|
||||
if (!capabilitiesSubset(job.capabilities ?? [], f.capabilities)) {
|
||||
reasons.push('missing required capabilities');
|
||||
}
|
||||
if (!depsSatisfied) reasons.push(`unmet deps: ${unmet.join(', ')}`);
|
||||
return {
|
||||
factoryId: f.factoryId,
|
||||
eligible: reasons.length === 0,
|
||||
ineligibleReasons: reasons,
|
||||
score,
|
||||
breakdown,
|
||||
};
|
||||
});
|
||||
scored.sort((a, b) => b.score - a.score);
|
||||
const best = scored.find(s => s.eligible) ?? null;
|
||||
|
||||
return {
|
||||
jobId,
|
||||
stage: job.stage,
|
||||
weights,
|
||||
depsSatisfied,
|
||||
unmetDeps: unmet,
|
||||
factories: scored,
|
||||
bestFactoryId: best?.factoryId ?? null,
|
||||
};
|
||||
}
|
||||
|
||||
export interface HeartbeatContext {
|
||||
productId: string;
|
||||
|
||||
@ -165,4 +165,28 @@ describe('fleetRoutes', () => {
|
||||
});
|
||||
expect(bad.statusCode).toBe(400);
|
||||
});
|
||||
|
||||
it('GET /fleet/jobs/:id/explain returns a per-factory score breakdown', async () => {
|
||||
const app = await buildApp();
|
||||
const sub = await submit(app, { idempotencyKey: 'k1', bodyMd: '# task' });
|
||||
const jobId = JSON.parse(sub.body).job.id as string;
|
||||
await app.inject({
|
||||
method: 'POST',
|
||||
url: '/api/fleet/factories/heartbeat',
|
||||
payload: { factoryId: 'fac_1', capabilities: [], health: 'ok' },
|
||||
});
|
||||
|
||||
const explain = await app.inject({
|
||||
method: 'GET',
|
||||
url: `/api/fleet/jobs/${jobId}/explain`,
|
||||
});
|
||||
expect(explain.statusCode).toBe(200);
|
||||
const body = JSON.parse(explain.body);
|
||||
expect(body.jobId).toBe(jobId);
|
||||
expect(body.factories).toHaveLength(1);
|
||||
expect(body.bestFactoryId).toBe('fac_1');
|
||||
|
||||
const missing = await app.inject({ method: 'GET', url: '/api/fleet/jobs/nope/explain' });
|
||||
expect(missing.statusCode).toBe(404);
|
||||
});
|
||||
});
|
||||
|
||||
@ -237,6 +237,16 @@ export async function fleetRoutes(app: FastifyInstance) {
|
||||
return { events };
|
||||
});
|
||||
|
||||
// ── Scoring explainability — why does this job route where it does? (§7) ──
|
||||
app.get('/fleet/jobs/:id/explain', async req => {
|
||||
await extractAuth(req);
|
||||
const { id } = req.params as { id: string };
|
||||
const pid = getRequestProductId(req);
|
||||
const explain = await coordinator.explainJob(id, pid);
|
||||
if (!explain) throw new NotFoundError('Job not found');
|
||||
return explain;
|
||||
});
|
||||
|
||||
// ── Artifacts: upload (base64 body → blob + pointer) ──
|
||||
app.post('/fleet/jobs/:id/artifacts', async (req, reply) => {
|
||||
await extractAuth(req);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user