feat: surface scoring explainability in fleet control plane

Adds 'why does this job route here?' to the §7 scheduler: - coordinator.explainJob() re-runs scoreCandidate against every live factory, returning per-factory weighted breakdown, eligibility + reasons, deps state, and the best eligible factory (read-only, side-effect free) - GET /fleet/jobs/:id/explain route (404 when job missing) - fleet-client.getJobExplain() + JobExplain/ScoreBreakdown types - ExplainPanel on the job detail page: score table per factory with the six weighted terms, eligibility, and unmet-deps note; degrades gracefully - Tests: +2 coordinator, +1 routes, +2 fleet-client (fleet 144 green, tracker-web 214 green) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-30 18:21:14 -07:00 · 2026-05-30 18:21:14 -07:00 · 2d5f9be642
commit 2d5f9be642
parent 69f553d432
7 changed files with 311 additions and 2 deletions
--- a/dashboards/tracker-web/src/tests/fleet-client.test.ts
+++ b/dashboards/tracker-web/src/tests/fleet-client.test.ts
@ -20,6 +20,7 @@ import {
  getJobEvents,
  getJobArtifacts,
  getJobDag,
+  getJobExplain,
  listFactories,
  getBudget,
  upsertBudget,
@ -136,6 +137,29 @@ describe('fleet-client', () => {
    });
  });

+  describe('getJobExplain', () => {
+    it('returns score breakdown on success', async () => {
+      fetchSpy.mockResolvedValue({
+        jobId: 'j1',
+        stage: 'queued',
+        weights: {},
+        depsSatisfied: true,
+        unmetDeps: [],
+        factories: [{ factoryId: 'f1', eligible: true, ineligibleReasons: [], score: 3.2 }],
+        bestFactoryId: 'f1',
+      });
+      const res = await getJobExplain('j1');
+      expect(res?.bestFactoryId).toBe('f1');
+      expect(fetchSpy).toHaveBeenCalledWith('/jobs/j1/explain', expect.anything());
+    });
+
+    it('returns null on 404', async () => {
+      fetchSpy.mockRejectedValue(new Error('404 Not Found'));
+      const res = await getJobExplain('missing');
+      expect(res).toBeNull();
+    });
+  });
+
  describe('listFactories', () => {
    it('returns factories on success', async () => {
      fetchSpy.mockResolvedValue({ factories: [{ id: 'f1' }] });
--- a/dashboards/tracker-web/src/app/dashboard/fleet/jobs/[id]/page.tsx
+++ b/dashboards/tracker-web/src/app/dashboard/fleet/jobs/[id]/page.tsx
@ -12,6 +12,7 @@ import {
  getJobEvents,
  getJobArtifacts,
  getJobDag,
+  getJobExplain,
  patchJob,
  operatorAction,
  type OperatorAction,
@ -20,6 +21,7 @@ import {
  type FleetEvent,
  type FleetArtifact,
  type DagNode,
+  type JobExplain,
 } from '@/lib/fleet-client';

 export default function FleetJobDetailPage() {
@ -32,24 +34,27 @@ export default function FleetJobDetailPage() {
  const [events, setEvents] = useState<FleetEvent[]>([]);
  const [artifacts, setArtifacts] = useState<FleetArtifact[]>([]);
  const [dag, setDag] = useState<DagNode | null>(null);
+  const [explain, setExplain] = useState<JobExplain | null>(null);
  const [loading, setLoading] = useState(true);
  const [shipping, setShipping] = useState(false);
  const [acting, setActing] = useState<OperatorAction | null>(null);

  const refresh = useCallback(async () => {
    try {
-      const [j, r, e, a, d] = await Promise.all([
+      const [j, r, e, a, d, x] = await Promise.all([
        getJob(jobId),
        getJobRuns(jobId),
        getJobEvents(jobId),
        getJobArtifacts(jobId),
        getJobDag(jobId),
+        getJobExplain(jobId),
      ]);
      setJob(j);
      setRuns(r.runs);
      setEvents(e.events);
      setArtifacts(a.artifacts);
      setDag(d?.dag ?? null);
+      setExplain(x);
    } catch {
      /* degrade */
    } finally {
@ -169,6 +174,9 @@ export default function FleetJobDetailPage() {
        </section>
      )}

+      {/* Routing explainability (§7) */}
+      {explain && <ExplainPanel explain={explain} />}
+
      {/* Event timeline */}
      <section>
        <h2 className="text-lg font-semibold mb-2">Event Timeline</h2>
@ -277,3 +285,73 @@ function DagTree({ node, depth = 0 }: { node: DagNode; depth?: number }) {
    </div>
  );
 }
+
+const SCORE_TERMS: { key: keyof JobExplain['factories'][number]['breakdown']; label: string }[] = [
+  { key: 'capabilityFit', label: 'Capability' },
+  { key: 'affinity', label: 'Affinity' },
+  { key: 'load', label: 'Load' },
+  { key: 'costFit', label: 'Cost fit' },
+  { key: 'health', label: 'Health' },
+  { key: 'starvation', label: 'Starvation' },
+];
+
+function ExplainPanel({ explain }: { explain: JobExplain }) {
+  return (
+    <section>
+      <h2 className="text-lg font-semibold mb-1">Routing Explainability</h2>
+      <p className="text-muted-foreground text-xs mb-3">
+        Why this job routes where it does — the §7 weighted score per factory.{' '}
+        {explain.bestFactoryId ? (
+          <>
+            Best factory: <span className="font-mono">{explain.bestFactoryId}</span>.
+          </>
+        ) : (
+          <>No eligible factory right now.</>
+        )}
+        {!explain.depsSatisfied && (
+          <span className="text-amber-600"> Blocked on deps: {explain.unmetDeps.join(', ')}.</span>
+        )}
+      </p>
+      {explain.factories.length === 0 ? (
+        <p className="text-muted-foreground text-sm">No factories have reported in.</p>
+      ) : (
+        <table className="w-full text-sm" aria-label="Routing score breakdown">
+          <thead>
+            <tr className="border-b text-left text-muted-foreground">
+              <th className="pb-2 pr-4">Factory</th>
+              <th className="pb-2 pr-4">Score</th>
+              {SCORE_TERMS.map(t => (
+                <th key={t.key} className="pb-2 pr-4 text-right">
+                  {t.label}
+                </th>
+              ))}
+              <th className="pb-2">Eligible</th>
+            </tr>
+          </thead>
+          <tbody>
+            {explain.factories.map(f => (
+              <tr key={f.factoryId} className="border-b last:border-0">
+                <td className="py-2 pr-4 font-mono text-xs">{f.factoryId}</td>
+                <td className="py-2 pr-4 font-medium">{f.score.toFixed(2)}</td>
+                {SCORE_TERMS.map(t => (
+                  <td key={t.key} className="py-2 pr-4 text-right font-mono text-xs">
+                    {f.breakdown[t.key].toFixed(2)}
+                  </td>
+                ))}
+                <td className="py-2">
+                  {f.eligible ? (
+                    <span className="text-green-600">✓</span>
+                  ) : (
+                    <span className="text-muted-foreground" title={f.ineligibleReasons.join('; ')}>
+                      ✗
+                    </span>
+                  )}
+                </td>
+              </tr>
+            ))}
+          </tbody>
+        </table>
+      )}
+    </section>
+  );
+}
--- a/dashboards/tracker-web/src/lib/fleet-client.ts
+++ b/dashboards/tracker-web/src/lib/fleet-client.ts
@ -87,6 +87,33 @@ export interface DagNode {
  children: DagNode[];
 }

+export interface ScoreBreakdown {
+  capabilityFit: number;
+  affinity: number;
+  load: number;
+  costFit: number;
+  health: number;
+  starvation: number;
+}
+
+export interface FactoryScoreExplain {
+  factoryId: string;
+  eligible: boolean;
+  ineligibleReasons: string[];
+  score: number;
+  breakdown: ScoreBreakdown;
+}
+
+export interface JobExplain {
+  jobId: string;
+  stage: string;
+  weights: Record<string, number>;
+  depsSatisfied: boolean;
+  unmetDeps: string[];
+  factories: FactoryScoreExplain[];
+  bestFactoryId: string | null;
+}
+
 // ── Client ──────────────────────────────────────────────────────────────────

 const fleetApi = createApiClient({
@ -167,6 +194,10 @@ export async function getJobDag(jobId: string): Promise<{ dag: DagNode } | null>
  return apiFetchOptional(`/jobs/${jobId}/dag`);
 }

+export async function getJobExplain(jobId: string): Promise<JobExplain | null> {
+  return apiFetchOptional(`/jobs/${jobId}/explain`);
+}
+
 // ── Factories ───────────────────────────────────────────────────────────────

 export async function listFactories(): Promise<{ factories: FleetFactory[] }> {
--- a/services/platform-service/src/modules/fleet/coordinator.test.ts
+++ b/services/platform-service/src/modules/fleet/coordinator.test.ts
@ -837,4 +837,67 @@ describe('fleet coordinator — Phase 3 per-product budgets', () => {
    const rejectEvents = (await repo.listEvents(job.id)).filter(e => e.type === 'operator_action');
    expect(rejectEvents).toHaveLength(1);
  });
+
+  // ── Phase 3: SCORING EXPLAINABILITY ──
+  it('explainJob: returns per-factory score breakdowns, eligibility, and the best factory', async () => {
+    const { job } = await coord.submitJob(PID, input({ capabilities: ['os:mac'] }));
+    // a capable factory and an incapable one both heartbeat in
+    await coord.heartbeat({
+      productId: PID,
+      factoryId: 'cap',
+      capabilities: ['os:mac', 'has:git'],
+      health: 'ok',
+      load: 0,
+    });
+    await coord.heartbeat({
+      productId: PID,
+      factoryId: 'nocap',
+      capabilities: ['os:linux'],
+      health: 'ok',
+      load: 5,
+    });
+
+    const explain = await coord.explainJob(job.id, PID);
+    expect(explain).not.toBeNull();
+    expect(explain!.jobId).toBe(job.id);
+    expect(explain!.depsSatisfied).toBe(true);
+    expect(explain!.factories).toHaveLength(2);
+    // each breakdown's six weighted terms sum to the reported score
+    for (const f of explain!.factories) {
+      const sum =
+        f.breakdown.capabilityFit +
+        f.breakdown.affinity +
+        f.breakdown.load +
+        f.breakdown.costFit +
+        f.breakdown.health +
+        f.breakdown.starvation;
+      expect(f.score).toBeCloseTo(sum, 9);
+    }
+    const cap = explain!.factories.find(f => f.factoryId === 'cap');
+    const nocap = explain!.factories.find(f => f.factoryId === 'nocap');
+    expect(cap?.eligible).toBe(true);
+    expect(nocap?.eligible).toBe(false);
+    expect(nocap?.ineligibleReasons).toContain('missing required capabilities');
+    expect(explain!.bestFactoryId).toBe('cap');
+  });
+
+  it('explainJob: reports unmet deps and no eligible factory; unknown job is null', async () => {
+    await coord.submitJob(PID, input({ idempotencyKey: 'dep' }));
+    const { job } = await coord.submitJob(PID, input({ idempotencyKey: 'child', deps: ['dep'] }));
+    await coord.heartbeat({
+      productId: PID,
+      factoryId: 'f1',
+      capabilities: [],
+      health: 'ok',
+      load: 0,
+    });
+
+    const explain = await coord.explainJob(job.id, PID);
+    expect(explain!.depsSatisfied).toBe(false);
+    expect(explain!.unmetDeps).toContain('dep');
+    expect(explain!.bestFactoryId).toBeNull(); // deps unmet ⇒ nothing eligible
+    expect(explain!.factories[0].ineligibleReasons.some(r => r.includes('unmet deps'))).toBe(true);
+
+    expect(await coord.explainJob('missing', PID)).toBeNull();
+  });
 });
--- a/services/platform-service/src/modules/fleet/coordinator.ts
+++ b/services/platform-service/src/modules/fleet/coordinator.ts
@ -23,7 +23,10 @@ import * as repo from './repository.js';
 import {
  selectJob,
  selectPreemptionVictim,
+  scoreCandidate,
+  capabilitiesSubset,
  type RunningJobView,
+  type ScoreBreakdown,
  type SchedulerContext,
  type SchedulerFactory,
  type SchedulerWeights,
@ -909,7 +912,83 @@ export async function operatorAction(
  return { ok: true, doc: res.doc };
 }

-// ── Heartbeat (§8) ────────────────────────────────────────────────────────────
+// ── Scoring explainability (§7 / Phase 3 — "why does this job route here?") ────
+
+/** One factory's scored explanation for a job (already-weighted breakdown). */
+export interface FactoryScoreExplain {
+  factoryId: string;
+  eligible: boolean;
+  ineligibleReasons: string[];
+  score: number;
+  breakdown: ScoreBreakdown;
+}
+
+/** Full explainability payload for a job against the current fleet. */
+export interface JobExplain {
+  jobId: string;
+  stage: FleetStage;
+  weights: SchedulerWeights;
+  depsSatisfied: boolean;
+  unmetDeps: string[];
+  factories: FactoryScoreExplain[];
+  bestFactoryId: string | null;
+}
+
+/**
+ * Explain how a job would be scored against every live factory for its product
+ * (§7 scoring surfaced for the control plane). Read-only and side-effect free:
+ * it re-runs the same `scoreCandidate` the scheduler uses, against persisted
+ * factory state, so operators can see WHY a job routes (or fails to route).
+ *
+ * Affinity (prefers-engine) and cost-fit depend on claim-time hints that are not
+ * persisted on the factory doc, so they score as neutral here — the breakdown
+ * reflects the structural signals available from stored state.
+ */
+export async function explainJob(jobId: string, productId: string): Promise<JobExplain | null> {
+  const job = await repo.getJob(jobId, productId);
+  if (!job) return null;
+
+  const weights = resolveWeights(weightRegistry, productId);
+  const unmet = await unmetDeps(job);
+  const depsSatisfied = unmet.length === 0;
+  const factories = await repo.listFactories(productId);
+  const ctx: SchedulerContext = { now: Date.now() };
+
+  const scored: FactoryScoreExplain[] = factories.map(f => {
+    const sf: SchedulerFactory = {
+      capabilities: f.capabilities,
+      health: f.health,
+      load: f.load,
+      seatLimit: f.seatLimit,
+    };
+    const { score, breakdown } = scoreCandidate(job, sf, ctx, weights);
+    const reasons: string[] = [];
+    if ((f.health ?? 'ok') === 'down') reasons.push('factory health is down');
+    if (!capabilitiesSubset(job.capabilities ?? [], f.capabilities)) {
+      reasons.push('missing required capabilities');
+    }
+    if (!depsSatisfied) reasons.push(`unmet deps: ${unmet.join(', ')}`);
+    return {
+      factoryId: f.factoryId,
+      eligible: reasons.length === 0,
+      ineligibleReasons: reasons,
+      score,
+      breakdown,
+    };
+  });
+  scored.sort((a, b) => b.score - a.score);
+  const best = scored.find(s => s.eligible) ?? null;
+
+  return {
+    jobId,
+    stage: job.stage,
+    weights,
+    depsSatisfied,
+    unmetDeps: unmet,
+    factories: scored,
+    bestFactoryId: best?.factoryId ?? null,
+  };
+}

 export interface HeartbeatContext {
  productId: string;
--- a/services/platform-service/src/modules/fleet/routes.test.ts
+++ b/services/platform-service/src/modules/fleet/routes.test.ts
@ -165,4 +165,28 @@ describe('fleetRoutes', () => {
    });
    expect(bad.statusCode).toBe(400);
  });
+
+  it('GET /fleet/jobs/:id/explain returns a per-factory score breakdown', async () => {
+    const app = await buildApp();
+    const sub = await submit(app, { idempotencyKey: 'k1', bodyMd: '# task' });
+    const jobId = JSON.parse(sub.body).job.id as string;
+    await app.inject({
+      method: 'POST',
+      url: '/api/fleet/factories/heartbeat',
+      payload: { factoryId: 'fac_1', capabilities: [], health: 'ok' },
+    });
+
+    const explain = await app.inject({
+      method: 'GET',
+      url: `/api/fleet/jobs/${jobId}/explain`,
+    });
+    expect(explain.statusCode).toBe(200);
+    const body = JSON.parse(explain.body);
+    expect(body.jobId).toBe(jobId);
+    expect(body.factories).toHaveLength(1);
+    expect(body.bestFactoryId).toBe('fac_1');
+
+    const missing = await app.inject({ method: 'GET', url: '/api/fleet/jobs/nope/explain' });
+    expect(missing.statusCode).toBe(404);
+  });
 });
--- a/services/platform-service/src/modules/fleet/routes.ts
+++ b/services/platform-service/src/modules/fleet/routes.ts
@ -237,6 +237,16 @@ export async function fleetRoutes(app: FastifyInstance) {
    return { events };
  });

+  // ── Scoring explainability — why does this job route where it does? (§7) ──
+  app.get('/fleet/jobs/:id/explain', async req => {
+    await extractAuth(req);
+    const { id } = req.params as { id: string };
+    const pid = getRequestProductId(req);
+    const explain = await coordinator.explainJob(id, pid);
+    if (!explain) throw new NotFoundError('Job not found');
+    return explain;
+  });
+
  // ── Artifacts: upload (base64 body → blob + pointer) ──
  app.post('/fleet/jobs/:id/artifacts', async (req, reply) => {
    await extractAuth(req);