feat(fleet): Prometheus metrics export + Grafana dashboard (ops #4)

Exports fleet observability to Prometheus/Grafana (previously JSON-only).

- GET /api/fleet/metrics/prom: global, product-labelled Prometheus exposition
  (queue depth, blocked/active, per-stage histogram, factory health/seats/
  utilization, active alerts, budget spent/ceiling/projected) plus process-wide
  reaper/GC counters and engine circuit-breaker state. Pure renderer
  (renderFleetMetricsProm) is unit-tested; route auth accepts a FLEET_METRICS_TOKEN
  bearer (scrape path) or an admin JWT — never world-readable by default.
- Infra: add a prometheus container to docker-compose + a platform-service-fleet
  scrape job; pin the Prometheus Grafana datasource uid; add a provisioned
  "Fleet Overview" dashboard (breakers, dead-letter, stale factories, alerts,
  queue depth, utilization, budget burn, reaper rate) with a product template var.
- Document FLEET_METRICS_TOKEN + the fleet feature flags in .env.example.

No default behavior change: the endpoint is additive and the new container is
opt-in via the compose stack.

Generated with [Devin](https://cli.devin.ai/docs)

Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
This commit is contained in:
saravanakumardb1 2026-06-01 22:24:03 -07:00
parent acf7c36cda
commit 93d1caf4a2
9 changed files with 652 additions and 0 deletions

View File

@ -99,3 +99,15 @@ RUST_RUNTIME_TIMEOUT_MS=300000
OLLAMA_URL=http://localhost:11434/v1
OLLAMA_MODELS=
FEATURE_FLAGS_ENABLED=true
# ── Fleet ops/observability ───────────────────────────────────
# Bearer token Prometheus uses to scrape GET /api/fleet/metrics/prom. Must match
# the `credentials` in services/monitoring/prometheus/prometheus.yml. When unset,
# the endpoint requires an admin JWT instead (so it is never world-readable).
FLEET_METRICS_TOKEN=changeme-fleet-metrics-token
# Fleet feature flags (default OFF): cost/latency routing, per-engine breaker,
# per-product/-engine budget enforcement, and multi-tenant access enforcement.
FLEET_COST_ROUTING=
FLEET_ENGINE_BREAKER=
FLEET_BUDGETS=
FLEET_TENANT_ENFORCEMENT=

View File

@ -96,6 +96,23 @@ services:
timeout: 5s
retries: 3
# ── Prometheus (fleet + infra metrics scrape) ─────────────────
prometheus:
image: prom/prometheus:v3.1.0
ports:
- '9090:9090'
volumes:
- ./services/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus-data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
restart: unless-stopped
healthcheck:
test: ['CMD', 'wget', '-q', '--spider', 'http://127.0.0.1:9090/-/ready']
interval: 15s
timeout: 5s
retries: 3
# ── API Gateway (Traefik) ───────────────────────────────────
gateway:
image: traefik:v3.3
@ -282,3 +299,4 @@ volumes:
azurite-data:
loki-data:
grafana-data:
prometheus-data:

View File

@ -0,0 +1,184 @@
{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"title": "Fleet Overview",
"tags": ["fleet", "gigafactory"],
"uid": "fleet-overview",
"schemaVersion": 38,
"version": 1,
"refresh": "30s",
"time": { "from": "now-6h", "to": "now" },
"timepicker": { "refresh_intervals": ["15s", "30s", "1m", "5m", "15m", "1h"] },
"templating": {
"list": [
{
"name": "product",
"type": "query",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"query": "label_values(fleet_queue_depth, product)",
"refresh": 2,
"includeAll": true,
"multi": true,
"current": { "text": "All", "value": "$__all" }
}
]
},
"panels": [
{
"title": "Open circuit breakers",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
}
}
},
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [{ "refId": "A", "expr": "fleet_engine_breaker_open_count" }]
},
{
"title": "Dead-letter jobs",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 }
]
}
}
},
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"refId": "A",
"expr": "sum(fleet_jobs_by_stage{product=~\"$product\",stage=\"dead_letter\"})"
}
]
},
{
"title": "Stale factories",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [{ "refId": "A", "expr": "sum(fleet_factories_stale{product=~\"$product\"})" }]
},
{
"title": "Active alerts",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{ "refId": "A", "expr": "count(fleet_alert_active{product=~\"$product\"}) or vector(0)" }
]
},
{
"title": "Queue depth",
"type": "timeseries",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10 } } },
"targets": [
{
"refId": "A",
"expr": "fleet_queue_depth{product=~\"$product\"}",
"legendFormat": "{{product}}"
}
]
},
{
"title": "Seat utilization %",
"type": "timeseries",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"fieldConfig": {
"defaults": { "unit": "percent", "custom": { "drawStyle": "line", "fillOpacity": 10 } }
},
"targets": [
{
"refId": "A",
"expr": "fleet_utilization_pct{product=~\"$product\"}",
"legendFormat": "{{product}}"
}
]
},
{
"title": "Budget: spent vs ceiling (USD)",
"type": "timeseries",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
"fieldConfig": {
"defaults": { "unit": "currencyUSD", "custom": { "drawStyle": "line", "fillOpacity": 10 } }
},
"targets": [
{
"refId": "A",
"expr": "fleet_budget_spent_usd{product=~\"$product\"}",
"legendFormat": "{{product}} spent"
},
{
"refId": "B",
"expr": "fleet_budget_ceiling_usd{product=~\"$product\"}",
"legendFormat": "{{product}} ceiling"
},
{
"refId": "C",
"expr": "fleet_budget_projected_usd{product=~\"$product\"}",
"legendFormat": "{{product}} projected"
}
]
},
{
"title": "Reaper reclaims (rate/5m)",
"type": "timeseries",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
"fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 10 } } },
"targets": [
{
"refId": "A",
"expr": "rate(fleet_reaper_expired_reclaimed_total[5m])",
"legendFormat": "expired"
},
{
"refId": "B",
"expr": "rate(fleet_reaper_stale_reclaimed_total[5m])",
"legendFormat": "stale"
}
]
}
]
}

View File

@ -3,6 +3,7 @@ apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
uid: prometheus
access: proxy
url: http://prometheus:9090
editable: false

View File

@ -8,6 +8,21 @@ scrape_configs:
- targets:
- prometheus:9090
# Fleet coordinator metrics (queue depth, factory health, reaper, breakers,
# budgets). The endpoint requires a bearer token — set `credentials` below to
# the same value as platform-service's FLEET_METRICS_TOKEN (.env). The default
# here is a non-secret placeholder for the local prototype; change it for any
# shared/remote deployment.
- job_name: platform-service-fleet
metrics_path: /api/fleet/metrics/prom
scheme: http
authorization:
type: Bearer
credentials: changeme-fleet-metrics-token
static_configs:
- targets:
- platform-service:4003
- job_name: node-exporter
static_configs:
- targets:

View File

@ -0,0 +1,179 @@
/**
* Fleet Prometheus exposition (§4) pure renderer unit tests.
*/
import { describe, it, expect, afterEach } from 'vitest';
import { renderFleetMetricsProm, scrapeTokenMatches, type FleetPromInput } from './prometheus.js';
import type { FleetMetrics } from './coordinator.js';
import type { ReaperStats } from './reaper.js';
function metrics(over: Partial<FleetMetrics> = {}): FleetMetrics {
return {
productId: 'lysnrai',
generatedAt: '2026-06-01T00:00:00.000Z',
jobs: {
total: 3,
byStage: { queued: 2, shipped: 1 },
queueDepth: 2,
blocked: 0,
active: 0,
oldestQueuedAgeMs: 5000,
},
factories: {
total: 2,
live: 1,
stale: 1,
byHealth: { ok: 1, degraded: 0, down: 0 },
seatsUsed: 1,
seatsTotal: 4,
utilizationPct: 25,
},
budget: null,
alerts: [],
...over,
} as FleetMetrics;
}
const reaper: ReaperStats = {
running: true,
startedAt: null,
lastReclaimAt: null,
lastSweepAt: null,
totals: {
expiredReclaimed: 5,
staleReclaimed: 2,
leasesDeleted: 10,
factoriesDeleted: 0,
tokensDeleted: 0,
jobsDeleted: 3,
runsDeleted: 0,
eventsDeleted: 0,
artifactsDeleted: 0,
},
};
const base: FleetPromInput = {
products: [{ productId: 'lysnrai', metrics: metrics() }],
reaper,
breakers: [],
};
describe('renderFleetMetricsProm', () => {
it('emits HELP/TYPE headers and product-labelled gauges', () => {
const out = renderFleetMetricsProm(base);
expect(out).toContain('# TYPE fleet_queue_depth gauge');
expect(out).toContain('fleet_queue_depth{product="lysnrai"} 2');
expect(out).toContain('fleet_utilization_pct{product="lysnrai"} 25');
expect(out).toContain('fleet_jobs_by_stage{product="lysnrai",stage="queued"} 2');
});
it('emits global reaper counters and a running gauge', () => {
const out = renderFleetMetricsProm(base);
expect(out).toContain('# TYPE fleet_reaper_expired_reclaimed_total counter');
expect(out).toContain('fleet_reaper_expired_reclaimed_total 5');
expect(out).toContain('fleet_reaper_jobs_deleted_total 3');
expect(out).toContain('fleet_reaper_running 1');
});
it('emits one alert series per active alert, labelled by code + level', () => {
const out = renderFleetMetricsProm({
...base,
products: [
{
productId: 'lysnrai',
metrics: metrics({
alerts: [{ level: 'warning', code: 'dead_letter', message: 'x' }],
}),
},
],
});
expect(out).toContain(
'fleet_alert_active{product="lysnrai",code="dead_letter",level="warning"} 1'
);
});
it('emits budget gauges only when a budget is configured', () => {
const withBudget = renderFleetMetricsProm({
...base,
products: [
{
productId: 'lysnrai',
metrics: metrics({
budget: {
ceilingUsd: 100,
spentUsd: 40,
status: 'active',
window: 'monthly',
projectedUsd: 300,
engines: [],
},
}),
},
],
});
expect(withBudget).toContain('fleet_budget_spent_usd{product="lysnrai"} 40');
expect(withBudget).toContain('fleet_budget_projected_usd{product="lysnrai"} 300');
// no budget ⇒ no budget series
expect(renderFleetMetricsProm(base)).not.toContain('fleet_budget_spent_usd');
});
it('counts and labels only tripped breakers', () => {
const out = renderFleetMetricsProm({
...base,
breakers: [
{
factoryId: 'fac_1',
engine: 'codex',
state: 'OPEN',
failureCount: 3,
lastFailureAt: null,
},
{
factoryId: 'fac_1',
engine: 'devin',
state: 'CLOSED',
failureCount: 0,
lastFailureAt: null,
},
],
});
expect(out).toContain(
'fleet_engine_breaker_open{factory="fac_1",engine="codex",state="OPEN"} 1'
);
expect(out).not.toContain('engine="devin"');
expect(out).toContain('fleet_engine_breaker_open_count 1');
});
it('escapes special characters in label values', () => {
const out = renderFleetMetricsProm({
...base,
breakers: [
{
factoryId: 'fac"x',
engine: 'a\\b',
state: 'HALF_OPEN',
failureCount: 1,
lastFailureAt: null,
},
],
});
expect(out).toContain('factory="fac\\"x",engine="a\\\\b"');
});
});
describe('scrapeTokenMatches', () => {
afterEach(() => {
delete process.env.FLEET_METRICS_TOKEN;
});
it('is false when no token is configured (forces admin-auth fallback)', () => {
expect(scrapeTokenMatches('Bearer anything')).toBe(false);
});
it('matches a correct bearer token and rejects a wrong / missing one', () => {
process.env.FLEET_METRICS_TOKEN = 'secret';
expect(scrapeTokenMatches('Bearer secret')).toBe(true);
expect(scrapeTokenMatches('Bearer nope')).toBe(false);
expect(scrapeTokenMatches(undefined)).toBe(false);
});
});

View File

@ -0,0 +1,203 @@
/**
* Prometheus exposition for fleet metrics (§4 ops export).
*
* Renders the per-product `FleetMetrics` + the process-wide reaper/GC counters +
* the engine circuit-breaker snapshot into the Prometheus text format
* (`text/plain; version=0.0.4`). PURE + synchronous the route does the I/O
* (scan products, compute metrics) and hands the snapshot here, which keeps the
* formatting fully unit-testable.
*/
import type { FleetMetrics } from './coordinator.js';
import type { ReaperStats } from './reaper.js';
import type { EngineBreakerEntry } from './engine-breaker.js';
/** One product's computed metrics, paired with its id for labelling. */
export interface ProductMetrics {
productId: string;
metrics: FleetMetrics;
}
export interface FleetPromInput {
products: ProductMetrics[];
reaper: ReaperStats;
breakers: EngineBreakerEntry[];
}
/** Escape a Prometheus label value (backslash, double-quote, newline). */
function esc(v: string): string {
return v.replace(/\\/g, '\\\\').replace(/"/g, '\\"').replace(/\n/g, '\\n');
}
/** Render one labelled series line. `labels` is an ordered [key,value][]. */
function line(name: string, labels: [string, string][], value: number): string {
if (labels.length === 0) return `${name} ${value}`;
const inner = labels.map(([k, v]) => `${k}="${esc(v)}"`).join(',');
return `${name}{${inner}} ${value}`;
}
interface Metric {
name: string;
help: string;
type: 'gauge' | 'counter';
rows: string[];
}
/**
* Render the full fleet exposition. Deterministic ordering (products as given,
* then global counters) so output diffs are stable and testable.
*/
export function renderFleetMetricsProm(input: FleetPromInput): string {
const m: Record<string, Metric> = {};
const def = (name: string, type: Metric['type'], help: string): Metric =>
(m[name] ??= { name, help, type, rows: [] });
for (const { productId, metrics } of input.products) {
const p: [string, string][] = [['product', productId]];
def('fleet_jobs_total', 'gauge', 'Total jobs for the product').rows.push(
line('fleet_jobs_total', p, metrics.jobs.total)
);
def('fleet_queue_depth', 'gauge', 'Jobs currently queued').rows.push(
line('fleet_queue_depth', p, metrics.jobs.queueDepth)
);
def('fleet_jobs_blocked', 'gauge', 'Jobs blocked on unmet dependencies').rows.push(
line('fleet_jobs_blocked', p, metrics.jobs.blocked)
);
def('fleet_jobs_active', 'gauge', 'Jobs in an active (claimed/running) stage').rows.push(
line('fleet_jobs_active', p, metrics.jobs.active)
);
def('fleet_oldest_queued_age_ms', 'gauge', 'Age of the oldest queued job (ms)').rows.push(
line('fleet_oldest_queued_age_ms', p, metrics.jobs.oldestQueuedAgeMs ?? 0)
);
// Per-stage histogram of the job lifecycle.
const stage = def('fleet_jobs_by_stage', 'gauge', 'Jobs by lifecycle stage');
for (const [s, n] of Object.entries(metrics.jobs.byStage)) {
stage.rows.push(
line(
'fleet_jobs_by_stage',
[
['product', productId],
['stage', s],
],
n
)
);
}
def('fleet_factories_total', 'gauge', 'Registered factories').rows.push(
line('fleet_factories_total', p, metrics.factories.total)
);
def('fleet_factories_live', 'gauge', 'Factories seen within the stale window').rows.push(
line('fleet_factories_live', p, metrics.factories.live)
);
def('fleet_factories_stale', 'gauge', 'Factories past the heartbeat stale window').rows.push(
line('fleet_factories_stale', p, metrics.factories.stale)
);
def('fleet_seats_used', 'gauge', 'Occupied factory seats').rows.push(
line('fleet_seats_used', p, metrics.factories.seatsUsed)
);
def('fleet_seats_total', 'gauge', 'Total advertised factory seats').rows.push(
line('fleet_seats_total', p, metrics.factories.seatsTotal)
);
def('fleet_utilization_pct', 'gauge', 'Seat utilization percentage').rows.push(
line('fleet_utilization_pct', p, metrics.factories.utilizationPct)
);
// One active-alert series per alert code (value 1), labelled by level.
const alert = def('fleet_alert_active', 'gauge', 'Active fleet alert (1 = firing)');
for (const a of metrics.alerts) {
alert.rows.push(
line(
'fleet_alert_active',
[
['product', productId],
['code', a.code],
['level', a.level],
],
1
)
);
}
if (metrics.budget) {
def('fleet_budget_spent_usd', 'gauge', 'Budget spent this window (USD)').rows.push(
line('fleet_budget_spent_usd', p, metrics.budget.spentUsd)
);
def('fleet_budget_ceiling_usd', 'gauge', 'Budget ceiling (USD)').rows.push(
line('fleet_budget_ceiling_usd', p, metrics.budget.ceilingUsd)
);
if (metrics.budget.projectedUsd !== null) {
def('fleet_budget_projected_usd', 'gauge', 'Projected end-of-window spend (USD)').rows.push(
line('fleet_budget_projected_usd', p, metrics.budget.projectedUsd)
);
}
}
}
// ── Process-wide reaper / GC counters ──
const r = input.reaper.totals;
const rc = (name: string, help: string, value: number): void => {
def(name, 'counter', help).rows.push(line(name, [], value));
};
rc(
'fleet_reaper_expired_reclaimed_total',
'Jobs reclaimed from expired leases',
r.expiredReclaimed
);
rc('fleet_reaper_stale_reclaimed_total', 'Jobs reclaimed from stale factories', r.staleReclaimed);
rc('fleet_reaper_leases_deleted_total', 'Finished leases garbage-collected', r.leasesDeleted);
rc('fleet_reaper_jobs_deleted_total', 'Terminal jobs garbage-collected', r.jobsDeleted);
def('fleet_reaper_running', 'gauge', 'Reaper loop running (1/0)').rows.push(
line('fleet_reaper_running', [], input.reaper.running ? 1 : 0)
);
// ── Engine circuit breakers (process-wide) ──
const open = def(
'fleet_engine_breaker_open',
'gauge',
'Engine breaker tripped (1 = OPEN/HALF_OPEN)'
);
let openCount = 0;
for (const b of input.breakers) {
if (b.state === 'CLOSED') continue;
openCount += 1;
open.rows.push(
line(
'fleet_engine_breaker_open',
[
['factory', b.factoryId],
['engine', b.engine],
['state', b.state],
],
1
)
);
}
def(
'fleet_engine_breaker_open_count',
'gauge',
'Number of tripped (factory,engine) breakers'
).rows.push(line('fleet_engine_breaker_open_count', [], openCount));
const out: string[] = [];
for (const metric of Object.values(m)) {
out.push(`# HELP ${metric.name} ${metric.help}`);
out.push(`# TYPE ${metric.name} ${metric.type}`);
out.push(...metric.rows);
}
out.push('');
return out.join('\n');
}
/**
* Authorize a metrics scrape. When `FLEET_METRICS_TOKEN` is set, a matching
* `Authorization: Bearer <token>` is accepted (the Prometheus scrape path).
* Returns false when the token is unset (caller falls back to admin auth) or
* the bearer does not match so the endpoint is never world-readable by default.
*/
export function scrapeTokenMatches(authHeader: string | undefined): boolean {
const token = (process.env.FLEET_METRICS_TOKEN ?? '').trim();
if (!token) return false;
if (!authHeader) return false;
const m = authHeader.match(/^Bearer\s+(.+)$/i);
return m?.[1] === token;
}

View File

@ -116,6 +116,17 @@ describe('fleetRoutes', () => {
expect(JSON.parse(metrics.body).jobs.byStage.shipped).toBe(1);
});
it('GET /fleet/metrics/prom renders Prometheus exposition (admin)', async () => {
const app = await buildApp();
await submit(app, { idempotencyKey: 'prom-1', bodyMd: '# task' });
const res = await app.inject({ method: 'GET', url: '/api/fleet/metrics/prom' });
expect(res.statusCode).toBe(200);
expect(res.headers['content-type']).toContain('text/plain; version=0.0.4');
expect(res.body).toContain('# TYPE fleet_queue_depth gauge');
expect(res.body).toContain('fleet_reaper_running');
expect(res.body).toContain('fleet_engine_breaker_open_count');
});
it('release with insights records run cost/tokens (factory reporting)', async () => {
const app = await buildApp();
const sub = await submit(app, { idempotencyKey: 'ins-1', bodyMd: '# task' });

View File

@ -35,6 +35,8 @@ import * as enrollment from './enrollment.js';
import * as trackerBridge from './tracker-bridge.js';
import { getReaperStats } from './reaper.js';
import { getEngineBreakerSnapshot } from './engine-breaker.js';
import { renderFleetMetricsProm, scrapeTokenMatches } from './prometheus.js';
import { getAllProducts } from '../products/cache.js';
import {
SubmitJobSchema,
ListJobsQuerySchema,
@ -455,6 +457,33 @@ export async function fleetRoutes(app: FastifyInstance) {
return { ...metrics, reaper: getReaperStats(), engineBreakers: getEngineBreakerSnapshot() };
});
// ── Prometheus exposition for fleet metrics (§4 ops export) ──
// GLOBAL (all products, labelled) so a single scrape target covers the fleet.
// Auth: a matching `FLEET_METRICS_TOKEN` bearer (the Prometheus scrape path),
// else an admin/super_admin JWT — never world-readable by default.
app.get('/fleet/metrics/prom', async (req, reply) => {
if (!scrapeTokenMatches(req.headers['authorization'])) {
const auth = await extractAuth(req);
if (auth.role !== 'admin' && auth.role !== 'super_admin') {
throw new ForbiddenError('admin role or a metrics scrape token is required');
}
}
const products = getAllProducts();
const perProduct = await Promise.all(
products.map(async p => ({
productId: p.productId,
metrics: await coordinator.fleetMetrics(p.productId),
}))
);
const body = renderFleetMetricsProm({
products: perProduct,
reaper: getReaperStats(),
breakers: getEngineBreakerSnapshot(),
});
reply.type('text/plain; version=0.0.4; charset=utf-8');
return body;
});
// ── M0 RU gate: per-product queue version (cheap ~1 RU point read) ──
// A polling factory reads this each tick and only runs the expensive claim when
// `version` changed since its last attempt. See