fix(dashboards): restore runtime and expose internal ops health

This commit is contained in:
root 2026-03-31 07:26:43 +00:00
parent b8661392c6
commit 534395bb5e
6 changed files with 3241 additions and 8575 deletions

View File

@ -50,4 +50,4 @@ USER nextjs
EXPOSE 3001
CMD ["node", "server.js"]
CMD ["node", "dashboards/admin-web/server.js"]

View File

@ -20,6 +20,8 @@ import {
interface ServiceCheck {
id: string;
name: string;
group: string;
target: string;
status: 'healthy' | 'degraded' | 'down' | 'maintenance';
latency: number;
version?: string;
@ -146,6 +148,13 @@ export default function OpsPage() {
</div>
<div className="space-y-1 mt-3">
<div className="flex justify-between text-xs">
<span className="text-muted-foreground">Group</span>
<span className="font-medium">{svc.group}</span>
</div>
<div className="text-xs text-muted-foreground font-mono break-all">
{svc.target}
</div>
<div className="flex justify-between text-xs">
<span className="text-muted-foreground">Uptime (30d)</span>
<span className="font-medium">99.9%</span>

View File

@ -1,15 +1,19 @@
import net from 'node:net';
import { NextResponse } from 'next/server';
export const dynamic = 'force-dynamic'; // No caching
export const dynamic = 'force-dynamic';
type ServiceStatus = 'healthy' | 'degraded' | 'down' | 'maintenance';
type CheckKind = 'http-json' | 'http-status' | 'tcp';
interface ServiceCheck {
id: string;
name: string;
url: string;
status: 'healthy' | 'degraded' | 'down' | 'maintenance';
group: string;
target: string;
status: ServiceStatus;
latency: number;
version?: string;
uptime?: number;
message?: string;
lastChecked: string;
}
@ -20,86 +24,230 @@ interface OpsStatus {
services: ServiceCheck[];
}
const SERVICES = [
interface HttpServiceDefinition {
id: string;
name: string;
group: string;
kind: 'http-json' | 'http-status';
env?: string;
default: string;
path: string;
}
interface TcpServiceDefinition {
id: string;
name: string;
group: string;
kind: 'tcp';
host: string;
port: number;
}
type ServiceDefinition = HttpServiceDefinition | TcpServiceDefinition;
const SERVICES: ServiceDefinition[] = [
{
id: 'backend',
name: 'Backend API',
env: 'API_BASE_URL',
default: 'http://localhost:8000',
path: '/health',
id: 'admin-web',
name: 'Admin Dashboard',
group: 'Dashboards',
kind: 'http-status',
default: 'http://admin-web:3001',
path: '/api/health',
},
{
id: 'tracker-web',
name: 'Tracker Dashboard',
group: 'Dashboards',
kind: 'http-status',
default: 'http://tracker-web:3003',
path: '/api/health',
},
{
id: 'platform',
name: 'Platform Service',
group: 'Core Services',
env: 'PLATFORM_SERVICE_URL',
default: 'http://localhost:4003',
kind: 'http-json',
default: 'http://platform-service:4003',
path: '/health',
},
{
id: 'extraction',
name: 'Extraction Service',
group: 'Core Services',
env: 'EXTRACTION_SERVICE_URL',
default: 'http://localhost:4005',
kind: 'http-json',
default: 'http://extraction-service:4005',
path: '/health',
},
{
id: 'mcp',
name: 'MCP Server',
group: 'Core Services',
env: 'MCP_SERVER_URL',
kind: 'http-json',
default: 'http://mcp-server:4007',
path: '/health',
},
{
id: 'grafana',
name: 'Grafana',
group: 'Observability',
kind: 'http-json',
default: 'http://grafana:3000',
path: '/api/health',
},
{
id: 'loki',
name: 'Loki',
group: 'Observability',
kind: 'http-status',
default: 'http://loki:3100',
path: '/ready',
},
{
id: 'prometheus',
name: 'Prometheus',
group: 'Observability',
kind: 'http-status',
default: 'http://prometheus:9090',
path: '/-/healthy',
},
{
id: 'node-exporter',
name: 'Node Exporter',
group: 'Observability',
kind: 'http-status',
default: 'http://node-exporter:9100',
path: '/metrics',
},
{
id: 'cadvisor',
name: 'cAdvisor',
group: 'Observability',
kind: 'http-status',
default: 'http://cadvisor:8080',
path: '/healthz',
},
{
id: 'valkey',
name: 'Valkey',
group: 'Shared Infrastructure',
kind: 'tcp',
host: 'valkey',
port: 6379,
},
];
export async function GET() {
const checks = await Promise.all(
SERVICES.map(async svc => {
const baseUrl = process.env[svc.env] || svc.default;
const url = `${baseUrl}${svc.path}`;
async function checkHttpService(service: HttpServiceDefinition): Promise<ServiceCheck> {
const baseUrl = (service.env && process.env[service.env]) || service.default;
const target = `${baseUrl}${service.path}`;
const start = Date.now();
try {
const res = await fetch(url, {
const res = await fetch(target, {
method: 'GET',
headers: { 'Content-Type': 'application/json' },
next: { revalidate: 0 },
signal: AbortSignal.timeout(3000), // 3s timeout
signal: AbortSignal.timeout(3000),
});
const latency = Date.now() - start;
if (!res.ok) {
return {
id: svc.id,
name: svc.name,
url,
id: service.id,
name: service.name,
group: service.group,
target,
status: 'down',
latency,
message: `HTTP ${res.status}`,
lastChecked: new Date().toISOString(),
} as ServiceCheck;
};
}
const json = await res.json();
// Assuming standard health response: { status: "ok", version: "0.1.0" }
// Fastify services return { status: "ok" }
const isOk = json.status === 'ok';
if (service.kind === 'http-json') {
const payload = await res.json().catch(() => null);
const rawStatus = payload?.status;
const isOk =
rawStatus === 'ok' ||
rawStatus === 'healthy' ||
payload?.database === 'ok' ||
payload?.commit === 'ok';
return {
id: svc.id,
name: svc.name,
url,
id: service.id,
name: service.name,
group: service.group,
target,
status: isOk ? 'healthy' : 'degraded',
latency,
version: json.version,
message: isOk ? undefined : JSON.stringify(json),
version: payload?.version,
message: isOk ? undefined : JSON.stringify(payload),
lastChecked: new Date().toISOString(),
} as ServiceCheck;
};
}
return {
id: service.id,
name: service.name,
group: service.group,
target,
status: 'healthy',
latency,
lastChecked: new Date().toISOString(),
};
} catch (err) {
return {
id: svc.id,
name: svc.name,
url,
id: service.id,
name: service.name,
group: service.group,
target,
status: 'down',
latency: Date.now() - start,
message: err instanceof Error ? err.message : String(err),
lastChecked: new Date().toISOString(),
} as ServiceCheck;
};
}
})
}
async function checkTcpService(service: TcpServiceDefinition): Promise<ServiceCheck> {
const start = Date.now();
const target = `${service.host}:${service.port}`;
return new Promise(resolve => {
const socket = net.createConnection({ host: service.host, port: service.port });
let settled = false;
const finish = (status: ServiceStatus, message?: string) => {
if (settled) return;
settled = true;
socket.destroy();
resolve({
id: service.id,
name: service.name,
group: service.group,
target,
status,
latency: Date.now() - start,
message,
lastChecked: new Date().toISOString(),
});
};
socket.setTimeout(3000);
socket.once('connect', () => finish('healthy'));
socket.once('timeout', () => finish('down', 'Connection timed out'));
socket.once('error', err => finish('down', err.message));
});
}
export async function GET() {
const checks = await Promise.all(
SERVICES.map(service =>
service.kind === 'tcp' ? checkTcpService(service) : checkHttpService(service)
)
);
const downCount = checks.filter(c => c.status === 'down').length;
@ -109,11 +257,9 @@ export async function GET() {
if (downCount > 0) overall = 'critical';
else if (degradedCount > 0) overall = 'degraded';
const response: OpsStatus = {
return NextResponse.json({
overall,
timestamp: new Date().toISOString(),
services: checks,
};
return NextResponse.json(response);
} satisfies OpsStatus);
}

View File

@ -46,4 +46,4 @@ USER nextjs
EXPOSE 3003
CMD ["node", "server.js"]
CMD ["node", "dashboards/tracker-web/server.js"]

View File

@ -345,6 +345,7 @@ services:
- PORT=3001
- PLATFORM_SERVICE_URL=http://platform-service:4003
- EXTRACTION_SERVICE_URL=http://extraction-service:4005
- SEED_SECRET=${SEED_SECRET:-dev-seed-secret}
depends_on:
platform-service:
condition: service_healthy
@ -366,6 +367,7 @@ services:
environment:
- PORT=3003
- PLATFORM_SERVICE_URL=http://platform-service:4003
- PLATFORM_API_URL=http://platform-service:4003
depends_on:
platform-service:
condition: service_healthy

9425
pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff