fix(dashboards): restore runtime and expose internal ops health
This commit is contained in:
parent
b8661392c6
commit
534395bb5e
@ -50,4 +50,4 @@ USER nextjs
|
||||
|
||||
EXPOSE 3001
|
||||
|
||||
CMD ["node", "server.js"]
|
||||
CMD ["node", "dashboards/admin-web/server.js"]
|
||||
|
||||
@ -20,6 +20,8 @@ import {
|
||||
interface ServiceCheck {
|
||||
id: string;
|
||||
name: string;
|
||||
group: string;
|
||||
target: string;
|
||||
status: 'healthy' | 'degraded' | 'down' | 'maintenance';
|
||||
latency: number;
|
||||
version?: string;
|
||||
@ -146,6 +148,13 @@ export default function OpsPage() {
|
||||
</div>
|
||||
|
||||
<div className="space-y-1 mt-3">
|
||||
<div className="flex justify-between text-xs">
|
||||
<span className="text-muted-foreground">Group</span>
|
||||
<span className="font-medium">{svc.group}</span>
|
||||
</div>
|
||||
<div className="text-xs text-muted-foreground font-mono break-all">
|
||||
{svc.target}
|
||||
</div>
|
||||
<div className="flex justify-between text-xs">
|
||||
<span className="text-muted-foreground">Uptime (30d)</span>
|
||||
<span className="font-medium">99.9%</span>
|
||||
|
||||
@ -1,15 +1,19 @@
|
||||
import net from 'node:net';
|
||||
import { NextResponse } from 'next/server';
|
||||
|
||||
export const dynamic = 'force-dynamic'; // No caching
|
||||
export const dynamic = 'force-dynamic';
|
||||
|
||||
type ServiceStatus = 'healthy' | 'degraded' | 'down' | 'maintenance';
|
||||
type CheckKind = 'http-json' | 'http-status' | 'tcp';
|
||||
|
||||
interface ServiceCheck {
|
||||
id: string;
|
||||
name: string;
|
||||
url: string;
|
||||
status: 'healthy' | 'degraded' | 'down' | 'maintenance';
|
||||
group: string;
|
||||
target: string;
|
||||
status: ServiceStatus;
|
||||
latency: number;
|
||||
version?: string;
|
||||
uptime?: number;
|
||||
message?: string;
|
||||
lastChecked: string;
|
||||
}
|
||||
@ -20,86 +24,230 @@ interface OpsStatus {
|
||||
services: ServiceCheck[];
|
||||
}
|
||||
|
||||
const SERVICES = [
|
||||
interface HttpServiceDefinition {
|
||||
id: string;
|
||||
name: string;
|
||||
group: string;
|
||||
kind: 'http-json' | 'http-status';
|
||||
env?: string;
|
||||
default: string;
|
||||
path: string;
|
||||
}
|
||||
|
||||
interface TcpServiceDefinition {
|
||||
id: string;
|
||||
name: string;
|
||||
group: string;
|
||||
kind: 'tcp';
|
||||
host: string;
|
||||
port: number;
|
||||
}
|
||||
|
||||
type ServiceDefinition = HttpServiceDefinition | TcpServiceDefinition;
|
||||
|
||||
const SERVICES: ServiceDefinition[] = [
|
||||
{
|
||||
id: 'backend',
|
||||
name: 'Backend API',
|
||||
env: 'API_BASE_URL',
|
||||
default: 'http://localhost:8000',
|
||||
path: '/health',
|
||||
id: 'admin-web',
|
||||
name: 'Admin Dashboard',
|
||||
group: 'Dashboards',
|
||||
kind: 'http-status',
|
||||
default: 'http://admin-web:3001',
|
||||
path: '/api/health',
|
||||
},
|
||||
{
|
||||
id: 'tracker-web',
|
||||
name: 'Tracker Dashboard',
|
||||
group: 'Dashboards',
|
||||
kind: 'http-status',
|
||||
default: 'http://tracker-web:3003',
|
||||
path: '/api/health',
|
||||
},
|
||||
{
|
||||
id: 'platform',
|
||||
name: 'Platform Service',
|
||||
group: 'Core Services',
|
||||
env: 'PLATFORM_SERVICE_URL',
|
||||
default: 'http://localhost:4003',
|
||||
kind: 'http-json',
|
||||
default: 'http://platform-service:4003',
|
||||
path: '/health',
|
||||
},
|
||||
{
|
||||
id: 'extraction',
|
||||
name: 'Extraction Service',
|
||||
group: 'Core Services',
|
||||
env: 'EXTRACTION_SERVICE_URL',
|
||||
default: 'http://localhost:4005',
|
||||
kind: 'http-json',
|
||||
default: 'http://extraction-service:4005',
|
||||
path: '/health',
|
||||
},
|
||||
{
|
||||
id: 'mcp',
|
||||
name: 'MCP Server',
|
||||
group: 'Core Services',
|
||||
env: 'MCP_SERVER_URL',
|
||||
kind: 'http-json',
|
||||
default: 'http://mcp-server:4007',
|
||||
path: '/health',
|
||||
},
|
||||
{
|
||||
id: 'grafana',
|
||||
name: 'Grafana',
|
||||
group: 'Observability',
|
||||
kind: 'http-json',
|
||||
default: 'http://grafana:3000',
|
||||
path: '/api/health',
|
||||
},
|
||||
{
|
||||
id: 'loki',
|
||||
name: 'Loki',
|
||||
group: 'Observability',
|
||||
kind: 'http-status',
|
||||
default: 'http://loki:3100',
|
||||
path: '/ready',
|
||||
},
|
||||
{
|
||||
id: 'prometheus',
|
||||
name: 'Prometheus',
|
||||
group: 'Observability',
|
||||
kind: 'http-status',
|
||||
default: 'http://prometheus:9090',
|
||||
path: '/-/healthy',
|
||||
},
|
||||
{
|
||||
id: 'node-exporter',
|
||||
name: 'Node Exporter',
|
||||
group: 'Observability',
|
||||
kind: 'http-status',
|
||||
default: 'http://node-exporter:9100',
|
||||
path: '/metrics',
|
||||
},
|
||||
{
|
||||
id: 'cadvisor',
|
||||
name: 'cAdvisor',
|
||||
group: 'Observability',
|
||||
kind: 'http-status',
|
||||
default: 'http://cadvisor:8080',
|
||||
path: '/healthz',
|
||||
},
|
||||
{
|
||||
id: 'valkey',
|
||||
name: 'Valkey',
|
||||
group: 'Shared Infrastructure',
|
||||
kind: 'tcp',
|
||||
host: 'valkey',
|
||||
port: 6379,
|
||||
},
|
||||
];
|
||||
|
||||
async function checkHttpService(service: HttpServiceDefinition): Promise<ServiceCheck> {
|
||||
const baseUrl = (service.env && process.env[service.env]) || service.default;
|
||||
const target = `${baseUrl}${service.path}`;
|
||||
const start = Date.now();
|
||||
|
||||
try {
|
||||
const res = await fetch(target, {
|
||||
method: 'GET',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
next: { revalidate: 0 },
|
||||
signal: AbortSignal.timeout(3000),
|
||||
});
|
||||
|
||||
const latency = Date.now() - start;
|
||||
|
||||
if (!res.ok) {
|
||||
return {
|
||||
id: service.id,
|
||||
name: service.name,
|
||||
group: service.group,
|
||||
target,
|
||||
status: 'down',
|
||||
latency,
|
||||
message: `HTTP ${res.status}`,
|
||||
lastChecked: new Date().toISOString(),
|
||||
};
|
||||
}
|
||||
|
||||
if (service.kind === 'http-json') {
|
||||
const payload = await res.json().catch(() => null);
|
||||
const rawStatus = payload?.status;
|
||||
const isOk =
|
||||
rawStatus === 'ok' ||
|
||||
rawStatus === 'healthy' ||
|
||||
payload?.database === 'ok' ||
|
||||
payload?.commit === 'ok';
|
||||
|
||||
return {
|
||||
id: service.id,
|
||||
name: service.name,
|
||||
group: service.group,
|
||||
target,
|
||||
status: isOk ? 'healthy' : 'degraded',
|
||||
latency,
|
||||
version: payload?.version,
|
||||
message: isOk ? undefined : JSON.stringify(payload),
|
||||
lastChecked: new Date().toISOString(),
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
id: service.id,
|
||||
name: service.name,
|
||||
group: service.group,
|
||||
target,
|
||||
status: 'healthy',
|
||||
latency,
|
||||
lastChecked: new Date().toISOString(),
|
||||
};
|
||||
} catch (err) {
|
||||
return {
|
||||
id: service.id,
|
||||
name: service.name,
|
||||
group: service.group,
|
||||
target,
|
||||
status: 'down',
|
||||
latency: Date.now() - start,
|
||||
message: err instanceof Error ? err.message : String(err),
|
||||
lastChecked: new Date().toISOString(),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
async function checkTcpService(service: TcpServiceDefinition): Promise<ServiceCheck> {
|
||||
const start = Date.now();
|
||||
const target = `${service.host}:${service.port}`;
|
||||
|
||||
return new Promise(resolve => {
|
||||
const socket = net.createConnection({ host: service.host, port: service.port });
|
||||
let settled = false;
|
||||
|
||||
const finish = (status: ServiceStatus, message?: string) => {
|
||||
if (settled) return;
|
||||
settled = true;
|
||||
socket.destroy();
|
||||
resolve({
|
||||
id: service.id,
|
||||
name: service.name,
|
||||
group: service.group,
|
||||
target,
|
||||
status,
|
||||
latency: Date.now() - start,
|
||||
message,
|
||||
lastChecked: new Date().toISOString(),
|
||||
});
|
||||
};
|
||||
|
||||
socket.setTimeout(3000);
|
||||
socket.once('connect', () => finish('healthy'));
|
||||
socket.once('timeout', () => finish('down', 'Connection timed out'));
|
||||
socket.once('error', err => finish('down', err.message));
|
||||
});
|
||||
}
|
||||
|
||||
export async function GET() {
|
||||
const checks = await Promise.all(
|
||||
SERVICES.map(async svc => {
|
||||
const baseUrl = process.env[svc.env] || svc.default;
|
||||
const url = `${baseUrl}${svc.path}`;
|
||||
const start = Date.now();
|
||||
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
method: 'GET',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
next: { revalidate: 0 },
|
||||
signal: AbortSignal.timeout(3000), // 3s timeout
|
||||
});
|
||||
|
||||
const latency = Date.now() - start;
|
||||
|
||||
if (!res.ok) {
|
||||
return {
|
||||
id: svc.id,
|
||||
name: svc.name,
|
||||
url,
|
||||
status: 'down',
|
||||
latency,
|
||||
message: `HTTP ${res.status}`,
|
||||
lastChecked: new Date().toISOString(),
|
||||
} as ServiceCheck;
|
||||
}
|
||||
|
||||
const json = await res.json();
|
||||
// Assuming standard health response: { status: "ok", version: "0.1.0" }
|
||||
// Fastify services return { status: "ok" }
|
||||
const isOk = json.status === 'ok';
|
||||
|
||||
return {
|
||||
id: svc.id,
|
||||
name: svc.name,
|
||||
url,
|
||||
status: isOk ? 'healthy' : 'degraded',
|
||||
latency,
|
||||
version: json.version,
|
||||
message: isOk ? undefined : JSON.stringify(json),
|
||||
lastChecked: new Date().toISOString(),
|
||||
} as ServiceCheck;
|
||||
} catch (err) {
|
||||
return {
|
||||
id: svc.id,
|
||||
name: svc.name,
|
||||
url,
|
||||
status: 'down',
|
||||
latency: Date.now() - start,
|
||||
message: err instanceof Error ? err.message : String(err),
|
||||
lastChecked: new Date().toISOString(),
|
||||
} as ServiceCheck;
|
||||
}
|
||||
})
|
||||
SERVICES.map(service =>
|
||||
service.kind === 'tcp' ? checkTcpService(service) : checkHttpService(service)
|
||||
)
|
||||
);
|
||||
|
||||
const downCount = checks.filter(c => c.status === 'down').length;
|
||||
@ -109,11 +257,9 @@ export async function GET() {
|
||||
if (downCount > 0) overall = 'critical';
|
||||
else if (degradedCount > 0) overall = 'degraded';
|
||||
|
||||
const response: OpsStatus = {
|
||||
return NextResponse.json({
|
||||
overall,
|
||||
timestamp: new Date().toISOString(),
|
||||
services: checks,
|
||||
};
|
||||
|
||||
return NextResponse.json(response);
|
||||
} satisfies OpsStatus);
|
||||
}
|
||||
|
||||
@ -46,4 +46,4 @@ USER nextjs
|
||||
|
||||
EXPOSE 3003
|
||||
|
||||
CMD ["node", "server.js"]
|
||||
CMD ["node", "dashboards/tracker-web/server.js"]
|
||||
|
||||
@ -345,6 +345,7 @@ services:
|
||||
- PORT=3001
|
||||
- PLATFORM_SERVICE_URL=http://platform-service:4003
|
||||
- EXTRACTION_SERVICE_URL=http://extraction-service:4005
|
||||
- SEED_SECRET=${SEED_SECRET:-dev-seed-secret}
|
||||
depends_on:
|
||||
platform-service:
|
||||
condition: service_healthy
|
||||
@ -366,6 +367,7 @@ services:
|
||||
environment:
|
||||
- PORT=3003
|
||||
- PLATFORM_SERVICE_URL=http://platform-service:4003
|
||||
- PLATFORM_API_URL=http://platform-service:4003
|
||||
depends_on:
|
||||
platform-service:
|
||||
condition: service_healthy
|
||||
|
||||
11515
pnpm-lock.yaml
generated
11515
pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user