fix(dashboards): restore runtime and expose internal ops health
This commit is contained in:
parent
b8661392c6
commit
534395bb5e
@ -50,4 +50,4 @@ USER nextjs
|
|||||||
|
|
||||||
EXPOSE 3001
|
EXPOSE 3001
|
||||||
|
|
||||||
CMD ["node", "server.js"]
|
CMD ["node", "dashboards/admin-web/server.js"]
|
||||||
|
|||||||
@ -20,6 +20,8 @@ import {
|
|||||||
interface ServiceCheck {
|
interface ServiceCheck {
|
||||||
id: string;
|
id: string;
|
||||||
name: string;
|
name: string;
|
||||||
|
group: string;
|
||||||
|
target: string;
|
||||||
status: 'healthy' | 'degraded' | 'down' | 'maintenance';
|
status: 'healthy' | 'degraded' | 'down' | 'maintenance';
|
||||||
latency: number;
|
latency: number;
|
||||||
version?: string;
|
version?: string;
|
||||||
@ -146,6 +148,13 @@ export default function OpsPage() {
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div className="space-y-1 mt-3">
|
<div className="space-y-1 mt-3">
|
||||||
|
<div className="flex justify-between text-xs">
|
||||||
|
<span className="text-muted-foreground">Group</span>
|
||||||
|
<span className="font-medium">{svc.group}</span>
|
||||||
|
</div>
|
||||||
|
<div className="text-xs text-muted-foreground font-mono break-all">
|
||||||
|
{svc.target}
|
||||||
|
</div>
|
||||||
<div className="flex justify-between text-xs">
|
<div className="flex justify-between text-xs">
|
||||||
<span className="text-muted-foreground">Uptime (30d)</span>
|
<span className="text-muted-foreground">Uptime (30d)</span>
|
||||||
<span className="font-medium">99.9%</span>
|
<span className="font-medium">99.9%</span>
|
||||||
|
|||||||
@ -1,15 +1,19 @@
|
|||||||
|
import net from 'node:net';
|
||||||
import { NextResponse } from 'next/server';
|
import { NextResponse } from 'next/server';
|
||||||
|
|
||||||
export const dynamic = 'force-dynamic'; // No caching
|
export const dynamic = 'force-dynamic';
|
||||||
|
|
||||||
|
type ServiceStatus = 'healthy' | 'degraded' | 'down' | 'maintenance';
|
||||||
|
type CheckKind = 'http-json' | 'http-status' | 'tcp';
|
||||||
|
|
||||||
interface ServiceCheck {
|
interface ServiceCheck {
|
||||||
id: string;
|
id: string;
|
||||||
name: string;
|
name: string;
|
||||||
url: string;
|
group: string;
|
||||||
status: 'healthy' | 'degraded' | 'down' | 'maintenance';
|
target: string;
|
||||||
|
status: ServiceStatus;
|
||||||
latency: number;
|
latency: number;
|
||||||
version?: string;
|
version?: string;
|
||||||
uptime?: number;
|
|
||||||
message?: string;
|
message?: string;
|
||||||
lastChecked: string;
|
lastChecked: string;
|
||||||
}
|
}
|
||||||
@ -20,86 +24,230 @@ interface OpsStatus {
|
|||||||
services: ServiceCheck[];
|
services: ServiceCheck[];
|
||||||
}
|
}
|
||||||
|
|
||||||
const SERVICES = [
|
interface HttpServiceDefinition {
|
||||||
|
id: string;
|
||||||
|
name: string;
|
||||||
|
group: string;
|
||||||
|
kind: 'http-json' | 'http-status';
|
||||||
|
env?: string;
|
||||||
|
default: string;
|
||||||
|
path: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface TcpServiceDefinition {
|
||||||
|
id: string;
|
||||||
|
name: string;
|
||||||
|
group: string;
|
||||||
|
kind: 'tcp';
|
||||||
|
host: string;
|
||||||
|
port: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
type ServiceDefinition = HttpServiceDefinition | TcpServiceDefinition;
|
||||||
|
|
||||||
|
const SERVICES: ServiceDefinition[] = [
|
||||||
{
|
{
|
||||||
id: 'backend',
|
id: 'admin-web',
|
||||||
name: 'Backend API',
|
name: 'Admin Dashboard',
|
||||||
env: 'API_BASE_URL',
|
group: 'Dashboards',
|
||||||
default: 'http://localhost:8000',
|
kind: 'http-status',
|
||||||
path: '/health',
|
default: 'http://admin-web:3001',
|
||||||
|
path: '/api/health',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'tracker-web',
|
||||||
|
name: 'Tracker Dashboard',
|
||||||
|
group: 'Dashboards',
|
||||||
|
kind: 'http-status',
|
||||||
|
default: 'http://tracker-web:3003',
|
||||||
|
path: '/api/health',
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
id: 'platform',
|
id: 'platform',
|
||||||
name: 'Platform Service',
|
name: 'Platform Service',
|
||||||
|
group: 'Core Services',
|
||||||
env: 'PLATFORM_SERVICE_URL',
|
env: 'PLATFORM_SERVICE_URL',
|
||||||
default: 'http://localhost:4003',
|
kind: 'http-json',
|
||||||
|
default: 'http://platform-service:4003',
|
||||||
path: '/health',
|
path: '/health',
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
id: 'extraction',
|
id: 'extraction',
|
||||||
name: 'Extraction Service',
|
name: 'Extraction Service',
|
||||||
|
group: 'Core Services',
|
||||||
env: 'EXTRACTION_SERVICE_URL',
|
env: 'EXTRACTION_SERVICE_URL',
|
||||||
default: 'http://localhost:4005',
|
kind: 'http-json',
|
||||||
|
default: 'http://extraction-service:4005',
|
||||||
path: '/health',
|
path: '/health',
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
id: 'mcp',
|
||||||
|
name: 'MCP Server',
|
||||||
|
group: 'Core Services',
|
||||||
|
env: 'MCP_SERVER_URL',
|
||||||
|
kind: 'http-json',
|
||||||
|
default: 'http://mcp-server:4007',
|
||||||
|
path: '/health',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'grafana',
|
||||||
|
name: 'Grafana',
|
||||||
|
group: 'Observability',
|
||||||
|
kind: 'http-json',
|
||||||
|
default: 'http://grafana:3000',
|
||||||
|
path: '/api/health',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'loki',
|
||||||
|
name: 'Loki',
|
||||||
|
group: 'Observability',
|
||||||
|
kind: 'http-status',
|
||||||
|
default: 'http://loki:3100',
|
||||||
|
path: '/ready',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'prometheus',
|
||||||
|
name: 'Prometheus',
|
||||||
|
group: 'Observability',
|
||||||
|
kind: 'http-status',
|
||||||
|
default: 'http://prometheus:9090',
|
||||||
|
path: '/-/healthy',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'node-exporter',
|
||||||
|
name: 'Node Exporter',
|
||||||
|
group: 'Observability',
|
||||||
|
kind: 'http-status',
|
||||||
|
default: 'http://node-exporter:9100',
|
||||||
|
path: '/metrics',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'cadvisor',
|
||||||
|
name: 'cAdvisor',
|
||||||
|
group: 'Observability',
|
||||||
|
kind: 'http-status',
|
||||||
|
default: 'http://cadvisor:8080',
|
||||||
|
path: '/healthz',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'valkey',
|
||||||
|
name: 'Valkey',
|
||||||
|
group: 'Shared Infrastructure',
|
||||||
|
kind: 'tcp',
|
||||||
|
host: 'valkey',
|
||||||
|
port: 6379,
|
||||||
|
},
|
||||||
];
|
];
|
||||||
|
|
||||||
|
async function checkHttpService(service: HttpServiceDefinition): Promise<ServiceCheck> {
|
||||||
|
const baseUrl = (service.env && process.env[service.env]) || service.default;
|
||||||
|
const target = `${baseUrl}${service.path}`;
|
||||||
|
const start = Date.now();
|
||||||
|
|
||||||
|
try {
|
||||||
|
const res = await fetch(target, {
|
||||||
|
method: 'GET',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
next: { revalidate: 0 },
|
||||||
|
signal: AbortSignal.timeout(3000),
|
||||||
|
});
|
||||||
|
|
||||||
|
const latency = Date.now() - start;
|
||||||
|
|
||||||
|
if (!res.ok) {
|
||||||
|
return {
|
||||||
|
id: service.id,
|
||||||
|
name: service.name,
|
||||||
|
group: service.group,
|
||||||
|
target,
|
||||||
|
status: 'down',
|
||||||
|
latency,
|
||||||
|
message: `HTTP ${res.status}`,
|
||||||
|
lastChecked: new Date().toISOString(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (service.kind === 'http-json') {
|
||||||
|
const payload = await res.json().catch(() => null);
|
||||||
|
const rawStatus = payload?.status;
|
||||||
|
const isOk =
|
||||||
|
rawStatus === 'ok' ||
|
||||||
|
rawStatus === 'healthy' ||
|
||||||
|
payload?.database === 'ok' ||
|
||||||
|
payload?.commit === 'ok';
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: service.id,
|
||||||
|
name: service.name,
|
||||||
|
group: service.group,
|
||||||
|
target,
|
||||||
|
status: isOk ? 'healthy' : 'degraded',
|
||||||
|
latency,
|
||||||
|
version: payload?.version,
|
||||||
|
message: isOk ? undefined : JSON.stringify(payload),
|
||||||
|
lastChecked: new Date().toISOString(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: service.id,
|
||||||
|
name: service.name,
|
||||||
|
group: service.group,
|
||||||
|
target,
|
||||||
|
status: 'healthy',
|
||||||
|
latency,
|
||||||
|
lastChecked: new Date().toISOString(),
|
||||||
|
};
|
||||||
|
} catch (err) {
|
||||||
|
return {
|
||||||
|
id: service.id,
|
||||||
|
name: service.name,
|
||||||
|
group: service.group,
|
||||||
|
target,
|
||||||
|
status: 'down',
|
||||||
|
latency: Date.now() - start,
|
||||||
|
message: err instanceof Error ? err.message : String(err),
|
||||||
|
lastChecked: new Date().toISOString(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function checkTcpService(service: TcpServiceDefinition): Promise<ServiceCheck> {
|
||||||
|
const start = Date.now();
|
||||||
|
const target = `${service.host}:${service.port}`;
|
||||||
|
|
||||||
|
return new Promise(resolve => {
|
||||||
|
const socket = net.createConnection({ host: service.host, port: service.port });
|
||||||
|
let settled = false;
|
||||||
|
|
||||||
|
const finish = (status: ServiceStatus, message?: string) => {
|
||||||
|
if (settled) return;
|
||||||
|
settled = true;
|
||||||
|
socket.destroy();
|
||||||
|
resolve({
|
||||||
|
id: service.id,
|
||||||
|
name: service.name,
|
||||||
|
group: service.group,
|
||||||
|
target,
|
||||||
|
status,
|
||||||
|
latency: Date.now() - start,
|
||||||
|
message,
|
||||||
|
lastChecked: new Date().toISOString(),
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
socket.setTimeout(3000);
|
||||||
|
socket.once('connect', () => finish('healthy'));
|
||||||
|
socket.once('timeout', () => finish('down', 'Connection timed out'));
|
||||||
|
socket.once('error', err => finish('down', err.message));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
export async function GET() {
|
export async function GET() {
|
||||||
const checks = await Promise.all(
|
const checks = await Promise.all(
|
||||||
SERVICES.map(async svc => {
|
SERVICES.map(service =>
|
||||||
const baseUrl = process.env[svc.env] || svc.default;
|
service.kind === 'tcp' ? checkTcpService(service) : checkHttpService(service)
|
||||||
const url = `${baseUrl}${svc.path}`;
|
)
|
||||||
const start = Date.now();
|
|
||||||
|
|
||||||
try {
|
|
||||||
const res = await fetch(url, {
|
|
||||||
method: 'GET',
|
|
||||||
headers: { 'Content-Type': 'application/json' },
|
|
||||||
next: { revalidate: 0 },
|
|
||||||
signal: AbortSignal.timeout(3000), // 3s timeout
|
|
||||||
});
|
|
||||||
|
|
||||||
const latency = Date.now() - start;
|
|
||||||
|
|
||||||
if (!res.ok) {
|
|
||||||
return {
|
|
||||||
id: svc.id,
|
|
||||||
name: svc.name,
|
|
||||||
url,
|
|
||||||
status: 'down',
|
|
||||||
latency,
|
|
||||||
message: `HTTP ${res.status}`,
|
|
||||||
lastChecked: new Date().toISOString(),
|
|
||||||
} as ServiceCheck;
|
|
||||||
}
|
|
||||||
|
|
||||||
const json = await res.json();
|
|
||||||
// Assuming standard health response: { status: "ok", version: "0.1.0" }
|
|
||||||
// Fastify services return { status: "ok" }
|
|
||||||
const isOk = json.status === 'ok';
|
|
||||||
|
|
||||||
return {
|
|
||||||
id: svc.id,
|
|
||||||
name: svc.name,
|
|
||||||
url,
|
|
||||||
status: isOk ? 'healthy' : 'degraded',
|
|
||||||
latency,
|
|
||||||
version: json.version,
|
|
||||||
message: isOk ? undefined : JSON.stringify(json),
|
|
||||||
lastChecked: new Date().toISOString(),
|
|
||||||
} as ServiceCheck;
|
|
||||||
} catch (err) {
|
|
||||||
return {
|
|
||||||
id: svc.id,
|
|
||||||
name: svc.name,
|
|
||||||
url,
|
|
||||||
status: 'down',
|
|
||||||
latency: Date.now() - start,
|
|
||||||
message: err instanceof Error ? err.message : String(err),
|
|
||||||
lastChecked: new Date().toISOString(),
|
|
||||||
} as ServiceCheck;
|
|
||||||
}
|
|
||||||
})
|
|
||||||
);
|
);
|
||||||
|
|
||||||
const downCount = checks.filter(c => c.status === 'down').length;
|
const downCount = checks.filter(c => c.status === 'down').length;
|
||||||
@ -109,11 +257,9 @@ export async function GET() {
|
|||||||
if (downCount > 0) overall = 'critical';
|
if (downCount > 0) overall = 'critical';
|
||||||
else if (degradedCount > 0) overall = 'degraded';
|
else if (degradedCount > 0) overall = 'degraded';
|
||||||
|
|
||||||
const response: OpsStatus = {
|
return NextResponse.json({
|
||||||
overall,
|
overall,
|
||||||
timestamp: new Date().toISOString(),
|
timestamp: new Date().toISOString(),
|
||||||
services: checks,
|
services: checks,
|
||||||
};
|
} satisfies OpsStatus);
|
||||||
|
|
||||||
return NextResponse.json(response);
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -46,4 +46,4 @@ USER nextjs
|
|||||||
|
|
||||||
EXPOSE 3003
|
EXPOSE 3003
|
||||||
|
|
||||||
CMD ["node", "server.js"]
|
CMD ["node", "dashboards/tracker-web/server.js"]
|
||||||
|
|||||||
@ -345,6 +345,7 @@ services:
|
|||||||
- PORT=3001
|
- PORT=3001
|
||||||
- PLATFORM_SERVICE_URL=http://platform-service:4003
|
- PLATFORM_SERVICE_URL=http://platform-service:4003
|
||||||
- EXTRACTION_SERVICE_URL=http://extraction-service:4005
|
- EXTRACTION_SERVICE_URL=http://extraction-service:4005
|
||||||
|
- SEED_SECRET=${SEED_SECRET:-dev-seed-secret}
|
||||||
depends_on:
|
depends_on:
|
||||||
platform-service:
|
platform-service:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
@ -366,6 +367,7 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
- PORT=3003
|
- PORT=3003
|
||||||
- PLATFORM_SERVICE_URL=http://platform-service:4003
|
- PLATFORM_SERVICE_URL=http://platform-service:4003
|
||||||
|
- PLATFORM_API_URL=http://platform-service:4003
|
||||||
depends_on:
|
depends_on:
|
||||||
platform-service:
|
platform-service:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
|||||||
11515
pnpm-lock.yaml
generated
11515
pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user