643 lines
17 KiB
TypeScript
643 lines
17 KiB
TypeScript
import net from 'node:net';
|
|
|
|
export type ServiceStatus = 'healthy' | 'degraded' | 'down' | 'maintenance';
|
|
export type CheckKind = 'http-json' | 'http-status' | 'tcp';
|
|
export type InventorySource = 'docker' | 'vm';
|
|
|
|
export interface ServiceCheck {
|
|
id: string;
|
|
name: string;
|
|
group: string;
|
|
target: string;
|
|
status: ServiceStatus;
|
|
latency: number;
|
|
version?: string;
|
|
message?: string;
|
|
lastChecked: string;
|
|
}
|
|
|
|
export interface OpsStatus {
|
|
overall: 'healthy' | 'degraded' | 'critical';
|
|
timestamp: string;
|
|
services: ServiceCheck[];
|
|
}
|
|
|
|
interface BaseDefinition {
|
|
id: string;
|
|
name: string;
|
|
group: string;
|
|
description: string;
|
|
management: InventorySource;
|
|
exposure: 'internal' | 'public';
|
|
port?: number;
|
|
}
|
|
|
|
interface HttpServiceDefinition extends BaseDefinition {
|
|
kind: 'http-json' | 'http-status';
|
|
env?: string;
|
|
default: string;
|
|
path: string;
|
|
}
|
|
|
|
interface TcpServiceDefinition extends BaseDefinition {
|
|
kind: 'tcp';
|
|
host: string;
|
|
port: number;
|
|
}
|
|
|
|
export type ServiceDefinition = HttpServiceDefinition | TcpServiceDefinition;
|
|
|
|
export interface InventoryService extends ServiceCheck {
|
|
description: string;
|
|
management: InventorySource;
|
|
exposure: 'internal' | 'public';
|
|
port?: number;
|
|
restartable: boolean;
|
|
}
|
|
|
|
export interface HostTool {
|
|
id: string;
|
|
name: string;
|
|
group: string;
|
|
source: InventorySource;
|
|
management: string;
|
|
status: 'managed' | 'manual';
|
|
description: string;
|
|
}
|
|
|
|
export const STACK_SERVICES: ServiceDefinition[] = [
|
|
{
|
|
id: 'admin-web',
|
|
name: 'Admin Dashboard',
|
|
group: 'Dashboards',
|
|
description: 'Internal admin portal for platform review and ops workflows.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 3001,
|
|
kind: 'http-status',
|
|
default: 'http://admin-web:3001',
|
|
path: '/api/health',
|
|
},
|
|
{
|
|
id: 'tracker-web',
|
|
name: 'Tracker Dashboard',
|
|
group: 'Dashboards',
|
|
description: 'Internal tracker UI for issue and delivery review.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 3003,
|
|
kind: 'http-status',
|
|
default: 'http://tracker-web:3003',
|
|
path: '/api/health',
|
|
},
|
|
{
|
|
id: 'lysnrai-dashboard',
|
|
name: 'LysnrAI Dashboard',
|
|
group: 'Product Web Apps',
|
|
description: 'Voice AI dashboard hosted on the VM for internal product review.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 3002,
|
|
kind: 'http-status',
|
|
default: 'http://lysnrai-dashboard:3002',
|
|
path: '/',
|
|
},
|
|
{
|
|
id: 'chronomind-web',
|
|
name: 'ChronoMind Web',
|
|
group: 'Product Web Apps',
|
|
description: 'ChronoMind web client hosted on the VM.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 3030,
|
|
kind: 'http-status',
|
|
default: 'http://chronomind-web:3030',
|
|
path: '/',
|
|
},
|
|
{
|
|
id: 'jarvisjr-web',
|
|
name: 'JarvisJr Web',
|
|
group: 'Product Web Apps',
|
|
description: 'JarvisJr web client hosted on the VM.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 3035,
|
|
kind: 'http-status',
|
|
default: 'http://jarvisjr-web:3035',
|
|
path: '/',
|
|
},
|
|
{
|
|
id: 'flowmonk-web',
|
|
name: 'FlowMonk Web',
|
|
group: 'Product Web Apps',
|
|
description: 'FlowMonk web client hosted on the VM.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 3040,
|
|
kind: 'http-status',
|
|
default: 'http://flowmonk-web:3040',
|
|
path: '/',
|
|
},
|
|
{
|
|
id: 'notelett-web',
|
|
name: 'NoteLett Web',
|
|
group: 'Product Web Apps',
|
|
description: 'NoteLett web client hosted on the VM.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 3045,
|
|
kind: 'http-status',
|
|
default: 'http://notelett-web:3045',
|
|
path: '/',
|
|
},
|
|
{
|
|
id: 'mindlyst-web',
|
|
name: 'MindLyst Web',
|
|
group: 'Product Web Apps',
|
|
description: 'MindLyst web client hosted on the VM.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 3050,
|
|
kind: 'http-status',
|
|
default: 'http://mindlyst-web:3050',
|
|
path: '/',
|
|
},
|
|
{
|
|
id: 'nomgap-web',
|
|
name: 'NomGap Web',
|
|
group: 'Product Web Apps',
|
|
description: 'NomGap web client hosted on the VM.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 3055,
|
|
kind: 'http-status',
|
|
default: 'http://nomgap-web:3055',
|
|
path: '/',
|
|
},
|
|
{
|
|
id: 'actiontrail-web',
|
|
name: 'ActionTrail Web',
|
|
group: 'Product Web Apps',
|
|
description: 'ActionTrail web client hosted on the VM.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 3060,
|
|
kind: 'http-status',
|
|
default: 'http://actiontrail-web:3060',
|
|
path: '/',
|
|
},
|
|
{
|
|
id: 'llmlab-dashboard',
|
|
name: 'LLM Lab Dashboard',
|
|
group: 'Internal Tooling',
|
|
description: 'Internal LLM lab dashboard hosted on the VM for operator and developer use.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 3075,
|
|
kind: 'http-status',
|
|
default: 'http://llmlab-dashboard:3075',
|
|
path: '/',
|
|
},
|
|
{
|
|
id: 'efforise-web',
|
|
name: 'Efforise Web',
|
|
group: 'Product Web Apps',
|
|
description: 'Efforise web client hosted on the VM.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 3080,
|
|
kind: 'http-status',
|
|
default: 'http://efforise-web:3080',
|
|
path: '/',
|
|
},
|
|
{
|
|
id: 'platform',
|
|
name: 'Platform Service',
|
|
group: 'Core Services',
|
|
description: 'Core API and auth platform service.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 4003,
|
|
env: 'PLATFORM_SERVICE_URL',
|
|
kind: 'http-json',
|
|
default: 'http://platform-service:4003',
|
|
path: '/health',
|
|
},
|
|
{
|
|
id: 'extraction',
|
|
name: 'Extraction Service',
|
|
group: 'Core Services',
|
|
description: 'Structured extraction service with product-aware throttling.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 4005,
|
|
env: 'EXTRACTION_SERVICE_URL',
|
|
kind: 'http-json',
|
|
default: 'http://extraction-service:4005',
|
|
path: '/health',
|
|
},
|
|
{
|
|
id: 'mcp',
|
|
name: 'MCP Server',
|
|
group: 'Core Services',
|
|
description: 'Internal MCP integration surface.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 4007,
|
|
env: 'MCP_SERVER_URL',
|
|
kind: 'http-json',
|
|
default: 'http://mcp-server:4007',
|
|
path: '/health',
|
|
},
|
|
{
|
|
id: 'grafana',
|
|
name: 'Grafana',
|
|
group: 'Observability',
|
|
description: 'Metrics and logs visualization.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 3000,
|
|
kind: 'http-json',
|
|
default: 'http://grafana:3000',
|
|
path: '/api/health',
|
|
},
|
|
{
|
|
id: 'loki',
|
|
name: 'Loki',
|
|
group: 'Observability',
|
|
description: 'Centralized log aggregation.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 3100,
|
|
kind: 'http-status',
|
|
default: 'http://loki:3100',
|
|
path: '/ready',
|
|
},
|
|
{
|
|
id: 'prometheus',
|
|
name: 'Prometheus',
|
|
group: 'Observability',
|
|
description: 'Internal metrics scraping and query engine.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 9090,
|
|
kind: 'http-status',
|
|
default: 'http://prometheus:9090',
|
|
path: '/-/healthy',
|
|
},
|
|
{
|
|
id: 'node-exporter',
|
|
name: 'Node Exporter',
|
|
group: 'Observability',
|
|
description: 'Host-level VM metrics exporter.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 9100,
|
|
kind: 'http-status',
|
|
default: 'http://node-exporter:9100',
|
|
path: '/metrics',
|
|
},
|
|
{
|
|
id: 'cadvisor',
|
|
name: 'cAdvisor',
|
|
group: 'Observability',
|
|
description: 'Container-level metrics exporter.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 8080,
|
|
kind: 'http-status',
|
|
default: 'http://cadvisor:8080',
|
|
path: '/healthz',
|
|
},
|
|
{
|
|
id: 'valkey',
|
|
name: 'Valkey',
|
|
group: 'Shared Infrastructure',
|
|
description: 'Shared cache and rate-limit backing store.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
kind: 'tcp',
|
|
host: 'valkey',
|
|
port: 6379,
|
|
},
|
|
{
|
|
id: 'gitea-registry',
|
|
name: 'Gitea Registry',
|
|
group: 'Shared Infrastructure',
|
|
description: 'Private npm package registry and source control service.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 3300,
|
|
kind: 'http-json',
|
|
default: 'http://gitea-npm-registry:3000',
|
|
path: '/api/v1/version',
|
|
},
|
|
{
|
|
id: 'mailpit',
|
|
name: 'Mailpit',
|
|
group: 'Shared Infrastructure',
|
|
description: 'SMTP sink and email inspection UI.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 8025,
|
|
kind: 'http-status',
|
|
default: 'http://mailpit:8025',
|
|
path: '/',
|
|
},
|
|
{
|
|
id: 'azurite',
|
|
name: 'Azurite',
|
|
group: 'Shared Infrastructure',
|
|
description: 'Local Azure Blob Storage emulator.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
kind: 'tcp',
|
|
host: 'azurite',
|
|
port: 10000,
|
|
},
|
|
{
|
|
id: 'cosmos-emulator',
|
|
name: 'Cosmos Emulator',
|
|
group: 'Shared Infrastructure',
|
|
description: 'Local Azure Cosmos DB emulator.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 8080,
|
|
kind: 'http-status',
|
|
default: 'http://cosmos-emulator:8080',
|
|
path: '/ready',
|
|
},
|
|
{
|
|
id: 'gateway',
|
|
name: 'Traefik Gateway',
|
|
group: 'Ingress',
|
|
description: 'Legacy internal gateway and routing layer.',
|
|
management: 'docker',
|
|
exposure: 'internal',
|
|
port: 8080,
|
|
kind: 'http-status',
|
|
default: 'http://gateway:8080',
|
|
path: '/',
|
|
},
|
|
{
|
|
id: 'caddy',
|
|
name: 'Caddy',
|
|
group: 'Ingress',
|
|
description: 'HTTPS ingress and reverse proxy for internal and backend domains.',
|
|
management: 'docker',
|
|
exposure: 'public',
|
|
kind: 'tcp',
|
|
host: 'caddy',
|
|
port: 80,
|
|
},
|
|
];
|
|
|
|
export const HOST_TOOLS: HostTool[] = [
|
|
{
|
|
id: 'docker-ce',
|
|
name: 'Docker CE',
|
|
group: 'Host Tooling',
|
|
source: 'vm',
|
|
management: 'VM bootstrap',
|
|
status: 'managed',
|
|
description: 'Container runtime for the internal stack.',
|
|
},
|
|
{
|
|
id: 'docker-compose',
|
|
name: 'Docker Compose',
|
|
group: 'Host Tooling',
|
|
source: 'vm',
|
|
management: 'VM bootstrap',
|
|
status: 'managed',
|
|
description: 'Multi-service orchestration for the VM stack.',
|
|
},
|
|
{
|
|
id: 'azure-cli',
|
|
name: 'Azure CLI',
|
|
group: 'Host Tooling',
|
|
source: 'vm',
|
|
management: 'Manual install',
|
|
status: 'manual',
|
|
description: 'Azure subscription and NSG management from the VM.',
|
|
},
|
|
{
|
|
id: 'nodejs',
|
|
name: 'Node.js 22',
|
|
group: 'Host Tooling',
|
|
source: 'vm',
|
|
management: 'VM bootstrap',
|
|
status: 'managed',
|
|
description: 'Build/runtime toolchain for workspace services.',
|
|
},
|
|
{
|
|
id: 'pnpm',
|
|
name: 'pnpm',
|
|
group: 'Host Tooling',
|
|
source: 'vm',
|
|
management: 'VM bootstrap',
|
|
status: 'managed',
|
|
description: 'Workspace package manager.',
|
|
},
|
|
{
|
|
id: 'git',
|
|
name: 'git',
|
|
group: 'Host Tooling',
|
|
source: 'vm',
|
|
management: 'VM bootstrap',
|
|
status: 'managed',
|
|
description: 'Repo sync and deployment workflow tooling.',
|
|
},
|
|
{
|
|
id: 'jq',
|
|
name: 'jq',
|
|
group: 'Host Tooling',
|
|
source: 'vm',
|
|
management: 'VM bootstrap',
|
|
status: 'managed',
|
|
description: 'CLI JSON inspection used in ops and setup scripts.',
|
|
},
|
|
{
|
|
id: 'caddy-host-config',
|
|
name: 'Caddy Config',
|
|
group: 'Host Tooling',
|
|
source: 'vm',
|
|
management: 'VM file mount',
|
|
status: 'managed',
|
|
description: 'Host-mounted Caddy configuration at /opt/bytelyst/Caddyfile.',
|
|
},
|
|
];
|
|
|
|
export const RESTARTABLE_SERVICE_CONTAINERS: Record<string, string> = {
|
|
'admin-web': 'learning_ai_common_plat-admin-web-1',
|
|
'tracker-web': 'learning_ai_common_plat-tracker-web-1',
|
|
'lysnrai-dashboard': 'learning_ai_common_plat-lysnrai-dashboard-1',
|
|
'chronomind-web': 'learning_ai_common_plat-chronomind-web-1',
|
|
'jarvisjr-web': 'learning_ai_common_plat-jarvisjr-web-1',
|
|
'flowmonk-web': 'learning_ai_common_plat-flowmonk-web-1',
|
|
'notelett-web': 'learning_ai_common_plat-notelett-web-1',
|
|
'mindlyst-web': 'learning_ai_common_plat-mindlyst-web-1',
|
|
'nomgap-web': 'learning_ai_common_plat-nomgap-web-1',
|
|
'actiontrail-web': 'learning_ai_common_plat-actiontrail-web-1',
|
|
'llmlab-dashboard': 'learning_ai_common_plat-llmlab-dashboard-1',
|
|
'efforise-web': 'learning_ai_common_plat-efforise-web-1',
|
|
platform: 'learning_ai_common_plat-platform-service-1',
|
|
extraction: 'learning_ai_common_plat-extraction-service-1',
|
|
mcp: 'learning_ai_common_plat-mcp-server-1',
|
|
grafana: 'learning_ai_common_plat-grafana-1',
|
|
loki: 'learning_ai_common_plat-loki-1',
|
|
prometheus: 'learning_ai_common_plat-prometheus-1',
|
|
'node-exporter': 'learning_ai_common_plat-node-exporter-1',
|
|
cadvisor: 'learning_ai_common_plat-cadvisor-1',
|
|
valkey: 'learning_ai_common_plat-valkey-1',
|
|
'gitea-registry': 'gitea-npm-registry',
|
|
mailpit: 'learning_ai_common_plat-mailpit-1',
|
|
azurite: 'learning_ai_common_plat-azurite-1',
|
|
'cosmos-emulator': 'learning_ai_common_plat-cosmos-emulator-1',
|
|
};
|
|
|
|
async function checkHttpService(service: HttpServiceDefinition): Promise<ServiceCheck> {
|
|
const baseUrl = (service.env && process.env[service.env]) || service.default;
|
|
const target = `${baseUrl}${service.path}`;
|
|
const start = Date.now();
|
|
|
|
try {
|
|
const res = await fetch(target, {
|
|
method: 'GET',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
next: { revalidate: 0 },
|
|
signal: AbortSignal.timeout(3000),
|
|
});
|
|
|
|
const latency = Date.now() - start;
|
|
|
|
if (!res.ok) {
|
|
return {
|
|
id: service.id,
|
|
name: service.name,
|
|
group: service.group,
|
|
target,
|
|
status: 'down',
|
|
latency,
|
|
message: `HTTP ${res.status}`,
|
|
lastChecked: new Date().toISOString(),
|
|
};
|
|
}
|
|
|
|
if (service.kind === 'http-json') {
|
|
const payload = await res.json().catch(() => null);
|
|
const rawStatus = payload?.status;
|
|
const isOk =
|
|
rawStatus === 'ok' ||
|
|
rawStatus === 'healthy' ||
|
|
payload?.database === 'ok' ||
|
|
payload?.commit === 'ok' ||
|
|
payload?.version;
|
|
|
|
return {
|
|
id: service.id,
|
|
name: service.name,
|
|
group: service.group,
|
|
target,
|
|
status: isOk ? 'healthy' : 'degraded',
|
|
latency,
|
|
version: payload?.version,
|
|
message: isOk ? undefined : JSON.stringify(payload),
|
|
lastChecked: new Date().toISOString(),
|
|
};
|
|
}
|
|
|
|
return {
|
|
id: service.id,
|
|
name: service.name,
|
|
group: service.group,
|
|
target,
|
|
status: 'healthy',
|
|
latency,
|
|
lastChecked: new Date().toISOString(),
|
|
};
|
|
} catch (err) {
|
|
return {
|
|
id: service.id,
|
|
name: service.name,
|
|
group: service.group,
|
|
target,
|
|
status: 'down',
|
|
latency: Date.now() - start,
|
|
message: err instanceof Error ? err.message : String(err),
|
|
lastChecked: new Date().toISOString(),
|
|
};
|
|
}
|
|
}
|
|
|
|
async function checkTcpService(service: TcpServiceDefinition): Promise<ServiceCheck> {
|
|
const start = Date.now();
|
|
const target = `${service.host}:${service.port}`;
|
|
|
|
return new Promise(resolve => {
|
|
const socket = net.createConnection({ host: service.host, port: service.port });
|
|
let settled = false;
|
|
|
|
const finish = (status: ServiceStatus, message?: string) => {
|
|
if (settled) return;
|
|
settled = true;
|
|
socket.destroy();
|
|
resolve({
|
|
id: service.id,
|
|
name: service.name,
|
|
group: service.group,
|
|
target,
|
|
status,
|
|
latency: Date.now() - start,
|
|
message,
|
|
lastChecked: new Date().toISOString(),
|
|
});
|
|
};
|
|
|
|
socket.setTimeout(3000);
|
|
socket.once('connect', () => finish('healthy'));
|
|
socket.once('timeout', () => finish('down', 'Connection timed out'));
|
|
socket.once('error', err => finish('down', err.message));
|
|
});
|
|
}
|
|
|
|
export async function collectOpsChecks(): Promise<ServiceCheck[]> {
|
|
return Promise.all(
|
|
STACK_SERVICES.map(service =>
|
|
service.kind === 'tcp' ? checkTcpService(service) : checkHttpService(service)
|
|
)
|
|
);
|
|
}
|
|
|
|
export async function collectOpsStatus(): Promise<OpsStatus> {
|
|
const services = await collectOpsChecks();
|
|
const downCount = services.filter(c => c.status === 'down').length;
|
|
const degradedCount = services.filter(c => c.status === 'degraded').length;
|
|
|
|
let overall: OpsStatus['overall'] = 'healthy';
|
|
if (downCount > 0) overall = 'critical';
|
|
else if (degradedCount > 0) overall = 'degraded';
|
|
|
|
return {
|
|
overall,
|
|
timestamp: new Date().toISOString(),
|
|
services,
|
|
};
|
|
}
|
|
|
|
export async function collectInventoryServices(): Promise<InventoryService[]> {
|
|
const checks = await collectOpsChecks();
|
|
const byId = new Map(checks.map(check => [check.id, check]));
|
|
|
|
return STACK_SERVICES.map(service => {
|
|
const check = byId.get(service.id);
|
|
return {
|
|
...(check as ServiceCheck),
|
|
description: service.description,
|
|
management: service.management,
|
|
exposure: service.exposure,
|
|
port: service.port,
|
|
restartable: Boolean(RESTARTABLE_SERVICE_CONTAINERS[service.id]),
|
|
};
|
|
});
|
|
}
|