feat(services): add monitoring (Loki + Grafana config, health-check)
- Copied as-is from learning_voice_ai_agent/services/monitoring - Grafana dashboards + provisioning for Loki datasource - health-check.ts for service health polling - Updated pnpm-workspace.yaml to include services/*
This commit is contained in:
parent
2738124ab9
commit
c97e697097
976
pnpm-lock.yaml
generated
976
pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
@ -1,2 +1,3 @@
|
||||
packages:
|
||||
- "packages/*"
|
||||
- "services/*"
|
||||
|
||||
128
services/monitoring/grafana/dashboards/lysnrai-services.json
Normal file
128
services/monitoring/grafana/dashboards/lysnrai-services.json
Normal file
@ -0,0 +1,128 @@
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "All Service Logs",
|
||||
"type": "logs",
|
||||
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 0 },
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{compose_service=~\".+\"}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"showCommonLabels": false,
|
||||
"wrapLogMessage": true,
|
||||
"prettifyLogMessage": false,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending",
|
||||
"dedupStrategy": "none"
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Logs by Service",
|
||||
"type": "logs",
|
||||
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 12 },
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{compose_service=\"backend\"}",
|
||||
"refId": "A",
|
||||
"legendFormat": "backend"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"wrapLogMessage": true,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending"
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Billing Service Logs",
|
||||
"type": "logs",
|
||||
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 12 },
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{compose_service=\"billing-service\"}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"wrapLogMessage": true,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending"
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Error Rate (all services)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 },
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (compose_service) (rate({compose_service=~\".+\"} |= \"error\" [5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{compose_service}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 15,
|
||||
"pointSize": 5
|
||||
},
|
||||
"unit": "reqps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Log Volume by Service",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 30 },
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (compose_service) (rate({compose_service=~\".+\"} [5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{compose_service}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"drawStyle": "bars",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 50,
|
||||
"stacking": { "mode": "normal" }
|
||||
},
|
||||
"unit": "reqps"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["lysnrai", "logs", "monitoring"],
|
||||
"templating": { "list": [] },
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "LysnrAI — Service Logs",
|
||||
"uid": "lysnrai-service-logs",
|
||||
"version": 1
|
||||
}
|
||||
@ -0,0 +1,12 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: default
|
||||
orgId: 1
|
||||
folder: ""
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
foldersFromFilesStructure: false
|
||||
@ -0,0 +1,9 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki:3100
|
||||
isDefault: true
|
||||
editable: false
|
||||
119
services/monitoring/health-check.ts
Normal file
119
services/monitoring/health-check.ts
Normal file
@ -0,0 +1,119 @@
|
||||
/**
|
||||
* Monitoring & Health Check — aggregates health from all services.
|
||||
*
|
||||
* Standalone script that polls each service's /health endpoint and
|
||||
* reports combined status. Can be run as a cron job, GitHub Action,
|
||||
* or standalone HTTP endpoint.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx services/monitoring/health-check.ts # one-shot check
|
||||
* npx tsx services/monitoring/health-check.ts --serve # HTTP server on :4004
|
||||
*
|
||||
* Environment:
|
||||
* BACKEND_URL (default: http://localhost:8000)
|
||||
* GROWTH_SERVICE_URL (default: http://localhost:4001)
|
||||
* BILLING_SERVICE_URL (default: http://localhost:4002)
|
||||
* PLATFORM_SERVICE_URL (default: http://localhost:4003)
|
||||
* ADMIN_DASHBOARD_URL (default: http://localhost:3001)
|
||||
* USER_DASHBOARD_URL (default: http://localhost:3002)
|
||||
*/
|
||||
|
||||
export {};
|
||||
|
||||
interface ServiceCheck {
|
||||
name: string;
|
||||
url: string;
|
||||
status: "healthy" | "unhealthy" | "unreachable";
|
||||
responseTimeMs: number;
|
||||
details?: Record<string, unknown>;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
interface HealthReport {
|
||||
overall: "healthy" | "degraded" | "down";
|
||||
timestamp: string;
|
||||
services: ServiceCheck[];
|
||||
summary: { healthy: number; unhealthy: number; unreachable: number; total: number };
|
||||
}
|
||||
|
||||
const SERVICES = [
|
||||
{ name: "Backend API", url: process.env.BACKEND_URL || "http://localhost:8000", path: "/health" },
|
||||
{ name: "Growth Service", url: process.env.GROWTH_SERVICE_URL || "http://localhost:4001", path: "/health" },
|
||||
{ name: "Billing Service", url: process.env.BILLING_SERVICE_URL || "http://localhost:4002", path: "/health" },
|
||||
{ name: "Platform Service", url: process.env.PLATFORM_SERVICE_URL || "http://localhost:4003", path: "/health" },
|
||||
{ name: "Admin Dashboard", url: process.env.ADMIN_DASHBOARD_URL || "http://localhost:3001", path: "/api/health" },
|
||||
{ name: "User Dashboard", url: process.env.USER_DASHBOARD_URL || "http://localhost:3002", path: "/api/health" },
|
||||
];
|
||||
|
||||
async function checkService(svc: { name: string; url: string; path: string }): Promise<ServiceCheck> {
|
||||
const fullUrl = `${svc.url}${svc.path}`;
|
||||
const start = performance.now();
|
||||
|
||||
try {
|
||||
const res = await fetch(fullUrl, { signal: AbortSignal.timeout(5_000) });
|
||||
const elapsed = Math.round(performance.now() - start);
|
||||
|
||||
if (res.ok) {
|
||||
let details: Record<string, unknown> | undefined;
|
||||
try { details = await res.json() as Record<string, unknown>; } catch { /* ignore */ }
|
||||
return { name: svc.name, url: svc.url, status: "healthy", responseTimeMs: elapsed, details };
|
||||
}
|
||||
return { name: svc.name, url: svc.url, status: "unhealthy", responseTimeMs: elapsed, error: `HTTP ${res.status}` };
|
||||
} catch (err) {
|
||||
const elapsed = Math.round(performance.now() - start);
|
||||
return { name: svc.name, url: svc.url, status: "unreachable", responseTimeMs: elapsed, error: String(err) };
|
||||
}
|
||||
}
|
||||
|
||||
async function generateReport(): Promise<HealthReport> {
|
||||
const checks = await Promise.all(SERVICES.map(checkService));
|
||||
|
||||
const healthy = checks.filter((c) => c.status === "healthy").length;
|
||||
const unhealthy = checks.filter((c) => c.status === "unhealthy").length;
|
||||
const unreachable = checks.filter((c) => c.status === "unreachable").length;
|
||||
|
||||
let overall: HealthReport["overall"] = "healthy";
|
||||
if (unreachable === checks.length) overall = "down";
|
||||
else if (unhealthy > 0 || unreachable > 0) overall = "degraded";
|
||||
|
||||
return {
|
||||
overall,
|
||||
timestamp: new Date().toISOString(),
|
||||
services: checks,
|
||||
summary: { healthy, unhealthy, unreachable, total: checks.length },
|
||||
};
|
||||
}
|
||||
|
||||
// ── CLI / HTTP server mode ──
|
||||
|
||||
const args = process.argv.slice(2);
|
||||
|
||||
if (args.includes("--serve")) {
|
||||
// Run as HTTP server for continuous monitoring
|
||||
const { createServer } = await import("http");
|
||||
const PORT = Number(process.env.MONITOR_PORT || 4004);
|
||||
|
||||
const server = createServer(async (_req, res) => {
|
||||
const report = await generateReport();
|
||||
res.writeHead(report.overall === "healthy" ? 200 : 503, { "Content-Type": "application/json" });
|
||||
res.end(JSON.stringify(report, null, 2));
|
||||
});
|
||||
|
||||
server.listen(PORT, () => {
|
||||
console.log(`🩺 Monitoring dashboard running on http://localhost:${PORT}`);
|
||||
console.log(` Checking ${SERVICES.length} services every request`);
|
||||
});
|
||||
} else {
|
||||
// One-shot check
|
||||
const report = await generateReport();
|
||||
const icon = { healthy: "✅", degraded: "⚠️", down: "❌" };
|
||||
console.log(`\n${icon[report.overall]} Overall: ${report.overall.toUpperCase()}\n`);
|
||||
|
||||
for (const svc of report.services) {
|
||||
const sIcon = { healthy: "✅", unhealthy: "⚠️", unreachable: "❌" };
|
||||
console.log(` ${sIcon[svc.status]} ${svc.name.padEnd(20)} ${svc.responseTimeMs}ms${svc.error ? ` — ${svc.error}` : ""}`);
|
||||
}
|
||||
|
||||
console.log(`\nHealthy: ${report.summary.healthy}/${report.summary.total}`);
|
||||
process.exit(report.overall === "healthy" ? 0 : 1);
|
||||
}
|
||||
34
services/monitoring/loki/loki-config.yml
Normal file
34
services/monitoring/loki/loki-config.yml
Normal file
@ -0,0 +1,34 @@
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
|
||||
common:
|
||||
path_prefix: /loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /loki/chunks
|
||||
rules_directory: /loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: "2024-01-01"
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
limits_config:
|
||||
reject_old_samples: true
|
||||
reject_old_samples_max_age: 168h # 7 days
|
||||
ingestion_rate_mb: 10
|
||||
ingestion_burst_size_mb: 20
|
||||
|
||||
analytics:
|
||||
reporting_enabled: false
|
||||
16
services/monitoring/package.json
Normal file
16
services/monitoring/package.json
Normal file
@ -0,0 +1,16 @@
|
||||
{
|
||||
"name": "@lysnrai/monitoring",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"description": "Health check and monitoring for all LysnrAI services",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"check": "tsx health-check.ts",
|
||||
"serve": "tsx health-check.ts --serve"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^22.12.0",
|
||||
"tsx": "^4.19.2",
|
||||
"typescript": "^5.7.3"
|
||||
}
|
||||
}
|
||||
11
services/monitoring/tsconfig.json
Normal file
11
services/monitoring/tsconfig.json
Normal file
@ -0,0 +1,11 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2022",
|
||||
"module": "ESNext",
|
||||
"moduleResolution": "bundler",
|
||||
"strict": true,
|
||||
"esModuleInterop": true,
|
||||
"types": ["node"]
|
||||
},
|
||||
"include": ["*.ts"]
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user