feat(services): add monitoring (Loki + Grafana config, health-check)
- Copied as-is from learning_voice_ai_agent/services/monitoring - Grafana dashboards + provisioning for Loki datasource - health-check.ts for service health polling - Updated pnpm-workspace.yaml to include services/*
This commit is contained in:
parent
2738124ab9
commit
c97e697097
976
pnpm-lock.yaml
generated
976
pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
@ -1,2 +1,3 @@
|
|||||||
packages:
|
packages:
|
||||||
- "packages/*"
|
- "packages/*"
|
||||||
|
- "services/*"
|
||||||
|
|||||||
128
services/monitoring/grafana/dashboards/lysnrai-services.json
Normal file
128
services/monitoring/grafana/dashboards/lysnrai-services.json
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
{
|
||||||
|
"annotations": { "list": [] },
|
||||||
|
"editable": true,
|
||||||
|
"fiscalYearStartMonth": 0,
|
||||||
|
"graphTooltip": 1,
|
||||||
|
"id": null,
|
||||||
|
"links": [],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"title": "All Service Logs",
|
||||||
|
"type": "logs",
|
||||||
|
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 0 },
|
||||||
|
"datasource": { "type": "loki", "uid": "loki" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "{compose_service=~\".+\"}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"showTime": true,
|
||||||
|
"showLabels": true,
|
||||||
|
"showCommonLabels": false,
|
||||||
|
"wrapLogMessage": true,
|
||||||
|
"prettifyLogMessage": false,
|
||||||
|
"enableLogDetails": true,
|
||||||
|
"sortOrder": "Descending",
|
||||||
|
"dedupStrategy": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Logs by Service",
|
||||||
|
"type": "logs",
|
||||||
|
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 12 },
|
||||||
|
"datasource": { "type": "loki", "uid": "loki" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "{compose_service=\"backend\"}",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "backend"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"showTime": true,
|
||||||
|
"showLabels": true,
|
||||||
|
"wrapLogMessage": true,
|
||||||
|
"enableLogDetails": true,
|
||||||
|
"sortOrder": "Descending"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Billing Service Logs",
|
||||||
|
"type": "logs",
|
||||||
|
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 12 },
|
||||||
|
"datasource": { "type": "loki", "uid": "loki" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "{compose_service=\"billing-service\"}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"showTime": true,
|
||||||
|
"showLabels": true,
|
||||||
|
"wrapLogMessage": true,
|
||||||
|
"enableLogDetails": true,
|
||||||
|
"sortOrder": "Descending"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Error Rate (all services)",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 },
|
||||||
|
"datasource": { "type": "loki", "uid": "loki" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by (compose_service) (rate({compose_service=~\".+\"} |= \"error\" [5m]))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{compose_service}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": {
|
||||||
|
"drawStyle": "line",
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 15,
|
||||||
|
"pointSize": 5
|
||||||
|
},
|
||||||
|
"unit": "reqps"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Log Volume by Service",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 30 },
|
||||||
|
"datasource": { "type": "loki", "uid": "loki" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by (compose_service) (rate({compose_service=~\".+\"} [5m]))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{compose_service}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": {
|
||||||
|
"drawStyle": "bars",
|
||||||
|
"lineWidth": 1,
|
||||||
|
"fillOpacity": 50,
|
||||||
|
"stacking": { "mode": "normal" }
|
||||||
|
},
|
||||||
|
"unit": "reqps"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"tags": ["lysnrai", "logs", "monitoring"],
|
||||||
|
"templating": { "list": [] },
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"timepicker": {},
|
||||||
|
"timezone": "browser",
|
||||||
|
"title": "LysnrAI — Service Logs",
|
||||||
|
"uid": "lysnrai-service-logs",
|
||||||
|
"version": 1
|
||||||
|
}
|
||||||
@ -0,0 +1,12 @@
|
|||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
providers:
|
||||||
|
- name: default
|
||||||
|
orgId: 1
|
||||||
|
folder: ""
|
||||||
|
type: file
|
||||||
|
disableDeletion: false
|
||||||
|
editable: true
|
||||||
|
options:
|
||||||
|
path: /var/lib/grafana/dashboards
|
||||||
|
foldersFromFilesStructure: false
|
||||||
@ -0,0 +1,9 @@
|
|||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
datasources:
|
||||||
|
- name: Loki
|
||||||
|
type: loki
|
||||||
|
access: proxy
|
||||||
|
url: http://loki:3100
|
||||||
|
isDefault: true
|
||||||
|
editable: false
|
||||||
119
services/monitoring/health-check.ts
Normal file
119
services/monitoring/health-check.ts
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
/**
|
||||||
|
* Monitoring & Health Check — aggregates health from all services.
|
||||||
|
*
|
||||||
|
* Standalone script that polls each service's /health endpoint and
|
||||||
|
* reports combined status. Can be run as a cron job, GitHub Action,
|
||||||
|
* or standalone HTTP endpoint.
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* npx tsx services/monitoring/health-check.ts # one-shot check
|
||||||
|
* npx tsx services/monitoring/health-check.ts --serve # HTTP server on :4004
|
||||||
|
*
|
||||||
|
* Environment:
|
||||||
|
* BACKEND_URL (default: http://localhost:8000)
|
||||||
|
* GROWTH_SERVICE_URL (default: http://localhost:4001)
|
||||||
|
* BILLING_SERVICE_URL (default: http://localhost:4002)
|
||||||
|
* PLATFORM_SERVICE_URL (default: http://localhost:4003)
|
||||||
|
* ADMIN_DASHBOARD_URL (default: http://localhost:3001)
|
||||||
|
* USER_DASHBOARD_URL (default: http://localhost:3002)
|
||||||
|
*/
|
||||||
|
|
||||||
|
export {};
|
||||||
|
|
||||||
|
interface ServiceCheck {
|
||||||
|
name: string;
|
||||||
|
url: string;
|
||||||
|
status: "healthy" | "unhealthy" | "unreachable";
|
||||||
|
responseTimeMs: number;
|
||||||
|
details?: Record<string, unknown>;
|
||||||
|
error?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface HealthReport {
|
||||||
|
overall: "healthy" | "degraded" | "down";
|
||||||
|
timestamp: string;
|
||||||
|
services: ServiceCheck[];
|
||||||
|
summary: { healthy: number; unhealthy: number; unreachable: number; total: number };
|
||||||
|
}
|
||||||
|
|
||||||
|
const SERVICES = [
|
||||||
|
{ name: "Backend API", url: process.env.BACKEND_URL || "http://localhost:8000", path: "/health" },
|
||||||
|
{ name: "Growth Service", url: process.env.GROWTH_SERVICE_URL || "http://localhost:4001", path: "/health" },
|
||||||
|
{ name: "Billing Service", url: process.env.BILLING_SERVICE_URL || "http://localhost:4002", path: "/health" },
|
||||||
|
{ name: "Platform Service", url: process.env.PLATFORM_SERVICE_URL || "http://localhost:4003", path: "/health" },
|
||||||
|
{ name: "Admin Dashboard", url: process.env.ADMIN_DASHBOARD_URL || "http://localhost:3001", path: "/api/health" },
|
||||||
|
{ name: "User Dashboard", url: process.env.USER_DASHBOARD_URL || "http://localhost:3002", path: "/api/health" },
|
||||||
|
];
|
||||||
|
|
||||||
|
async function checkService(svc: { name: string; url: string; path: string }): Promise<ServiceCheck> {
|
||||||
|
const fullUrl = `${svc.url}${svc.path}`;
|
||||||
|
const start = performance.now();
|
||||||
|
|
||||||
|
try {
|
||||||
|
const res = await fetch(fullUrl, { signal: AbortSignal.timeout(5_000) });
|
||||||
|
const elapsed = Math.round(performance.now() - start);
|
||||||
|
|
||||||
|
if (res.ok) {
|
||||||
|
let details: Record<string, unknown> | undefined;
|
||||||
|
try { details = await res.json() as Record<string, unknown>; } catch { /* ignore */ }
|
||||||
|
return { name: svc.name, url: svc.url, status: "healthy", responseTimeMs: elapsed, details };
|
||||||
|
}
|
||||||
|
return { name: svc.name, url: svc.url, status: "unhealthy", responseTimeMs: elapsed, error: `HTTP ${res.status}` };
|
||||||
|
} catch (err) {
|
||||||
|
const elapsed = Math.round(performance.now() - start);
|
||||||
|
return { name: svc.name, url: svc.url, status: "unreachable", responseTimeMs: elapsed, error: String(err) };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function generateReport(): Promise<HealthReport> {
|
||||||
|
const checks = await Promise.all(SERVICES.map(checkService));
|
||||||
|
|
||||||
|
const healthy = checks.filter((c) => c.status === "healthy").length;
|
||||||
|
const unhealthy = checks.filter((c) => c.status === "unhealthy").length;
|
||||||
|
const unreachable = checks.filter((c) => c.status === "unreachable").length;
|
||||||
|
|
||||||
|
let overall: HealthReport["overall"] = "healthy";
|
||||||
|
if (unreachable === checks.length) overall = "down";
|
||||||
|
else if (unhealthy > 0 || unreachable > 0) overall = "degraded";
|
||||||
|
|
||||||
|
return {
|
||||||
|
overall,
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
services: checks,
|
||||||
|
summary: { healthy, unhealthy, unreachable, total: checks.length },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── CLI / HTTP server mode ──
|
||||||
|
|
||||||
|
const args = process.argv.slice(2);
|
||||||
|
|
||||||
|
if (args.includes("--serve")) {
|
||||||
|
// Run as HTTP server for continuous monitoring
|
||||||
|
const { createServer } = await import("http");
|
||||||
|
const PORT = Number(process.env.MONITOR_PORT || 4004);
|
||||||
|
|
||||||
|
const server = createServer(async (_req, res) => {
|
||||||
|
const report = await generateReport();
|
||||||
|
res.writeHead(report.overall === "healthy" ? 200 : 503, { "Content-Type": "application/json" });
|
||||||
|
res.end(JSON.stringify(report, null, 2));
|
||||||
|
});
|
||||||
|
|
||||||
|
server.listen(PORT, () => {
|
||||||
|
console.log(`🩺 Monitoring dashboard running on http://localhost:${PORT}`);
|
||||||
|
console.log(` Checking ${SERVICES.length} services every request`);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
// One-shot check
|
||||||
|
const report = await generateReport();
|
||||||
|
const icon = { healthy: "✅", degraded: "⚠️", down: "❌" };
|
||||||
|
console.log(`\n${icon[report.overall]} Overall: ${report.overall.toUpperCase()}\n`);
|
||||||
|
|
||||||
|
for (const svc of report.services) {
|
||||||
|
const sIcon = { healthy: "✅", unhealthy: "⚠️", unreachable: "❌" };
|
||||||
|
console.log(` ${sIcon[svc.status]} ${svc.name.padEnd(20)} ${svc.responseTimeMs}ms${svc.error ? ` — ${svc.error}` : ""}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`\nHealthy: ${report.summary.healthy}/${report.summary.total}`);
|
||||||
|
process.exit(report.overall === "healthy" ? 0 : 1);
|
||||||
|
}
|
||||||
34
services/monitoring/loki/loki-config.yml
Normal file
34
services/monitoring/loki/loki-config.yml
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
auth_enabled: false
|
||||||
|
|
||||||
|
server:
|
||||||
|
http_listen_port: 3100
|
||||||
|
|
||||||
|
common:
|
||||||
|
path_prefix: /loki
|
||||||
|
storage:
|
||||||
|
filesystem:
|
||||||
|
chunks_directory: /loki/chunks
|
||||||
|
rules_directory: /loki/rules
|
||||||
|
replication_factor: 1
|
||||||
|
ring:
|
||||||
|
kvstore:
|
||||||
|
store: inmemory
|
||||||
|
|
||||||
|
schema_config:
|
||||||
|
configs:
|
||||||
|
- from: "2024-01-01"
|
||||||
|
store: tsdb
|
||||||
|
object_store: filesystem
|
||||||
|
schema: v13
|
||||||
|
index:
|
||||||
|
prefix: index_
|
||||||
|
period: 24h
|
||||||
|
|
||||||
|
limits_config:
|
||||||
|
reject_old_samples: true
|
||||||
|
reject_old_samples_max_age: 168h # 7 days
|
||||||
|
ingestion_rate_mb: 10
|
||||||
|
ingestion_burst_size_mb: 20
|
||||||
|
|
||||||
|
analytics:
|
||||||
|
reporting_enabled: false
|
||||||
16
services/monitoring/package.json
Normal file
16
services/monitoring/package.json
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
{
|
||||||
|
"name": "@lysnrai/monitoring",
|
||||||
|
"version": "0.1.0",
|
||||||
|
"private": true,
|
||||||
|
"description": "Health check and monitoring for all LysnrAI services",
|
||||||
|
"type": "module",
|
||||||
|
"scripts": {
|
||||||
|
"check": "tsx health-check.ts",
|
||||||
|
"serve": "tsx health-check.ts --serve"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/node": "^22.12.0",
|
||||||
|
"tsx": "^4.19.2",
|
||||||
|
"typescript": "^5.7.3"
|
||||||
|
}
|
||||||
|
}
|
||||||
11
services/monitoring/tsconfig.json
Normal file
11
services/monitoring/tsconfig.json
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
"target": "ES2022",
|
||||||
|
"module": "ESNext",
|
||||||
|
"moduleResolution": "bundler",
|
||||||
|
"strict": true,
|
||||||
|
"esModuleInterop": true,
|
||||||
|
"types": ["node"]
|
||||||
|
},
|
||||||
|
"include": ["*.ts"]
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user