feat(services): add monitoring (Loki + Grafana config, health-check)

- Copied as-is from learning_voice_ai_agent/services/monitoring
- Grafana dashboards + provisioning for Loki datasource
- health-check.ts for service health polling
- Updated pnpm-workspace.yaml to include services/*
This commit is contained in:
saravanakumardb1 2026-02-12 11:39:24 -08:00
parent 2738124ab9
commit c97e697097
9 changed files with 1296 additions and 10 deletions

976
pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@ -1,2 +1,3 @@
packages:
- "packages/*"
- "services/*"

View File

@ -0,0 +1,128 @@
{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"title": "All Service Logs",
"type": "logs",
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 0 },
"datasource": { "type": "loki", "uid": "loki" },
"targets": [
{
"expr": "{compose_service=~\".+\"}",
"refId": "A"
}
],
"options": {
"showTime": true,
"showLabels": true,
"showCommonLabels": false,
"wrapLogMessage": true,
"prettifyLogMessage": false,
"enableLogDetails": true,
"sortOrder": "Descending",
"dedupStrategy": "none"
}
},
{
"title": "Logs by Service",
"type": "logs",
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 12 },
"datasource": { "type": "loki", "uid": "loki" },
"targets": [
{
"expr": "{compose_service=\"backend\"}",
"refId": "A",
"legendFormat": "backend"
}
],
"options": {
"showTime": true,
"showLabels": true,
"wrapLogMessage": true,
"enableLogDetails": true,
"sortOrder": "Descending"
}
},
{
"title": "Billing Service Logs",
"type": "logs",
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 12 },
"datasource": { "type": "loki", "uid": "loki" },
"targets": [
{
"expr": "{compose_service=\"billing-service\"}",
"refId": "A"
}
],
"options": {
"showTime": true,
"showLabels": true,
"wrapLogMessage": true,
"enableLogDetails": true,
"sortOrder": "Descending"
}
},
{
"title": "Error Rate (all services)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 },
"datasource": { "type": "loki", "uid": "loki" },
"targets": [
{
"expr": "sum by (compose_service) (rate({compose_service=~\".+\"} |= \"error\" [5m]))",
"refId": "A",
"legendFormat": "{{compose_service}}"
}
],
"fieldConfig": {
"defaults": {
"custom": {
"drawStyle": "line",
"lineWidth": 2,
"fillOpacity": 15,
"pointSize": 5
},
"unit": "reqps"
}
}
},
{
"title": "Log Volume by Service",
"type": "timeseries",
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 30 },
"datasource": { "type": "loki", "uid": "loki" },
"targets": [
{
"expr": "sum by (compose_service) (rate({compose_service=~\".+\"} [5m]))",
"refId": "A",
"legendFormat": "{{compose_service}}"
}
],
"fieldConfig": {
"defaults": {
"custom": {
"drawStyle": "bars",
"lineWidth": 1,
"fillOpacity": 50,
"stacking": { "mode": "normal" }
},
"unit": "reqps"
}
}
}
],
"schemaVersion": 39,
"tags": ["lysnrai", "logs", "monitoring"],
"templating": { "list": [] },
"time": { "from": "now-1h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "LysnrAI — Service Logs",
"uid": "lysnrai-service-logs",
"version": 1
}

View File

@ -0,0 +1,12 @@
apiVersion: 1
providers:
- name: default
orgId: 1
folder: ""
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: false

View File

@ -0,0 +1,9 @@
apiVersion: 1
datasources:
- name: Loki
type: loki
access: proxy
url: http://loki:3100
isDefault: true
editable: false

View File

@ -0,0 +1,119 @@
/**
* Monitoring & Health Check aggregates health from all services.
*
* Standalone script that polls each service's /health endpoint and
* reports combined status. Can be run as a cron job, GitHub Action,
* or standalone HTTP endpoint.
*
* Usage:
* npx tsx services/monitoring/health-check.ts # one-shot check
* npx tsx services/monitoring/health-check.ts --serve # HTTP server on :4004
*
* Environment:
* BACKEND_URL (default: http://localhost:8000)
* GROWTH_SERVICE_URL (default: http://localhost:4001)
* BILLING_SERVICE_URL (default: http://localhost:4002)
* PLATFORM_SERVICE_URL (default: http://localhost:4003)
* ADMIN_DASHBOARD_URL (default: http://localhost:3001)
* USER_DASHBOARD_URL (default: http://localhost:3002)
*/
export {};
interface ServiceCheck {
name: string;
url: string;
status: "healthy" | "unhealthy" | "unreachable";
responseTimeMs: number;
details?: Record<string, unknown>;
error?: string;
}
interface HealthReport {
overall: "healthy" | "degraded" | "down";
timestamp: string;
services: ServiceCheck[];
summary: { healthy: number; unhealthy: number; unreachable: number; total: number };
}
const SERVICES = [
{ name: "Backend API", url: process.env.BACKEND_URL || "http://localhost:8000", path: "/health" },
{ name: "Growth Service", url: process.env.GROWTH_SERVICE_URL || "http://localhost:4001", path: "/health" },
{ name: "Billing Service", url: process.env.BILLING_SERVICE_URL || "http://localhost:4002", path: "/health" },
{ name: "Platform Service", url: process.env.PLATFORM_SERVICE_URL || "http://localhost:4003", path: "/health" },
{ name: "Admin Dashboard", url: process.env.ADMIN_DASHBOARD_URL || "http://localhost:3001", path: "/api/health" },
{ name: "User Dashboard", url: process.env.USER_DASHBOARD_URL || "http://localhost:3002", path: "/api/health" },
];
async function checkService(svc: { name: string; url: string; path: string }): Promise<ServiceCheck> {
const fullUrl = `${svc.url}${svc.path}`;
const start = performance.now();
try {
const res = await fetch(fullUrl, { signal: AbortSignal.timeout(5_000) });
const elapsed = Math.round(performance.now() - start);
if (res.ok) {
let details: Record<string, unknown> | undefined;
try { details = await res.json() as Record<string, unknown>; } catch { /* ignore */ }
return { name: svc.name, url: svc.url, status: "healthy", responseTimeMs: elapsed, details };
}
return { name: svc.name, url: svc.url, status: "unhealthy", responseTimeMs: elapsed, error: `HTTP ${res.status}` };
} catch (err) {
const elapsed = Math.round(performance.now() - start);
return { name: svc.name, url: svc.url, status: "unreachable", responseTimeMs: elapsed, error: String(err) };
}
}
async function generateReport(): Promise<HealthReport> {
const checks = await Promise.all(SERVICES.map(checkService));
const healthy = checks.filter((c) => c.status === "healthy").length;
const unhealthy = checks.filter((c) => c.status === "unhealthy").length;
const unreachable = checks.filter((c) => c.status === "unreachable").length;
let overall: HealthReport["overall"] = "healthy";
if (unreachable === checks.length) overall = "down";
else if (unhealthy > 0 || unreachable > 0) overall = "degraded";
return {
overall,
timestamp: new Date().toISOString(),
services: checks,
summary: { healthy, unhealthy, unreachable, total: checks.length },
};
}
// ── CLI / HTTP server mode ──
const args = process.argv.slice(2);
if (args.includes("--serve")) {
// Run as HTTP server for continuous monitoring
const { createServer } = await import("http");
const PORT = Number(process.env.MONITOR_PORT || 4004);
const server = createServer(async (_req, res) => {
const report = await generateReport();
res.writeHead(report.overall === "healthy" ? 200 : 503, { "Content-Type": "application/json" });
res.end(JSON.stringify(report, null, 2));
});
server.listen(PORT, () => {
console.log(`🩺 Monitoring dashboard running on http://localhost:${PORT}`);
console.log(` Checking ${SERVICES.length} services every request`);
});
} else {
// One-shot check
const report = await generateReport();
const icon = { healthy: "✅", degraded: "⚠️", down: "❌" };
console.log(`\n${icon[report.overall]} Overall: ${report.overall.toUpperCase()}\n`);
for (const svc of report.services) {
const sIcon = { healthy: "✅", unhealthy: "⚠️", unreachable: "❌" };
console.log(` ${sIcon[svc.status]} ${svc.name.padEnd(20)} ${svc.responseTimeMs}ms${svc.error ? `${svc.error}` : ""}`);
}
console.log(`\nHealthy: ${report.summary.healthy}/${report.summary.total}`);
process.exit(report.overall === "healthy" ? 0 : 1);
}

View File

@ -0,0 +1,34 @@
auth_enabled: false
server:
http_listen_port: 3100
common:
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
schema_config:
configs:
- from: "2024-01-01"
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
limits_config:
reject_old_samples: true
reject_old_samples_max_age: 168h # 7 days
ingestion_rate_mb: 10
ingestion_burst_size_mb: 20
analytics:
reporting_enabled: false

View File

@ -0,0 +1,16 @@
{
"name": "@lysnrai/monitoring",
"version": "0.1.0",
"private": true,
"description": "Health check and monitoring for all LysnrAI services",
"type": "module",
"scripts": {
"check": "tsx health-check.ts",
"serve": "tsx health-check.ts --serve"
},
"devDependencies": {
"@types/node": "^22.12.0",
"tsx": "^4.19.2",
"typescript": "^5.7.3"
}
}

View File

@ -0,0 +1,11 @@
{
"compilerOptions": {
"target": "ES2022",
"module": "ESNext",
"moduleResolution": "bundler",
"strict": true,
"esModuleInterop": true,
"types": ["node"]
},
"include": ["*.ts"]
}