docs: update documentation
This commit is contained in:
parent
856788c386
commit
80a4459f81
@ -20,6 +20,10 @@
|
|||||||
5. [New Environment Variables](#5-new-environment-variables)
|
5. [New Environment Variables](#5-new-environment-variables)
|
||||||
6. [Quick Reference — Where Things Live](#6-quick-reference--where-things-live)
|
6. [Quick Reference — Where Things Live](#6-quick-reference--where-things-live)
|
||||||
|
|
||||||
|
- [Appendix A: Risks & Open Questions](#appendix-a-risks--open-questions)
|
||||||
|
- [Appendix B: Component Dependency Graph](#appendix-b-component-dependency-graph)
|
||||||
|
- [Appendix C: Review Findings](#appendix-c-review-findings)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 1. Current Inventory
|
## 1. Current Inventory
|
||||||
@ -29,23 +33,23 @@
|
|||||||
| Category | Module | Endpoints | Description |
|
| Category | Module | Endpoints | Description |
|
||||||
| ------------ | --------------- | --------- | --------------------------------------------------------------------------------------------------- |
|
| ------------ | --------------- | --------- | --------------------------------------------------------------------------------------------------- |
|
||||||
| **Identity** | `auth` | 11 routes | Login, register, refresh, SSO, profile, admin user CRUD |
|
| **Identity** | `auth` | 11 routes | Login, register, refresh, SSO, profile, admin user CRUD |
|
||||||
| **Identity** | `tokens` | CRUD | API token management |
|
| **Identity** | `tokens` | 5 routes | API token management (CRUD + validate) |
|
||||||
| **Identity** | `licenses` | CRUD | License key generation, activation, device binding |
|
| **Identity** | `licenses` | 6 routes | License key generation, activation, device binding, validate |
|
||||||
| **Billing** | `subscriptions` | CRUD | Plan management, trial tracking, period management |
|
| **Billing** | `subscriptions` | 5 routes | Plan management, trial tracking, period management |
|
||||||
| **Billing** | `stripe` | Webhooks | Inbound Stripe webhook processing |
|
| **Billing** | `stripe` | 2 routes | Inbound Stripe webhook + portal session |
|
||||||
| **Billing** | `plans` | CRUD | Plan definitions (free, pro, enterprise) |
|
| **Billing** | `plans` | 4 routes | Plan definitions (free, pro, enterprise) |
|
||||||
| **Billing** | `usage` | CRUD | Usage tracking and quota enforcement |
|
| **Billing** | `usage` | 4 routes | Usage tracking and quota enforcement |
|
||||||
| **Billing** | `promos` | CRUD | Promo code creation, validation, redemption |
|
| **Billing** | `promos` | 5 routes | Promo code creation, validation, redemption |
|
||||||
| **Growth** | `invitations` | CRUD | Invitation code generation, redemption, tracking |
|
| **Growth** | `invitations` | 5 routes | Invitation code generation, redemption, tracking |
|
||||||
| **Growth** | `referrals` | CRUD | Referral link tracking, status transitions |
|
| **Growth** | `referrals` | 5 routes | Referral link tracking, status transitions |
|
||||||
| **Growth** | `waitlist` | 12 routes | Pre-launch signups, position tracking, admin batch invite, CSV export |
|
| **Growth** | `waitlist` | 12 routes | Pre-launch signups, position tracking, admin batch invite, CSV export |
|
||||||
| **Growth** | `public` | 5 routes | Public roadmap, community voting, feature submissions |
|
| **Growth** | `public` | 5 routes | Public roadmap, community voting, feature submissions |
|
||||||
| **Content** | `items` | CRUD | Tracker items (bugs, features, tasks) |
|
| **Content** | `items` | 5 routes | Tracker items (bugs, features, tasks) |
|
||||||
| **Content** | `comments` | CRUD | Threaded comments on items |
|
| **Content** | `comments` | 4 routes | Threaded comments on items |
|
||||||
| **Content** | `votes` | CRUD | User votes on items and comments |
|
| **Content** | `votes` | 3 routes | User votes on items and comments |
|
||||||
| **Content** | `memory` | 5 routes | Memory items — create, reassign, patch, delete |
|
| **Content** | `memory` | 5 routes | Memory items — create, reassign, patch, delete |
|
||||||
| **Ops** | `audit` | Query | Audit log recording and admin queries |
|
| **Ops** | `audit` | Query | Audit log recording and admin queries |
|
||||||
| **Ops** | `flags` | CRUD | Feature flags with FNV-1a deterministic rollout |
|
| **Ops** | `flags` | 5 routes | Feature flags with FNV-1a deterministic rollout |
|
||||||
| **Ops** | `telemetry` | 9 routes | Client event ingestion, error clustering, collection policies, GDPR erasure |
|
| **Ops** | `telemetry` | 9 routes | Client event ingestion, error clustering, collection policies, GDPR erasure |
|
||||||
| **Ops** | `notifications` | 5 routes | Device registration, notification preferences |
|
| **Ops** | `notifications` | 5 routes | Device registration, notification preferences |
|
||||||
| **Ops** | `settings` | 6 routes | User/device settings, kill switch |
|
| **Ops** | `settings` | 6 routes | User/device settings, kill switch |
|
||||||
@ -75,8 +79,8 @@
|
|||||||
### 1.3 Services
|
### 1.3 Services
|
||||||
|
|
||||||
| Service | Port | Description |
|
| Service | Port | Description |
|
||||||
| ---------------------- | ---- | ----------------------------------------------------- |
|
| ---------------------- | ---- | ---------------------------------------------------- |
|
||||||
| **platform-service** | 4003 | Consolidated Fastify service (25 modules, 158+ tests) |
|
| **platform-service** | 4003 | Consolidated Fastify service (25 modules, 621 tests) |
|
||||||
| **extraction-service** | 4005 | LangExtract text extraction + Python sidecar |
|
| **extraction-service** | 4005 | LangExtract text extraction + Python sidecar |
|
||||||
| **monitoring** | 4004 | Health-check aggregator (all services) |
|
| **monitoring** | 4004 | Health-check aggregator (all services) |
|
||||||
|
|
||||||
@ -185,8 +189,8 @@ platform-service/src/modules/delivery/
|
|||||||
│ ├── push-apns.ts — Apple Push Notification Service
|
│ ├── push-apns.ts — Apple Push Notification Service
|
||||||
│ ├── push-fcm.ts — Firebase Cloud Messaging
|
│ ├── push-fcm.ts — Firebase Cloud Messaging
|
||||||
│ └── sms.ts — Twilio/Azure Communication Services (future)
|
│ └── sms.ts — Twilio/Azure Communication Services (future)
|
||||||
├── renderer.ts — Template rendering (Handlebars/Mustache for email)
|
├── renderer.ts — Template rendering (Handlebars for email bodies)
|
||||||
├── repository.ts — delivery_log container (track sent/failed/bounced)
|
├── repository.ts — delivery_log + email_templates containers
|
||||||
├── dispatcher.ts — Route delivery request to correct channel(s) based on prefs
|
├── dispatcher.ts — Route delivery request to correct channel(s) based on prefs
|
||||||
└── routes.ts — Admin: send test, view delivery log, manage templates
|
└── routes.ts — Admin: send test, view delivery log, manage templates
|
||||||
```
|
```
|
||||||
@ -245,9 +249,9 @@ platform-service/src/modules/webhooks/
|
|||||||
**Event catalog (subscribe to any combination):**
|
**Event catalog (subscribe to any combination):**
|
||||||
|
|
||||||
| Event | Payload | Source |
|
| Event | Payload | Source |
|
||||||
| ----------------------- | ---------------------------------------------- | --------------------------- |
|
| ----------------------- | ---------------------------------------------- | ------------------------------- |
|
||||||
| `user.created` | `{ userId, email, plan }` | `auth.register`, `auth.sso` |
|
| `user.created` | `{ userId, email, plan }` | `auth.register`, `auth.sso` |
|
||||||
| `user.deleted` | `{ userId }` | `auth.delete` |
|
| `user.deleted` | `{ userId }` | Admin: `DELETE /auth/users/:id` |
|
||||||
| `subscription.created` | `{ subscriptionId, userId, plan, status }` | Registration hook |
|
| `subscription.created` | `{ subscriptionId, userId, plan, status }` | Registration hook |
|
||||||
| `subscription.changed` | `{ subscriptionId, oldPlan, newPlan, status }` | Stripe webhook |
|
| `subscription.changed` | `{ subscriptionId, oldPlan, newPlan, status }` | Stripe webhook |
|
||||||
| `subscription.canceled` | `{ subscriptionId, userId, reason }` | User action / Stripe |
|
| `subscription.canceled` | `{ subscriptionId, userId, reason }` | User action / Stripe |
|
||||||
@ -334,6 +338,13 @@ const PlatformEvents = {
|
|||||||
} as const;
|
} as const;
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Migration from existing `lib/webhooks.ts`:**
|
||||||
|
|
||||||
|
- Existing `dispatchInvitationRedeemed()`, `dispatchReferralStatusChanged()`, `dispatchWaitlistJoined()` become event bus subscribers
|
||||||
|
- Phase 1: Register existing webhooks.ts functions as handlers on the bus
|
||||||
|
- Phase 2: Replace inline dispatch calls in routes with `bus.emit()`
|
||||||
|
- Phase 3: Remove `lib/webhooks.ts` once all callers migrated
|
||||||
|
|
||||||
**Benefits:**
|
**Benefits:**
|
||||||
|
|
||||||
- Audit logging becomes a subscriber, not inline code
|
- Audit logging becomes a subscriber, not inline code
|
||||||
@ -389,7 +400,7 @@ interface PasswordResetToken {
|
|||||||
|
|
||||||
- `password_reset_tokens` (pk: `/productId`) — short-lived, TTL 24h auto-expiry
|
- `password_reset_tokens` (pk: `/productId`) — short-lived, TTL 24h auto-expiry
|
||||||
|
|
||||||
**Dependency:** Requires email delivery (§2.2) for sending reset links and verification emails. Can ship the endpoints first with console-logged URLs for dev/testing.
|
**Dependency:** Requires email delivery (§2.2) for sending reset links and verification emails. Can ship the endpoints first with `req.log.info`-logged URLs for dev/testing (never `console.log`).
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@ -556,9 +567,11 @@ platform-service/src/modules/exports/
|
|||||||
**Flow:**
|
**Flow:**
|
||||||
|
|
||||||
1. Admin POST `/api/exports` → `{ type: 'users', format: 'csv', filters: { plan: 'free' } }`
|
1. Admin POST `/api/exports` → `{ type: 'users', format: 'csv', filters: { plan: 'free' } }`
|
||||||
2. Background job runs query, writes result to blob storage
|
2. Background job runs query, writes result to blob storage (via existing `blob` module)
|
||||||
3. Job status updates: `pending` → `processing` → `ready` / `failed`
|
3. Job status updates: `pending` → `processing` → `ready` / `failed`
|
||||||
4. Admin downloads from signed blob URL
|
4. Admin downloads from signed blob URL (SAS token via `@bytelyst/blob`)
|
||||||
|
|
||||||
|
**Dependencies:** `blob` module (existing) for storage, `jobs` module (§2.1) for auto-cleanup of expired exports.
|
||||||
|
|
||||||
**Supported exports:**
|
**Supported exports:**
|
||||||
|
|
||||||
@ -629,6 +642,8 @@ interface MaintenanceConfig {
|
|||||||
- Schedule builder with start/end date pickers
|
- Schedule builder with start/end date pickers
|
||||||
- Bypass IP whitelist management
|
- Bypass IP whitelist management
|
||||||
|
|
||||||
|
**Storage:** Maintenance config is a single document per product in the existing `settings` container (field: `maintenanceConfig`). No new Cosmos container needed.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
#### 2.11 Rate Limit Dashboard & IP Allow/Deny Lists
|
#### 2.11 Rate Limit Dashboard & IP Allow/Deny Lists
|
||||||
@ -678,6 +693,11 @@ interface IPRule {
|
|||||||
- IP rules management (allow/deny with expiry)
|
- IP rules management (allow/deny with expiry)
|
||||||
- Per-user rate limit override
|
- Per-user rate limit override
|
||||||
|
|
||||||
|
**Cosmos container:**
|
||||||
|
|
||||||
|
- `ip_rules` (pk: `/productId`) — persistent IP allow/deny rules
|
||||||
|
- Rate limit stats remain in-memory (ephemeral); no persistence needed for counters
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### P2 — Product Intelligence
|
### P2 — Product Intelligence
|
||||||
@ -715,7 +735,7 @@ interface ExperimentDoc {
|
|||||||
hypothesis: string;
|
hypothesis: string;
|
||||||
status: 'draft' | 'running' | 'paused' | 'concluded';
|
status: 'draft' | 'running' | 'paused' | 'concluded';
|
||||||
variants: Variant[]; // [{id: 'control', weight: 50}, {id: 'treatment', weight: 50}]
|
variants: Variant[]; // [{id: 'control', weight: 50}, {id: 'treatment', weight: 50}]
|
||||||
targetingRules: {}; // Same as flag targeting
|
targetingRules: FlagTargetingRules; // Reuse from flags module (platforms, versions, percentage)
|
||||||
primaryMetric: string; // e.g., 'dictation_completed_rate'
|
primaryMetric: string; // e.g., 'dictation_completed_rate'
|
||||||
secondaryMetrics: string[];
|
secondaryMetrics: string[];
|
||||||
startedAt?: string;
|
startedAt?: string;
|
||||||
@ -1047,24 +1067,25 @@ This is a major architectural expansion. Defer until enterprise tier is validate
|
|||||||
Each new component introduces Cosmos containers. Cosmos DB Serverless charges per RU consumed + storage, so idle containers cost only storage (~$0.25/GB/month).
|
Each new component introduces Cosmos containers. Cosmos DB Serverless charges per RU consumed + storage, so idle containers cost only storage (~$0.25/GB/month).
|
||||||
|
|
||||||
| Component | New Containers | Partition Key | Est. TTL | Est. Daily RU |
|
| Component | New Containers | Partition Key | Est. TTL | Est. Daily RU |
|
||||||
| ---------------------- | --------------------------------------------- | ----------------------------------------- | --------------- | ----------------------------------- |
|
| ---------------------- | ---------------------------------------------- | ----------------------------------------- | --------------- | ----------------------------------- |
|
||||||
| **2.1 Jobs** | `job_definitions`, `job_runs` | `/productId`, `/productId:jobName` | runs: 90d | ~50 RU (low volume) |
|
| **2.1 Jobs** | `job_definitions`, `job_runs` | `/productId`, `/productId:jobName` | runs: 90d | ~50 RU (low volume) |
|
||||||
| **2.2 Email/Push** | `delivery_log`, `email_templates` | `/productId:channel:yyyyMM`, `/productId` | log: 90d | ~200 RU |
|
| **2.2 Email/Push** | `delivery_log`, `email_templates` | `/productId:channel:yyyyMM`, `/productId` | log: 90d | ~200 RU |
|
||||||
| **2.3 Webhooks** | `webhook_subscriptions`, `webhook_deliveries` | `/productId`, `/subscriptionId:yyyyMM` | deliveries: 30d | ~100 RU |
|
| **2.3 Webhooks** | `webhook_subscriptions`, `webhook_deliveries` | `/productId`, `/subscriptionId:yyyyMM` | deliveries: 30d | ~100 RU |
|
||||||
| **2.5 Password Reset** | `password_reset_tokens` | `/productId` | 24h auto | ~10 RU |
|
| **2.5 Password Reset** | `password_reset_tokens`, `email_verifications` | `/productId`, `/productId` | 24h auto | ~10 RU |
|
||||||
| **2.6 Status** | `service_status`, `incidents` | `/productId`, `/productId` | None | ~20 RU |
|
| **2.6 Status** | `service_status`, `incidents` | `/productId`, `/productId` | None | ~20 RU |
|
||||||
| **2.7 Sessions** | `sessions` | `/userId` | 90d | ~500 RU (read-heavy) |
|
| **2.7 Sessions** | `sessions` | `/userId` | 90d | ~500 RU (read-heavy) |
|
||||||
| **2.8 Migrations** | `migrations` | `/productId` | None | ~5 RU (startup only) |
|
| **2.8 Migrations** | `migrations` | `/productId` | None | ~5 RU (startup only) |
|
||||||
| **2.9 Exports** | `export_jobs` | `/productId` | 30d | ~20 RU |
|
| **2.9 Exports** | `export_jobs` | `/productId` | 30d | ~20 RU |
|
||||||
| **2.12 Experiments** | `experiments` | `/productId` | None | ~50 RU |
|
| **2.12 Experiments** | `experiments` | `/productId` | None | ~50 RU |
|
||||||
| **2.13 Analytics** | `analytics_rollups` | `/productId:metric:period` | None | ~300 RU (write-heavy during rollup) |
|
| **2.13 Analytics** | `analytics_rollups` | `/productId:metric:period` | None | ~300 RU (write-heavy during rollup) |
|
||||||
|
| **2.11 IP Rules** | `ip_rules` | `/productId` | None (manual) | ~10 RU |
|
||||||
| **2.14 Feedback** | `feedback` | `/productId` | None | ~50 RU |
|
| **2.14 Feedback** | `feedback` | `/productId` | None | ~50 RU |
|
||||||
| **2.16 Changelog** | `changelog` | `/productId` | None | ~10 RU |
|
| **2.16 Changelog** | `changelog` | `/productId` | None | ~10 RU |
|
||||||
| **2.20 i18n** | `translations` | `/productId:locale` | None | ~100 RU (read-heavy, cacheable) |
|
| **2.20 i18n** | `translations` | `/productId:locale` | None | ~100 RU (read-heavy, cacheable) |
|
||||||
| **2.23 Retention** | `retention_policies` | `/productId` | None | ~5 RU |
|
| **2.23 Retention** | `retention_policies` | `/productId` | None | ~5 RU |
|
||||||
|
|
||||||
**Total new containers:** ~17 (across all phases)
|
**Total new containers:** ~19 (across all phases)
|
||||||
**Existing containers:** ~25+ (across platform-service + dashboards)
|
**Existing containers:** 27 (defined in `cosmos-init.ts`: products, users, settings, devices, notification_prefs, audit_log, feature_flags, invitation_codes, referrals, subscriptions, payments, licenses, plans, usage_daily, api_tokens, tracker_items, comments, votes, themes, waitlist, memory_items, daily_briefs, reflections, brain_insights, telemetry_events, telemetry_error_clusters, telemetry_collection_policies). Note: `promos` module uses Stripe API directly — no Cosmos container.
|
||||||
**Cost impact:** Minimal for Serverless tier — idle containers only consume storage. Active containers during job runs add burst RU.
|
**Cost impact:** Minimal for Serverless tier — idle containers only consume storage. Active containers during job runs add burst RU.
|
||||||
|
|
||||||
**Recommendation:** Register all new containers in `cosmos-init.ts` alongside existing ones. Use TTL liberally for transient data (tokens, deliveries, job runs) to keep storage bounded.
|
**Recommendation:** Register all new containers in `cosmos-init.ts` alongside existing ones. Use TTL liberally for transient data (tokens, deliveries, job runs) to keep storage bounded.
|
||||||
@ -1076,7 +1097,7 @@ Each new component introduces Cosmos containers. Cosmos DB Serverless charges pe
|
|||||||
New components will require additional env vars. All should be added to `.env.example` files in both repos and documented.
|
New components will require additional env vars. All should be added to `.env.example` files in both repos and documented.
|
||||||
|
|
||||||
| Component | Variable | Example | Required |
|
| Component | Variable | Example | Required |
|
||||||
| -------------------- | -------------------------- | -------------------------------- | ------------------------- |
|
| -------------------- | ----------------------------- | -------------------------------- | ------------------------- |
|
||||||
| **2.1 Jobs** | `JOB_RUNNER_ENABLED` | `true` | No (default: true) |
|
| **2.1 Jobs** | `JOB_RUNNER_ENABLED` | `true` | No (default: true) |
|
||||||
| **2.1 Jobs** | `JOB_TICK_INTERVAL_MS` | `60000` | No (default: 60s) |
|
| **2.1 Jobs** | `JOB_TICK_INTERVAL_MS` | `60000` | No (default: 60s) |
|
||||||
| **2.2 Email** | `SENDGRID_API_KEY` | `SG.xxx` | Yes (for email delivery) |
|
| **2.2 Email** | `SENDGRID_API_KEY` | `SG.xxx` | Yes (for email delivery) |
|
||||||
@ -1090,6 +1111,10 @@ New components will require additional env vars. All should be added to `.env.ex
|
|||||||
| **2.5 Auth** | `EMAIL_VERIFY_URL_BASE` | `https://app.lysnrai.com/verify` | Yes |
|
| **2.5 Auth** | `EMAIL_VERIFY_URL_BASE` | `https://app.lysnrai.com/verify` | Yes |
|
||||||
| **2.10 Maintenance** | `MAINTENANCE_MODE` | `off` | No (default: off) |
|
| **2.10 Maintenance** | `MAINTENANCE_MODE` | `off` | No (default: off) |
|
||||||
| **2.10 Maintenance** | `MAINTENANCE_BYPASS_IPS` | `10.0.0.1,10.0.0.2` | No |
|
| **2.10 Maintenance** | `MAINTENANCE_BYPASS_IPS` | `10.0.0.1,10.0.0.2` | No |
|
||||||
|
| **2.3 Webhooks** | `WEBHOOK_DELIVERY_TIMEOUT_MS` | `5000` | No (default: 5s) |
|
||||||
|
| **2.3 Webhooks** | `WEBHOOK_MAX_RETRIES` | `3` | No (default: 3) |
|
||||||
|
| **2.7 Sessions** | `SESSION_TTL_DAYS` | `90` | No (default: 90) |
|
||||||
|
| **2.7 Sessions** | `SESSION_CACHE_TTL_MS` | `30000` | No (default: 30s) |
|
||||||
| **2.19 OpenAPI** | `SWAGGER_UI_ENABLED` | `true` | No (default: true in dev) |
|
| **2.19 OpenAPI** | `SWAGGER_UI_ENABLED` | `true` | No (default: true in dev) |
|
||||||
|
|
||||||
**Secret management:** `SENDGRID_API_KEY`, `APNS_*`, and `FCM_*` should be added to Azure Key Vault as `lysnr-sendgrid-api-key`, `lysnr-apns-key-id`, etc. Update `LYSNR_SECRETS` in `@bytelyst/config` to include them.
|
**Secret management:** `SENDGRID_API_KEY`, `APNS_*`, and `FCM_*` should be added to Azure Key Vault as `lysnr-sendgrid-api-key`, `lysnr-apns-key-id`, etc. Update `LYSNR_SECRETS` in `@bytelyst/config` to include them.
|
||||||
@ -1099,7 +1124,7 @@ New components will require additional env vars. All should be added to `.env.ex
|
|||||||
## 6. Quick Reference — Where Things Live
|
## 6. Quick Reference — Where Things Live
|
||||||
|
|
||||||
| Component | Repo | Path |
|
| Component | Repo | Path |
|
||||||
| ------------------------ | ------------------------- | ----------------------------------------------- |
|
| ------------------------ | ----------------------------------- | ------------------------------------------------------ |
|
||||||
| Platform-service modules | `learning_ai_common_plat` | `services/platform-service/src/modules/` |
|
| Platform-service modules | `learning_ai_common_plat` | `services/platform-service/src/modules/` |
|
||||||
| Shared packages | `learning_ai_common_plat` | `packages/` |
|
| Shared packages | `learning_ai_common_plat` | `packages/` |
|
||||||
| Admin dashboard | `learning_voice_ai_agent` | `admin-dashboard-web/` |
|
| Admin dashboard | `learning_voice_ai_agent` | `admin-dashboard-web/` |
|
||||||
@ -1108,20 +1133,40 @@ New components will require additional env vars. All should be added to `.env.ex
|
|||||||
| Docker Compose | both repos | `docker-compose.yml` |
|
| Docker Compose | both repos | `docker-compose.yml` |
|
||||||
| Monitoring | `learning_ai_common_plat` | `services/monitoring/` |
|
| Monitoring | `learning_ai_common_plat` | `services/monitoring/` |
|
||||||
| Design tokens | `learning_ai_common_plat` | `packages/design-tokens/` |
|
| Design tokens | `learning_ai_common_plat` | `packages/design-tokens/` |
|
||||||
|
| MindLyst native app | `learning_multimodal_memory_agents` | `mindlyst-native/` (KMP + SwiftUI + Compose + Next.js) |
|
||||||
|
| MindLyst web | `learning_multimodal_memory_agents` | `mindlyst-native/web/` |
|
||||||
| Existing webhooks | `learning_ai_common_plat` | `services/platform-service/src/lib/webhooks.ts` |
|
| Existing webhooks | `learning_ai_common_plat` | `services/platform-service/src/lib/webhooks.ts` |
|
||||||
|
| Cosmos container defs | `learning_ai_common_plat` | `services/platform-service/src/lib/cosmos-init.ts` |
|
||||||
| Telemetry design doc | `learning_ai_common_plat` | `docs/WINDSURF/CLIENT_TELEMETRY_DESIGN.md` |
|
| Telemetry design doc | `learning_ai_common_plat` | `docs/WINDSURF/CLIENT_TELEMETRY_DESIGN.md` |
|
||||||
| Telemetry roadmap | `learning_ai_common_plat` | `docs/WINDSURF/TELEMETRY_ROADMAP.md` |
|
| Telemetry roadmap | `learning_ai_common_plat` | `docs/WINDSURF/TELEMETRY_ROADMAP.md` |
|
||||||
| **This document** | `learning_ai_common_plat` | `docs/WINDSURF/PLATFORM_COMPONENTS_ROADMAP.md` |
|
| **This document** | `learning_ai_common_plat` | `docs/WINDSURF/PLATFORM_COMPONENTS_ROADMAP.md` |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Appendix: Component Dependency Graph
|
## Appendix A: Risks & Open Questions
|
||||||
|
|
||||||
|
| # | Topic | Risk / Question | Mitigation |
|
||||||
|
| --- | ---------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| 1 | **Leader election for jobs** | In-process tick loop with Cosmos lease — what happens during deploys? Two instances may briefly both hold leases. | Cosmos lease has a built-in TTL. Use 30s lease with 10s renewal. During deploy overlap, the old instance's lease expires before the new one acquires. Jobs must be idempotent. |
|
||||||
|
| 2 | **Email deliverability** | SendGrid requires domain verification (SPF/DKIM/DMARC). Without it, emails land in spam. | Set up `lysnrai.com` domain authentication in SendGrid before shipping §2.2. Budget 1–2 days for DNS propagation. |
|
||||||
|
| 3 | **Session validation latency** | Checking Cosmos on every request for session revocation adds ~5–10ms per request. | In-memory cache with 30s TTL (§2.7). Revocation is eventually consistent — acceptable trade-off for most apps. Document the 30s window. |
|
||||||
|
| 4 | **Cosmos container proliferation** | 28 existing + 19 new = 47 containers. Serverless tier has no per-container cost, but management complexity grows. | Group related containers by module. Document all containers in `cosmos-init.ts`. Consider container-per-module naming convention. |
|
||||||
|
| 5 | **Event bus ordering guarantees** | In-memory `EventEmitter` has no ordering guarantees across handlers. If audit must record before webhook fires, ordering matters. | Phase 1: Document that handlers run concurrently with no ordering. If ordering is needed, use handler priority weights or sequential mode. |
|
||||||
|
| 6 | **Push notification certificates** | APNs requires yearly certificate renewal. If it expires, all iOS push silently stops. | Add `apns-cert-expiry-check` to scheduled jobs (§2.1). Alert admin 30 days before expiry. |
|
||||||
|
| 7 | **Webhook abuse** | External subscribers could register slow endpoints that back up the delivery queue. | Per-subscription timeout (5s default), circuit breaker after 10 consecutive failures, auto-disable. |
|
||||||
|
| 8 | **Migration rollback** | Cosmos is schemaless — some migrations (e.g., partition key changes) are irreversible. | Mark migrations as `reversible: true/false`. Require manual approval for irreversible migrations. Always back up before running. |
|
||||||
|
| 9 | **MindLyst parity** | MindLyst web uses Cosmos directly (in-memory fallback). Shared components (email, sessions, webhooks) must work for MindLyst too, not just LysnrAI. | All new modules use `productId` for multi-product isolation. MindLyst can consume the same platform-service APIs. |
|
||||||
|
| 10 | **Priority conflicts** | Sprint plan assumes available engineering bandwidth. If telemetry or mobile work takes priority, these sprints slip. | Treat sprint assignments as relative ordering, not calendar commitments. Re-evaluate after each sprint. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Appendix B: Component Dependency Graph
|
||||||
|
|
||||||
```
|
```
|
||||||
┌─────────────────────┐
|
┌─────────────────────┐
|
||||||
│ Event Bus (2.4) │
|
│ Event Bus (2.4) │
|
||||||
└─────────┬───────────┘
|
└─────────┬───────────┘
|
||||||
│ emits events to all subscribers
|
│ emits to subscribers
|
||||||
┌───────────┼───────────┼───────────┐
|
┌───────────┼───────────┼───────────┐
|
||||||
│ │ │ │
|
│ │ │ │
|
||||||
▼ ▼ ▼ ▼
|
▼ ▼ ▼ ▼
|
||||||
@ -1130,31 +1175,60 @@ New components will require additional env vars. All should be added to `.env.ex
|
|||||||
│ (2.2) │ │ (2.3) │ │ (existing)│ │ (2.13) │
|
│ (2.2) │ │ (2.3) │ │ (existing)│ │ (2.13) │
|
||||||
└─────┬─────┘ └───────────┘ └───────────┘ └───────────┘
|
└─────┬─────┘ └───────────┘ └───────────┘ └───────────┘
|
||||||
│
|
│
|
||||||
│ triggers
|
│ sends
|
||||||
▼
|
▼
|
||||||
┌───────────┐
|
┌───────────┐
|
||||||
│ Password │
|
│ Password │
|
||||||
│ Reset(2.5)│
|
│ Reset(2.5)│
|
||||||
└───────────┘
|
└───────────┘
|
||||||
|
|
||||||
|
┌───────────────┐──▶┌─────────────────┐ ┌─────────────────┐
|
||||||
|
│ Scheduled │ │ Analytics │ │ Blob Storage │
|
||||||
|
│ Jobs (2.1) │ │ Rollups (2.13) │ │ (existing) │
|
||||||
|
└───────┬───────┘ └─────────────────┘ └────────┬────────┘
|
||||||
|
│ │
|
||||||
|
│ triggers on schedule ▲ writes exports
|
||||||
|
▼ │
|
||||||
┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
||||||
│ Scheduled │──▶│ Analytics │ │ Data Export │
|
│ Trial Expiry │ │ Usage Reset │ │ Data Export │
|
||||||
│ Jobs (2.1) │ │ Rollups (2.13) │ │ (2.9) │
|
│ (2.1 job) │ │ (2.1 job) │ │ (2.9) │
|
||||||
└───────┬───────┘ └─────────────────┘ └─────────────────┘
|
|
||||||
│
|
|
||||||
│ triggers on schedule
|
|
||||||
▼
|
|
||||||
┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
|
||||||
│ Trial Expiry │ │ Usage Reset │ │ Retention │
|
|
||||||
│ Check │ │ │ │ Cleanup (2.23) │
|
|
||||||
└───────────────┘ └─────────────────┘ └─────────────────┘
|
└───────────────┘ └─────────────────┘ └─────────────────┘
|
||||||
|
|
||||||
┌───────────────┐ ┌─────────────────┐
|
┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
||||||
│ Billing │──▶│ Email/Push │
|
│ Billing │──▶│ Email/Push │ │ Retention │
|
||||||
│ Dunning(2.25) │ │ Delivery (2.2) │
|
│ Dunning(2.25) │ │ Delivery (2.2) │ │ Cleanup (2.23) │
|
||||||
└───────────────┘ └─────────────────┘
|
└───────────────┘ └─────────────────┘ └─────────────────┘
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## Appendix C: Review Findings
|
||||||
|
|
||||||
|
Systematic review performed 2026-02-17. All issues below have been fixed inline.
|
||||||
|
|
||||||
|
| # | Severity | Section | Finding | Fix |
|
||||||
|
| --- | -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| 1 | **Bug** | §1.3 | Test count stale: said "158+ tests" — actual count is **621** (verified via `grep -c 'it(' *.test.ts`). | Updated to 621. |
|
||||||
|
| 2 | **Bug** | §1.1 | Endpoint column inconsistent: some modules said "CRUD" (vague, could be 4–8 routes), others had exact counts. | Replaced all "CRUD" with actual route counts. |
|
||||||
|
| 3 | **Bug** | §2.5 | Said "console-logged URLs for dev/testing" — violates project rule: never `console.log` in production code. | Changed to `req.log.info`. |
|
||||||
|
| 4 | **Bug** | §2.12 | `ExperimentDoc.targetingRules: {}` — meaningless empty object type. | Changed to `FlagTargetingRules` (reuse from flags module). |
|
||||||
|
| 5 | **Bug** | §2.3 | Webhook event `user.deleted` source said `auth.delete` — no such endpoint name. Actual route is `DELETE /auth/users/:id` (admin action). | Fixed source column. |
|
||||||
|
| 6 | **Bug** | §4 | `email_verifications` container (from §2.5) missing from Cosmos table. Only `password_reset_tokens` was listed. | Added `email_verifications` to §2.5 row. |
|
||||||
|
| 7 | **Bug** | §4 | Existing container count said "~25+" — actual is **27** (counted from `cosmos-init.ts`; `promos` uses Stripe API directly, no Cosmos container). | Updated to 27 with full container list. |
|
||||||
|
| 8 | **Bug** | §4 | Total new containers said "~17" — after adding `email_verifications` and `ip_rules`, count is **19**. | Updated. |
|
||||||
|
| 9 | **Gap** | §2.2 | No clarity on email template storage strategy. `renderer.ts` mentioned but not whether templates are Cosmos-stored or file-based. | Clarified: `repository.ts` now references `delivery_log + email_templates` containers. |
|
||||||
|
| 10 | **Gap** | §2.4 | No migration strategy from existing `lib/webhooks.ts` to new event bus pattern. | Added "Migration from existing `lib/webhooks.ts`" subsection with 3-phase plan. |
|
||||||
|
| 11 | **Gap** | §2.10 | Maintenance mode proposed extending `settings` module but didn't clarify storage location. Missing from §4 Cosmos table. | Added: stored as single document per product in existing `settings` container (no new container needed). |
|
||||||
|
| 12 | **Gap** | §2.11 | IP rules need persistence but no container was mentioned. Missing from §4 table. | Added `ip_rules` container (pk: `/productId`) to both §2.11 and §4 table. |
|
||||||
|
| 13 | **Gap** | §2.9 | Data Export didn't mention blob module dependency (exports written to blob storage). | Added explicit dependency note on `blob` module and `jobs` module for cleanup. |
|
||||||
|
| 14 | **Gap** | §5 | Missing env vars for webhooks (timeout, retries) and sessions (TTL, cache TTL). | Added 4 new env vars: `WEBHOOK_DELIVERY_TIMEOUT_MS`, `WEBHOOK_MAX_RETRIES`, `SESSION_TTL_DAYS`, `SESSION_CACHE_TTL_MS`. |
|
||||||
|
| 15 | **Gap** | §6 | Quick Reference missing MindLyst repo (`learning_multimodal_memory_agents`). Doc scope says "ByteLyst platform" which includes MindLyst. | Added MindLyst native app and web entries. Also added `cosmos-init.ts` path. |
|
||||||
|
| 16 | **Gap** | Appendix | Dependency graph incomplete: missing Jobs → Data Export connection, missing Blob → Data Export dependency, downstream jobs not labeled with section numbers. | Rewrote graph with all connections and section labels. |
|
||||||
|
| 17 | **Gap** | Overall | No "Risks & Open Questions" section — design docs should call out unknowns. | Added Appendix A with 10 risk items and mitigations. |
|
||||||
|
| 18 | **Gap** | TOC | Table of Contents didn't include Appendix sections. | Added Appendix A, B, C to TOC. |
|
||||||
|
| 19 | **Gap** | §2.5 | Password reset cross-referenced "§2.6" for sessions but sessions was renumbered to §2.7 in previous edit pass. | Fixed to §2.7 (caught in prior pass). |
|
||||||
|
| 20 | **Gap** | §1.5 | Infrastructure table was missing Swagger/OpenAPI (partially wired) and Prometheus metrics (partially enabled). | Added in prior pass — verified still present. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
_This document is a living brainstorm. Items will be promoted to dedicated design docs (like `CLIENT_TELEMETRY_DESIGN.md`) as they move into implementation._
|
_This document is a living brainstorm. Items will be promoted to dedicated design docs (like `CLIENT_TELEMETRY_DESIGN.md`) as they move into implementation._
|
||||||
|
|||||||
@ -252,3 +252,21 @@ export async function getCluster(id: string, pk: string): Promise<TelemetryError
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function updateCluster(
|
||||||
|
id: string,
|
||||||
|
pk: string,
|
||||||
|
updates: Partial<TelemetryErrorCluster>
|
||||||
|
): Promise<TelemetryErrorCluster | null> {
|
||||||
|
try {
|
||||||
|
const { resource: existing } = await clustersContainer()
|
||||||
|
.item(id, pk)
|
||||||
|
.read<TelemetryErrorCluster>();
|
||||||
|
if (!existing) return null;
|
||||||
|
const merged = { ...existing, ...updates };
|
||||||
|
const { resource } = await clustersContainer().item(id, pk).replace(merged);
|
||||||
|
return resource as unknown as TelemetryErrorCluster;
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@ -5,11 +5,13 @@
|
|||||||
* GET /telemetry/config — collection config for clients
|
* GET /telemetry/config — collection config for clients
|
||||||
* GET /telemetry/query — admin query
|
* GET /telemetry/query — admin query
|
||||||
* GET /telemetry/clusters — admin error clusters
|
* GET /telemetry/clusters — admin error clusters
|
||||||
|
* PATCH /telemetry/clusters/:id — resolve/ignore cluster (admin)
|
||||||
* GET /telemetry/policies — list policies (admin)
|
* GET /telemetry/policies — list policies (admin)
|
||||||
* POST /telemetry/policies — create policy (admin)
|
* POST /telemetry/policies — create policy (admin)
|
||||||
* PUT /telemetry/policies/:id — update policy (admin)
|
* PUT /telemetry/policies/:id — update policy (admin)
|
||||||
* DELETE /telemetry/policies/:id — delete policy (admin)
|
* DELETE /telemetry/policies/:id — delete policy (admin)
|
||||||
* DELETE /telemetry/user/:userId — GDPR erasure (admin)
|
* DELETE /telemetry/user/:userId — GDPR erasure (admin)
|
||||||
|
* GET /telemetry/metrics — ingestion metrics (admin)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import type { FastifyInstance } from 'fastify';
|
import type { FastifyInstance } from 'fastify';
|
||||||
@ -23,12 +25,16 @@ import {
|
|||||||
TelemetryIngestRequestSchema,
|
TelemetryIngestRequestSchema,
|
||||||
CreatePolicySchema,
|
CreatePolicySchema,
|
||||||
UpdatePolicySchema,
|
UpdatePolicySchema,
|
||||||
|
UpdateClusterSchema,
|
||||||
TelemetryQuerySchema,
|
TelemetryQuerySchema,
|
||||||
type TelemetryEventDoc,
|
type TelemetryEventDoc,
|
||||||
type TelemetryCollectionPolicyDoc,
|
type TelemetryCollectionPolicyDoc,
|
||||||
type TelemetryCollectionConfig,
|
type TelemetryCollectionConfig,
|
||||||
type TelemetryErrorCluster,
|
type TelemetryErrorCluster,
|
||||||
|
type TelemetryMetrics,
|
||||||
} from './types.js';
|
} from './types.js';
|
||||||
|
import * as auditRepo from '../audit/repository.js';
|
||||||
|
import type { AuditDoc } from '../audit/types.js';
|
||||||
|
|
||||||
// ─── Helpers ────────────────────────────────────────────────────────
|
// ─── Helpers ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
@ -69,6 +75,96 @@ const _cleanupTimer = globalThis.setInterval(() => {
|
|||||||
}, 300_000);
|
}, 300_000);
|
||||||
if (typeof _cleanupTimer === 'object' && 'unref' in _cleanupTimer) _cleanupTimer.unref();
|
if (typeof _cleanupTimer === 'object' && 'unref' in _cleanupTimer) _cleanupTimer.unref();
|
||||||
|
|
||||||
|
// ─── Ingestion metrics (in-memory counters) ─────────────────────
|
||||||
|
|
||||||
|
const metrics: TelemetryMetrics = {
|
||||||
|
totalEventsIngested: 0,
|
||||||
|
totalEventsRejected: 0,
|
||||||
|
totalBatchRequests: 0,
|
||||||
|
totalRateLimited: 0,
|
||||||
|
totalPiiBlocked: 0,
|
||||||
|
totalDuplicatesDropped: 0,
|
||||||
|
uptimeSince: new Date().toISOString(),
|
||||||
|
};
|
||||||
|
|
||||||
|
// ─── Webhook alerting ───────────────────────────────────────────
|
||||||
|
|
||||||
|
const ALERT_WEBHOOK_URL = process.env.TELEMETRY_ALERT_WEBHOOK_URL ?? '';
|
||||||
|
const ALERT_SEVERITY_THRESHOLD = (process.env.TELEMETRY_ALERT_SEVERITY_THRESHOLD ?? 'error') as
|
||||||
|
| 'warn'
|
||||||
|
| 'error'
|
||||||
|
| 'fatal';
|
||||||
|
const ALERT_COUNT_THRESHOLD = parseInt(process.env.TELEMETRY_ALERT_COUNT_THRESHOLD ?? '10', 10);
|
||||||
|
|
||||||
|
async function sendClusterAlert(
|
||||||
|
cluster: TelemetryErrorCluster,
|
||||||
|
previousSeverity: string
|
||||||
|
): Promise<void> {
|
||||||
|
if (!ALERT_WEBHOOK_URL) return;
|
||||||
|
|
||||||
|
const severityOrder: Record<string, number> = { warn: 0, error: 1, fatal: 2 };
|
||||||
|
const thresholdNum = severityOrder[ALERT_SEVERITY_THRESHOLD] ?? 1;
|
||||||
|
const clusterNum = severityOrder[cluster.severity] ?? 0;
|
||||||
|
|
||||||
|
// Alert if severity meets threshold OR count exceeds threshold
|
||||||
|
if (clusterNum < thresholdNum && cluster.totalCount < ALERT_COUNT_THRESHOLD) return;
|
||||||
|
|
||||||
|
const payload = {
|
||||||
|
text: `Telemetry Alert: Cluster ${cluster.fingerprint} escalated`,
|
||||||
|
blocks: [
|
||||||
|
{
|
||||||
|
type: 'section',
|
||||||
|
text: {
|
||||||
|
type: 'mrkdwn',
|
||||||
|
value: [
|
||||||
|
`*Telemetry Error Cluster Escalated*`,
|
||||||
|
`*Event:* ${cluster.module}/${cluster.eventName}`,
|
||||||
|
`*Platform:* ${cluster.platform} (${cluster.channel})`,
|
||||||
|
`*Severity:* ${previousSeverity} → *${cluster.severity}*`,
|
||||||
|
`*Total Count:* ${cluster.totalCount}`,
|
||||||
|
`*Affected Users:* ${cluster.affectedUserIds.length + cluster.affectedInstallIds.length}`,
|
||||||
|
`*First Seen:* ${cluster.firstSeenAt}`,
|
||||||
|
`*Last Seen:* ${cluster.lastSeenAt}`,
|
||||||
|
cluster.sampleMessage ? `*Sample:* \`${cluster.sampleMessage.slice(0, 200)}\`` : '',
|
||||||
|
]
|
||||||
|
.filter(Boolean)
|
||||||
|
.join('\n'),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
await fetch(ALERT_WEBHOOK_URL, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify(payload),
|
||||||
|
});
|
||||||
|
} catch {
|
||||||
|
// Best-effort — don't fail ingestion on alert failure
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Audit helper ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
function emitAudit(
|
||||||
|
productId: string,
|
||||||
|
userId: string,
|
||||||
|
action: string,
|
||||||
|
details: Record<string, unknown>
|
||||||
|
): void {
|
||||||
|
const doc: AuditDoc = {
|
||||||
|
id: `aud_${randomUUID()}`,
|
||||||
|
productId,
|
||||||
|
userId,
|
||||||
|
action,
|
||||||
|
category: 'telemetry',
|
||||||
|
details,
|
||||||
|
createdAt: new Date().toISOString(),
|
||||||
|
};
|
||||||
|
auditRepo.create(doc).catch(() => {});
|
||||||
|
}
|
||||||
|
|
||||||
/** PII patterns — reject events containing these. */
|
/** PII patterns — reject events containing these. */
|
||||||
const PII_PATTERNS = [
|
const PII_PATTERNS = [
|
||||||
/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z]{2,}\b/i, // email
|
/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z]{2,}\b/i, // email
|
||||||
@ -300,14 +396,23 @@ async function updateClusterForEvent(event: TelemetryEventDoc): Promise<void> {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Escalate severity
|
// Escalate severity + alert on escalation
|
||||||
const severityOrder = { warn: 0, error: 1, fatal: 2 };
|
const severityOrder = { warn: 0, error: 1, fatal: 2 };
|
||||||
const eventSev = event.eventType as 'warn' | 'error' | 'fatal';
|
const eventSev = event.eventType as 'warn' | 'error' | 'fatal';
|
||||||
|
const previousSeverity = existing.severity;
|
||||||
if ((severityOrder[eventSev] ?? 0) > (severityOrder[existing.severity] ?? 0)) {
|
if ((severityOrder[eventSev] ?? 0) > (severityOrder[existing.severity] ?? 0)) {
|
||||||
existing.severity = eventSev;
|
existing.severity = eventSev;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Default status for legacy clusters without status field
|
||||||
|
if (!existing.status) existing.status = 'open';
|
||||||
|
|
||||||
await repo.upsertCluster(existing);
|
await repo.upsertCluster(existing);
|
||||||
|
|
||||||
|
// Fire webhook alert on severity escalation
|
||||||
|
if (existing.severity !== previousSeverity) {
|
||||||
|
sendClusterAlert(existing, previousSeverity).catch(() => {});
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
const newCluster: TelemetryErrorCluster = {
|
const newCluster: TelemetryErrorCluster = {
|
||||||
id: clusterId,
|
id: clusterId,
|
||||||
@ -336,6 +441,7 @@ async function updateClusterForEvent(event: TelemetryEventDoc): Promise<void> {
|
|||||||
sampleErrorCode: event.errorCode,
|
sampleErrorCode: event.errorCode,
|
||||||
sampleMessage: event.message,
|
sampleMessage: event.message,
|
||||||
severity: event.eventType as 'warn' | 'error' | 'fatal',
|
severity: event.eventType as 'warn' | 'error' | 'fatal',
|
||||||
|
status: 'open',
|
||||||
ttl: DEFAULT_CLUSTER_TTL_DAYS * 86400,
|
ttl: DEFAULT_CLUSTER_TTL_DAYS * 86400,
|
||||||
};
|
};
|
||||||
await repo.upsertCluster(newCluster);
|
await repo.upsertCluster(newCluster);
|
||||||
@ -367,6 +473,7 @@ export async function telemetryRoutes(app: FastifyInstance) {
|
|||||||
// Rate limiting per installId
|
// Rate limiting per installId
|
||||||
const rateLimitKey = installToken || req.jwtPayload?.sub || 'unknown';
|
const rateLimitKey = installToken || req.jwtPayload?.sub || 'unknown';
|
||||||
if (!checkRateLimit(rateLimitKey, events.length)) {
|
if (!checkRateLimit(rateLimitKey, events.length)) {
|
||||||
|
metrics.totalRateLimited++;
|
||||||
reply.code(429);
|
reply.code(429);
|
||||||
return {
|
return {
|
||||||
accepted: 0,
|
accepted: 0,
|
||||||
@ -383,11 +490,15 @@ export async function telemetryRoutes(app: FastifyInstance) {
|
|||||||
seenIds.add(e.id);
|
seenIds.add(e.id);
|
||||||
return true;
|
return true;
|
||||||
});
|
});
|
||||||
|
const dupCount = events.length - dedupedEvents.length;
|
||||||
|
metrics.totalDuplicatesDropped += dupCount;
|
||||||
|
metrics.totalBatchRequests++;
|
||||||
|
|
||||||
const now = new Date().toISOString();
|
const now = new Date().toISOString();
|
||||||
const ttl = DEFAULT_EVENT_TTL_DAYS * 86400;
|
const ttl = DEFAULT_EVENT_TTL_DAYS * 86400;
|
||||||
|
|
||||||
let accepted = 0;
|
let accepted = 0;
|
||||||
let rejected = events.length - dedupedEvents.length; // duplicates
|
let rejected = dupCount; // duplicates
|
||||||
const errors: Array<{ index: number; reason: string }> = [];
|
const errors: Array<{ index: number; reason: string }> = [];
|
||||||
const docsToInsert: TelemetryEventDoc[] = [];
|
const docsToInsert: TelemetryEventDoc[] = [];
|
||||||
|
|
||||||
@ -406,6 +517,7 @@ export async function telemetryRoutes(app: FastifyInstance) {
|
|||||||
if (fieldsToScan.some(f => containsPII(f!))) {
|
if (fieldsToScan.some(f => containsPII(f!))) {
|
||||||
errors.push({ index: i, reason: 'PII detected' });
|
errors.push({ index: i, reason: 'PII detected' });
|
||||||
rejected++;
|
rejected++;
|
||||||
|
metrics.totalPiiBlocked++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -435,6 +547,9 @@ export async function telemetryRoutes(app: FastifyInstance) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
metrics.totalEventsIngested += accepted;
|
||||||
|
metrics.totalEventsRejected += rejected;
|
||||||
|
|
||||||
reply.code(accepted > 0 ? 200 : 400);
|
reply.code(accepted > 0 ? 200 : 400);
|
||||||
return {
|
return {
|
||||||
accepted,
|
accepted,
|
||||||
@ -549,6 +664,10 @@ export async function telemetryRoutes(app: FastifyInstance) {
|
|||||||
};
|
};
|
||||||
|
|
||||||
const created = await repo.createPolicy(doc);
|
const created = await repo.createPolicy(doc);
|
||||||
|
emitAudit(productId, doc.createdBy, 'telemetry.policy.created', {
|
||||||
|
policyId: doc.id,
|
||||||
|
name: doc.name,
|
||||||
|
});
|
||||||
reply.code(201);
|
reply.code(201);
|
||||||
return created;
|
return created;
|
||||||
});
|
});
|
||||||
@ -575,6 +694,10 @@ export async function telemetryRoutes(app: FastifyInstance) {
|
|||||||
updates as Partial<TelemetryCollectionPolicyDoc>
|
updates as Partial<TelemetryCollectionPolicyDoc>
|
||||||
);
|
);
|
||||||
if (!updated) throw new NotFoundError('Policy not found');
|
if (!updated) throw new NotFoundError('Policy not found');
|
||||||
|
emitAudit(productId, req.jwtPayload?.sub ?? 'unknown', 'telemetry.policy.updated', {
|
||||||
|
policyId: id,
|
||||||
|
updates: parsed.data,
|
||||||
|
});
|
||||||
return updated;
|
return updated;
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -585,6 +708,9 @@ export async function telemetryRoutes(app: FastifyInstance) {
|
|||||||
const productId = getRequestProductId(req);
|
const productId = getRequestProductId(req);
|
||||||
const deleted = await repo.deletePolicy(id, productId);
|
const deleted = await repo.deletePolicy(id, productId);
|
||||||
if (!deleted) throw new NotFoundError('Policy not found');
|
if (!deleted) throw new NotFoundError('Policy not found');
|
||||||
|
emitAudit(productId, req.jwtPayload?.sub ?? 'unknown', 'telemetry.policy.deleted', {
|
||||||
|
policyId: id,
|
||||||
|
});
|
||||||
return { success: true };
|
return { success: true };
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -594,6 +720,53 @@ export async function telemetryRoutes(app: FastifyInstance) {
|
|||||||
const { userId } = req.params as { userId: string };
|
const { userId } = req.params as { userId: string };
|
||||||
const productId = getRequestProductId(req);
|
const productId = getRequestProductId(req);
|
||||||
const eventsDeleted = await repo.deleteEventsByUserId(productId, userId);
|
const eventsDeleted = await repo.deleteEventsByUserId(productId, userId);
|
||||||
|
emitAudit(productId, req.jwtPayload?.sub ?? 'unknown', 'telemetry.gdpr.erasure', {
|
||||||
|
targetUserId: userId,
|
||||||
|
eventsDeleted,
|
||||||
|
});
|
||||||
return { userId, eventsDeleted, clustersUpdated: 0 };
|
return { userId, eventsDeleted, clustersUpdated: 0 };
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ── Admin: resolve/ignore cluster ───────────────────────────
|
||||||
|
app.patch('/telemetry/clusters/:id', async req => {
|
||||||
|
requireAdmin(req);
|
||||||
|
const { id } = req.params as { id: string };
|
||||||
|
const { pk } = req.query as { pk: string };
|
||||||
|
if (!pk) throw new BadRequestError('pk query parameter required');
|
||||||
|
|
||||||
|
const parsed = UpdateClusterSchema.safeParse(req.body);
|
||||||
|
if (!parsed.success) {
|
||||||
|
throw new BadRequestError(parsed.error.issues.map(i => i.message).join('; '));
|
||||||
|
}
|
||||||
|
|
||||||
|
const updates: Partial<TelemetryErrorCluster> = {
|
||||||
|
status: parsed.data.status,
|
||||||
|
};
|
||||||
|
if (parsed.data.status === 'resolved' || parsed.data.status === 'ignored') {
|
||||||
|
updates.resolvedBy = req.jwtPayload?.sub ?? 'unknown';
|
||||||
|
updates.resolvedAt = new Date().toISOString();
|
||||||
|
}
|
||||||
|
|
||||||
|
const updated = await repo.updateCluster(id, pk, updates);
|
||||||
|
if (!updated) throw new NotFoundError('Cluster not found');
|
||||||
|
|
||||||
|
const productId = getRequestProductId(req);
|
||||||
|
emitAudit(
|
||||||
|
productId,
|
||||||
|
req.jwtPayload?.sub ?? 'unknown',
|
||||||
|
`telemetry.cluster.${parsed.data.status}`,
|
||||||
|
{
|
||||||
|
clusterId: id,
|
||||||
|
pk,
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
return updated;
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── Admin: ingestion metrics ──────────────────────────────
|
||||||
|
app.get('/telemetry/metrics', async req => {
|
||||||
|
requireAdmin(req);
|
||||||
|
return metrics;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@ -201,6 +201,8 @@ export interface TelemetryCollectionConfig {
|
|||||||
|
|
||||||
// ─── Error Cluster ──────────────────────────────────────────────────
|
// ─── Error Cluster ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export const ClusterStatusEnum = z.enum(['open', 'resolved', 'ignored']);
|
||||||
|
|
||||||
export interface TelemetryErrorCluster {
|
export interface TelemetryErrorCluster {
|
||||||
id: string; // ${fingerprint}:${yyyyMM}
|
id: string; // ${fingerprint}:${yyyyMM}
|
||||||
pk: string; // ${productId}:${platform}:${module}
|
pk: string; // ${productId}:${platform}:${module}
|
||||||
@ -230,9 +232,16 @@ export interface TelemetryErrorCluster {
|
|||||||
sampleErrorCode?: string;
|
sampleErrorCode?: string;
|
||||||
sampleMessage?: string;
|
sampleMessage?: string;
|
||||||
severity: 'warn' | 'error' | 'fatal';
|
severity: 'warn' | 'error' | 'fatal';
|
||||||
|
status: 'open' | 'resolved' | 'ignored';
|
||||||
|
resolvedBy?: string;
|
||||||
|
resolvedAt?: string;
|
||||||
ttl: number;
|
ttl: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const UpdateClusterSchema = z.object({
|
||||||
|
status: ClusterStatusEnum,
|
||||||
|
});
|
||||||
|
|
||||||
// ─── Query / Admin types ────────────────────────────────────────────
|
// ─── Query / Admin types ────────────────────────────────────────────
|
||||||
|
|
||||||
export const TelemetryQuerySchema = z.object({
|
export const TelemetryQuerySchema = z.object({
|
||||||
@ -253,3 +262,15 @@ export const TelemetryQuerySchema = z.object({
|
|||||||
});
|
});
|
||||||
|
|
||||||
export type TelemetryQueryInput = z.infer<typeof TelemetryQuerySchema>;
|
export type TelemetryQueryInput = z.infer<typeof TelemetryQuerySchema>;
|
||||||
|
|
||||||
|
// ─── Ingestion Metrics (in-memory counters) ─────────────────────────
|
||||||
|
|
||||||
|
export interface TelemetryMetrics {
|
||||||
|
totalEventsIngested: number;
|
||||||
|
totalEventsRejected: number;
|
||||||
|
totalBatchRequests: number;
|
||||||
|
totalRateLimited: number;
|
||||||
|
totalPiiBlocked: number;
|
||||||
|
totalDuplicatesDropped: number;
|
||||||
|
uptimeSince: string;
|
||||||
|
}
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user