docs: update documentation
This commit is contained in:
parent
856788c386
commit
80a4459f81
@ -20,6 +20,10 @@
|
||||
5. [New Environment Variables](#5-new-environment-variables)
|
||||
6. [Quick Reference — Where Things Live](#6-quick-reference--where-things-live)
|
||||
|
||||
- [Appendix A: Risks & Open Questions](#appendix-a-risks--open-questions)
|
||||
- [Appendix B: Component Dependency Graph](#appendix-b-component-dependency-graph)
|
||||
- [Appendix C: Review Findings](#appendix-c-review-findings)
|
||||
|
||||
---
|
||||
|
||||
## 1. Current Inventory
|
||||
@ -29,23 +33,23 @@
|
||||
| Category | Module | Endpoints | Description |
|
||||
| ------------ | --------------- | --------- | --------------------------------------------------------------------------------------------------- |
|
||||
| **Identity** | `auth` | 11 routes | Login, register, refresh, SSO, profile, admin user CRUD |
|
||||
| **Identity** | `tokens` | CRUD | API token management |
|
||||
| **Identity** | `licenses` | CRUD | License key generation, activation, device binding |
|
||||
| **Billing** | `subscriptions` | CRUD | Plan management, trial tracking, period management |
|
||||
| **Billing** | `stripe` | Webhooks | Inbound Stripe webhook processing |
|
||||
| **Billing** | `plans` | CRUD | Plan definitions (free, pro, enterprise) |
|
||||
| **Billing** | `usage` | CRUD | Usage tracking and quota enforcement |
|
||||
| **Billing** | `promos` | CRUD | Promo code creation, validation, redemption |
|
||||
| **Growth** | `invitations` | CRUD | Invitation code generation, redemption, tracking |
|
||||
| **Growth** | `referrals` | CRUD | Referral link tracking, status transitions |
|
||||
| **Identity** | `tokens` | 5 routes | API token management (CRUD + validate) |
|
||||
| **Identity** | `licenses` | 6 routes | License key generation, activation, device binding, validate |
|
||||
| **Billing** | `subscriptions` | 5 routes | Plan management, trial tracking, period management |
|
||||
| **Billing** | `stripe` | 2 routes | Inbound Stripe webhook + portal session |
|
||||
| **Billing** | `plans` | 4 routes | Plan definitions (free, pro, enterprise) |
|
||||
| **Billing** | `usage` | 4 routes | Usage tracking and quota enforcement |
|
||||
| **Billing** | `promos` | 5 routes | Promo code creation, validation, redemption |
|
||||
| **Growth** | `invitations` | 5 routes | Invitation code generation, redemption, tracking |
|
||||
| **Growth** | `referrals` | 5 routes | Referral link tracking, status transitions |
|
||||
| **Growth** | `waitlist` | 12 routes | Pre-launch signups, position tracking, admin batch invite, CSV export |
|
||||
| **Growth** | `public` | 5 routes | Public roadmap, community voting, feature submissions |
|
||||
| **Content** | `items` | CRUD | Tracker items (bugs, features, tasks) |
|
||||
| **Content** | `comments` | CRUD | Threaded comments on items |
|
||||
| **Content** | `votes` | CRUD | User votes on items and comments |
|
||||
| **Content** | `items` | 5 routes | Tracker items (bugs, features, tasks) |
|
||||
| **Content** | `comments` | 4 routes | Threaded comments on items |
|
||||
| **Content** | `votes` | 3 routes | User votes on items and comments |
|
||||
| **Content** | `memory` | 5 routes | Memory items — create, reassign, patch, delete |
|
||||
| **Ops** | `audit` | Query | Audit log recording and admin queries |
|
||||
| **Ops** | `flags` | CRUD | Feature flags with FNV-1a deterministic rollout |
|
||||
| **Ops** | `flags` | 5 routes | Feature flags with FNV-1a deterministic rollout |
|
||||
| **Ops** | `telemetry` | 9 routes | Client event ingestion, error clustering, collection policies, GDPR erasure |
|
||||
| **Ops** | `notifications` | 5 routes | Device registration, notification preferences |
|
||||
| **Ops** | `settings` | 6 routes | User/device settings, kill switch |
|
||||
@ -74,11 +78,11 @@
|
||||
|
||||
### 1.3 Services
|
||||
|
||||
| Service | Port | Description |
|
||||
| ---------------------- | ---- | ----------------------------------------------------- |
|
||||
| **platform-service** | 4003 | Consolidated Fastify service (25 modules, 158+ tests) |
|
||||
| **extraction-service** | 4005 | LangExtract text extraction + Python sidecar |
|
||||
| **monitoring** | 4004 | Health-check aggregator (all services) |
|
||||
| Service | Port | Description |
|
||||
| ---------------------- | ---- | ---------------------------------------------------- |
|
||||
| **platform-service** | 4003 | Consolidated Fastify service (25 modules, 621 tests) |
|
||||
| **extraction-service** | 4005 | LangExtract text extraction + Python sidecar |
|
||||
| **monitoring** | 4004 | Health-check aggregator (all services) |
|
||||
|
||||
### 1.4 Dashboards
|
||||
|
||||
@ -185,8 +189,8 @@ platform-service/src/modules/delivery/
|
||||
│ ├── push-apns.ts — Apple Push Notification Service
|
||||
│ ├── push-fcm.ts — Firebase Cloud Messaging
|
||||
│ └── sms.ts — Twilio/Azure Communication Services (future)
|
||||
├── renderer.ts — Template rendering (Handlebars/Mustache for email)
|
||||
├── repository.ts — delivery_log container (track sent/failed/bounced)
|
||||
├── renderer.ts — Template rendering (Handlebars for email bodies)
|
||||
├── repository.ts — delivery_log + email_templates containers
|
||||
├── dispatcher.ts — Route delivery request to correct channel(s) based on prefs
|
||||
└── routes.ts — Admin: send test, view delivery log, manage templates
|
||||
```
|
||||
@ -244,21 +248,21 @@ platform-service/src/modules/webhooks/
|
||||
|
||||
**Event catalog (subscribe to any combination):**
|
||||
|
||||
| Event | Payload | Source |
|
||||
| ----------------------- | ---------------------------------------------- | --------------------------- |
|
||||
| `user.created` | `{ userId, email, plan }` | `auth.register`, `auth.sso` |
|
||||
| `user.deleted` | `{ userId }` | `auth.delete` |
|
||||
| `subscription.created` | `{ subscriptionId, userId, plan, status }` | Registration hook |
|
||||
| `subscription.changed` | `{ subscriptionId, oldPlan, newPlan, status }` | Stripe webhook |
|
||||
| `subscription.canceled` | `{ subscriptionId, userId, reason }` | User action / Stripe |
|
||||
| `payment.succeeded` | `{ invoiceId, amount, userId }` | Stripe webhook |
|
||||
| `payment.failed` | `{ invoiceId, amount, userId, retryCount }` | Stripe webhook |
|
||||
| `invitation.redeemed` | `{ invitationId, userId }` | Invitation module |
|
||||
| `referral.completed` | `{ referralId, referrerId, referredId }` | Referral module |
|
||||
| `waitlist.joined` | `{ email, position }` | Waitlist module |
|
||||
| `flag.toggled` | `{ flagId, enabled, percentage }` | Flags module |
|
||||
| `license.activated` | `{ licenseId, userId, deviceId }` | License module |
|
||||
| `license.expired` | `{ licenseId, userId }` | Jobs: license-expiry-check |
|
||||
| Event | Payload | Source |
|
||||
| ----------------------- | ---------------------------------------------- | ------------------------------- |
|
||||
| `user.created` | `{ userId, email, plan }` | `auth.register`, `auth.sso` |
|
||||
| `user.deleted` | `{ userId }` | Admin: `DELETE /auth/users/:id` |
|
||||
| `subscription.created` | `{ subscriptionId, userId, plan, status }` | Registration hook |
|
||||
| `subscription.changed` | `{ subscriptionId, oldPlan, newPlan, status }` | Stripe webhook |
|
||||
| `subscription.canceled` | `{ subscriptionId, userId, reason }` | User action / Stripe |
|
||||
| `payment.succeeded` | `{ invoiceId, amount, userId }` | Stripe webhook |
|
||||
| `payment.failed` | `{ invoiceId, amount, userId, retryCount }` | Stripe webhook |
|
||||
| `invitation.redeemed` | `{ invitationId, userId }` | Invitation module |
|
||||
| `referral.completed` | `{ referralId, referrerId, referredId }` | Referral module |
|
||||
| `waitlist.joined` | `{ email, position }` | Waitlist module |
|
||||
| `flag.toggled` | `{ flagId, enabled, percentage }` | Flags module |
|
||||
| `license.activated` | `{ licenseId, userId, deviceId }` | License module |
|
||||
| `license.expired` | `{ licenseId, userId }` | Jobs: license-expiry-check |
|
||||
|
||||
**Security:**
|
||||
|
||||
@ -334,6 +338,13 @@ const PlatformEvents = {
|
||||
} as const;
|
||||
```
|
||||
|
||||
**Migration from existing `lib/webhooks.ts`:**
|
||||
|
||||
- Existing `dispatchInvitationRedeemed()`, `dispatchReferralStatusChanged()`, `dispatchWaitlistJoined()` become event bus subscribers
|
||||
- Phase 1: Register existing webhooks.ts functions as handlers on the bus
|
||||
- Phase 2: Replace inline dispatch calls in routes with `bus.emit()`
|
||||
- Phase 3: Remove `lib/webhooks.ts` once all callers migrated
|
||||
|
||||
**Benefits:**
|
||||
|
||||
- Audit logging becomes a subscriber, not inline code
|
||||
@ -389,7 +400,7 @@ interface PasswordResetToken {
|
||||
|
||||
- `password_reset_tokens` (pk: `/productId`) — short-lived, TTL 24h auto-expiry
|
||||
|
||||
**Dependency:** Requires email delivery (§2.2) for sending reset links and verification emails. Can ship the endpoints first with console-logged URLs for dev/testing.
|
||||
**Dependency:** Requires email delivery (§2.2) for sending reset links and verification emails. Can ship the endpoints first with `req.log.info`-logged URLs for dev/testing (never `console.log`).
|
||||
|
||||
---
|
||||
|
||||
@ -556,9 +567,11 @@ platform-service/src/modules/exports/
|
||||
**Flow:**
|
||||
|
||||
1. Admin POST `/api/exports` → `{ type: 'users', format: 'csv', filters: { plan: 'free' } }`
|
||||
2. Background job runs query, writes result to blob storage
|
||||
2. Background job runs query, writes result to blob storage (via existing `blob` module)
|
||||
3. Job status updates: `pending` → `processing` → `ready` / `failed`
|
||||
4. Admin downloads from signed blob URL
|
||||
4. Admin downloads from signed blob URL (SAS token via `@bytelyst/blob`)
|
||||
|
||||
**Dependencies:** `blob` module (existing) for storage, `jobs` module (§2.1) for auto-cleanup of expired exports.
|
||||
|
||||
**Supported exports:**
|
||||
|
||||
@ -629,6 +642,8 @@ interface MaintenanceConfig {
|
||||
- Schedule builder with start/end date pickers
|
||||
- Bypass IP whitelist management
|
||||
|
||||
**Storage:** Maintenance config is a single document per product in the existing `settings` container (field: `maintenanceConfig`). No new Cosmos container needed.
|
||||
|
||||
---
|
||||
|
||||
#### 2.11 Rate Limit Dashboard & IP Allow/Deny Lists
|
||||
@ -678,6 +693,11 @@ interface IPRule {
|
||||
- IP rules management (allow/deny with expiry)
|
||||
- Per-user rate limit override
|
||||
|
||||
**Cosmos container:**
|
||||
|
||||
- `ip_rules` (pk: `/productId`) — persistent IP allow/deny rules
|
||||
- Rate limit stats remain in-memory (ephemeral); no persistence needed for counters
|
||||
|
||||
---
|
||||
|
||||
### P2 — Product Intelligence
|
||||
@ -715,7 +735,7 @@ interface ExperimentDoc {
|
||||
hypothesis: string;
|
||||
status: 'draft' | 'running' | 'paused' | 'concluded';
|
||||
variants: Variant[]; // [{id: 'control', weight: 50}, {id: 'treatment', weight: 50}]
|
||||
targetingRules: {}; // Same as flag targeting
|
||||
targetingRules: FlagTargetingRules; // Reuse from flags module (platforms, versions, percentage)
|
||||
primaryMetric: string; // e.g., 'dictation_completed_rate'
|
||||
secondaryMetrics: string[];
|
||||
startedAt?: string;
|
||||
@ -1046,25 +1066,26 @@ This is a major architectural expansion. Defer until enterprise tier is validate
|
||||
|
||||
Each new component introduces Cosmos containers. Cosmos DB Serverless charges per RU consumed + storage, so idle containers cost only storage (~$0.25/GB/month).
|
||||
|
||||
| Component | New Containers | Partition Key | Est. TTL | Est. Daily RU |
|
||||
| ---------------------- | --------------------------------------------- | ----------------------------------------- | --------------- | ----------------------------------- |
|
||||
| **2.1 Jobs** | `job_definitions`, `job_runs` | `/productId`, `/productId:jobName` | runs: 90d | ~50 RU (low volume) |
|
||||
| **2.2 Email/Push** | `delivery_log`, `email_templates` | `/productId:channel:yyyyMM`, `/productId` | log: 90d | ~200 RU |
|
||||
| **2.3 Webhooks** | `webhook_subscriptions`, `webhook_deliveries` | `/productId`, `/subscriptionId:yyyyMM` | deliveries: 30d | ~100 RU |
|
||||
| **2.5 Password Reset** | `password_reset_tokens` | `/productId` | 24h auto | ~10 RU |
|
||||
| **2.6 Status** | `service_status`, `incidents` | `/productId`, `/productId` | None | ~20 RU |
|
||||
| **2.7 Sessions** | `sessions` | `/userId` | 90d | ~500 RU (read-heavy) |
|
||||
| **2.8 Migrations** | `migrations` | `/productId` | None | ~5 RU (startup only) |
|
||||
| **2.9 Exports** | `export_jobs` | `/productId` | 30d | ~20 RU |
|
||||
| **2.12 Experiments** | `experiments` | `/productId` | None | ~50 RU |
|
||||
| **2.13 Analytics** | `analytics_rollups` | `/productId:metric:period` | None | ~300 RU (write-heavy during rollup) |
|
||||
| **2.14 Feedback** | `feedback` | `/productId` | None | ~50 RU |
|
||||
| **2.16 Changelog** | `changelog` | `/productId` | None | ~10 RU |
|
||||
| **2.20 i18n** | `translations` | `/productId:locale` | None | ~100 RU (read-heavy, cacheable) |
|
||||
| **2.23 Retention** | `retention_policies` | `/productId` | None | ~5 RU |
|
||||
| Component | New Containers | Partition Key | Est. TTL | Est. Daily RU |
|
||||
| ---------------------- | ---------------------------------------------- | ----------------------------------------- | --------------- | ----------------------------------- |
|
||||
| **2.1 Jobs** | `job_definitions`, `job_runs` | `/productId`, `/productId:jobName` | runs: 90d | ~50 RU (low volume) |
|
||||
| **2.2 Email/Push** | `delivery_log`, `email_templates` | `/productId:channel:yyyyMM`, `/productId` | log: 90d | ~200 RU |
|
||||
| **2.3 Webhooks** | `webhook_subscriptions`, `webhook_deliveries` | `/productId`, `/subscriptionId:yyyyMM` | deliveries: 30d | ~100 RU |
|
||||
| **2.5 Password Reset** | `password_reset_tokens`, `email_verifications` | `/productId`, `/productId` | 24h auto | ~10 RU |
|
||||
| **2.6 Status** | `service_status`, `incidents` | `/productId`, `/productId` | None | ~20 RU |
|
||||
| **2.7 Sessions** | `sessions` | `/userId` | 90d | ~500 RU (read-heavy) |
|
||||
| **2.8 Migrations** | `migrations` | `/productId` | None | ~5 RU (startup only) |
|
||||
| **2.9 Exports** | `export_jobs` | `/productId` | 30d | ~20 RU |
|
||||
| **2.12 Experiments** | `experiments` | `/productId` | None | ~50 RU |
|
||||
| **2.13 Analytics** | `analytics_rollups` | `/productId:metric:period` | None | ~300 RU (write-heavy during rollup) |
|
||||
| **2.11 IP Rules** | `ip_rules` | `/productId` | None (manual) | ~10 RU |
|
||||
| **2.14 Feedback** | `feedback` | `/productId` | None | ~50 RU |
|
||||
| **2.16 Changelog** | `changelog` | `/productId` | None | ~10 RU |
|
||||
| **2.20 i18n** | `translations` | `/productId:locale` | None | ~100 RU (read-heavy, cacheable) |
|
||||
| **2.23 Retention** | `retention_policies` | `/productId` | None | ~5 RU |
|
||||
|
||||
**Total new containers:** ~17 (across all phases)
|
||||
**Existing containers:** ~25+ (across platform-service + dashboards)
|
||||
**Total new containers:** ~19 (across all phases)
|
||||
**Existing containers:** 27 (defined in `cosmos-init.ts`: products, users, settings, devices, notification_prefs, audit_log, feature_flags, invitation_codes, referrals, subscriptions, payments, licenses, plans, usage_daily, api_tokens, tracker_items, comments, votes, themes, waitlist, memory_items, daily_briefs, reflections, brain_insights, telemetry_events, telemetry_error_clusters, telemetry_collection_policies). Note: `promos` module uses Stripe API directly — no Cosmos container.
|
||||
**Cost impact:** Minimal for Serverless tier — idle containers only consume storage. Active containers during job runs add burst RU.
|
||||
|
||||
**Recommendation:** Register all new containers in `cosmos-init.ts` alongside existing ones. Use TTL liberally for transient data (tokens, deliveries, job runs) to keep storage bounded.
|
||||
@ -1075,22 +1096,26 @@ Each new component introduces Cosmos containers. Cosmos DB Serverless charges pe
|
||||
|
||||
New components will require additional env vars. All should be added to `.env.example` files in both repos and documented.
|
||||
|
||||
| Component | Variable | Example | Required |
|
||||
| -------------------- | -------------------------- | -------------------------------- | ------------------------- |
|
||||
| **2.1 Jobs** | `JOB_RUNNER_ENABLED` | `true` | No (default: true) |
|
||||
| **2.1 Jobs** | `JOB_TICK_INTERVAL_MS` | `60000` | No (default: 60s) |
|
||||
| **2.2 Email** | `SENDGRID_API_KEY` | `SG.xxx` | Yes (for email delivery) |
|
||||
| **2.2 Email** | `EMAIL_FROM_ADDRESS` | `noreply@lysnrai.com` | Yes |
|
||||
| **2.2 Email** | `EMAIL_FROM_NAME` | `LysnrAI` | No |
|
||||
| **2.2 Push** | `APNS_KEY_ID` | `ABC123` | Yes (for iOS push) |
|
||||
| **2.2 Push** | `APNS_TEAM_ID` | `748N7QPX7J` | Yes |
|
||||
| **2.2 Push** | `APNS_KEY_PATH` | `./certs/AuthKey.p8` | Yes |
|
||||
| **2.2 Push** | `FCM_SERVICE_ACCOUNT_JSON` | `{...}` | Yes (for Android push) |
|
||||
| **2.5 Auth** | `PASSWORD_RESET_URL_BASE` | `https://app.lysnrai.com/reset` | Yes |
|
||||
| **2.5 Auth** | `EMAIL_VERIFY_URL_BASE` | `https://app.lysnrai.com/verify` | Yes |
|
||||
| **2.10 Maintenance** | `MAINTENANCE_MODE` | `off` | No (default: off) |
|
||||
| **2.10 Maintenance** | `MAINTENANCE_BYPASS_IPS` | `10.0.0.1,10.0.0.2` | No |
|
||||
| **2.19 OpenAPI** | `SWAGGER_UI_ENABLED` | `true` | No (default: true in dev) |
|
||||
| Component | Variable | Example | Required |
|
||||
| -------------------- | ----------------------------- | -------------------------------- | ------------------------- |
|
||||
| **2.1 Jobs** | `JOB_RUNNER_ENABLED` | `true` | No (default: true) |
|
||||
| **2.1 Jobs** | `JOB_TICK_INTERVAL_MS` | `60000` | No (default: 60s) |
|
||||
| **2.2 Email** | `SENDGRID_API_KEY` | `SG.xxx` | Yes (for email delivery) |
|
||||
| **2.2 Email** | `EMAIL_FROM_ADDRESS` | `noreply@lysnrai.com` | Yes |
|
||||
| **2.2 Email** | `EMAIL_FROM_NAME` | `LysnrAI` | No |
|
||||
| **2.2 Push** | `APNS_KEY_ID` | `ABC123` | Yes (for iOS push) |
|
||||
| **2.2 Push** | `APNS_TEAM_ID` | `748N7QPX7J` | Yes |
|
||||
| **2.2 Push** | `APNS_KEY_PATH` | `./certs/AuthKey.p8` | Yes |
|
||||
| **2.2 Push** | `FCM_SERVICE_ACCOUNT_JSON` | `{...}` | Yes (for Android push) |
|
||||
| **2.5 Auth** | `PASSWORD_RESET_URL_BASE` | `https://app.lysnrai.com/reset` | Yes |
|
||||
| **2.5 Auth** | `EMAIL_VERIFY_URL_BASE` | `https://app.lysnrai.com/verify` | Yes |
|
||||
| **2.10 Maintenance** | `MAINTENANCE_MODE` | `off` | No (default: off) |
|
||||
| **2.10 Maintenance** | `MAINTENANCE_BYPASS_IPS` | `10.0.0.1,10.0.0.2` | No |
|
||||
| **2.3 Webhooks** | `WEBHOOK_DELIVERY_TIMEOUT_MS` | `5000` | No (default: 5s) |
|
||||
| **2.3 Webhooks** | `WEBHOOK_MAX_RETRIES` | `3` | No (default: 3) |
|
||||
| **2.7 Sessions** | `SESSION_TTL_DAYS` | `90` | No (default: 90) |
|
||||
| **2.7 Sessions** | `SESSION_CACHE_TTL_MS` | `30000` | No (default: 30s) |
|
||||
| **2.19 OpenAPI** | `SWAGGER_UI_ENABLED` | `true` | No (default: true in dev) |
|
||||
|
||||
**Secret management:** `SENDGRID_API_KEY`, `APNS_*`, and `FCM_*` should be added to Azure Key Vault as `lysnr-sendgrid-api-key`, `lysnr-apns-key-id`, etc. Update `LYSNR_SECRETS` in `@bytelyst/config` to include them.
|
||||
|
||||
@ -1098,30 +1123,50 @@ New components will require additional env vars. All should be added to `.env.ex
|
||||
|
||||
## 6. Quick Reference — Where Things Live
|
||||
|
||||
| Component | Repo | Path |
|
||||
| ------------------------ | ------------------------- | ----------------------------------------------- |
|
||||
| Platform-service modules | `learning_ai_common_plat` | `services/platform-service/src/modules/` |
|
||||
| Shared packages | `learning_ai_common_plat` | `packages/` |
|
||||
| Admin dashboard | `learning_voice_ai_agent` | `admin-dashboard-web/` |
|
||||
| User dashboard | `learning_voice_ai_agent` | `user-dashboard-web/` |
|
||||
| Tracker dashboard | `learning_voice_ai_agent` | `tracker-dashboard-web/` |
|
||||
| Docker Compose | both repos | `docker-compose.yml` |
|
||||
| Monitoring | `learning_ai_common_plat` | `services/monitoring/` |
|
||||
| Design tokens | `learning_ai_common_plat` | `packages/design-tokens/` |
|
||||
| Existing webhooks | `learning_ai_common_plat` | `services/platform-service/src/lib/webhooks.ts` |
|
||||
| Telemetry design doc | `learning_ai_common_plat` | `docs/WINDSURF/CLIENT_TELEMETRY_DESIGN.md` |
|
||||
| Telemetry roadmap | `learning_ai_common_plat` | `docs/WINDSURF/TELEMETRY_ROADMAP.md` |
|
||||
| **This document** | `learning_ai_common_plat` | `docs/WINDSURF/PLATFORM_COMPONENTS_ROADMAP.md` |
|
||||
| Component | Repo | Path |
|
||||
| ------------------------ | ----------------------------------- | ------------------------------------------------------ |
|
||||
| Platform-service modules | `learning_ai_common_plat` | `services/platform-service/src/modules/` |
|
||||
| Shared packages | `learning_ai_common_plat` | `packages/` |
|
||||
| Admin dashboard | `learning_voice_ai_agent` | `admin-dashboard-web/` |
|
||||
| User dashboard | `learning_voice_ai_agent` | `user-dashboard-web/` |
|
||||
| Tracker dashboard | `learning_voice_ai_agent` | `tracker-dashboard-web/` |
|
||||
| Docker Compose | both repos | `docker-compose.yml` |
|
||||
| Monitoring | `learning_ai_common_plat` | `services/monitoring/` |
|
||||
| Design tokens | `learning_ai_common_plat` | `packages/design-tokens/` |
|
||||
| MindLyst native app | `learning_multimodal_memory_agents` | `mindlyst-native/` (KMP + SwiftUI + Compose + Next.js) |
|
||||
| MindLyst web | `learning_multimodal_memory_agents` | `mindlyst-native/web/` |
|
||||
| Existing webhooks | `learning_ai_common_plat` | `services/platform-service/src/lib/webhooks.ts` |
|
||||
| Cosmos container defs | `learning_ai_common_plat` | `services/platform-service/src/lib/cosmos-init.ts` |
|
||||
| Telemetry design doc | `learning_ai_common_plat` | `docs/WINDSURF/CLIENT_TELEMETRY_DESIGN.md` |
|
||||
| Telemetry roadmap | `learning_ai_common_plat` | `docs/WINDSURF/TELEMETRY_ROADMAP.md` |
|
||||
| **This document** | `learning_ai_common_plat` | `docs/WINDSURF/PLATFORM_COMPONENTS_ROADMAP.md` |
|
||||
|
||||
---
|
||||
|
||||
## Appendix: Component Dependency Graph
|
||||
## Appendix A: Risks & Open Questions
|
||||
|
||||
| # | Topic | Risk / Question | Mitigation |
|
||||
| --- | ---------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| 1 | **Leader election for jobs** | In-process tick loop with Cosmos lease — what happens during deploys? Two instances may briefly both hold leases. | Cosmos lease has a built-in TTL. Use 30s lease with 10s renewal. During deploy overlap, the old instance's lease expires before the new one acquires. Jobs must be idempotent. |
|
||||
| 2 | **Email deliverability** | SendGrid requires domain verification (SPF/DKIM/DMARC). Without it, emails land in spam. | Set up `lysnrai.com` domain authentication in SendGrid before shipping §2.2. Budget 1–2 days for DNS propagation. |
|
||||
| 3 | **Session validation latency** | Checking Cosmos on every request for session revocation adds ~5–10ms per request. | In-memory cache with 30s TTL (§2.7). Revocation is eventually consistent — acceptable trade-off for most apps. Document the 30s window. |
|
||||
| 4 | **Cosmos container proliferation** | 28 existing + 19 new = 47 containers. Serverless tier has no per-container cost, but management complexity grows. | Group related containers by module. Document all containers in `cosmos-init.ts`. Consider container-per-module naming convention. |
|
||||
| 5 | **Event bus ordering guarantees** | In-memory `EventEmitter` has no ordering guarantees across handlers. If audit must record before webhook fires, ordering matters. | Phase 1: Document that handlers run concurrently with no ordering. If ordering is needed, use handler priority weights or sequential mode. |
|
||||
| 6 | **Push notification certificates** | APNs requires yearly certificate renewal. If it expires, all iOS push silently stops. | Add `apns-cert-expiry-check` to scheduled jobs (§2.1). Alert admin 30 days before expiry. |
|
||||
| 7 | **Webhook abuse** | External subscribers could register slow endpoints that back up the delivery queue. | Per-subscription timeout (5s default), circuit breaker after 10 consecutive failures, auto-disable. |
|
||||
| 8 | **Migration rollback** | Cosmos is schemaless — some migrations (e.g., partition key changes) are irreversible. | Mark migrations as `reversible: true/false`. Require manual approval for irreversible migrations. Always back up before running. |
|
||||
| 9 | **MindLyst parity** | MindLyst web uses Cosmos directly (in-memory fallback). Shared components (email, sessions, webhooks) must work for MindLyst too, not just LysnrAI. | All new modules use `productId` for multi-product isolation. MindLyst can consume the same platform-service APIs. |
|
||||
| 10 | **Priority conflicts** | Sprint plan assumes available engineering bandwidth. If telemetry or mobile work takes priority, these sprints slip. | Treat sprint assignments as relative ordering, not calendar commitments. Re-evaluate after each sprint. |
|
||||
|
||||
---
|
||||
|
||||
## Appendix B: Component Dependency Graph
|
||||
|
||||
```
|
||||
┌─────────────────────┐
|
||||
│ Event Bus (2.4) │
|
||||
└─────────┬───────────┘
|
||||
│ emits events to all subscribers
|
||||
│ emits to subscribers
|
||||
┌───────────┼───────────┼───────────┐
|
||||
│ │ │ │
|
||||
▼ ▼ ▼ ▼
|
||||
@ -1130,31 +1175,60 @@ New components will require additional env vars. All should be added to `.env.ex
|
||||
│ (2.2) │ │ (2.3) │ │ (existing)│ │ (2.13) │
|
||||
└─────┬─────┘ └───────────┘ └───────────┘ └───────────┘
|
||||
│
|
||||
│ triggers
|
||||
│ sends
|
||||
▼
|
||||
┌───────────┐
|
||||
│ Password │
|
||||
│ Reset(2.5)│
|
||||
└───────────┘
|
||||
|
||||
┌───────────────┐──▶┌─────────────────┐ ┌─────────────────┐
|
||||
│ Scheduled │ │ Analytics │ │ Blob Storage │
|
||||
│ Jobs (2.1) │ │ Rollups (2.13) │ │ (existing) │
|
||||
└───────┬───────┘ └─────────────────┘ └────────┬────────┘
|
||||
│ │
|
||||
│ triggers on schedule ▲ writes exports
|
||||
▼ │
|
||||
┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
||||
│ Scheduled │──▶│ Analytics │ │ Data Export │
|
||||
│ Jobs (2.1) │ │ Rollups (2.13) │ │ (2.9) │
|
||||
└───────┬───────┘ └─────────────────┘ └─────────────────┘
|
||||
│
|
||||
│ triggers on schedule
|
||||
▼
|
||||
┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
||||
│ Trial Expiry │ │ Usage Reset │ │ Retention │
|
||||
│ Check │ │ │ │ Cleanup (2.23) │
|
||||
│ Trial Expiry │ │ Usage Reset │ │ Data Export │
|
||||
│ (2.1 job) │ │ (2.1 job) │ │ (2.9) │
|
||||
└───────────────┘ └─────────────────┘ └─────────────────┘
|
||||
|
||||
┌───────────────┐ ┌─────────────────┐
|
||||
│ Billing │──▶│ Email/Push │
|
||||
│ Dunning(2.25) │ │ Delivery (2.2) │
|
||||
└───────────────┘ └─────────────────┘
|
||||
┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
||||
│ Billing │──▶│ Email/Push │ │ Retention │
|
||||
│ Dunning(2.25) │ │ Delivery (2.2) │ │ Cleanup (2.23) │
|
||||
└───────────────┘ └─────────────────┘ └─────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Appendix C: Review Findings
|
||||
|
||||
Systematic review performed 2026-02-17. All issues below have been fixed inline.
|
||||
|
||||
| # | Severity | Section | Finding | Fix |
|
||||
| --- | -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------- |
|
||||
| 1 | **Bug** | §1.3 | Test count stale: said "158+ tests" — actual count is **621** (verified via `grep -c 'it(' *.test.ts`). | Updated to 621. |
|
||||
| 2 | **Bug** | §1.1 | Endpoint column inconsistent: some modules said "CRUD" (vague, could be 4–8 routes), others had exact counts. | Replaced all "CRUD" with actual route counts. |
|
||||
| 3 | **Bug** | §2.5 | Said "console-logged URLs for dev/testing" — violates project rule: never `console.log` in production code. | Changed to `req.log.info`. |
|
||||
| 4 | **Bug** | §2.12 | `ExperimentDoc.targetingRules: {}` — meaningless empty object type. | Changed to `FlagTargetingRules` (reuse from flags module). |
|
||||
| 5 | **Bug** | §2.3 | Webhook event `user.deleted` source said `auth.delete` — no such endpoint name. Actual route is `DELETE /auth/users/:id` (admin action). | Fixed source column. |
|
||||
| 6 | **Bug** | §4 | `email_verifications` container (from §2.5) missing from Cosmos table. Only `password_reset_tokens` was listed. | Added `email_verifications` to §2.5 row. |
|
||||
| 7 | **Bug** | §4 | Existing container count said "~25+" — actual is **27** (counted from `cosmos-init.ts`; `promos` uses Stripe API directly, no Cosmos container). | Updated to 27 with full container list. |
|
||||
| 8 | **Bug** | §4 | Total new containers said "~17" — after adding `email_verifications` and `ip_rules`, count is **19**. | Updated. |
|
||||
| 9 | **Gap** | §2.2 | No clarity on email template storage strategy. `renderer.ts` mentioned but not whether templates are Cosmos-stored or file-based. | Clarified: `repository.ts` now references `delivery_log + email_templates` containers. |
|
||||
| 10 | **Gap** | §2.4 | No migration strategy from existing `lib/webhooks.ts` to new event bus pattern. | Added "Migration from existing `lib/webhooks.ts`" subsection with 3-phase plan. |
|
||||
| 11 | **Gap** | §2.10 | Maintenance mode proposed extending `settings` module but didn't clarify storage location. Missing from §4 Cosmos table. | Added: stored as single document per product in existing `settings` container (no new container needed). |
|
||||
| 12 | **Gap** | §2.11 | IP rules need persistence but no container was mentioned. Missing from §4 table. | Added `ip_rules` container (pk: `/productId`) to both §2.11 and §4 table. |
|
||||
| 13 | **Gap** | §2.9 | Data Export didn't mention blob module dependency (exports written to blob storage). | Added explicit dependency note on `blob` module and `jobs` module for cleanup. |
|
||||
| 14 | **Gap** | §5 | Missing env vars for webhooks (timeout, retries) and sessions (TTL, cache TTL). | Added 4 new env vars: `WEBHOOK_DELIVERY_TIMEOUT_MS`, `WEBHOOK_MAX_RETRIES`, `SESSION_TTL_DAYS`, `SESSION_CACHE_TTL_MS`. |
|
||||
| 15 | **Gap** | §6 | Quick Reference missing MindLyst repo (`learning_multimodal_memory_agents`). Doc scope says "ByteLyst platform" which includes MindLyst. | Added MindLyst native app and web entries. Also added `cosmos-init.ts` path. |
|
||||
| 16 | **Gap** | Appendix | Dependency graph incomplete: missing Jobs → Data Export connection, missing Blob → Data Export dependency, downstream jobs not labeled with section numbers. | Rewrote graph with all connections and section labels. |
|
||||
| 17 | **Gap** | Overall | No "Risks & Open Questions" section — design docs should call out unknowns. | Added Appendix A with 10 risk items and mitigations. |
|
||||
| 18 | **Gap** | TOC | Table of Contents didn't include Appendix sections. | Added Appendix A, B, C to TOC. |
|
||||
| 19 | **Gap** | §2.5 | Password reset cross-referenced "§2.6" for sessions but sessions was renumbered to §2.7 in previous edit pass. | Fixed to §2.7 (caught in prior pass). |
|
||||
| 20 | **Gap** | §1.5 | Infrastructure table was missing Swagger/OpenAPI (partially wired) and Prometheus metrics (partially enabled). | Added in prior pass — verified still present. |
|
||||
|
||||
---
|
||||
|
||||
_This document is a living brainstorm. Items will be promoted to dedicated design docs (like `CLIENT_TELEMETRY_DESIGN.md`) as they move into implementation._
|
||||
|
||||
@ -252,3 +252,21 @@ export async function getCluster(id: string, pk: string): Promise<TelemetryError
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export async function updateCluster(
|
||||
id: string,
|
||||
pk: string,
|
||||
updates: Partial<TelemetryErrorCluster>
|
||||
): Promise<TelemetryErrorCluster | null> {
|
||||
try {
|
||||
const { resource: existing } = await clustersContainer()
|
||||
.item(id, pk)
|
||||
.read<TelemetryErrorCluster>();
|
||||
if (!existing) return null;
|
||||
const merged = { ...existing, ...updates };
|
||||
const { resource } = await clustersContainer().item(id, pk).replace(merged);
|
||||
return resource as unknown as TelemetryErrorCluster;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,15 +1,17 @@
|
||||
/**
|
||||
* Telemetry REST endpoints.
|
||||
*
|
||||
* POST /telemetry/events — batch ingest (any auth)
|
||||
* GET /telemetry/config — collection config for clients
|
||||
* GET /telemetry/query — admin query
|
||||
* GET /telemetry/clusters — admin error clusters
|
||||
* GET /telemetry/policies — list policies (admin)
|
||||
* POST /telemetry/policies — create policy (admin)
|
||||
* PUT /telemetry/policies/:id — update policy (admin)
|
||||
* DELETE /telemetry/policies/:id — delete policy (admin)
|
||||
* DELETE /telemetry/user/:userId — GDPR erasure (admin)
|
||||
* POST /telemetry/events — batch ingest (any auth)
|
||||
* GET /telemetry/config — collection config for clients
|
||||
* GET /telemetry/query — admin query
|
||||
* GET /telemetry/clusters — admin error clusters
|
||||
* PATCH /telemetry/clusters/:id — resolve/ignore cluster (admin)
|
||||
* GET /telemetry/policies — list policies (admin)
|
||||
* POST /telemetry/policies — create policy (admin)
|
||||
* PUT /telemetry/policies/:id — update policy (admin)
|
||||
* DELETE /telemetry/policies/:id — delete policy (admin)
|
||||
* DELETE /telemetry/user/:userId — GDPR erasure (admin)
|
||||
* GET /telemetry/metrics — ingestion metrics (admin)
|
||||
*/
|
||||
|
||||
import type { FastifyInstance } from 'fastify';
|
||||
@ -23,12 +25,16 @@ import {
|
||||
TelemetryIngestRequestSchema,
|
||||
CreatePolicySchema,
|
||||
UpdatePolicySchema,
|
||||
UpdateClusterSchema,
|
||||
TelemetryQuerySchema,
|
||||
type TelemetryEventDoc,
|
||||
type TelemetryCollectionPolicyDoc,
|
||||
type TelemetryCollectionConfig,
|
||||
type TelemetryErrorCluster,
|
||||
type TelemetryMetrics,
|
||||
} from './types.js';
|
||||
import * as auditRepo from '../audit/repository.js';
|
||||
import type { AuditDoc } from '../audit/types.js';
|
||||
|
||||
// ─── Helpers ────────────────────────────────────────────────────────
|
||||
|
||||
@ -69,6 +75,96 @@ const _cleanupTimer = globalThis.setInterval(() => {
|
||||
}, 300_000);
|
||||
if (typeof _cleanupTimer === 'object' && 'unref' in _cleanupTimer) _cleanupTimer.unref();
|
||||
|
||||
// ─── Ingestion metrics (in-memory counters) ─────────────────────
|
||||
|
||||
const metrics: TelemetryMetrics = {
|
||||
totalEventsIngested: 0,
|
||||
totalEventsRejected: 0,
|
||||
totalBatchRequests: 0,
|
||||
totalRateLimited: 0,
|
||||
totalPiiBlocked: 0,
|
||||
totalDuplicatesDropped: 0,
|
||||
uptimeSince: new Date().toISOString(),
|
||||
};
|
||||
|
||||
// ─── Webhook alerting ───────────────────────────────────────────
|
||||
|
||||
const ALERT_WEBHOOK_URL = process.env.TELEMETRY_ALERT_WEBHOOK_URL ?? '';
|
||||
const ALERT_SEVERITY_THRESHOLD = (process.env.TELEMETRY_ALERT_SEVERITY_THRESHOLD ?? 'error') as
|
||||
| 'warn'
|
||||
| 'error'
|
||||
| 'fatal';
|
||||
const ALERT_COUNT_THRESHOLD = parseInt(process.env.TELEMETRY_ALERT_COUNT_THRESHOLD ?? '10', 10);
|
||||
|
||||
async function sendClusterAlert(
|
||||
cluster: TelemetryErrorCluster,
|
||||
previousSeverity: string
|
||||
): Promise<void> {
|
||||
if (!ALERT_WEBHOOK_URL) return;
|
||||
|
||||
const severityOrder: Record<string, number> = { warn: 0, error: 1, fatal: 2 };
|
||||
const thresholdNum = severityOrder[ALERT_SEVERITY_THRESHOLD] ?? 1;
|
||||
const clusterNum = severityOrder[cluster.severity] ?? 0;
|
||||
|
||||
// Alert if severity meets threshold OR count exceeds threshold
|
||||
if (clusterNum < thresholdNum && cluster.totalCount < ALERT_COUNT_THRESHOLD) return;
|
||||
|
||||
const payload = {
|
||||
text: `Telemetry Alert: Cluster ${cluster.fingerprint} escalated`,
|
||||
blocks: [
|
||||
{
|
||||
type: 'section',
|
||||
text: {
|
||||
type: 'mrkdwn',
|
||||
value: [
|
||||
`*Telemetry Error Cluster Escalated*`,
|
||||
`*Event:* ${cluster.module}/${cluster.eventName}`,
|
||||
`*Platform:* ${cluster.platform} (${cluster.channel})`,
|
||||
`*Severity:* ${previousSeverity} → *${cluster.severity}*`,
|
||||
`*Total Count:* ${cluster.totalCount}`,
|
||||
`*Affected Users:* ${cluster.affectedUserIds.length + cluster.affectedInstallIds.length}`,
|
||||
`*First Seen:* ${cluster.firstSeenAt}`,
|
||||
`*Last Seen:* ${cluster.lastSeenAt}`,
|
||||
cluster.sampleMessage ? `*Sample:* \`${cluster.sampleMessage.slice(0, 200)}\`` : '',
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join('\n'),
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
try {
|
||||
await fetch(ALERT_WEBHOOK_URL, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(payload),
|
||||
});
|
||||
} catch {
|
||||
// Best-effort — don't fail ingestion on alert failure
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Audit helper ───────────────────────────────────────────────
|
||||
|
||||
function emitAudit(
|
||||
productId: string,
|
||||
userId: string,
|
||||
action: string,
|
||||
details: Record<string, unknown>
|
||||
): void {
|
||||
const doc: AuditDoc = {
|
||||
id: `aud_${randomUUID()}`,
|
||||
productId,
|
||||
userId,
|
||||
action,
|
||||
category: 'telemetry',
|
||||
details,
|
||||
createdAt: new Date().toISOString(),
|
||||
};
|
||||
auditRepo.create(doc).catch(() => {});
|
||||
}
|
||||
|
||||
/** PII patterns — reject events containing these. */
|
||||
const PII_PATTERNS = [
|
||||
/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z]{2,}\b/i, // email
|
||||
@ -300,14 +396,23 @@ async function updateClusterForEvent(event: TelemetryEventDoc): Promise<void> {
|
||||
});
|
||||
}
|
||||
|
||||
// Escalate severity
|
||||
// Escalate severity + alert on escalation
|
||||
const severityOrder = { warn: 0, error: 1, fatal: 2 };
|
||||
const eventSev = event.eventType as 'warn' | 'error' | 'fatal';
|
||||
const previousSeverity = existing.severity;
|
||||
if ((severityOrder[eventSev] ?? 0) > (severityOrder[existing.severity] ?? 0)) {
|
||||
existing.severity = eventSev;
|
||||
}
|
||||
|
||||
// Default status for legacy clusters without status field
|
||||
if (!existing.status) existing.status = 'open';
|
||||
|
||||
await repo.upsertCluster(existing);
|
||||
|
||||
// Fire webhook alert on severity escalation
|
||||
if (existing.severity !== previousSeverity) {
|
||||
sendClusterAlert(existing, previousSeverity).catch(() => {});
|
||||
}
|
||||
} else {
|
||||
const newCluster: TelemetryErrorCluster = {
|
||||
id: clusterId,
|
||||
@ -336,6 +441,7 @@ async function updateClusterForEvent(event: TelemetryEventDoc): Promise<void> {
|
||||
sampleErrorCode: event.errorCode,
|
||||
sampleMessage: event.message,
|
||||
severity: event.eventType as 'warn' | 'error' | 'fatal',
|
||||
status: 'open',
|
||||
ttl: DEFAULT_CLUSTER_TTL_DAYS * 86400,
|
||||
};
|
||||
await repo.upsertCluster(newCluster);
|
||||
@ -367,6 +473,7 @@ export async function telemetryRoutes(app: FastifyInstance) {
|
||||
// Rate limiting per installId
|
||||
const rateLimitKey = installToken || req.jwtPayload?.sub || 'unknown';
|
||||
if (!checkRateLimit(rateLimitKey, events.length)) {
|
||||
metrics.totalRateLimited++;
|
||||
reply.code(429);
|
||||
return {
|
||||
accepted: 0,
|
||||
@ -383,11 +490,15 @@ export async function telemetryRoutes(app: FastifyInstance) {
|
||||
seenIds.add(e.id);
|
||||
return true;
|
||||
});
|
||||
const dupCount = events.length - dedupedEvents.length;
|
||||
metrics.totalDuplicatesDropped += dupCount;
|
||||
metrics.totalBatchRequests++;
|
||||
|
||||
const now = new Date().toISOString();
|
||||
const ttl = DEFAULT_EVENT_TTL_DAYS * 86400;
|
||||
|
||||
let accepted = 0;
|
||||
let rejected = events.length - dedupedEvents.length; // duplicates
|
||||
let rejected = dupCount; // duplicates
|
||||
const errors: Array<{ index: number; reason: string }> = [];
|
||||
const docsToInsert: TelemetryEventDoc[] = [];
|
||||
|
||||
@ -406,6 +517,7 @@ export async function telemetryRoutes(app: FastifyInstance) {
|
||||
if (fieldsToScan.some(f => containsPII(f!))) {
|
||||
errors.push({ index: i, reason: 'PII detected' });
|
||||
rejected++;
|
||||
metrics.totalPiiBlocked++;
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -435,6 +547,9 @@ export async function telemetryRoutes(app: FastifyInstance) {
|
||||
}
|
||||
}
|
||||
|
||||
metrics.totalEventsIngested += accepted;
|
||||
metrics.totalEventsRejected += rejected;
|
||||
|
||||
reply.code(accepted > 0 ? 200 : 400);
|
||||
return {
|
||||
accepted,
|
||||
@ -549,6 +664,10 @@ export async function telemetryRoutes(app: FastifyInstance) {
|
||||
};
|
||||
|
||||
const created = await repo.createPolicy(doc);
|
||||
emitAudit(productId, doc.createdBy, 'telemetry.policy.created', {
|
||||
policyId: doc.id,
|
||||
name: doc.name,
|
||||
});
|
||||
reply.code(201);
|
||||
return created;
|
||||
});
|
||||
@ -575,6 +694,10 @@ export async function telemetryRoutes(app: FastifyInstance) {
|
||||
updates as Partial<TelemetryCollectionPolicyDoc>
|
||||
);
|
||||
if (!updated) throw new NotFoundError('Policy not found');
|
||||
emitAudit(productId, req.jwtPayload?.sub ?? 'unknown', 'telemetry.policy.updated', {
|
||||
policyId: id,
|
||||
updates: parsed.data,
|
||||
});
|
||||
return updated;
|
||||
});
|
||||
|
||||
@ -585,6 +708,9 @@ export async function telemetryRoutes(app: FastifyInstance) {
|
||||
const productId = getRequestProductId(req);
|
||||
const deleted = await repo.deletePolicy(id, productId);
|
||||
if (!deleted) throw new NotFoundError('Policy not found');
|
||||
emitAudit(productId, req.jwtPayload?.sub ?? 'unknown', 'telemetry.policy.deleted', {
|
||||
policyId: id,
|
||||
});
|
||||
return { success: true };
|
||||
});
|
||||
|
||||
@ -594,6 +720,53 @@ export async function telemetryRoutes(app: FastifyInstance) {
|
||||
const { userId } = req.params as { userId: string };
|
||||
const productId = getRequestProductId(req);
|
||||
const eventsDeleted = await repo.deleteEventsByUserId(productId, userId);
|
||||
emitAudit(productId, req.jwtPayload?.sub ?? 'unknown', 'telemetry.gdpr.erasure', {
|
||||
targetUserId: userId,
|
||||
eventsDeleted,
|
||||
});
|
||||
return { userId, eventsDeleted, clustersUpdated: 0 };
|
||||
});
|
||||
|
||||
// ── Admin: resolve/ignore cluster ───────────────────────────
|
||||
app.patch('/telemetry/clusters/:id', async req => {
|
||||
requireAdmin(req);
|
||||
const { id } = req.params as { id: string };
|
||||
const { pk } = req.query as { pk: string };
|
||||
if (!pk) throw new BadRequestError('pk query parameter required');
|
||||
|
||||
const parsed = UpdateClusterSchema.safeParse(req.body);
|
||||
if (!parsed.success) {
|
||||
throw new BadRequestError(parsed.error.issues.map(i => i.message).join('; '));
|
||||
}
|
||||
|
||||
const updates: Partial<TelemetryErrorCluster> = {
|
||||
status: parsed.data.status,
|
||||
};
|
||||
if (parsed.data.status === 'resolved' || parsed.data.status === 'ignored') {
|
||||
updates.resolvedBy = req.jwtPayload?.sub ?? 'unknown';
|
||||
updates.resolvedAt = new Date().toISOString();
|
||||
}
|
||||
|
||||
const updated = await repo.updateCluster(id, pk, updates);
|
||||
if (!updated) throw new NotFoundError('Cluster not found');
|
||||
|
||||
const productId = getRequestProductId(req);
|
||||
emitAudit(
|
||||
productId,
|
||||
req.jwtPayload?.sub ?? 'unknown',
|
||||
`telemetry.cluster.${parsed.data.status}`,
|
||||
{
|
||||
clusterId: id,
|
||||
pk,
|
||||
}
|
||||
);
|
||||
|
||||
return updated;
|
||||
});
|
||||
|
||||
// ── Admin: ingestion metrics ──────────────────────────────
|
||||
app.get('/telemetry/metrics', async req => {
|
||||
requireAdmin(req);
|
||||
return metrics;
|
||||
});
|
||||
}
|
||||
|
||||
@ -201,6 +201,8 @@ export interface TelemetryCollectionConfig {
|
||||
|
||||
// ─── Error Cluster ──────────────────────────────────────────────────
|
||||
|
||||
export const ClusterStatusEnum = z.enum(['open', 'resolved', 'ignored']);
|
||||
|
||||
export interface TelemetryErrorCluster {
|
||||
id: string; // ${fingerprint}:${yyyyMM}
|
||||
pk: string; // ${productId}:${platform}:${module}
|
||||
@ -230,9 +232,16 @@ export interface TelemetryErrorCluster {
|
||||
sampleErrorCode?: string;
|
||||
sampleMessage?: string;
|
||||
severity: 'warn' | 'error' | 'fatal';
|
||||
status: 'open' | 'resolved' | 'ignored';
|
||||
resolvedBy?: string;
|
||||
resolvedAt?: string;
|
||||
ttl: number;
|
||||
}
|
||||
|
||||
export const UpdateClusterSchema = z.object({
|
||||
status: ClusterStatusEnum,
|
||||
});
|
||||
|
||||
// ─── Query / Admin types ────────────────────────────────────────────
|
||||
|
||||
export const TelemetryQuerySchema = z.object({
|
||||
@ -253,3 +262,15 @@ export const TelemetryQuerySchema = z.object({
|
||||
});
|
||||
|
||||
export type TelemetryQueryInput = z.infer<typeof TelemetryQuerySchema>;
|
||||
|
||||
// ─── Ingestion Metrics (in-memory counters) ─────────────────────────
|
||||
|
||||
export interface TelemetryMetrics {
|
||||
totalEventsIngested: number;
|
||||
totalEventsRejected: number;
|
||||
totalBatchRequests: number;
|
||||
totalRateLimited: number;
|
||||
totalPiiBlocked: number;
|
||||
totalDuplicatesDropped: number;
|
||||
uptimeSince: string;
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user