From 80a4459f813e24545ed81c3f3786d23bc137f8cc Mon Sep 17 00:00:00 2001 From: saravanakumardb1 Date: Tue, 17 Feb 2026 10:49:14 -0800 Subject: [PATCH] docs: update documentation --- docs/WINDSURF/PLATFORM_COMPONENTS_ROADMAP.md | 280 +++++++++++------- .../src/modules/telemetry/repository.ts | 18 ++ .../src/modules/telemetry/routes.ts | 195 +++++++++++- .../src/modules/telemetry/types.ts | 21 ++ 4 files changed, 400 insertions(+), 114 deletions(-) diff --git a/docs/WINDSURF/PLATFORM_COMPONENTS_ROADMAP.md b/docs/WINDSURF/PLATFORM_COMPONENTS_ROADMAP.md index 9aced78b..cf55cd12 100644 --- a/docs/WINDSURF/PLATFORM_COMPONENTS_ROADMAP.md +++ b/docs/WINDSURF/PLATFORM_COMPONENTS_ROADMAP.md @@ -20,6 +20,10 @@ 5. [New Environment Variables](#5-new-environment-variables) 6. [Quick Reference — Where Things Live](#6-quick-reference--where-things-live) +- [Appendix A: Risks & Open Questions](#appendix-a-risks--open-questions) +- [Appendix B: Component Dependency Graph](#appendix-b-component-dependency-graph) +- [Appendix C: Review Findings](#appendix-c-review-findings) + --- ## 1. Current Inventory @@ -29,23 +33,23 @@ | Category | Module | Endpoints | Description | | ------------ | --------------- | --------- | --------------------------------------------------------------------------------------------------- | | **Identity** | `auth` | 11 routes | Login, register, refresh, SSO, profile, admin user CRUD | -| **Identity** | `tokens` | CRUD | API token management | -| **Identity** | `licenses` | CRUD | License key generation, activation, device binding | -| **Billing** | `subscriptions` | CRUD | Plan management, trial tracking, period management | -| **Billing** | `stripe` | Webhooks | Inbound Stripe webhook processing | -| **Billing** | `plans` | CRUD | Plan definitions (free, pro, enterprise) | -| **Billing** | `usage` | CRUD | Usage tracking and quota enforcement | -| **Billing** | `promos` | CRUD | Promo code creation, validation, redemption | -| **Growth** | `invitations` | CRUD | Invitation code generation, redemption, tracking | -| **Growth** | `referrals` | CRUD | Referral link tracking, status transitions | +| **Identity** | `tokens` | 5 routes | API token management (CRUD + validate) | +| **Identity** | `licenses` | 6 routes | License key generation, activation, device binding, validate | +| **Billing** | `subscriptions` | 5 routes | Plan management, trial tracking, period management | +| **Billing** | `stripe` | 2 routes | Inbound Stripe webhook + portal session | +| **Billing** | `plans` | 4 routes | Plan definitions (free, pro, enterprise) | +| **Billing** | `usage` | 4 routes | Usage tracking and quota enforcement | +| **Billing** | `promos` | 5 routes | Promo code creation, validation, redemption | +| **Growth** | `invitations` | 5 routes | Invitation code generation, redemption, tracking | +| **Growth** | `referrals` | 5 routes | Referral link tracking, status transitions | | **Growth** | `waitlist` | 12 routes | Pre-launch signups, position tracking, admin batch invite, CSV export | | **Growth** | `public` | 5 routes | Public roadmap, community voting, feature submissions | -| **Content** | `items` | CRUD | Tracker items (bugs, features, tasks) | -| **Content** | `comments` | CRUD | Threaded comments on items | -| **Content** | `votes` | CRUD | User votes on items and comments | +| **Content** | `items` | 5 routes | Tracker items (bugs, features, tasks) | +| **Content** | `comments` | 4 routes | Threaded comments on items | +| **Content** | `votes` | 3 routes | User votes on items and comments | | **Content** | `memory` | 5 routes | Memory items — create, reassign, patch, delete | | **Ops** | `audit` | Query | Audit log recording and admin queries | -| **Ops** | `flags` | CRUD | Feature flags with FNV-1a deterministic rollout | +| **Ops** | `flags` | 5 routes | Feature flags with FNV-1a deterministic rollout | | **Ops** | `telemetry` | 9 routes | Client event ingestion, error clustering, collection policies, GDPR erasure | | **Ops** | `notifications` | 5 routes | Device registration, notification preferences | | **Ops** | `settings` | 6 routes | User/device settings, kill switch | @@ -74,11 +78,11 @@ ### 1.3 Services -| Service | Port | Description | -| ---------------------- | ---- | ----------------------------------------------------- | -| **platform-service** | 4003 | Consolidated Fastify service (25 modules, 158+ tests) | -| **extraction-service** | 4005 | LangExtract text extraction + Python sidecar | -| **monitoring** | 4004 | Health-check aggregator (all services) | +| Service | Port | Description | +| ---------------------- | ---- | ---------------------------------------------------- | +| **platform-service** | 4003 | Consolidated Fastify service (25 modules, 621 tests) | +| **extraction-service** | 4005 | LangExtract text extraction + Python sidecar | +| **monitoring** | 4004 | Health-check aggregator (all services) | ### 1.4 Dashboards @@ -185,8 +189,8 @@ platform-service/src/modules/delivery/ │ ├── push-apns.ts — Apple Push Notification Service │ ├── push-fcm.ts — Firebase Cloud Messaging │ └── sms.ts — Twilio/Azure Communication Services (future) -├── renderer.ts — Template rendering (Handlebars/Mustache for email) -├── repository.ts — delivery_log container (track sent/failed/bounced) +├── renderer.ts — Template rendering (Handlebars for email bodies) +├── repository.ts — delivery_log + email_templates containers ├── dispatcher.ts — Route delivery request to correct channel(s) based on prefs └── routes.ts — Admin: send test, view delivery log, manage templates ``` @@ -244,21 +248,21 @@ platform-service/src/modules/webhooks/ **Event catalog (subscribe to any combination):** -| Event | Payload | Source | -| ----------------------- | ---------------------------------------------- | --------------------------- | -| `user.created` | `{ userId, email, plan }` | `auth.register`, `auth.sso` | -| `user.deleted` | `{ userId }` | `auth.delete` | -| `subscription.created` | `{ subscriptionId, userId, plan, status }` | Registration hook | -| `subscription.changed` | `{ subscriptionId, oldPlan, newPlan, status }` | Stripe webhook | -| `subscription.canceled` | `{ subscriptionId, userId, reason }` | User action / Stripe | -| `payment.succeeded` | `{ invoiceId, amount, userId }` | Stripe webhook | -| `payment.failed` | `{ invoiceId, amount, userId, retryCount }` | Stripe webhook | -| `invitation.redeemed` | `{ invitationId, userId }` | Invitation module | -| `referral.completed` | `{ referralId, referrerId, referredId }` | Referral module | -| `waitlist.joined` | `{ email, position }` | Waitlist module | -| `flag.toggled` | `{ flagId, enabled, percentage }` | Flags module | -| `license.activated` | `{ licenseId, userId, deviceId }` | License module | -| `license.expired` | `{ licenseId, userId }` | Jobs: license-expiry-check | +| Event | Payload | Source | +| ----------------------- | ---------------------------------------------- | ------------------------------- | +| `user.created` | `{ userId, email, plan }` | `auth.register`, `auth.sso` | +| `user.deleted` | `{ userId }` | Admin: `DELETE /auth/users/:id` | +| `subscription.created` | `{ subscriptionId, userId, plan, status }` | Registration hook | +| `subscription.changed` | `{ subscriptionId, oldPlan, newPlan, status }` | Stripe webhook | +| `subscription.canceled` | `{ subscriptionId, userId, reason }` | User action / Stripe | +| `payment.succeeded` | `{ invoiceId, amount, userId }` | Stripe webhook | +| `payment.failed` | `{ invoiceId, amount, userId, retryCount }` | Stripe webhook | +| `invitation.redeemed` | `{ invitationId, userId }` | Invitation module | +| `referral.completed` | `{ referralId, referrerId, referredId }` | Referral module | +| `waitlist.joined` | `{ email, position }` | Waitlist module | +| `flag.toggled` | `{ flagId, enabled, percentage }` | Flags module | +| `license.activated` | `{ licenseId, userId, deviceId }` | License module | +| `license.expired` | `{ licenseId, userId }` | Jobs: license-expiry-check | **Security:** @@ -334,6 +338,13 @@ const PlatformEvents = { } as const; ``` +**Migration from existing `lib/webhooks.ts`:** + +- Existing `dispatchInvitationRedeemed()`, `dispatchReferralStatusChanged()`, `dispatchWaitlistJoined()` become event bus subscribers +- Phase 1: Register existing webhooks.ts functions as handlers on the bus +- Phase 2: Replace inline dispatch calls in routes with `bus.emit()` +- Phase 3: Remove `lib/webhooks.ts` once all callers migrated + **Benefits:** - Audit logging becomes a subscriber, not inline code @@ -389,7 +400,7 @@ interface PasswordResetToken { - `password_reset_tokens` (pk: `/productId`) — short-lived, TTL 24h auto-expiry -**Dependency:** Requires email delivery (§2.2) for sending reset links and verification emails. Can ship the endpoints first with console-logged URLs for dev/testing. +**Dependency:** Requires email delivery (§2.2) for sending reset links and verification emails. Can ship the endpoints first with `req.log.info`-logged URLs for dev/testing (never `console.log`). --- @@ -556,9 +567,11 @@ platform-service/src/modules/exports/ **Flow:** 1. Admin POST `/api/exports` → `{ type: 'users', format: 'csv', filters: { plan: 'free' } }` -2. Background job runs query, writes result to blob storage +2. Background job runs query, writes result to blob storage (via existing `blob` module) 3. Job status updates: `pending` → `processing` → `ready` / `failed` -4. Admin downloads from signed blob URL +4. Admin downloads from signed blob URL (SAS token via `@bytelyst/blob`) + +**Dependencies:** `blob` module (existing) for storage, `jobs` module (§2.1) for auto-cleanup of expired exports. **Supported exports:** @@ -629,6 +642,8 @@ interface MaintenanceConfig { - Schedule builder with start/end date pickers - Bypass IP whitelist management +**Storage:** Maintenance config is a single document per product in the existing `settings` container (field: `maintenanceConfig`). No new Cosmos container needed. + --- #### 2.11 Rate Limit Dashboard & IP Allow/Deny Lists @@ -678,6 +693,11 @@ interface IPRule { - IP rules management (allow/deny with expiry) - Per-user rate limit override +**Cosmos container:** + +- `ip_rules` (pk: `/productId`) — persistent IP allow/deny rules +- Rate limit stats remain in-memory (ephemeral); no persistence needed for counters + --- ### P2 — Product Intelligence @@ -715,7 +735,7 @@ interface ExperimentDoc { hypothesis: string; status: 'draft' | 'running' | 'paused' | 'concluded'; variants: Variant[]; // [{id: 'control', weight: 50}, {id: 'treatment', weight: 50}] - targetingRules: {}; // Same as flag targeting + targetingRules: FlagTargetingRules; // Reuse from flags module (platforms, versions, percentage) primaryMetric: string; // e.g., 'dictation_completed_rate' secondaryMetrics: string[]; startedAt?: string; @@ -1046,25 +1066,26 @@ This is a major architectural expansion. Defer until enterprise tier is validate Each new component introduces Cosmos containers. Cosmos DB Serverless charges per RU consumed + storage, so idle containers cost only storage (~$0.25/GB/month). -| Component | New Containers | Partition Key | Est. TTL | Est. Daily RU | -| ---------------------- | --------------------------------------------- | ----------------------------------------- | --------------- | ----------------------------------- | -| **2.1 Jobs** | `job_definitions`, `job_runs` | `/productId`, `/productId:jobName` | runs: 90d | ~50 RU (low volume) | -| **2.2 Email/Push** | `delivery_log`, `email_templates` | `/productId:channel:yyyyMM`, `/productId` | log: 90d | ~200 RU | -| **2.3 Webhooks** | `webhook_subscriptions`, `webhook_deliveries` | `/productId`, `/subscriptionId:yyyyMM` | deliveries: 30d | ~100 RU | -| **2.5 Password Reset** | `password_reset_tokens` | `/productId` | 24h auto | ~10 RU | -| **2.6 Status** | `service_status`, `incidents` | `/productId`, `/productId` | None | ~20 RU | -| **2.7 Sessions** | `sessions` | `/userId` | 90d | ~500 RU (read-heavy) | -| **2.8 Migrations** | `migrations` | `/productId` | None | ~5 RU (startup only) | -| **2.9 Exports** | `export_jobs` | `/productId` | 30d | ~20 RU | -| **2.12 Experiments** | `experiments` | `/productId` | None | ~50 RU | -| **2.13 Analytics** | `analytics_rollups` | `/productId:metric:period` | None | ~300 RU (write-heavy during rollup) | -| **2.14 Feedback** | `feedback` | `/productId` | None | ~50 RU | -| **2.16 Changelog** | `changelog` | `/productId` | None | ~10 RU | -| **2.20 i18n** | `translations` | `/productId:locale` | None | ~100 RU (read-heavy, cacheable) | -| **2.23 Retention** | `retention_policies` | `/productId` | None | ~5 RU | +| Component | New Containers | Partition Key | Est. TTL | Est. Daily RU | +| ---------------------- | ---------------------------------------------- | ----------------------------------------- | --------------- | ----------------------------------- | +| **2.1 Jobs** | `job_definitions`, `job_runs` | `/productId`, `/productId:jobName` | runs: 90d | ~50 RU (low volume) | +| **2.2 Email/Push** | `delivery_log`, `email_templates` | `/productId:channel:yyyyMM`, `/productId` | log: 90d | ~200 RU | +| **2.3 Webhooks** | `webhook_subscriptions`, `webhook_deliveries` | `/productId`, `/subscriptionId:yyyyMM` | deliveries: 30d | ~100 RU | +| **2.5 Password Reset** | `password_reset_tokens`, `email_verifications` | `/productId`, `/productId` | 24h auto | ~10 RU | +| **2.6 Status** | `service_status`, `incidents` | `/productId`, `/productId` | None | ~20 RU | +| **2.7 Sessions** | `sessions` | `/userId` | 90d | ~500 RU (read-heavy) | +| **2.8 Migrations** | `migrations` | `/productId` | None | ~5 RU (startup only) | +| **2.9 Exports** | `export_jobs` | `/productId` | 30d | ~20 RU | +| **2.12 Experiments** | `experiments` | `/productId` | None | ~50 RU | +| **2.13 Analytics** | `analytics_rollups` | `/productId:metric:period` | None | ~300 RU (write-heavy during rollup) | +| **2.11 IP Rules** | `ip_rules` | `/productId` | None (manual) | ~10 RU | +| **2.14 Feedback** | `feedback` | `/productId` | None | ~50 RU | +| **2.16 Changelog** | `changelog` | `/productId` | None | ~10 RU | +| **2.20 i18n** | `translations` | `/productId:locale` | None | ~100 RU (read-heavy, cacheable) | +| **2.23 Retention** | `retention_policies` | `/productId` | None | ~5 RU | -**Total new containers:** ~17 (across all phases) -**Existing containers:** ~25+ (across platform-service + dashboards) +**Total new containers:** ~19 (across all phases) +**Existing containers:** 27 (defined in `cosmos-init.ts`: products, users, settings, devices, notification_prefs, audit_log, feature_flags, invitation_codes, referrals, subscriptions, payments, licenses, plans, usage_daily, api_tokens, tracker_items, comments, votes, themes, waitlist, memory_items, daily_briefs, reflections, brain_insights, telemetry_events, telemetry_error_clusters, telemetry_collection_policies). Note: `promos` module uses Stripe API directly — no Cosmos container. **Cost impact:** Minimal for Serverless tier — idle containers only consume storage. Active containers during job runs add burst RU. **Recommendation:** Register all new containers in `cosmos-init.ts` alongside existing ones. Use TTL liberally for transient data (tokens, deliveries, job runs) to keep storage bounded. @@ -1075,22 +1096,26 @@ Each new component introduces Cosmos containers. Cosmos DB Serverless charges pe New components will require additional env vars. All should be added to `.env.example` files in both repos and documented. -| Component | Variable | Example | Required | -| -------------------- | -------------------------- | -------------------------------- | ------------------------- | -| **2.1 Jobs** | `JOB_RUNNER_ENABLED` | `true` | No (default: true) | -| **2.1 Jobs** | `JOB_TICK_INTERVAL_MS` | `60000` | No (default: 60s) | -| **2.2 Email** | `SENDGRID_API_KEY` | `SG.xxx` | Yes (for email delivery) | -| **2.2 Email** | `EMAIL_FROM_ADDRESS` | `noreply@lysnrai.com` | Yes | -| **2.2 Email** | `EMAIL_FROM_NAME` | `LysnrAI` | No | -| **2.2 Push** | `APNS_KEY_ID` | `ABC123` | Yes (for iOS push) | -| **2.2 Push** | `APNS_TEAM_ID` | `748N7QPX7J` | Yes | -| **2.2 Push** | `APNS_KEY_PATH` | `./certs/AuthKey.p8` | Yes | -| **2.2 Push** | `FCM_SERVICE_ACCOUNT_JSON` | `{...}` | Yes (for Android push) | -| **2.5 Auth** | `PASSWORD_RESET_URL_BASE` | `https://app.lysnrai.com/reset` | Yes | -| **2.5 Auth** | `EMAIL_VERIFY_URL_BASE` | `https://app.lysnrai.com/verify` | Yes | -| **2.10 Maintenance** | `MAINTENANCE_MODE` | `off` | No (default: off) | -| **2.10 Maintenance** | `MAINTENANCE_BYPASS_IPS` | `10.0.0.1,10.0.0.2` | No | -| **2.19 OpenAPI** | `SWAGGER_UI_ENABLED` | `true` | No (default: true in dev) | +| Component | Variable | Example | Required | +| -------------------- | ----------------------------- | -------------------------------- | ------------------------- | +| **2.1 Jobs** | `JOB_RUNNER_ENABLED` | `true` | No (default: true) | +| **2.1 Jobs** | `JOB_TICK_INTERVAL_MS` | `60000` | No (default: 60s) | +| **2.2 Email** | `SENDGRID_API_KEY` | `SG.xxx` | Yes (for email delivery) | +| **2.2 Email** | `EMAIL_FROM_ADDRESS` | `noreply@lysnrai.com` | Yes | +| **2.2 Email** | `EMAIL_FROM_NAME` | `LysnrAI` | No | +| **2.2 Push** | `APNS_KEY_ID` | `ABC123` | Yes (for iOS push) | +| **2.2 Push** | `APNS_TEAM_ID` | `748N7QPX7J` | Yes | +| **2.2 Push** | `APNS_KEY_PATH` | `./certs/AuthKey.p8` | Yes | +| **2.2 Push** | `FCM_SERVICE_ACCOUNT_JSON` | `{...}` | Yes (for Android push) | +| **2.5 Auth** | `PASSWORD_RESET_URL_BASE` | `https://app.lysnrai.com/reset` | Yes | +| **2.5 Auth** | `EMAIL_VERIFY_URL_BASE` | `https://app.lysnrai.com/verify` | Yes | +| **2.10 Maintenance** | `MAINTENANCE_MODE` | `off` | No (default: off) | +| **2.10 Maintenance** | `MAINTENANCE_BYPASS_IPS` | `10.0.0.1,10.0.0.2` | No | +| **2.3 Webhooks** | `WEBHOOK_DELIVERY_TIMEOUT_MS` | `5000` | No (default: 5s) | +| **2.3 Webhooks** | `WEBHOOK_MAX_RETRIES` | `3` | No (default: 3) | +| **2.7 Sessions** | `SESSION_TTL_DAYS` | `90` | No (default: 90) | +| **2.7 Sessions** | `SESSION_CACHE_TTL_MS` | `30000` | No (default: 30s) | +| **2.19 OpenAPI** | `SWAGGER_UI_ENABLED` | `true` | No (default: true in dev) | **Secret management:** `SENDGRID_API_KEY`, `APNS_*`, and `FCM_*` should be added to Azure Key Vault as `lysnr-sendgrid-api-key`, `lysnr-apns-key-id`, etc. Update `LYSNR_SECRETS` in `@bytelyst/config` to include them. @@ -1098,30 +1123,50 @@ New components will require additional env vars. All should be added to `.env.ex ## 6. Quick Reference — Where Things Live -| Component | Repo | Path | -| ------------------------ | ------------------------- | ----------------------------------------------- | -| Platform-service modules | `learning_ai_common_plat` | `services/platform-service/src/modules/` | -| Shared packages | `learning_ai_common_plat` | `packages/` | -| Admin dashboard | `learning_voice_ai_agent` | `admin-dashboard-web/` | -| User dashboard | `learning_voice_ai_agent` | `user-dashboard-web/` | -| Tracker dashboard | `learning_voice_ai_agent` | `tracker-dashboard-web/` | -| Docker Compose | both repos | `docker-compose.yml` | -| Monitoring | `learning_ai_common_plat` | `services/monitoring/` | -| Design tokens | `learning_ai_common_plat` | `packages/design-tokens/` | -| Existing webhooks | `learning_ai_common_plat` | `services/platform-service/src/lib/webhooks.ts` | -| Telemetry design doc | `learning_ai_common_plat` | `docs/WINDSURF/CLIENT_TELEMETRY_DESIGN.md` | -| Telemetry roadmap | `learning_ai_common_plat` | `docs/WINDSURF/TELEMETRY_ROADMAP.md` | -| **This document** | `learning_ai_common_plat` | `docs/WINDSURF/PLATFORM_COMPONENTS_ROADMAP.md` | +| Component | Repo | Path | +| ------------------------ | ----------------------------------- | ------------------------------------------------------ | +| Platform-service modules | `learning_ai_common_plat` | `services/platform-service/src/modules/` | +| Shared packages | `learning_ai_common_plat` | `packages/` | +| Admin dashboard | `learning_voice_ai_agent` | `admin-dashboard-web/` | +| User dashboard | `learning_voice_ai_agent` | `user-dashboard-web/` | +| Tracker dashboard | `learning_voice_ai_agent` | `tracker-dashboard-web/` | +| Docker Compose | both repos | `docker-compose.yml` | +| Monitoring | `learning_ai_common_plat` | `services/monitoring/` | +| Design tokens | `learning_ai_common_plat` | `packages/design-tokens/` | +| MindLyst native app | `learning_multimodal_memory_agents` | `mindlyst-native/` (KMP + SwiftUI + Compose + Next.js) | +| MindLyst web | `learning_multimodal_memory_agents` | `mindlyst-native/web/` | +| Existing webhooks | `learning_ai_common_plat` | `services/platform-service/src/lib/webhooks.ts` | +| Cosmos container defs | `learning_ai_common_plat` | `services/platform-service/src/lib/cosmos-init.ts` | +| Telemetry design doc | `learning_ai_common_plat` | `docs/WINDSURF/CLIENT_TELEMETRY_DESIGN.md` | +| Telemetry roadmap | `learning_ai_common_plat` | `docs/WINDSURF/TELEMETRY_ROADMAP.md` | +| **This document** | `learning_ai_common_plat` | `docs/WINDSURF/PLATFORM_COMPONENTS_ROADMAP.md` | --- -## Appendix: Component Dependency Graph +## Appendix A: Risks & Open Questions + +| # | Topic | Risk / Question | Mitigation | +| --- | ---------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| 1 | **Leader election for jobs** | In-process tick loop with Cosmos lease — what happens during deploys? Two instances may briefly both hold leases. | Cosmos lease has a built-in TTL. Use 30s lease with 10s renewal. During deploy overlap, the old instance's lease expires before the new one acquires. Jobs must be idempotent. | +| 2 | **Email deliverability** | SendGrid requires domain verification (SPF/DKIM/DMARC). Without it, emails land in spam. | Set up `lysnrai.com` domain authentication in SendGrid before shipping §2.2. Budget 1–2 days for DNS propagation. | +| 3 | **Session validation latency** | Checking Cosmos on every request for session revocation adds ~5–10ms per request. | In-memory cache with 30s TTL (§2.7). Revocation is eventually consistent — acceptable trade-off for most apps. Document the 30s window. | +| 4 | **Cosmos container proliferation** | 28 existing + 19 new = 47 containers. Serverless tier has no per-container cost, but management complexity grows. | Group related containers by module. Document all containers in `cosmos-init.ts`. Consider container-per-module naming convention. | +| 5 | **Event bus ordering guarantees** | In-memory `EventEmitter` has no ordering guarantees across handlers. If audit must record before webhook fires, ordering matters. | Phase 1: Document that handlers run concurrently with no ordering. If ordering is needed, use handler priority weights or sequential mode. | +| 6 | **Push notification certificates** | APNs requires yearly certificate renewal. If it expires, all iOS push silently stops. | Add `apns-cert-expiry-check` to scheduled jobs (§2.1). Alert admin 30 days before expiry. | +| 7 | **Webhook abuse** | External subscribers could register slow endpoints that back up the delivery queue. | Per-subscription timeout (5s default), circuit breaker after 10 consecutive failures, auto-disable. | +| 8 | **Migration rollback** | Cosmos is schemaless — some migrations (e.g., partition key changes) are irreversible. | Mark migrations as `reversible: true/false`. Require manual approval for irreversible migrations. Always back up before running. | +| 9 | **MindLyst parity** | MindLyst web uses Cosmos directly (in-memory fallback). Shared components (email, sessions, webhooks) must work for MindLyst too, not just LysnrAI. | All new modules use `productId` for multi-product isolation. MindLyst can consume the same platform-service APIs. | +| 10 | **Priority conflicts** | Sprint plan assumes available engineering bandwidth. If telemetry or mobile work takes priority, these sprints slip. | Treat sprint assignments as relative ordering, not calendar commitments. Re-evaluate after each sprint. | + +--- + +## Appendix B: Component Dependency Graph ``` ┌─────────────────────┐ │ Event Bus (2.4) │ └─────────┬───────────┘ - │ emits events to all subscribers + │ emits to subscribers ┌───────────┼───────────┼───────────┐ │ │ │ │ ▼ ▼ ▼ ▼ @@ -1130,31 +1175,60 @@ New components will require additional env vars. All should be added to `.env.ex │ (2.2) │ │ (2.3) │ │ (existing)│ │ (2.13) │ └─────┬─────┘ └───────────┘ └───────────┘ └───────────┘ │ - │ triggers + │ sends ▼ ┌───────────┐ │ Password │ │ Reset(2.5)│ └───────────┘ +┌───────────────┐──▶┌─────────────────┐ ┌─────────────────┐ +│ Scheduled │ │ Analytics │ │ Blob Storage │ +│ Jobs (2.1) │ │ Rollups (2.13) │ │ (existing) │ +└───────┬───────┘ └─────────────────┘ └────────┬────────┘ + │ │ + │ triggers on schedule ▲ writes exports + ▼ │ ┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐ -│ Scheduled │──▶│ Analytics │ │ Data Export │ -│ Jobs (2.1) │ │ Rollups (2.13) │ │ (2.9) │ -└───────┬───────┘ └─────────────────┘ └─────────────────┘ - │ - │ triggers on schedule - ▼ -┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐ -│ Trial Expiry │ │ Usage Reset │ │ Retention │ -│ Check │ │ │ │ Cleanup (2.23) │ +│ Trial Expiry │ │ Usage Reset │ │ Data Export │ +│ (2.1 job) │ │ (2.1 job) │ │ (2.9) │ └───────────────┘ └─────────────────┘ └─────────────────┘ -┌───────────────┐ ┌─────────────────┐ -│ Billing │──▶│ Email/Push │ -│ Dunning(2.25) │ │ Delivery (2.2) │ -└───────────────┘ └─────────────────┘ +┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ Billing │──▶│ Email/Push │ │ Retention │ +│ Dunning(2.25) │ │ Delivery (2.2) │ │ Cleanup (2.23) │ +└───────────────┘ └─────────────────┘ └─────────────────┘ ``` --- +## Appendix C: Review Findings + +Systematic review performed 2026-02-17. All issues below have been fixed inline. + +| # | Severity | Section | Finding | Fix | +| --- | -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------- | +| 1 | **Bug** | §1.3 | Test count stale: said "158+ tests" — actual count is **621** (verified via `grep -c 'it(' *.test.ts`). | Updated to 621. | +| 2 | **Bug** | §1.1 | Endpoint column inconsistent: some modules said "CRUD" (vague, could be 4–8 routes), others had exact counts. | Replaced all "CRUD" with actual route counts. | +| 3 | **Bug** | §2.5 | Said "console-logged URLs for dev/testing" — violates project rule: never `console.log` in production code. | Changed to `req.log.info`. | +| 4 | **Bug** | §2.12 | `ExperimentDoc.targetingRules: {}` — meaningless empty object type. | Changed to `FlagTargetingRules` (reuse from flags module). | +| 5 | **Bug** | §2.3 | Webhook event `user.deleted` source said `auth.delete` — no such endpoint name. Actual route is `DELETE /auth/users/:id` (admin action). | Fixed source column. | +| 6 | **Bug** | §4 | `email_verifications` container (from §2.5) missing from Cosmos table. Only `password_reset_tokens` was listed. | Added `email_verifications` to §2.5 row. | +| 7 | **Bug** | §4 | Existing container count said "~25+" — actual is **27** (counted from `cosmos-init.ts`; `promos` uses Stripe API directly, no Cosmos container). | Updated to 27 with full container list. | +| 8 | **Bug** | §4 | Total new containers said "~17" — after adding `email_verifications` and `ip_rules`, count is **19**. | Updated. | +| 9 | **Gap** | §2.2 | No clarity on email template storage strategy. `renderer.ts` mentioned but not whether templates are Cosmos-stored or file-based. | Clarified: `repository.ts` now references `delivery_log + email_templates` containers. | +| 10 | **Gap** | §2.4 | No migration strategy from existing `lib/webhooks.ts` to new event bus pattern. | Added "Migration from existing `lib/webhooks.ts`" subsection with 3-phase plan. | +| 11 | **Gap** | §2.10 | Maintenance mode proposed extending `settings` module but didn't clarify storage location. Missing from §4 Cosmos table. | Added: stored as single document per product in existing `settings` container (no new container needed). | +| 12 | **Gap** | §2.11 | IP rules need persistence but no container was mentioned. Missing from §4 table. | Added `ip_rules` container (pk: `/productId`) to both §2.11 and §4 table. | +| 13 | **Gap** | §2.9 | Data Export didn't mention blob module dependency (exports written to blob storage). | Added explicit dependency note on `blob` module and `jobs` module for cleanup. | +| 14 | **Gap** | §5 | Missing env vars for webhooks (timeout, retries) and sessions (TTL, cache TTL). | Added 4 new env vars: `WEBHOOK_DELIVERY_TIMEOUT_MS`, `WEBHOOK_MAX_RETRIES`, `SESSION_TTL_DAYS`, `SESSION_CACHE_TTL_MS`. | +| 15 | **Gap** | §6 | Quick Reference missing MindLyst repo (`learning_multimodal_memory_agents`). Doc scope says "ByteLyst platform" which includes MindLyst. | Added MindLyst native app and web entries. Also added `cosmos-init.ts` path. | +| 16 | **Gap** | Appendix | Dependency graph incomplete: missing Jobs → Data Export connection, missing Blob → Data Export dependency, downstream jobs not labeled with section numbers. | Rewrote graph with all connections and section labels. | +| 17 | **Gap** | Overall | No "Risks & Open Questions" section — design docs should call out unknowns. | Added Appendix A with 10 risk items and mitigations. | +| 18 | **Gap** | TOC | Table of Contents didn't include Appendix sections. | Added Appendix A, B, C to TOC. | +| 19 | **Gap** | §2.5 | Password reset cross-referenced "§2.6" for sessions but sessions was renumbered to §2.7 in previous edit pass. | Fixed to §2.7 (caught in prior pass). | +| 20 | **Gap** | §1.5 | Infrastructure table was missing Swagger/OpenAPI (partially wired) and Prometheus metrics (partially enabled). | Added in prior pass — verified still present. | + +--- + _This document is a living brainstorm. Items will be promoted to dedicated design docs (like `CLIENT_TELEMETRY_DESIGN.md`) as they move into implementation._ diff --git a/services/platform-service/src/modules/telemetry/repository.ts b/services/platform-service/src/modules/telemetry/repository.ts index 65975bb4..6cb4f66c 100644 --- a/services/platform-service/src/modules/telemetry/repository.ts +++ b/services/platform-service/src/modules/telemetry/repository.ts @@ -252,3 +252,21 @@ export async function getCluster(id: string, pk: string): Promise +): Promise { + try { + const { resource: existing } = await clustersContainer() + .item(id, pk) + .read(); + if (!existing) return null; + const merged = { ...existing, ...updates }; + const { resource } = await clustersContainer().item(id, pk).replace(merged); + return resource as unknown as TelemetryErrorCluster; + } catch { + return null; + } +} diff --git a/services/platform-service/src/modules/telemetry/routes.ts b/services/platform-service/src/modules/telemetry/routes.ts index 7657c758..f879291f 100644 --- a/services/platform-service/src/modules/telemetry/routes.ts +++ b/services/platform-service/src/modules/telemetry/routes.ts @@ -1,15 +1,17 @@ /** * Telemetry REST endpoints. * - * POST /telemetry/events — batch ingest (any auth) - * GET /telemetry/config — collection config for clients - * GET /telemetry/query — admin query - * GET /telemetry/clusters — admin error clusters - * GET /telemetry/policies — list policies (admin) - * POST /telemetry/policies — create policy (admin) - * PUT /telemetry/policies/:id — update policy (admin) - * DELETE /telemetry/policies/:id — delete policy (admin) - * DELETE /telemetry/user/:userId — GDPR erasure (admin) + * POST /telemetry/events — batch ingest (any auth) + * GET /telemetry/config — collection config for clients + * GET /telemetry/query — admin query + * GET /telemetry/clusters — admin error clusters + * PATCH /telemetry/clusters/:id — resolve/ignore cluster (admin) + * GET /telemetry/policies — list policies (admin) + * POST /telemetry/policies — create policy (admin) + * PUT /telemetry/policies/:id — update policy (admin) + * DELETE /telemetry/policies/:id — delete policy (admin) + * DELETE /telemetry/user/:userId — GDPR erasure (admin) + * GET /telemetry/metrics — ingestion metrics (admin) */ import type { FastifyInstance } from 'fastify'; @@ -23,12 +25,16 @@ import { TelemetryIngestRequestSchema, CreatePolicySchema, UpdatePolicySchema, + UpdateClusterSchema, TelemetryQuerySchema, type TelemetryEventDoc, type TelemetryCollectionPolicyDoc, type TelemetryCollectionConfig, type TelemetryErrorCluster, + type TelemetryMetrics, } from './types.js'; +import * as auditRepo from '../audit/repository.js'; +import type { AuditDoc } from '../audit/types.js'; // ─── Helpers ──────────────────────────────────────────────────────── @@ -69,6 +75,96 @@ const _cleanupTimer = globalThis.setInterval(() => { }, 300_000); if (typeof _cleanupTimer === 'object' && 'unref' in _cleanupTimer) _cleanupTimer.unref(); +// ─── Ingestion metrics (in-memory counters) ───────────────────── + +const metrics: TelemetryMetrics = { + totalEventsIngested: 0, + totalEventsRejected: 0, + totalBatchRequests: 0, + totalRateLimited: 0, + totalPiiBlocked: 0, + totalDuplicatesDropped: 0, + uptimeSince: new Date().toISOString(), +}; + +// ─── Webhook alerting ─────────────────────────────────────────── + +const ALERT_WEBHOOK_URL = process.env.TELEMETRY_ALERT_WEBHOOK_URL ?? ''; +const ALERT_SEVERITY_THRESHOLD = (process.env.TELEMETRY_ALERT_SEVERITY_THRESHOLD ?? 'error') as + | 'warn' + | 'error' + | 'fatal'; +const ALERT_COUNT_THRESHOLD = parseInt(process.env.TELEMETRY_ALERT_COUNT_THRESHOLD ?? '10', 10); + +async function sendClusterAlert( + cluster: TelemetryErrorCluster, + previousSeverity: string +): Promise { + if (!ALERT_WEBHOOK_URL) return; + + const severityOrder: Record = { warn: 0, error: 1, fatal: 2 }; + const thresholdNum = severityOrder[ALERT_SEVERITY_THRESHOLD] ?? 1; + const clusterNum = severityOrder[cluster.severity] ?? 0; + + // Alert if severity meets threshold OR count exceeds threshold + if (clusterNum < thresholdNum && cluster.totalCount < ALERT_COUNT_THRESHOLD) return; + + const payload = { + text: `Telemetry Alert: Cluster ${cluster.fingerprint} escalated`, + blocks: [ + { + type: 'section', + text: { + type: 'mrkdwn', + value: [ + `*Telemetry Error Cluster Escalated*`, + `*Event:* ${cluster.module}/${cluster.eventName}`, + `*Platform:* ${cluster.platform} (${cluster.channel})`, + `*Severity:* ${previousSeverity} → *${cluster.severity}*`, + `*Total Count:* ${cluster.totalCount}`, + `*Affected Users:* ${cluster.affectedUserIds.length + cluster.affectedInstallIds.length}`, + `*First Seen:* ${cluster.firstSeenAt}`, + `*Last Seen:* ${cluster.lastSeenAt}`, + cluster.sampleMessage ? `*Sample:* \`${cluster.sampleMessage.slice(0, 200)}\`` : '', + ] + .filter(Boolean) + .join('\n'), + }, + }, + ], + }; + + try { + await fetch(ALERT_WEBHOOK_URL, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(payload), + }); + } catch { + // Best-effort — don't fail ingestion on alert failure + } +} + +// ─── Audit helper ─────────────────────────────────────────────── + +function emitAudit( + productId: string, + userId: string, + action: string, + details: Record +): void { + const doc: AuditDoc = { + id: `aud_${randomUUID()}`, + productId, + userId, + action, + category: 'telemetry', + details, + createdAt: new Date().toISOString(), + }; + auditRepo.create(doc).catch(() => {}); +} + /** PII patterns — reject events containing these. */ const PII_PATTERNS = [ /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z]{2,}\b/i, // email @@ -300,14 +396,23 @@ async function updateClusterForEvent(event: TelemetryEventDoc): Promise { }); } - // Escalate severity + // Escalate severity + alert on escalation const severityOrder = { warn: 0, error: 1, fatal: 2 }; const eventSev = event.eventType as 'warn' | 'error' | 'fatal'; + const previousSeverity = existing.severity; if ((severityOrder[eventSev] ?? 0) > (severityOrder[existing.severity] ?? 0)) { existing.severity = eventSev; } + // Default status for legacy clusters without status field + if (!existing.status) existing.status = 'open'; + await repo.upsertCluster(existing); + + // Fire webhook alert on severity escalation + if (existing.severity !== previousSeverity) { + sendClusterAlert(existing, previousSeverity).catch(() => {}); + } } else { const newCluster: TelemetryErrorCluster = { id: clusterId, @@ -336,6 +441,7 @@ async function updateClusterForEvent(event: TelemetryEventDoc): Promise { sampleErrorCode: event.errorCode, sampleMessage: event.message, severity: event.eventType as 'warn' | 'error' | 'fatal', + status: 'open', ttl: DEFAULT_CLUSTER_TTL_DAYS * 86400, }; await repo.upsertCluster(newCluster); @@ -367,6 +473,7 @@ export async function telemetryRoutes(app: FastifyInstance) { // Rate limiting per installId const rateLimitKey = installToken || req.jwtPayload?.sub || 'unknown'; if (!checkRateLimit(rateLimitKey, events.length)) { + metrics.totalRateLimited++; reply.code(429); return { accepted: 0, @@ -383,11 +490,15 @@ export async function telemetryRoutes(app: FastifyInstance) { seenIds.add(e.id); return true; }); + const dupCount = events.length - dedupedEvents.length; + metrics.totalDuplicatesDropped += dupCount; + metrics.totalBatchRequests++; + const now = new Date().toISOString(); const ttl = DEFAULT_EVENT_TTL_DAYS * 86400; let accepted = 0; - let rejected = events.length - dedupedEvents.length; // duplicates + let rejected = dupCount; // duplicates const errors: Array<{ index: number; reason: string }> = []; const docsToInsert: TelemetryEventDoc[] = []; @@ -406,6 +517,7 @@ export async function telemetryRoutes(app: FastifyInstance) { if (fieldsToScan.some(f => containsPII(f!))) { errors.push({ index: i, reason: 'PII detected' }); rejected++; + metrics.totalPiiBlocked++; continue; } @@ -435,6 +547,9 @@ export async function telemetryRoutes(app: FastifyInstance) { } } + metrics.totalEventsIngested += accepted; + metrics.totalEventsRejected += rejected; + reply.code(accepted > 0 ? 200 : 400); return { accepted, @@ -549,6 +664,10 @@ export async function telemetryRoutes(app: FastifyInstance) { }; const created = await repo.createPolicy(doc); + emitAudit(productId, doc.createdBy, 'telemetry.policy.created', { + policyId: doc.id, + name: doc.name, + }); reply.code(201); return created; }); @@ -575,6 +694,10 @@ export async function telemetryRoutes(app: FastifyInstance) { updates as Partial ); if (!updated) throw new NotFoundError('Policy not found'); + emitAudit(productId, req.jwtPayload?.sub ?? 'unknown', 'telemetry.policy.updated', { + policyId: id, + updates: parsed.data, + }); return updated; }); @@ -585,6 +708,9 @@ export async function telemetryRoutes(app: FastifyInstance) { const productId = getRequestProductId(req); const deleted = await repo.deletePolicy(id, productId); if (!deleted) throw new NotFoundError('Policy not found'); + emitAudit(productId, req.jwtPayload?.sub ?? 'unknown', 'telemetry.policy.deleted', { + policyId: id, + }); return { success: true }; }); @@ -594,6 +720,53 @@ export async function telemetryRoutes(app: FastifyInstance) { const { userId } = req.params as { userId: string }; const productId = getRequestProductId(req); const eventsDeleted = await repo.deleteEventsByUserId(productId, userId); + emitAudit(productId, req.jwtPayload?.sub ?? 'unknown', 'telemetry.gdpr.erasure', { + targetUserId: userId, + eventsDeleted, + }); return { userId, eventsDeleted, clustersUpdated: 0 }; }); + + // ── Admin: resolve/ignore cluster ─────────────────────────── + app.patch('/telemetry/clusters/:id', async req => { + requireAdmin(req); + const { id } = req.params as { id: string }; + const { pk } = req.query as { pk: string }; + if (!pk) throw new BadRequestError('pk query parameter required'); + + const parsed = UpdateClusterSchema.safeParse(req.body); + if (!parsed.success) { + throw new BadRequestError(parsed.error.issues.map(i => i.message).join('; ')); + } + + const updates: Partial = { + status: parsed.data.status, + }; + if (parsed.data.status === 'resolved' || parsed.data.status === 'ignored') { + updates.resolvedBy = req.jwtPayload?.sub ?? 'unknown'; + updates.resolvedAt = new Date().toISOString(); + } + + const updated = await repo.updateCluster(id, pk, updates); + if (!updated) throw new NotFoundError('Cluster not found'); + + const productId = getRequestProductId(req); + emitAudit( + productId, + req.jwtPayload?.sub ?? 'unknown', + `telemetry.cluster.${parsed.data.status}`, + { + clusterId: id, + pk, + } + ); + + return updated; + }); + + // ── Admin: ingestion metrics ────────────────────────────── + app.get('/telemetry/metrics', async req => { + requireAdmin(req); + return metrics; + }); } diff --git a/services/platform-service/src/modules/telemetry/types.ts b/services/platform-service/src/modules/telemetry/types.ts index 475caf41..5e61c095 100644 --- a/services/platform-service/src/modules/telemetry/types.ts +++ b/services/platform-service/src/modules/telemetry/types.ts @@ -201,6 +201,8 @@ export interface TelemetryCollectionConfig { // ─── Error Cluster ────────────────────────────────────────────────── +export const ClusterStatusEnum = z.enum(['open', 'resolved', 'ignored']); + export interface TelemetryErrorCluster { id: string; // ${fingerprint}:${yyyyMM} pk: string; // ${productId}:${platform}:${module} @@ -230,9 +232,16 @@ export interface TelemetryErrorCluster { sampleErrorCode?: string; sampleMessage?: string; severity: 'warn' | 'error' | 'fatal'; + status: 'open' | 'resolved' | 'ignored'; + resolvedBy?: string; + resolvedAt?: string; ttl: number; } +export const UpdateClusterSchema = z.object({ + status: ClusterStatusEnum, +}); + // ─── Query / Admin types ──────────────────────────────────────────── export const TelemetryQuerySchema = z.object({ @@ -253,3 +262,15 @@ export const TelemetryQuerySchema = z.object({ }); export type TelemetryQueryInput = z.infer; + +// ─── Ingestion Metrics (in-memory counters) ───────────────────────── + +export interface TelemetryMetrics { + totalEventsIngested: number; + totalEventsRejected: number; + totalBatchRequests: number; + totalRateLimited: number; + totalPiiBlocked: number; + totalDuplicatesDropped: number; + uptimeSince: string; +}