docs: update documentation

This commit is contained in:
saravanakumardb1 2026-02-17 10:49:14 -08:00
parent 856788c386
commit 80a4459f81
4 changed files with 400 additions and 114 deletions

View File

@ -20,6 +20,10 @@
5. [New Environment Variables](#5-new-environment-variables)
6. [Quick Reference — Where Things Live](#6-quick-reference--where-things-live)
- [Appendix A: Risks & Open Questions](#appendix-a-risks--open-questions)
- [Appendix B: Component Dependency Graph](#appendix-b-component-dependency-graph)
- [Appendix C: Review Findings](#appendix-c-review-findings)
---
## 1. Current Inventory
@ -29,23 +33,23 @@
| Category | Module | Endpoints | Description |
| ------------ | --------------- | --------- | --------------------------------------------------------------------------------------------------- |
| **Identity** | `auth` | 11 routes | Login, register, refresh, SSO, profile, admin user CRUD |
| **Identity** | `tokens` | CRUD | API token management |
| **Identity** | `licenses` | CRUD | License key generation, activation, device binding |
| **Billing** | `subscriptions` | CRUD | Plan management, trial tracking, period management |
| **Billing** | `stripe` | Webhooks | Inbound Stripe webhook processing |
| **Billing** | `plans` | CRUD | Plan definitions (free, pro, enterprise) |
| **Billing** | `usage` | CRUD | Usage tracking and quota enforcement |
| **Billing** | `promos` | CRUD | Promo code creation, validation, redemption |
| **Growth** | `invitations` | CRUD | Invitation code generation, redemption, tracking |
| **Growth** | `referrals` | CRUD | Referral link tracking, status transitions |
| **Identity** | `tokens` | 5 routes | API token management (CRUD + validate) |
| **Identity** | `licenses` | 6 routes | License key generation, activation, device binding, validate |
| **Billing** | `subscriptions` | 5 routes | Plan management, trial tracking, period management |
| **Billing** | `stripe` | 2 routes | Inbound Stripe webhook + portal session |
| **Billing** | `plans` | 4 routes | Plan definitions (free, pro, enterprise) |
| **Billing** | `usage` | 4 routes | Usage tracking and quota enforcement |
| **Billing** | `promos` | 5 routes | Promo code creation, validation, redemption |
| **Growth** | `invitations` | 5 routes | Invitation code generation, redemption, tracking |
| **Growth** | `referrals` | 5 routes | Referral link tracking, status transitions |
| **Growth** | `waitlist` | 12 routes | Pre-launch signups, position tracking, admin batch invite, CSV export |
| **Growth** | `public` | 5 routes | Public roadmap, community voting, feature submissions |
| **Content** | `items` | CRUD | Tracker items (bugs, features, tasks) |
| **Content** | `comments` | CRUD | Threaded comments on items |
| **Content** | `votes` | CRUD | User votes on items and comments |
| **Content** | `items` | 5 routes | Tracker items (bugs, features, tasks) |
| **Content** | `comments` | 4 routes | Threaded comments on items |
| **Content** | `votes` | 3 routes | User votes on items and comments |
| **Content** | `memory` | 5 routes | Memory items — create, reassign, patch, delete |
| **Ops** | `audit` | Query | Audit log recording and admin queries |
| **Ops** | `flags` | CRUD | Feature flags with FNV-1a deterministic rollout |
| **Ops** | `flags` | 5 routes | Feature flags with FNV-1a deterministic rollout |
| **Ops** | `telemetry` | 9 routes | Client event ingestion, error clustering, collection policies, GDPR erasure |
| **Ops** | `notifications` | 5 routes | Device registration, notification preferences |
| **Ops** | `settings` | 6 routes | User/device settings, kill switch |
@ -74,11 +78,11 @@
### 1.3 Services
| Service | Port | Description |
| ---------------------- | ---- | ----------------------------------------------------- |
| **platform-service** | 4003 | Consolidated Fastify service (25 modules, 158+ tests) |
| **extraction-service** | 4005 | LangExtract text extraction + Python sidecar |
| **monitoring** | 4004 | Health-check aggregator (all services) |
| Service | Port | Description |
| ---------------------- | ---- | ---------------------------------------------------- |
| **platform-service** | 4003 | Consolidated Fastify service (25 modules, 621 tests) |
| **extraction-service** | 4005 | LangExtract text extraction + Python sidecar |
| **monitoring** | 4004 | Health-check aggregator (all services) |
### 1.4 Dashboards
@ -185,8 +189,8 @@ platform-service/src/modules/delivery/
│ ├── push-apns.ts — Apple Push Notification Service
│ ├── push-fcm.ts — Firebase Cloud Messaging
│ └── sms.ts — Twilio/Azure Communication Services (future)
├── renderer.ts — Template rendering (Handlebars/Mustache for email)
├── repository.ts — delivery_log container (track sent/failed/bounced)
├── renderer.ts — Template rendering (Handlebars for email bodies)
├── repository.ts — delivery_log + email_templates containers
├── dispatcher.ts — Route delivery request to correct channel(s) based on prefs
└── routes.ts — Admin: send test, view delivery log, manage templates
```
@ -244,21 +248,21 @@ platform-service/src/modules/webhooks/
**Event catalog (subscribe to any combination):**
| Event | Payload | Source |
| ----------------------- | ---------------------------------------------- | --------------------------- |
| `user.created` | `{ userId, email, plan }` | `auth.register`, `auth.sso` |
| `user.deleted` | `{ userId }` | `auth.delete` |
| `subscription.created` | `{ subscriptionId, userId, plan, status }` | Registration hook |
| `subscription.changed` | `{ subscriptionId, oldPlan, newPlan, status }` | Stripe webhook |
| `subscription.canceled` | `{ subscriptionId, userId, reason }` | User action / Stripe |
| `payment.succeeded` | `{ invoiceId, amount, userId }` | Stripe webhook |
| `payment.failed` | `{ invoiceId, amount, userId, retryCount }` | Stripe webhook |
| `invitation.redeemed` | `{ invitationId, userId }` | Invitation module |
| `referral.completed` | `{ referralId, referrerId, referredId }` | Referral module |
| `waitlist.joined` | `{ email, position }` | Waitlist module |
| `flag.toggled` | `{ flagId, enabled, percentage }` | Flags module |
| `license.activated` | `{ licenseId, userId, deviceId }` | License module |
| `license.expired` | `{ licenseId, userId }` | Jobs: license-expiry-check |
| Event | Payload | Source |
| ----------------------- | ---------------------------------------------- | ------------------------------- |
| `user.created` | `{ userId, email, plan }` | `auth.register`, `auth.sso` |
| `user.deleted` | `{ userId }` | Admin: `DELETE /auth/users/:id` |
| `subscription.created` | `{ subscriptionId, userId, plan, status }` | Registration hook |
| `subscription.changed` | `{ subscriptionId, oldPlan, newPlan, status }` | Stripe webhook |
| `subscription.canceled` | `{ subscriptionId, userId, reason }` | User action / Stripe |
| `payment.succeeded` | `{ invoiceId, amount, userId }` | Stripe webhook |
| `payment.failed` | `{ invoiceId, amount, userId, retryCount }` | Stripe webhook |
| `invitation.redeemed` | `{ invitationId, userId }` | Invitation module |
| `referral.completed` | `{ referralId, referrerId, referredId }` | Referral module |
| `waitlist.joined` | `{ email, position }` | Waitlist module |
| `flag.toggled` | `{ flagId, enabled, percentage }` | Flags module |
| `license.activated` | `{ licenseId, userId, deviceId }` | License module |
| `license.expired` | `{ licenseId, userId }` | Jobs: license-expiry-check |
**Security:**
@ -334,6 +338,13 @@ const PlatformEvents = {
} as const;
```
**Migration from existing `lib/webhooks.ts`:**
- Existing `dispatchInvitationRedeemed()`, `dispatchReferralStatusChanged()`, `dispatchWaitlistJoined()` become event bus subscribers
- Phase 1: Register existing webhooks.ts functions as handlers on the bus
- Phase 2: Replace inline dispatch calls in routes with `bus.emit()`
- Phase 3: Remove `lib/webhooks.ts` once all callers migrated
**Benefits:**
- Audit logging becomes a subscriber, not inline code
@ -389,7 +400,7 @@ interface PasswordResetToken {
- `password_reset_tokens` (pk: `/productId`) — short-lived, TTL 24h auto-expiry
**Dependency:** Requires email delivery (§2.2) for sending reset links and verification emails. Can ship the endpoints first with console-logged URLs for dev/testing.
**Dependency:** Requires email delivery (§2.2) for sending reset links and verification emails. Can ship the endpoints first with `req.log.info`-logged URLs for dev/testing (never `console.log`).
---
@ -556,9 +567,11 @@ platform-service/src/modules/exports/
**Flow:**
1. Admin POST `/api/exports``{ type: 'users', format: 'csv', filters: { plan: 'free' } }`
2. Background job runs query, writes result to blob storage
2. Background job runs query, writes result to blob storage (via existing `blob` module)
3. Job status updates: `pending``processing``ready` / `failed`
4. Admin downloads from signed blob URL
4. Admin downloads from signed blob URL (SAS token via `@bytelyst/blob`)
**Dependencies:** `blob` module (existing) for storage, `jobs` module (§2.1) for auto-cleanup of expired exports.
**Supported exports:**
@ -629,6 +642,8 @@ interface MaintenanceConfig {
- Schedule builder with start/end date pickers
- Bypass IP whitelist management
**Storage:** Maintenance config is a single document per product in the existing `settings` container (field: `maintenanceConfig`). No new Cosmos container needed.
---
#### 2.11 Rate Limit Dashboard & IP Allow/Deny Lists
@ -678,6 +693,11 @@ interface IPRule {
- IP rules management (allow/deny with expiry)
- Per-user rate limit override
**Cosmos container:**
- `ip_rules` (pk: `/productId`) — persistent IP allow/deny rules
- Rate limit stats remain in-memory (ephemeral); no persistence needed for counters
---
### P2 — Product Intelligence
@ -715,7 +735,7 @@ interface ExperimentDoc {
hypothesis: string;
status: 'draft' | 'running' | 'paused' | 'concluded';
variants: Variant[]; // [{id: 'control', weight: 50}, {id: 'treatment', weight: 50}]
targetingRules: {}; // Same as flag targeting
targetingRules: FlagTargetingRules; // Reuse from flags module (platforms, versions, percentage)
primaryMetric: string; // e.g., 'dictation_completed_rate'
secondaryMetrics: string[];
startedAt?: string;
@ -1046,25 +1066,26 @@ This is a major architectural expansion. Defer until enterprise tier is validate
Each new component introduces Cosmos containers. Cosmos DB Serverless charges per RU consumed + storage, so idle containers cost only storage (~$0.25/GB/month).
| Component | New Containers | Partition Key | Est. TTL | Est. Daily RU |
| ---------------------- | --------------------------------------------- | ----------------------------------------- | --------------- | ----------------------------------- |
| **2.1 Jobs** | `job_definitions`, `job_runs` | `/productId`, `/productId:jobName` | runs: 90d | ~50 RU (low volume) |
| **2.2 Email/Push** | `delivery_log`, `email_templates` | `/productId:channel:yyyyMM`, `/productId` | log: 90d | ~200 RU |
| **2.3 Webhooks** | `webhook_subscriptions`, `webhook_deliveries` | `/productId`, `/subscriptionId:yyyyMM` | deliveries: 30d | ~100 RU |
| **2.5 Password Reset** | `password_reset_tokens` | `/productId` | 24h auto | ~10 RU |
| **2.6 Status** | `service_status`, `incidents` | `/productId`, `/productId` | None | ~20 RU |
| **2.7 Sessions** | `sessions` | `/userId` | 90d | ~500 RU (read-heavy) |
| **2.8 Migrations** | `migrations` | `/productId` | None | ~5 RU (startup only) |
| **2.9 Exports** | `export_jobs` | `/productId` | 30d | ~20 RU |
| **2.12 Experiments** | `experiments` | `/productId` | None | ~50 RU |
| **2.13 Analytics** | `analytics_rollups` | `/productId:metric:period` | None | ~300 RU (write-heavy during rollup) |
| **2.14 Feedback** | `feedback` | `/productId` | None | ~50 RU |
| **2.16 Changelog** | `changelog` | `/productId` | None | ~10 RU |
| **2.20 i18n** | `translations` | `/productId:locale` | None | ~100 RU (read-heavy, cacheable) |
| **2.23 Retention** | `retention_policies` | `/productId` | None | ~5 RU |
| Component | New Containers | Partition Key | Est. TTL | Est. Daily RU |
| ---------------------- | ---------------------------------------------- | ----------------------------------------- | --------------- | ----------------------------------- |
| **2.1 Jobs** | `job_definitions`, `job_runs` | `/productId`, `/productId:jobName` | runs: 90d | ~50 RU (low volume) |
| **2.2 Email/Push** | `delivery_log`, `email_templates` | `/productId:channel:yyyyMM`, `/productId` | log: 90d | ~200 RU |
| **2.3 Webhooks** | `webhook_subscriptions`, `webhook_deliveries` | `/productId`, `/subscriptionId:yyyyMM` | deliveries: 30d | ~100 RU |
| **2.5 Password Reset** | `password_reset_tokens`, `email_verifications` | `/productId`, `/productId` | 24h auto | ~10 RU |
| **2.6 Status** | `service_status`, `incidents` | `/productId`, `/productId` | None | ~20 RU |
| **2.7 Sessions** | `sessions` | `/userId` | 90d | ~500 RU (read-heavy) |
| **2.8 Migrations** | `migrations` | `/productId` | None | ~5 RU (startup only) |
| **2.9 Exports** | `export_jobs` | `/productId` | 30d | ~20 RU |
| **2.12 Experiments** | `experiments` | `/productId` | None | ~50 RU |
| **2.13 Analytics** | `analytics_rollups` | `/productId:metric:period` | None | ~300 RU (write-heavy during rollup) |
| **2.11 IP Rules** | `ip_rules` | `/productId` | None (manual) | ~10 RU |
| **2.14 Feedback** | `feedback` | `/productId` | None | ~50 RU |
| **2.16 Changelog** | `changelog` | `/productId` | None | ~10 RU |
| **2.20 i18n** | `translations` | `/productId:locale` | None | ~100 RU (read-heavy, cacheable) |
| **2.23 Retention** | `retention_policies` | `/productId` | None | ~5 RU |
**Total new containers:** ~17 (across all phases)
**Existing containers:** ~25+ (across platform-service + dashboards)
**Total new containers:** ~19 (across all phases)
**Existing containers:** 27 (defined in `cosmos-init.ts`: products, users, settings, devices, notification_prefs, audit_log, feature_flags, invitation_codes, referrals, subscriptions, payments, licenses, plans, usage_daily, api_tokens, tracker_items, comments, votes, themes, waitlist, memory_items, daily_briefs, reflections, brain_insights, telemetry_events, telemetry_error_clusters, telemetry_collection_policies). Note: `promos` module uses Stripe API directly — no Cosmos container.
**Cost impact:** Minimal for Serverless tier — idle containers only consume storage. Active containers during job runs add burst RU.
**Recommendation:** Register all new containers in `cosmos-init.ts` alongside existing ones. Use TTL liberally for transient data (tokens, deliveries, job runs) to keep storage bounded.
@ -1075,22 +1096,26 @@ Each new component introduces Cosmos containers. Cosmos DB Serverless charges pe
New components will require additional env vars. All should be added to `.env.example` files in both repos and documented.
| Component | Variable | Example | Required |
| -------------------- | -------------------------- | -------------------------------- | ------------------------- |
| **2.1 Jobs** | `JOB_RUNNER_ENABLED` | `true` | No (default: true) |
| **2.1 Jobs** | `JOB_TICK_INTERVAL_MS` | `60000` | No (default: 60s) |
| **2.2 Email** | `SENDGRID_API_KEY` | `SG.xxx` | Yes (for email delivery) |
| **2.2 Email** | `EMAIL_FROM_ADDRESS` | `noreply@lysnrai.com` | Yes |
| **2.2 Email** | `EMAIL_FROM_NAME` | `LysnrAI` | No |
| **2.2 Push** | `APNS_KEY_ID` | `ABC123` | Yes (for iOS push) |
| **2.2 Push** | `APNS_TEAM_ID` | `748N7QPX7J` | Yes |
| **2.2 Push** | `APNS_KEY_PATH` | `./certs/AuthKey.p8` | Yes |
| **2.2 Push** | `FCM_SERVICE_ACCOUNT_JSON` | `{...}` | Yes (for Android push) |
| **2.5 Auth** | `PASSWORD_RESET_URL_BASE` | `https://app.lysnrai.com/reset` | Yes |
| **2.5 Auth** | `EMAIL_VERIFY_URL_BASE` | `https://app.lysnrai.com/verify` | Yes |
| **2.10 Maintenance** | `MAINTENANCE_MODE` | `off` | No (default: off) |
| **2.10 Maintenance** | `MAINTENANCE_BYPASS_IPS` | `10.0.0.1,10.0.0.2` | No |
| **2.19 OpenAPI** | `SWAGGER_UI_ENABLED` | `true` | No (default: true in dev) |
| Component | Variable | Example | Required |
| -------------------- | ----------------------------- | -------------------------------- | ------------------------- |
| **2.1 Jobs** | `JOB_RUNNER_ENABLED` | `true` | No (default: true) |
| **2.1 Jobs** | `JOB_TICK_INTERVAL_MS` | `60000` | No (default: 60s) |
| **2.2 Email** | `SENDGRID_API_KEY` | `SG.xxx` | Yes (for email delivery) |
| **2.2 Email** | `EMAIL_FROM_ADDRESS` | `noreply@lysnrai.com` | Yes |
| **2.2 Email** | `EMAIL_FROM_NAME` | `LysnrAI` | No |
| **2.2 Push** | `APNS_KEY_ID` | `ABC123` | Yes (for iOS push) |
| **2.2 Push** | `APNS_TEAM_ID` | `748N7QPX7J` | Yes |
| **2.2 Push** | `APNS_KEY_PATH` | `./certs/AuthKey.p8` | Yes |
| **2.2 Push** | `FCM_SERVICE_ACCOUNT_JSON` | `{...}` | Yes (for Android push) |
| **2.5 Auth** | `PASSWORD_RESET_URL_BASE` | `https://app.lysnrai.com/reset` | Yes |
| **2.5 Auth** | `EMAIL_VERIFY_URL_BASE` | `https://app.lysnrai.com/verify` | Yes |
| **2.10 Maintenance** | `MAINTENANCE_MODE` | `off` | No (default: off) |
| **2.10 Maintenance** | `MAINTENANCE_BYPASS_IPS` | `10.0.0.1,10.0.0.2` | No |
| **2.3 Webhooks** | `WEBHOOK_DELIVERY_TIMEOUT_MS` | `5000` | No (default: 5s) |
| **2.3 Webhooks** | `WEBHOOK_MAX_RETRIES` | `3` | No (default: 3) |
| **2.7 Sessions** | `SESSION_TTL_DAYS` | `90` | No (default: 90) |
| **2.7 Sessions** | `SESSION_CACHE_TTL_MS` | `30000` | No (default: 30s) |
| **2.19 OpenAPI** | `SWAGGER_UI_ENABLED` | `true` | No (default: true in dev) |
**Secret management:** `SENDGRID_API_KEY`, `APNS_*`, and `FCM_*` should be added to Azure Key Vault as `lysnr-sendgrid-api-key`, `lysnr-apns-key-id`, etc. Update `LYSNR_SECRETS` in `@bytelyst/config` to include them.
@ -1098,30 +1123,50 @@ New components will require additional env vars. All should be added to `.env.ex
## 6. Quick Reference — Where Things Live
| Component | Repo | Path |
| ------------------------ | ------------------------- | ----------------------------------------------- |
| Platform-service modules | `learning_ai_common_plat` | `services/platform-service/src/modules/` |
| Shared packages | `learning_ai_common_plat` | `packages/` |
| Admin dashboard | `learning_voice_ai_agent` | `admin-dashboard-web/` |
| User dashboard | `learning_voice_ai_agent` | `user-dashboard-web/` |
| Tracker dashboard | `learning_voice_ai_agent` | `tracker-dashboard-web/` |
| Docker Compose | both repos | `docker-compose.yml` |
| Monitoring | `learning_ai_common_plat` | `services/monitoring/` |
| Design tokens | `learning_ai_common_plat` | `packages/design-tokens/` |
| Existing webhooks | `learning_ai_common_plat` | `services/platform-service/src/lib/webhooks.ts` |
| Telemetry design doc | `learning_ai_common_plat` | `docs/WINDSURF/CLIENT_TELEMETRY_DESIGN.md` |
| Telemetry roadmap | `learning_ai_common_plat` | `docs/WINDSURF/TELEMETRY_ROADMAP.md` |
| **This document** | `learning_ai_common_plat` | `docs/WINDSURF/PLATFORM_COMPONENTS_ROADMAP.md` |
| Component | Repo | Path |
| ------------------------ | ----------------------------------- | ------------------------------------------------------ |
| Platform-service modules | `learning_ai_common_plat` | `services/platform-service/src/modules/` |
| Shared packages | `learning_ai_common_plat` | `packages/` |
| Admin dashboard | `learning_voice_ai_agent` | `admin-dashboard-web/` |
| User dashboard | `learning_voice_ai_agent` | `user-dashboard-web/` |
| Tracker dashboard | `learning_voice_ai_agent` | `tracker-dashboard-web/` |
| Docker Compose | both repos | `docker-compose.yml` |
| Monitoring | `learning_ai_common_plat` | `services/monitoring/` |
| Design tokens | `learning_ai_common_plat` | `packages/design-tokens/` |
| MindLyst native app | `learning_multimodal_memory_agents` | `mindlyst-native/` (KMP + SwiftUI + Compose + Next.js) |
| MindLyst web | `learning_multimodal_memory_agents` | `mindlyst-native/web/` |
| Existing webhooks | `learning_ai_common_plat` | `services/platform-service/src/lib/webhooks.ts` |
| Cosmos container defs | `learning_ai_common_plat` | `services/platform-service/src/lib/cosmos-init.ts` |
| Telemetry design doc | `learning_ai_common_plat` | `docs/WINDSURF/CLIENT_TELEMETRY_DESIGN.md` |
| Telemetry roadmap | `learning_ai_common_plat` | `docs/WINDSURF/TELEMETRY_ROADMAP.md` |
| **This document** | `learning_ai_common_plat` | `docs/WINDSURF/PLATFORM_COMPONENTS_ROADMAP.md` |
---
## Appendix: Component Dependency Graph
## Appendix A: Risks & Open Questions
| # | Topic | Risk / Question | Mitigation |
| --- | ---------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| 1 | **Leader election for jobs** | In-process tick loop with Cosmos lease — what happens during deploys? Two instances may briefly both hold leases. | Cosmos lease has a built-in TTL. Use 30s lease with 10s renewal. During deploy overlap, the old instance's lease expires before the new one acquires. Jobs must be idempotent. |
| 2 | **Email deliverability** | SendGrid requires domain verification (SPF/DKIM/DMARC). Without it, emails land in spam. | Set up `lysnrai.com` domain authentication in SendGrid before shipping §2.2. Budget 12 days for DNS propagation. |
| 3 | **Session validation latency** | Checking Cosmos on every request for session revocation adds ~510ms per request. | In-memory cache with 30s TTL (§2.7). Revocation is eventually consistent — acceptable trade-off for most apps. Document the 30s window. |
| 4 | **Cosmos container proliferation** | 28 existing + 19 new = 47 containers. Serverless tier has no per-container cost, but management complexity grows. | Group related containers by module. Document all containers in `cosmos-init.ts`. Consider container-per-module naming convention. |
| 5 | **Event bus ordering guarantees** | In-memory `EventEmitter` has no ordering guarantees across handlers. If audit must record before webhook fires, ordering matters. | Phase 1: Document that handlers run concurrently with no ordering. If ordering is needed, use handler priority weights or sequential mode. |
| 6 | **Push notification certificates** | APNs requires yearly certificate renewal. If it expires, all iOS push silently stops. | Add `apns-cert-expiry-check` to scheduled jobs (§2.1). Alert admin 30 days before expiry. |
| 7 | **Webhook abuse** | External subscribers could register slow endpoints that back up the delivery queue. | Per-subscription timeout (5s default), circuit breaker after 10 consecutive failures, auto-disable. |
| 8 | **Migration rollback** | Cosmos is schemaless — some migrations (e.g., partition key changes) are irreversible. | Mark migrations as `reversible: true/false`. Require manual approval for irreversible migrations. Always back up before running. |
| 9 | **MindLyst parity** | MindLyst web uses Cosmos directly (in-memory fallback). Shared components (email, sessions, webhooks) must work for MindLyst too, not just LysnrAI. | All new modules use `productId` for multi-product isolation. MindLyst can consume the same platform-service APIs. |
| 10 | **Priority conflicts** | Sprint plan assumes available engineering bandwidth. If telemetry or mobile work takes priority, these sprints slip. | Treat sprint assignments as relative ordering, not calendar commitments. Re-evaluate after each sprint. |
---
## Appendix B: Component Dependency Graph
```
┌─────────────────────┐
│ Event Bus (2.4) │
└─────────┬───────────┘
│ emits events to all subscribers
│ emits to subscribers
┌───────────┼───────────┼───────────┐
│ │ │ │
▼ ▼ ▼ ▼
@ -1130,31 +1175,60 @@ New components will require additional env vars. All should be added to `.env.ex
│ (2.2) │ │ (2.3) │ │ (existing)│ │ (2.13) │
└─────┬─────┘ └───────────┘ └───────────┘ └───────────┘
triggers
sends
┌───────────┐
│ Password │
│ Reset(2.5)│
└───────────┘
┌───────────────┐──▶┌─────────────────┐ ┌─────────────────┐
│ Scheduled │ │ Analytics │ │ Blob Storage │
│ Jobs (2.1) │ │ Rollups (2.13) │ │ (existing) │
└───────┬───────┘ └─────────────────┘ └────────┬────────┘
│ │
│ triggers on schedule ▲ writes exports
▼ │
┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
│ Scheduled │──▶│ Analytics │ │ Data Export │
│ Jobs (2.1) │ │ Rollups (2.13) │ │ (2.9) │
└───────┬───────┘ └─────────────────┘ └─────────────────┘
│ triggers on schedule
┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
│ Trial Expiry │ │ Usage Reset │ │ Retention │
│ Check │ │ │ │ Cleanup (2.23) │
│ Trial Expiry │ │ Usage Reset │ │ Data Export │
│ (2.1 job) │ │ (2.1 job) │ │ (2.9) │
└───────────────┘ └─────────────────┘ └─────────────────┘
┌───────────────┐ ┌─────────────────┐
│ Billing │──▶│ Email/Push │
│ Dunning(2.25) │ │ Delivery (2.2) │
└───────────────┘ └─────────────────┘
┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
│ Billing │──▶│ Email/Push │ │ Retention │
│ Dunning(2.25) │ │ Delivery (2.2) │ │ Cleanup (2.23) │
└───────────────┘ └─────────────────┘ └─────────────────┘
```
---
## Appendix C: Review Findings
Systematic review performed 2026-02-17. All issues below have been fixed inline.
| # | Severity | Section | Finding | Fix |
| --- | -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------- |
| 1 | **Bug** | §1.3 | Test count stale: said "158+ tests" — actual count is **621** (verified via `grep -c 'it(' *.test.ts`). | Updated to 621. |
| 2 | **Bug** | §1.1 | Endpoint column inconsistent: some modules said "CRUD" (vague, could be 48 routes), others had exact counts. | Replaced all "CRUD" with actual route counts. |
| 3 | **Bug** | §2.5 | Said "console-logged URLs for dev/testing" — violates project rule: never `console.log` in production code. | Changed to `req.log.info`. |
| 4 | **Bug** | §2.12 | `ExperimentDoc.targetingRules: {}` — meaningless empty object type. | Changed to `FlagTargetingRules` (reuse from flags module). |
| 5 | **Bug** | §2.3 | Webhook event `user.deleted` source said `auth.delete` — no such endpoint name. Actual route is `DELETE /auth/users/:id` (admin action). | Fixed source column. |
| 6 | **Bug** | §4 | `email_verifications` container (from §2.5) missing from Cosmos table. Only `password_reset_tokens` was listed. | Added `email_verifications` to §2.5 row. |
| 7 | **Bug** | §4 | Existing container count said "~25+" — actual is **27** (counted from `cosmos-init.ts`; `promos` uses Stripe API directly, no Cosmos container). | Updated to 27 with full container list. |
| 8 | **Bug** | §4 | Total new containers said "~17" — after adding `email_verifications` and `ip_rules`, count is **19**. | Updated. |
| 9 | **Gap** | §2.2 | No clarity on email template storage strategy. `renderer.ts` mentioned but not whether templates are Cosmos-stored or file-based. | Clarified: `repository.ts` now references `delivery_log + email_templates` containers. |
| 10 | **Gap** | §2.4 | No migration strategy from existing `lib/webhooks.ts` to new event bus pattern. | Added "Migration from existing `lib/webhooks.ts`" subsection with 3-phase plan. |
| 11 | **Gap** | §2.10 | Maintenance mode proposed extending `settings` module but didn't clarify storage location. Missing from §4 Cosmos table. | Added: stored as single document per product in existing `settings` container (no new container needed). |
| 12 | **Gap** | §2.11 | IP rules need persistence but no container was mentioned. Missing from §4 table. | Added `ip_rules` container (pk: `/productId`) to both §2.11 and §4 table. |
| 13 | **Gap** | §2.9 | Data Export didn't mention blob module dependency (exports written to blob storage). | Added explicit dependency note on `blob` module and `jobs` module for cleanup. |
| 14 | **Gap** | §5 | Missing env vars for webhooks (timeout, retries) and sessions (TTL, cache TTL). | Added 4 new env vars: `WEBHOOK_DELIVERY_TIMEOUT_MS`, `WEBHOOK_MAX_RETRIES`, `SESSION_TTL_DAYS`, `SESSION_CACHE_TTL_MS`. |
| 15 | **Gap** | §6 | Quick Reference missing MindLyst repo (`learning_multimodal_memory_agents`). Doc scope says "ByteLyst platform" which includes MindLyst. | Added MindLyst native app and web entries. Also added `cosmos-init.ts` path. |
| 16 | **Gap** | Appendix | Dependency graph incomplete: missing Jobs → Data Export connection, missing Blob → Data Export dependency, downstream jobs not labeled with section numbers. | Rewrote graph with all connections and section labels. |
| 17 | **Gap** | Overall | No "Risks & Open Questions" section — design docs should call out unknowns. | Added Appendix A with 10 risk items and mitigations. |
| 18 | **Gap** | TOC | Table of Contents didn't include Appendix sections. | Added Appendix A, B, C to TOC. |
| 19 | **Gap** | §2.5 | Password reset cross-referenced "§2.6" for sessions but sessions was renumbered to §2.7 in previous edit pass. | Fixed to §2.7 (caught in prior pass). |
| 20 | **Gap** | §1.5 | Infrastructure table was missing Swagger/OpenAPI (partially wired) and Prometheus metrics (partially enabled). | Added in prior pass — verified still present. |
---
_This document is a living brainstorm. Items will be promoted to dedicated design docs (like `CLIENT_TELEMETRY_DESIGN.md`) as they move into implementation._

View File

@ -252,3 +252,21 @@ export async function getCluster(id: string, pk: string): Promise<TelemetryError
return null;
}
}
export async function updateCluster(
id: string,
pk: string,
updates: Partial<TelemetryErrorCluster>
): Promise<TelemetryErrorCluster | null> {
try {
const { resource: existing } = await clustersContainer()
.item(id, pk)
.read<TelemetryErrorCluster>();
if (!existing) return null;
const merged = { ...existing, ...updates };
const { resource } = await clustersContainer().item(id, pk).replace(merged);
return resource as unknown as TelemetryErrorCluster;
} catch {
return null;
}
}

View File

@ -1,15 +1,17 @@
/**
* Telemetry REST endpoints.
*
* POST /telemetry/events batch ingest (any auth)
* GET /telemetry/config collection config for clients
* GET /telemetry/query admin query
* GET /telemetry/clusters admin error clusters
* GET /telemetry/policies list policies (admin)
* POST /telemetry/policies create policy (admin)
* PUT /telemetry/policies/:id update policy (admin)
* DELETE /telemetry/policies/:id delete policy (admin)
* DELETE /telemetry/user/:userId GDPR erasure (admin)
* POST /telemetry/events batch ingest (any auth)
* GET /telemetry/config collection config for clients
* GET /telemetry/query admin query
* GET /telemetry/clusters admin error clusters
* PATCH /telemetry/clusters/:id resolve/ignore cluster (admin)
* GET /telemetry/policies list policies (admin)
* POST /telemetry/policies create policy (admin)
* PUT /telemetry/policies/:id update policy (admin)
* DELETE /telemetry/policies/:id delete policy (admin)
* DELETE /telemetry/user/:userId GDPR erasure (admin)
* GET /telemetry/metrics ingestion metrics (admin)
*/
import type { FastifyInstance } from 'fastify';
@ -23,12 +25,16 @@ import {
TelemetryIngestRequestSchema,
CreatePolicySchema,
UpdatePolicySchema,
UpdateClusterSchema,
TelemetryQuerySchema,
type TelemetryEventDoc,
type TelemetryCollectionPolicyDoc,
type TelemetryCollectionConfig,
type TelemetryErrorCluster,
type TelemetryMetrics,
} from './types.js';
import * as auditRepo from '../audit/repository.js';
import type { AuditDoc } from '../audit/types.js';
// ─── Helpers ────────────────────────────────────────────────────────
@ -69,6 +75,96 @@ const _cleanupTimer = globalThis.setInterval(() => {
}, 300_000);
if (typeof _cleanupTimer === 'object' && 'unref' in _cleanupTimer) _cleanupTimer.unref();
// ─── Ingestion metrics (in-memory counters) ─────────────────────
const metrics: TelemetryMetrics = {
totalEventsIngested: 0,
totalEventsRejected: 0,
totalBatchRequests: 0,
totalRateLimited: 0,
totalPiiBlocked: 0,
totalDuplicatesDropped: 0,
uptimeSince: new Date().toISOString(),
};
// ─── Webhook alerting ───────────────────────────────────────────
const ALERT_WEBHOOK_URL = process.env.TELEMETRY_ALERT_WEBHOOK_URL ?? '';
const ALERT_SEVERITY_THRESHOLD = (process.env.TELEMETRY_ALERT_SEVERITY_THRESHOLD ?? 'error') as
| 'warn'
| 'error'
| 'fatal';
const ALERT_COUNT_THRESHOLD = parseInt(process.env.TELEMETRY_ALERT_COUNT_THRESHOLD ?? '10', 10);
async function sendClusterAlert(
cluster: TelemetryErrorCluster,
previousSeverity: string
): Promise<void> {
if (!ALERT_WEBHOOK_URL) return;
const severityOrder: Record<string, number> = { warn: 0, error: 1, fatal: 2 };
const thresholdNum = severityOrder[ALERT_SEVERITY_THRESHOLD] ?? 1;
const clusterNum = severityOrder[cluster.severity] ?? 0;
// Alert if severity meets threshold OR count exceeds threshold
if (clusterNum < thresholdNum && cluster.totalCount < ALERT_COUNT_THRESHOLD) return;
const payload = {
text: `Telemetry Alert: Cluster ${cluster.fingerprint} escalated`,
blocks: [
{
type: 'section',
text: {
type: 'mrkdwn',
value: [
`*Telemetry Error Cluster Escalated*`,
`*Event:* ${cluster.module}/${cluster.eventName}`,
`*Platform:* ${cluster.platform} (${cluster.channel})`,
`*Severity:* ${previousSeverity} → *${cluster.severity}*`,
`*Total Count:* ${cluster.totalCount}`,
`*Affected Users:* ${cluster.affectedUserIds.length + cluster.affectedInstallIds.length}`,
`*First Seen:* ${cluster.firstSeenAt}`,
`*Last Seen:* ${cluster.lastSeenAt}`,
cluster.sampleMessage ? `*Sample:* \`${cluster.sampleMessage.slice(0, 200)}\`` : '',
]
.filter(Boolean)
.join('\n'),
},
},
],
};
try {
await fetch(ALERT_WEBHOOK_URL, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload),
});
} catch {
// Best-effort — don't fail ingestion on alert failure
}
}
// ─── Audit helper ───────────────────────────────────────────────
function emitAudit(
productId: string,
userId: string,
action: string,
details: Record<string, unknown>
): void {
const doc: AuditDoc = {
id: `aud_${randomUUID()}`,
productId,
userId,
action,
category: 'telemetry',
details,
createdAt: new Date().toISOString(),
};
auditRepo.create(doc).catch(() => {});
}
/** PII patterns — reject events containing these. */
const PII_PATTERNS = [
/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z]{2,}\b/i, // email
@ -300,14 +396,23 @@ async function updateClusterForEvent(event: TelemetryEventDoc): Promise<void> {
});
}
// Escalate severity
// Escalate severity + alert on escalation
const severityOrder = { warn: 0, error: 1, fatal: 2 };
const eventSev = event.eventType as 'warn' | 'error' | 'fatal';
const previousSeverity = existing.severity;
if ((severityOrder[eventSev] ?? 0) > (severityOrder[existing.severity] ?? 0)) {
existing.severity = eventSev;
}
// Default status for legacy clusters without status field
if (!existing.status) existing.status = 'open';
await repo.upsertCluster(existing);
// Fire webhook alert on severity escalation
if (existing.severity !== previousSeverity) {
sendClusterAlert(existing, previousSeverity).catch(() => {});
}
} else {
const newCluster: TelemetryErrorCluster = {
id: clusterId,
@ -336,6 +441,7 @@ async function updateClusterForEvent(event: TelemetryEventDoc): Promise<void> {
sampleErrorCode: event.errorCode,
sampleMessage: event.message,
severity: event.eventType as 'warn' | 'error' | 'fatal',
status: 'open',
ttl: DEFAULT_CLUSTER_TTL_DAYS * 86400,
};
await repo.upsertCluster(newCluster);
@ -367,6 +473,7 @@ export async function telemetryRoutes(app: FastifyInstance) {
// Rate limiting per installId
const rateLimitKey = installToken || req.jwtPayload?.sub || 'unknown';
if (!checkRateLimit(rateLimitKey, events.length)) {
metrics.totalRateLimited++;
reply.code(429);
return {
accepted: 0,
@ -383,11 +490,15 @@ export async function telemetryRoutes(app: FastifyInstance) {
seenIds.add(e.id);
return true;
});
const dupCount = events.length - dedupedEvents.length;
metrics.totalDuplicatesDropped += dupCount;
metrics.totalBatchRequests++;
const now = new Date().toISOString();
const ttl = DEFAULT_EVENT_TTL_DAYS * 86400;
let accepted = 0;
let rejected = events.length - dedupedEvents.length; // duplicates
let rejected = dupCount; // duplicates
const errors: Array<{ index: number; reason: string }> = [];
const docsToInsert: TelemetryEventDoc[] = [];
@ -406,6 +517,7 @@ export async function telemetryRoutes(app: FastifyInstance) {
if (fieldsToScan.some(f => containsPII(f!))) {
errors.push({ index: i, reason: 'PII detected' });
rejected++;
metrics.totalPiiBlocked++;
continue;
}
@ -435,6 +547,9 @@ export async function telemetryRoutes(app: FastifyInstance) {
}
}
metrics.totalEventsIngested += accepted;
metrics.totalEventsRejected += rejected;
reply.code(accepted > 0 ? 200 : 400);
return {
accepted,
@ -549,6 +664,10 @@ export async function telemetryRoutes(app: FastifyInstance) {
};
const created = await repo.createPolicy(doc);
emitAudit(productId, doc.createdBy, 'telemetry.policy.created', {
policyId: doc.id,
name: doc.name,
});
reply.code(201);
return created;
});
@ -575,6 +694,10 @@ export async function telemetryRoutes(app: FastifyInstance) {
updates as Partial<TelemetryCollectionPolicyDoc>
);
if (!updated) throw new NotFoundError('Policy not found');
emitAudit(productId, req.jwtPayload?.sub ?? 'unknown', 'telemetry.policy.updated', {
policyId: id,
updates: parsed.data,
});
return updated;
});
@ -585,6 +708,9 @@ export async function telemetryRoutes(app: FastifyInstance) {
const productId = getRequestProductId(req);
const deleted = await repo.deletePolicy(id, productId);
if (!deleted) throw new NotFoundError('Policy not found');
emitAudit(productId, req.jwtPayload?.sub ?? 'unknown', 'telemetry.policy.deleted', {
policyId: id,
});
return { success: true };
});
@ -594,6 +720,53 @@ export async function telemetryRoutes(app: FastifyInstance) {
const { userId } = req.params as { userId: string };
const productId = getRequestProductId(req);
const eventsDeleted = await repo.deleteEventsByUserId(productId, userId);
emitAudit(productId, req.jwtPayload?.sub ?? 'unknown', 'telemetry.gdpr.erasure', {
targetUserId: userId,
eventsDeleted,
});
return { userId, eventsDeleted, clustersUpdated: 0 };
});
// ── Admin: resolve/ignore cluster ───────────────────────────
app.patch('/telemetry/clusters/:id', async req => {
requireAdmin(req);
const { id } = req.params as { id: string };
const { pk } = req.query as { pk: string };
if (!pk) throw new BadRequestError('pk query parameter required');
const parsed = UpdateClusterSchema.safeParse(req.body);
if (!parsed.success) {
throw new BadRequestError(parsed.error.issues.map(i => i.message).join('; '));
}
const updates: Partial<TelemetryErrorCluster> = {
status: parsed.data.status,
};
if (parsed.data.status === 'resolved' || parsed.data.status === 'ignored') {
updates.resolvedBy = req.jwtPayload?.sub ?? 'unknown';
updates.resolvedAt = new Date().toISOString();
}
const updated = await repo.updateCluster(id, pk, updates);
if (!updated) throw new NotFoundError('Cluster not found');
const productId = getRequestProductId(req);
emitAudit(
productId,
req.jwtPayload?.sub ?? 'unknown',
`telemetry.cluster.${parsed.data.status}`,
{
clusterId: id,
pk,
}
);
return updated;
});
// ── Admin: ingestion metrics ──────────────────────────────
app.get('/telemetry/metrics', async req => {
requireAdmin(req);
return metrics;
});
}

View File

@ -201,6 +201,8 @@ export interface TelemetryCollectionConfig {
// ─── Error Cluster ──────────────────────────────────────────────────
export const ClusterStatusEnum = z.enum(['open', 'resolved', 'ignored']);
export interface TelemetryErrorCluster {
id: string; // ${fingerprint}:${yyyyMM}
pk: string; // ${productId}:${platform}:${module}
@ -230,9 +232,16 @@ export interface TelemetryErrorCluster {
sampleErrorCode?: string;
sampleMessage?: string;
severity: 'warn' | 'error' | 'fatal';
status: 'open' | 'resolved' | 'ignored';
resolvedBy?: string;
resolvedAt?: string;
ttl: number;
}
export const UpdateClusterSchema = z.object({
status: ClusterStatusEnum,
});
// ─── Query / Admin types ────────────────────────────────────────────
export const TelemetryQuerySchema = z.object({
@ -253,3 +262,15 @@ export const TelemetryQuerySchema = z.object({
});
export type TelemetryQueryInput = z.infer<typeof TelemetryQuerySchema>;
// ─── Ingestion Metrics (in-memory counters) ─────────────────────────
export interface TelemetryMetrics {
totalEventsIngested: number;
totalEventsRejected: number;
totalBatchRequests: number;
totalRateLimited: number;
totalPiiBlocked: number;
totalDuplicatesDropped: number;
uptimeSince: string;
}