docs: update documentation

This commit is contained in:
saravanakumardb1 2026-02-17 10:49:14 -08:00
parent 856788c386
commit 80a4459f81
4 changed files with 400 additions and 114 deletions

View File

@ -20,6 +20,10 @@
5. [New Environment Variables](#5-new-environment-variables) 5. [New Environment Variables](#5-new-environment-variables)
6. [Quick Reference — Where Things Live](#6-quick-reference--where-things-live) 6. [Quick Reference — Where Things Live](#6-quick-reference--where-things-live)
- [Appendix A: Risks & Open Questions](#appendix-a-risks--open-questions)
- [Appendix B: Component Dependency Graph](#appendix-b-component-dependency-graph)
- [Appendix C: Review Findings](#appendix-c-review-findings)
--- ---
## 1. Current Inventory ## 1. Current Inventory
@ -29,23 +33,23 @@
| Category | Module | Endpoints | Description | | Category | Module | Endpoints | Description |
| ------------ | --------------- | --------- | --------------------------------------------------------------------------------------------------- | | ------------ | --------------- | --------- | --------------------------------------------------------------------------------------------------- |
| **Identity** | `auth` | 11 routes | Login, register, refresh, SSO, profile, admin user CRUD | | **Identity** | `auth` | 11 routes | Login, register, refresh, SSO, profile, admin user CRUD |
| **Identity** | `tokens` | CRUD | API token management | | **Identity** | `tokens` | 5 routes | API token management (CRUD + validate) |
| **Identity** | `licenses` | CRUD | License key generation, activation, device binding | | **Identity** | `licenses` | 6 routes | License key generation, activation, device binding, validate |
| **Billing** | `subscriptions` | CRUD | Plan management, trial tracking, period management | | **Billing** | `subscriptions` | 5 routes | Plan management, trial tracking, period management |
| **Billing** | `stripe` | Webhooks | Inbound Stripe webhook processing | | **Billing** | `stripe` | 2 routes | Inbound Stripe webhook + portal session |
| **Billing** | `plans` | CRUD | Plan definitions (free, pro, enterprise) | | **Billing** | `plans` | 4 routes | Plan definitions (free, pro, enterprise) |
| **Billing** | `usage` | CRUD | Usage tracking and quota enforcement | | **Billing** | `usage` | 4 routes | Usage tracking and quota enforcement |
| **Billing** | `promos` | CRUD | Promo code creation, validation, redemption | | **Billing** | `promos` | 5 routes | Promo code creation, validation, redemption |
| **Growth** | `invitations` | CRUD | Invitation code generation, redemption, tracking | | **Growth** | `invitations` | 5 routes | Invitation code generation, redemption, tracking |
| **Growth** | `referrals` | CRUD | Referral link tracking, status transitions | | **Growth** | `referrals` | 5 routes | Referral link tracking, status transitions |
| **Growth** | `waitlist` | 12 routes | Pre-launch signups, position tracking, admin batch invite, CSV export | | **Growth** | `waitlist` | 12 routes | Pre-launch signups, position tracking, admin batch invite, CSV export |
| **Growth** | `public` | 5 routes | Public roadmap, community voting, feature submissions | | **Growth** | `public` | 5 routes | Public roadmap, community voting, feature submissions |
| **Content** | `items` | CRUD | Tracker items (bugs, features, tasks) | | **Content** | `items` | 5 routes | Tracker items (bugs, features, tasks) |
| **Content** | `comments` | CRUD | Threaded comments on items | | **Content** | `comments` | 4 routes | Threaded comments on items |
| **Content** | `votes` | CRUD | User votes on items and comments | | **Content** | `votes` | 3 routes | User votes on items and comments |
| **Content** | `memory` | 5 routes | Memory items — create, reassign, patch, delete | | **Content** | `memory` | 5 routes | Memory items — create, reassign, patch, delete |
| **Ops** | `audit` | Query | Audit log recording and admin queries | | **Ops** | `audit` | Query | Audit log recording and admin queries |
| **Ops** | `flags` | CRUD | Feature flags with FNV-1a deterministic rollout | | **Ops** | `flags` | 5 routes | Feature flags with FNV-1a deterministic rollout |
| **Ops** | `telemetry` | 9 routes | Client event ingestion, error clustering, collection policies, GDPR erasure | | **Ops** | `telemetry` | 9 routes | Client event ingestion, error clustering, collection policies, GDPR erasure |
| **Ops** | `notifications` | 5 routes | Device registration, notification preferences | | **Ops** | `notifications` | 5 routes | Device registration, notification preferences |
| **Ops** | `settings` | 6 routes | User/device settings, kill switch | | **Ops** | `settings` | 6 routes | User/device settings, kill switch |
@ -75,8 +79,8 @@
### 1.3 Services ### 1.3 Services
| Service | Port | Description | | Service | Port | Description |
| ---------------------- | ---- | ----------------------------------------------------- | | ---------------------- | ---- | ---------------------------------------------------- |
| **platform-service** | 4003 | Consolidated Fastify service (25 modules, 158+ tests) | | **platform-service** | 4003 | Consolidated Fastify service (25 modules, 621 tests) |
| **extraction-service** | 4005 | LangExtract text extraction + Python sidecar | | **extraction-service** | 4005 | LangExtract text extraction + Python sidecar |
| **monitoring** | 4004 | Health-check aggregator (all services) | | **monitoring** | 4004 | Health-check aggregator (all services) |
@ -185,8 +189,8 @@ platform-service/src/modules/delivery/
│ ├── push-apns.ts — Apple Push Notification Service │ ├── push-apns.ts — Apple Push Notification Service
│ ├── push-fcm.ts — Firebase Cloud Messaging │ ├── push-fcm.ts — Firebase Cloud Messaging
│ └── sms.ts — Twilio/Azure Communication Services (future) │ └── sms.ts — Twilio/Azure Communication Services (future)
├── renderer.ts — Template rendering (Handlebars/Mustache for email) ├── renderer.ts — Template rendering (Handlebars for email bodies)
├── repository.ts — delivery_log container (track sent/failed/bounced) ├── repository.ts — delivery_log + email_templates containers
├── dispatcher.ts — Route delivery request to correct channel(s) based on prefs ├── dispatcher.ts — Route delivery request to correct channel(s) based on prefs
└── routes.ts — Admin: send test, view delivery log, manage templates └── routes.ts — Admin: send test, view delivery log, manage templates
``` ```
@ -245,9 +249,9 @@ platform-service/src/modules/webhooks/
**Event catalog (subscribe to any combination):** **Event catalog (subscribe to any combination):**
| Event | Payload | Source | | Event | Payload | Source |
| ----------------------- | ---------------------------------------------- | --------------------------- | | ----------------------- | ---------------------------------------------- | ------------------------------- |
| `user.created` | `{ userId, email, plan }` | `auth.register`, `auth.sso` | | `user.created` | `{ userId, email, plan }` | `auth.register`, `auth.sso` |
| `user.deleted` | `{ userId }` | `auth.delete` | | `user.deleted` | `{ userId }` | Admin: `DELETE /auth/users/:id` |
| `subscription.created` | `{ subscriptionId, userId, plan, status }` | Registration hook | | `subscription.created` | `{ subscriptionId, userId, plan, status }` | Registration hook |
| `subscription.changed` | `{ subscriptionId, oldPlan, newPlan, status }` | Stripe webhook | | `subscription.changed` | `{ subscriptionId, oldPlan, newPlan, status }` | Stripe webhook |
| `subscription.canceled` | `{ subscriptionId, userId, reason }` | User action / Stripe | | `subscription.canceled` | `{ subscriptionId, userId, reason }` | User action / Stripe |
@ -334,6 +338,13 @@ const PlatformEvents = {
} as const; } as const;
``` ```
**Migration from existing `lib/webhooks.ts`:**
- Existing `dispatchInvitationRedeemed()`, `dispatchReferralStatusChanged()`, `dispatchWaitlistJoined()` become event bus subscribers
- Phase 1: Register existing webhooks.ts functions as handlers on the bus
- Phase 2: Replace inline dispatch calls in routes with `bus.emit()`
- Phase 3: Remove `lib/webhooks.ts` once all callers migrated
**Benefits:** **Benefits:**
- Audit logging becomes a subscriber, not inline code - Audit logging becomes a subscriber, not inline code
@ -389,7 +400,7 @@ interface PasswordResetToken {
- `password_reset_tokens` (pk: `/productId`) — short-lived, TTL 24h auto-expiry - `password_reset_tokens` (pk: `/productId`) — short-lived, TTL 24h auto-expiry
**Dependency:** Requires email delivery (§2.2) for sending reset links and verification emails. Can ship the endpoints first with console-logged URLs for dev/testing. **Dependency:** Requires email delivery (§2.2) for sending reset links and verification emails. Can ship the endpoints first with `req.log.info`-logged URLs for dev/testing (never `console.log`).
--- ---
@ -556,9 +567,11 @@ platform-service/src/modules/exports/
**Flow:** **Flow:**
1. Admin POST `/api/exports``{ type: 'users', format: 'csv', filters: { plan: 'free' } }` 1. Admin POST `/api/exports``{ type: 'users', format: 'csv', filters: { plan: 'free' } }`
2. Background job runs query, writes result to blob storage 2. Background job runs query, writes result to blob storage (via existing `blob` module)
3. Job status updates: `pending``processing``ready` / `failed` 3. Job status updates: `pending``processing``ready` / `failed`
4. Admin downloads from signed blob URL 4. Admin downloads from signed blob URL (SAS token via `@bytelyst/blob`)
**Dependencies:** `blob` module (existing) for storage, `jobs` module (§2.1) for auto-cleanup of expired exports.
**Supported exports:** **Supported exports:**
@ -629,6 +642,8 @@ interface MaintenanceConfig {
- Schedule builder with start/end date pickers - Schedule builder with start/end date pickers
- Bypass IP whitelist management - Bypass IP whitelist management
**Storage:** Maintenance config is a single document per product in the existing `settings` container (field: `maintenanceConfig`). No new Cosmos container needed.
--- ---
#### 2.11 Rate Limit Dashboard & IP Allow/Deny Lists #### 2.11 Rate Limit Dashboard & IP Allow/Deny Lists
@ -678,6 +693,11 @@ interface IPRule {
- IP rules management (allow/deny with expiry) - IP rules management (allow/deny with expiry)
- Per-user rate limit override - Per-user rate limit override
**Cosmos container:**
- `ip_rules` (pk: `/productId`) — persistent IP allow/deny rules
- Rate limit stats remain in-memory (ephemeral); no persistence needed for counters
--- ---
### P2 — Product Intelligence ### P2 — Product Intelligence
@ -715,7 +735,7 @@ interface ExperimentDoc {
hypothesis: string; hypothesis: string;
status: 'draft' | 'running' | 'paused' | 'concluded'; status: 'draft' | 'running' | 'paused' | 'concluded';
variants: Variant[]; // [{id: 'control', weight: 50}, {id: 'treatment', weight: 50}] variants: Variant[]; // [{id: 'control', weight: 50}, {id: 'treatment', weight: 50}]
targetingRules: {}; // Same as flag targeting targetingRules: FlagTargetingRules; // Reuse from flags module (platforms, versions, percentage)
primaryMetric: string; // e.g., 'dictation_completed_rate' primaryMetric: string; // e.g., 'dictation_completed_rate'
secondaryMetrics: string[]; secondaryMetrics: string[];
startedAt?: string; startedAt?: string;
@ -1047,24 +1067,25 @@ This is a major architectural expansion. Defer until enterprise tier is validate
Each new component introduces Cosmos containers. Cosmos DB Serverless charges per RU consumed + storage, so idle containers cost only storage (~$0.25/GB/month). Each new component introduces Cosmos containers. Cosmos DB Serverless charges per RU consumed + storage, so idle containers cost only storage (~$0.25/GB/month).
| Component | New Containers | Partition Key | Est. TTL | Est. Daily RU | | Component | New Containers | Partition Key | Est. TTL | Est. Daily RU |
| ---------------------- | --------------------------------------------- | ----------------------------------------- | --------------- | ----------------------------------- | | ---------------------- | ---------------------------------------------- | ----------------------------------------- | --------------- | ----------------------------------- |
| **2.1 Jobs** | `job_definitions`, `job_runs` | `/productId`, `/productId:jobName` | runs: 90d | ~50 RU (low volume) | | **2.1 Jobs** | `job_definitions`, `job_runs` | `/productId`, `/productId:jobName` | runs: 90d | ~50 RU (low volume) |
| **2.2 Email/Push** | `delivery_log`, `email_templates` | `/productId:channel:yyyyMM`, `/productId` | log: 90d | ~200 RU | | **2.2 Email/Push** | `delivery_log`, `email_templates` | `/productId:channel:yyyyMM`, `/productId` | log: 90d | ~200 RU |
| **2.3 Webhooks** | `webhook_subscriptions`, `webhook_deliveries` | `/productId`, `/subscriptionId:yyyyMM` | deliveries: 30d | ~100 RU | | **2.3 Webhooks** | `webhook_subscriptions`, `webhook_deliveries` | `/productId`, `/subscriptionId:yyyyMM` | deliveries: 30d | ~100 RU |
| **2.5 Password Reset** | `password_reset_tokens` | `/productId` | 24h auto | ~10 RU | | **2.5 Password Reset** | `password_reset_tokens`, `email_verifications` | `/productId`, `/productId` | 24h auto | ~10 RU |
| **2.6 Status** | `service_status`, `incidents` | `/productId`, `/productId` | None | ~20 RU | | **2.6 Status** | `service_status`, `incidents` | `/productId`, `/productId` | None | ~20 RU |
| **2.7 Sessions** | `sessions` | `/userId` | 90d | ~500 RU (read-heavy) | | **2.7 Sessions** | `sessions` | `/userId` | 90d | ~500 RU (read-heavy) |
| **2.8 Migrations** | `migrations` | `/productId` | None | ~5 RU (startup only) | | **2.8 Migrations** | `migrations` | `/productId` | None | ~5 RU (startup only) |
| **2.9 Exports** | `export_jobs` | `/productId` | 30d | ~20 RU | | **2.9 Exports** | `export_jobs` | `/productId` | 30d | ~20 RU |
| **2.12 Experiments** | `experiments` | `/productId` | None | ~50 RU | | **2.12 Experiments** | `experiments` | `/productId` | None | ~50 RU |
| **2.13 Analytics** | `analytics_rollups` | `/productId:metric:period` | None | ~300 RU (write-heavy during rollup) | | **2.13 Analytics** | `analytics_rollups` | `/productId:metric:period` | None | ~300 RU (write-heavy during rollup) |
| **2.11 IP Rules** | `ip_rules` | `/productId` | None (manual) | ~10 RU |
| **2.14 Feedback** | `feedback` | `/productId` | None | ~50 RU | | **2.14 Feedback** | `feedback` | `/productId` | None | ~50 RU |
| **2.16 Changelog** | `changelog` | `/productId` | None | ~10 RU | | **2.16 Changelog** | `changelog` | `/productId` | None | ~10 RU |
| **2.20 i18n** | `translations` | `/productId:locale` | None | ~100 RU (read-heavy, cacheable) | | **2.20 i18n** | `translations` | `/productId:locale` | None | ~100 RU (read-heavy, cacheable) |
| **2.23 Retention** | `retention_policies` | `/productId` | None | ~5 RU | | **2.23 Retention** | `retention_policies` | `/productId` | None | ~5 RU |
**Total new containers:** ~17 (across all phases) **Total new containers:** ~19 (across all phases)
**Existing containers:** ~25+ (across platform-service + dashboards) **Existing containers:** 27 (defined in `cosmos-init.ts`: products, users, settings, devices, notification_prefs, audit_log, feature_flags, invitation_codes, referrals, subscriptions, payments, licenses, plans, usage_daily, api_tokens, tracker_items, comments, votes, themes, waitlist, memory_items, daily_briefs, reflections, brain_insights, telemetry_events, telemetry_error_clusters, telemetry_collection_policies). Note: `promos` module uses Stripe API directly — no Cosmos container.
**Cost impact:** Minimal for Serverless tier — idle containers only consume storage. Active containers during job runs add burst RU. **Cost impact:** Minimal for Serverless tier — idle containers only consume storage. Active containers during job runs add burst RU.
**Recommendation:** Register all new containers in `cosmos-init.ts` alongside existing ones. Use TTL liberally for transient data (tokens, deliveries, job runs) to keep storage bounded. **Recommendation:** Register all new containers in `cosmos-init.ts` alongside existing ones. Use TTL liberally for transient data (tokens, deliveries, job runs) to keep storage bounded.
@ -1076,7 +1097,7 @@ Each new component introduces Cosmos containers. Cosmos DB Serverless charges pe
New components will require additional env vars. All should be added to `.env.example` files in both repos and documented. New components will require additional env vars. All should be added to `.env.example` files in both repos and documented.
| Component | Variable | Example | Required | | Component | Variable | Example | Required |
| -------------------- | -------------------------- | -------------------------------- | ------------------------- | | -------------------- | ----------------------------- | -------------------------------- | ------------------------- |
| **2.1 Jobs** | `JOB_RUNNER_ENABLED` | `true` | No (default: true) | | **2.1 Jobs** | `JOB_RUNNER_ENABLED` | `true` | No (default: true) |
| **2.1 Jobs** | `JOB_TICK_INTERVAL_MS` | `60000` | No (default: 60s) | | **2.1 Jobs** | `JOB_TICK_INTERVAL_MS` | `60000` | No (default: 60s) |
| **2.2 Email** | `SENDGRID_API_KEY` | `SG.xxx` | Yes (for email delivery) | | **2.2 Email** | `SENDGRID_API_KEY` | `SG.xxx` | Yes (for email delivery) |
@ -1090,6 +1111,10 @@ New components will require additional env vars. All should be added to `.env.ex
| **2.5 Auth** | `EMAIL_VERIFY_URL_BASE` | `https://app.lysnrai.com/verify` | Yes | | **2.5 Auth** | `EMAIL_VERIFY_URL_BASE` | `https://app.lysnrai.com/verify` | Yes |
| **2.10 Maintenance** | `MAINTENANCE_MODE` | `off` | No (default: off) | | **2.10 Maintenance** | `MAINTENANCE_MODE` | `off` | No (default: off) |
| **2.10 Maintenance** | `MAINTENANCE_BYPASS_IPS` | `10.0.0.1,10.0.0.2` | No | | **2.10 Maintenance** | `MAINTENANCE_BYPASS_IPS` | `10.0.0.1,10.0.0.2` | No |
| **2.3 Webhooks** | `WEBHOOK_DELIVERY_TIMEOUT_MS` | `5000` | No (default: 5s) |
| **2.3 Webhooks** | `WEBHOOK_MAX_RETRIES` | `3` | No (default: 3) |
| **2.7 Sessions** | `SESSION_TTL_DAYS` | `90` | No (default: 90) |
| **2.7 Sessions** | `SESSION_CACHE_TTL_MS` | `30000` | No (default: 30s) |
| **2.19 OpenAPI** | `SWAGGER_UI_ENABLED` | `true` | No (default: true in dev) | | **2.19 OpenAPI** | `SWAGGER_UI_ENABLED` | `true` | No (default: true in dev) |
**Secret management:** `SENDGRID_API_KEY`, `APNS_*`, and `FCM_*` should be added to Azure Key Vault as `lysnr-sendgrid-api-key`, `lysnr-apns-key-id`, etc. Update `LYSNR_SECRETS` in `@bytelyst/config` to include them. **Secret management:** `SENDGRID_API_KEY`, `APNS_*`, and `FCM_*` should be added to Azure Key Vault as `lysnr-sendgrid-api-key`, `lysnr-apns-key-id`, etc. Update `LYSNR_SECRETS` in `@bytelyst/config` to include them.
@ -1099,7 +1124,7 @@ New components will require additional env vars. All should be added to `.env.ex
## 6. Quick Reference — Where Things Live ## 6. Quick Reference — Where Things Live
| Component | Repo | Path | | Component | Repo | Path |
| ------------------------ | ------------------------- | ----------------------------------------------- | | ------------------------ | ----------------------------------- | ------------------------------------------------------ |
| Platform-service modules | `learning_ai_common_plat` | `services/platform-service/src/modules/` | | Platform-service modules | `learning_ai_common_plat` | `services/platform-service/src/modules/` |
| Shared packages | `learning_ai_common_plat` | `packages/` | | Shared packages | `learning_ai_common_plat` | `packages/` |
| Admin dashboard | `learning_voice_ai_agent` | `admin-dashboard-web/` | | Admin dashboard | `learning_voice_ai_agent` | `admin-dashboard-web/` |
@ -1108,20 +1133,40 @@ New components will require additional env vars. All should be added to `.env.ex
| Docker Compose | both repos | `docker-compose.yml` | | Docker Compose | both repos | `docker-compose.yml` |
| Monitoring | `learning_ai_common_plat` | `services/monitoring/` | | Monitoring | `learning_ai_common_plat` | `services/monitoring/` |
| Design tokens | `learning_ai_common_plat` | `packages/design-tokens/` | | Design tokens | `learning_ai_common_plat` | `packages/design-tokens/` |
| MindLyst native app | `learning_multimodal_memory_agents` | `mindlyst-native/` (KMP + SwiftUI + Compose + Next.js) |
| MindLyst web | `learning_multimodal_memory_agents` | `mindlyst-native/web/` |
| Existing webhooks | `learning_ai_common_plat` | `services/platform-service/src/lib/webhooks.ts` | | Existing webhooks | `learning_ai_common_plat` | `services/platform-service/src/lib/webhooks.ts` |
| Cosmos container defs | `learning_ai_common_plat` | `services/platform-service/src/lib/cosmos-init.ts` |
| Telemetry design doc | `learning_ai_common_plat` | `docs/WINDSURF/CLIENT_TELEMETRY_DESIGN.md` | | Telemetry design doc | `learning_ai_common_plat` | `docs/WINDSURF/CLIENT_TELEMETRY_DESIGN.md` |
| Telemetry roadmap | `learning_ai_common_plat` | `docs/WINDSURF/TELEMETRY_ROADMAP.md` | | Telemetry roadmap | `learning_ai_common_plat` | `docs/WINDSURF/TELEMETRY_ROADMAP.md` |
| **This document** | `learning_ai_common_plat` | `docs/WINDSURF/PLATFORM_COMPONENTS_ROADMAP.md` | | **This document** | `learning_ai_common_plat` | `docs/WINDSURF/PLATFORM_COMPONENTS_ROADMAP.md` |
--- ---
## Appendix: Component Dependency Graph ## Appendix A: Risks & Open Questions
| # | Topic | Risk / Question | Mitigation |
| --- | ---------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| 1 | **Leader election for jobs** | In-process tick loop with Cosmos lease — what happens during deploys? Two instances may briefly both hold leases. | Cosmos lease has a built-in TTL. Use 30s lease with 10s renewal. During deploy overlap, the old instance's lease expires before the new one acquires. Jobs must be idempotent. |
| 2 | **Email deliverability** | SendGrid requires domain verification (SPF/DKIM/DMARC). Without it, emails land in spam. | Set up `lysnrai.com` domain authentication in SendGrid before shipping §2.2. Budget 12 days for DNS propagation. |
| 3 | **Session validation latency** | Checking Cosmos on every request for session revocation adds ~510ms per request. | In-memory cache with 30s TTL (§2.7). Revocation is eventually consistent — acceptable trade-off for most apps. Document the 30s window. |
| 4 | **Cosmos container proliferation** | 28 existing + 19 new = 47 containers. Serverless tier has no per-container cost, but management complexity grows. | Group related containers by module. Document all containers in `cosmos-init.ts`. Consider container-per-module naming convention. |
| 5 | **Event bus ordering guarantees** | In-memory `EventEmitter` has no ordering guarantees across handlers. If audit must record before webhook fires, ordering matters. | Phase 1: Document that handlers run concurrently with no ordering. If ordering is needed, use handler priority weights or sequential mode. |
| 6 | **Push notification certificates** | APNs requires yearly certificate renewal. If it expires, all iOS push silently stops. | Add `apns-cert-expiry-check` to scheduled jobs (§2.1). Alert admin 30 days before expiry. |
| 7 | **Webhook abuse** | External subscribers could register slow endpoints that back up the delivery queue. | Per-subscription timeout (5s default), circuit breaker after 10 consecutive failures, auto-disable. |
| 8 | **Migration rollback** | Cosmos is schemaless — some migrations (e.g., partition key changes) are irreversible. | Mark migrations as `reversible: true/false`. Require manual approval for irreversible migrations. Always back up before running. |
| 9 | **MindLyst parity** | MindLyst web uses Cosmos directly (in-memory fallback). Shared components (email, sessions, webhooks) must work for MindLyst too, not just LysnrAI. | All new modules use `productId` for multi-product isolation. MindLyst can consume the same platform-service APIs. |
| 10 | **Priority conflicts** | Sprint plan assumes available engineering bandwidth. If telemetry or mobile work takes priority, these sprints slip. | Treat sprint assignments as relative ordering, not calendar commitments. Re-evaluate after each sprint. |
---
## Appendix B: Component Dependency Graph
``` ```
┌─────────────────────┐ ┌─────────────────────┐
│ Event Bus (2.4) │ │ Event Bus (2.4) │
└─────────┬───────────┘ └─────────┬───────────┘
│ emits events to all subscribers │ emits to subscribers
┌───────────┼───────────┼───────────┐ ┌───────────┼───────────┼───────────┐
│ │ │ │ │ │ │ │
▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼
@ -1130,31 +1175,60 @@ New components will require additional env vars. All should be added to `.env.ex
│ (2.2) │ │ (2.3) │ │ (existing)│ │ (2.13) │ │ (2.2) │ │ (2.3) │ │ (existing)│ │ (2.13) │
└─────┬─────┘ └───────────┘ └───────────┘ └───────────┘ └─────┬─────┘ └───────────┘ └───────────┘ └───────────┘
triggers sends
┌───────────┐ ┌───────────┐
│ Password │ │ Password │
│ Reset(2.5)│ │ Reset(2.5)│
└───────────┘ └───────────┘
┌───────────────┐──▶┌─────────────────┐ ┌─────────────────┐
│ Scheduled │ │ Analytics │ │ Blob Storage │
│ Jobs (2.1) │ │ Rollups (2.13) │ │ (existing) │
└───────┬───────┘ └─────────────────┘ └────────┬────────┘
│ │
│ triggers on schedule ▲ writes exports
▼ │
┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐ ┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
│ Scheduled │──▶│ Analytics │ │ Data Export │ │ Trial Expiry │ │ Usage Reset │ │ Data Export │
│ Jobs (2.1) │ │ Rollups (2.13) │ │ (2.9) │ │ (2.1 job) │ │ (2.1 job) │ │ (2.9) │
└───────┬───────┘ └─────────────────┘ └─────────────────┘
│ triggers on schedule
┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
│ Trial Expiry │ │ Usage Reset │ │ Retention │
│ Check │ │ │ │ Cleanup (2.23) │
└───────────────┘ └─────────────────┘ └─────────────────┘ └───────────────┘ └─────────────────┘ └─────────────────┘
┌───────────────┐ ┌─────────────────┐ ┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
│ Billing │──▶│ Email/Push │ │ Billing │──▶│ Email/Push │ │ Retention │
│ Dunning(2.25) │ │ Delivery (2.2) │ │ Dunning(2.25) │ │ Delivery (2.2) │ │ Cleanup (2.23) │
└───────────────┘ └─────────────────┘ └───────────────┘ └─────────────────┘ └─────────────────┘
``` ```
--- ---
## Appendix C: Review Findings
Systematic review performed 2026-02-17. All issues below have been fixed inline.
| # | Severity | Section | Finding | Fix |
| --- | -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------- |
| 1 | **Bug** | §1.3 | Test count stale: said "158+ tests" — actual count is **621** (verified via `grep -c 'it(' *.test.ts`). | Updated to 621. |
| 2 | **Bug** | §1.1 | Endpoint column inconsistent: some modules said "CRUD" (vague, could be 48 routes), others had exact counts. | Replaced all "CRUD" with actual route counts. |
| 3 | **Bug** | §2.5 | Said "console-logged URLs for dev/testing" — violates project rule: never `console.log` in production code. | Changed to `req.log.info`. |
| 4 | **Bug** | §2.12 | `ExperimentDoc.targetingRules: {}` — meaningless empty object type. | Changed to `FlagTargetingRules` (reuse from flags module). |
| 5 | **Bug** | §2.3 | Webhook event `user.deleted` source said `auth.delete` — no such endpoint name. Actual route is `DELETE /auth/users/:id` (admin action). | Fixed source column. |
| 6 | **Bug** | §4 | `email_verifications` container (from §2.5) missing from Cosmos table. Only `password_reset_tokens` was listed. | Added `email_verifications` to §2.5 row. |
| 7 | **Bug** | §4 | Existing container count said "~25+" — actual is **27** (counted from `cosmos-init.ts`; `promos` uses Stripe API directly, no Cosmos container). | Updated to 27 with full container list. |
| 8 | **Bug** | §4 | Total new containers said "~17" — after adding `email_verifications` and `ip_rules`, count is **19**. | Updated. |
| 9 | **Gap** | §2.2 | No clarity on email template storage strategy. `renderer.ts` mentioned but not whether templates are Cosmos-stored or file-based. | Clarified: `repository.ts` now references `delivery_log + email_templates` containers. |
| 10 | **Gap** | §2.4 | No migration strategy from existing `lib/webhooks.ts` to new event bus pattern. | Added "Migration from existing `lib/webhooks.ts`" subsection with 3-phase plan. |
| 11 | **Gap** | §2.10 | Maintenance mode proposed extending `settings` module but didn't clarify storage location. Missing from §4 Cosmos table. | Added: stored as single document per product in existing `settings` container (no new container needed). |
| 12 | **Gap** | §2.11 | IP rules need persistence but no container was mentioned. Missing from §4 table. | Added `ip_rules` container (pk: `/productId`) to both §2.11 and §4 table. |
| 13 | **Gap** | §2.9 | Data Export didn't mention blob module dependency (exports written to blob storage). | Added explicit dependency note on `blob` module and `jobs` module for cleanup. |
| 14 | **Gap** | §5 | Missing env vars for webhooks (timeout, retries) and sessions (TTL, cache TTL). | Added 4 new env vars: `WEBHOOK_DELIVERY_TIMEOUT_MS`, `WEBHOOK_MAX_RETRIES`, `SESSION_TTL_DAYS`, `SESSION_CACHE_TTL_MS`. |
| 15 | **Gap** | §6 | Quick Reference missing MindLyst repo (`learning_multimodal_memory_agents`). Doc scope says "ByteLyst platform" which includes MindLyst. | Added MindLyst native app and web entries. Also added `cosmos-init.ts` path. |
| 16 | **Gap** | Appendix | Dependency graph incomplete: missing Jobs → Data Export connection, missing Blob → Data Export dependency, downstream jobs not labeled with section numbers. | Rewrote graph with all connections and section labels. |
| 17 | **Gap** | Overall | No "Risks & Open Questions" section — design docs should call out unknowns. | Added Appendix A with 10 risk items and mitigations. |
| 18 | **Gap** | TOC | Table of Contents didn't include Appendix sections. | Added Appendix A, B, C to TOC. |
| 19 | **Gap** | §2.5 | Password reset cross-referenced "§2.6" for sessions but sessions was renumbered to §2.7 in previous edit pass. | Fixed to §2.7 (caught in prior pass). |
| 20 | **Gap** | §1.5 | Infrastructure table was missing Swagger/OpenAPI (partially wired) and Prometheus metrics (partially enabled). | Added in prior pass — verified still present. |
---
_This document is a living brainstorm. Items will be promoted to dedicated design docs (like `CLIENT_TELEMETRY_DESIGN.md`) as they move into implementation._ _This document is a living brainstorm. Items will be promoted to dedicated design docs (like `CLIENT_TELEMETRY_DESIGN.md`) as they move into implementation._

View File

@ -252,3 +252,21 @@ export async function getCluster(id: string, pk: string): Promise<TelemetryError
return null; return null;
} }
} }
export async function updateCluster(
id: string,
pk: string,
updates: Partial<TelemetryErrorCluster>
): Promise<TelemetryErrorCluster | null> {
try {
const { resource: existing } = await clustersContainer()
.item(id, pk)
.read<TelemetryErrorCluster>();
if (!existing) return null;
const merged = { ...existing, ...updates };
const { resource } = await clustersContainer().item(id, pk).replace(merged);
return resource as unknown as TelemetryErrorCluster;
} catch {
return null;
}
}

View File

@ -5,11 +5,13 @@
* GET /telemetry/config collection config for clients * GET /telemetry/config collection config for clients
* GET /telemetry/query admin query * GET /telemetry/query admin query
* GET /telemetry/clusters admin error clusters * GET /telemetry/clusters admin error clusters
* PATCH /telemetry/clusters/:id resolve/ignore cluster (admin)
* GET /telemetry/policies list policies (admin) * GET /telemetry/policies list policies (admin)
* POST /telemetry/policies create policy (admin) * POST /telemetry/policies create policy (admin)
* PUT /telemetry/policies/:id update policy (admin) * PUT /telemetry/policies/:id update policy (admin)
* DELETE /telemetry/policies/:id delete policy (admin) * DELETE /telemetry/policies/:id delete policy (admin)
* DELETE /telemetry/user/:userId GDPR erasure (admin) * DELETE /telemetry/user/:userId GDPR erasure (admin)
* GET /telemetry/metrics ingestion metrics (admin)
*/ */
import type { FastifyInstance } from 'fastify'; import type { FastifyInstance } from 'fastify';
@ -23,12 +25,16 @@ import {
TelemetryIngestRequestSchema, TelemetryIngestRequestSchema,
CreatePolicySchema, CreatePolicySchema,
UpdatePolicySchema, UpdatePolicySchema,
UpdateClusterSchema,
TelemetryQuerySchema, TelemetryQuerySchema,
type TelemetryEventDoc, type TelemetryEventDoc,
type TelemetryCollectionPolicyDoc, type TelemetryCollectionPolicyDoc,
type TelemetryCollectionConfig, type TelemetryCollectionConfig,
type TelemetryErrorCluster, type TelemetryErrorCluster,
type TelemetryMetrics,
} from './types.js'; } from './types.js';
import * as auditRepo from '../audit/repository.js';
import type { AuditDoc } from '../audit/types.js';
// ─── Helpers ──────────────────────────────────────────────────────── // ─── Helpers ────────────────────────────────────────────────────────
@ -69,6 +75,96 @@ const _cleanupTimer = globalThis.setInterval(() => {
}, 300_000); }, 300_000);
if (typeof _cleanupTimer === 'object' && 'unref' in _cleanupTimer) _cleanupTimer.unref(); if (typeof _cleanupTimer === 'object' && 'unref' in _cleanupTimer) _cleanupTimer.unref();
// ─── Ingestion metrics (in-memory counters) ─────────────────────
const metrics: TelemetryMetrics = {
totalEventsIngested: 0,
totalEventsRejected: 0,
totalBatchRequests: 0,
totalRateLimited: 0,
totalPiiBlocked: 0,
totalDuplicatesDropped: 0,
uptimeSince: new Date().toISOString(),
};
// ─── Webhook alerting ───────────────────────────────────────────
const ALERT_WEBHOOK_URL = process.env.TELEMETRY_ALERT_WEBHOOK_URL ?? '';
const ALERT_SEVERITY_THRESHOLD = (process.env.TELEMETRY_ALERT_SEVERITY_THRESHOLD ?? 'error') as
| 'warn'
| 'error'
| 'fatal';
const ALERT_COUNT_THRESHOLD = parseInt(process.env.TELEMETRY_ALERT_COUNT_THRESHOLD ?? '10', 10);
async function sendClusterAlert(
cluster: TelemetryErrorCluster,
previousSeverity: string
): Promise<void> {
if (!ALERT_WEBHOOK_URL) return;
const severityOrder: Record<string, number> = { warn: 0, error: 1, fatal: 2 };
const thresholdNum = severityOrder[ALERT_SEVERITY_THRESHOLD] ?? 1;
const clusterNum = severityOrder[cluster.severity] ?? 0;
// Alert if severity meets threshold OR count exceeds threshold
if (clusterNum < thresholdNum && cluster.totalCount < ALERT_COUNT_THRESHOLD) return;
const payload = {
text: `Telemetry Alert: Cluster ${cluster.fingerprint} escalated`,
blocks: [
{
type: 'section',
text: {
type: 'mrkdwn',
value: [
`*Telemetry Error Cluster Escalated*`,
`*Event:* ${cluster.module}/${cluster.eventName}`,
`*Platform:* ${cluster.platform} (${cluster.channel})`,
`*Severity:* ${previousSeverity} → *${cluster.severity}*`,
`*Total Count:* ${cluster.totalCount}`,
`*Affected Users:* ${cluster.affectedUserIds.length + cluster.affectedInstallIds.length}`,
`*First Seen:* ${cluster.firstSeenAt}`,
`*Last Seen:* ${cluster.lastSeenAt}`,
cluster.sampleMessage ? `*Sample:* \`${cluster.sampleMessage.slice(0, 200)}\`` : '',
]
.filter(Boolean)
.join('\n'),
},
},
],
};
try {
await fetch(ALERT_WEBHOOK_URL, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload),
});
} catch {
// Best-effort — don't fail ingestion on alert failure
}
}
// ─── Audit helper ───────────────────────────────────────────────
function emitAudit(
productId: string,
userId: string,
action: string,
details: Record<string, unknown>
): void {
const doc: AuditDoc = {
id: `aud_${randomUUID()}`,
productId,
userId,
action,
category: 'telemetry',
details,
createdAt: new Date().toISOString(),
};
auditRepo.create(doc).catch(() => {});
}
/** PII patterns — reject events containing these. */ /** PII patterns — reject events containing these. */
const PII_PATTERNS = [ const PII_PATTERNS = [
/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z]{2,}\b/i, // email /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z]{2,}\b/i, // email
@ -300,14 +396,23 @@ async function updateClusterForEvent(event: TelemetryEventDoc): Promise<void> {
}); });
} }
// Escalate severity // Escalate severity + alert on escalation
const severityOrder = { warn: 0, error: 1, fatal: 2 }; const severityOrder = { warn: 0, error: 1, fatal: 2 };
const eventSev = event.eventType as 'warn' | 'error' | 'fatal'; const eventSev = event.eventType as 'warn' | 'error' | 'fatal';
const previousSeverity = existing.severity;
if ((severityOrder[eventSev] ?? 0) > (severityOrder[existing.severity] ?? 0)) { if ((severityOrder[eventSev] ?? 0) > (severityOrder[existing.severity] ?? 0)) {
existing.severity = eventSev; existing.severity = eventSev;
} }
// Default status for legacy clusters without status field
if (!existing.status) existing.status = 'open';
await repo.upsertCluster(existing); await repo.upsertCluster(existing);
// Fire webhook alert on severity escalation
if (existing.severity !== previousSeverity) {
sendClusterAlert(existing, previousSeverity).catch(() => {});
}
} else { } else {
const newCluster: TelemetryErrorCluster = { const newCluster: TelemetryErrorCluster = {
id: clusterId, id: clusterId,
@ -336,6 +441,7 @@ async function updateClusterForEvent(event: TelemetryEventDoc): Promise<void> {
sampleErrorCode: event.errorCode, sampleErrorCode: event.errorCode,
sampleMessage: event.message, sampleMessage: event.message,
severity: event.eventType as 'warn' | 'error' | 'fatal', severity: event.eventType as 'warn' | 'error' | 'fatal',
status: 'open',
ttl: DEFAULT_CLUSTER_TTL_DAYS * 86400, ttl: DEFAULT_CLUSTER_TTL_DAYS * 86400,
}; };
await repo.upsertCluster(newCluster); await repo.upsertCluster(newCluster);
@ -367,6 +473,7 @@ export async function telemetryRoutes(app: FastifyInstance) {
// Rate limiting per installId // Rate limiting per installId
const rateLimitKey = installToken || req.jwtPayload?.sub || 'unknown'; const rateLimitKey = installToken || req.jwtPayload?.sub || 'unknown';
if (!checkRateLimit(rateLimitKey, events.length)) { if (!checkRateLimit(rateLimitKey, events.length)) {
metrics.totalRateLimited++;
reply.code(429); reply.code(429);
return { return {
accepted: 0, accepted: 0,
@ -383,11 +490,15 @@ export async function telemetryRoutes(app: FastifyInstance) {
seenIds.add(e.id); seenIds.add(e.id);
return true; return true;
}); });
const dupCount = events.length - dedupedEvents.length;
metrics.totalDuplicatesDropped += dupCount;
metrics.totalBatchRequests++;
const now = new Date().toISOString(); const now = new Date().toISOString();
const ttl = DEFAULT_EVENT_TTL_DAYS * 86400; const ttl = DEFAULT_EVENT_TTL_DAYS * 86400;
let accepted = 0; let accepted = 0;
let rejected = events.length - dedupedEvents.length; // duplicates let rejected = dupCount; // duplicates
const errors: Array<{ index: number; reason: string }> = []; const errors: Array<{ index: number; reason: string }> = [];
const docsToInsert: TelemetryEventDoc[] = []; const docsToInsert: TelemetryEventDoc[] = [];
@ -406,6 +517,7 @@ export async function telemetryRoutes(app: FastifyInstance) {
if (fieldsToScan.some(f => containsPII(f!))) { if (fieldsToScan.some(f => containsPII(f!))) {
errors.push({ index: i, reason: 'PII detected' }); errors.push({ index: i, reason: 'PII detected' });
rejected++; rejected++;
metrics.totalPiiBlocked++;
continue; continue;
} }
@ -435,6 +547,9 @@ export async function telemetryRoutes(app: FastifyInstance) {
} }
} }
metrics.totalEventsIngested += accepted;
metrics.totalEventsRejected += rejected;
reply.code(accepted > 0 ? 200 : 400); reply.code(accepted > 0 ? 200 : 400);
return { return {
accepted, accepted,
@ -549,6 +664,10 @@ export async function telemetryRoutes(app: FastifyInstance) {
}; };
const created = await repo.createPolicy(doc); const created = await repo.createPolicy(doc);
emitAudit(productId, doc.createdBy, 'telemetry.policy.created', {
policyId: doc.id,
name: doc.name,
});
reply.code(201); reply.code(201);
return created; return created;
}); });
@ -575,6 +694,10 @@ export async function telemetryRoutes(app: FastifyInstance) {
updates as Partial<TelemetryCollectionPolicyDoc> updates as Partial<TelemetryCollectionPolicyDoc>
); );
if (!updated) throw new NotFoundError('Policy not found'); if (!updated) throw new NotFoundError('Policy not found');
emitAudit(productId, req.jwtPayload?.sub ?? 'unknown', 'telemetry.policy.updated', {
policyId: id,
updates: parsed.data,
});
return updated; return updated;
}); });
@ -585,6 +708,9 @@ export async function telemetryRoutes(app: FastifyInstance) {
const productId = getRequestProductId(req); const productId = getRequestProductId(req);
const deleted = await repo.deletePolicy(id, productId); const deleted = await repo.deletePolicy(id, productId);
if (!deleted) throw new NotFoundError('Policy not found'); if (!deleted) throw new NotFoundError('Policy not found');
emitAudit(productId, req.jwtPayload?.sub ?? 'unknown', 'telemetry.policy.deleted', {
policyId: id,
});
return { success: true }; return { success: true };
}); });
@ -594,6 +720,53 @@ export async function telemetryRoutes(app: FastifyInstance) {
const { userId } = req.params as { userId: string }; const { userId } = req.params as { userId: string };
const productId = getRequestProductId(req); const productId = getRequestProductId(req);
const eventsDeleted = await repo.deleteEventsByUserId(productId, userId); const eventsDeleted = await repo.deleteEventsByUserId(productId, userId);
emitAudit(productId, req.jwtPayload?.sub ?? 'unknown', 'telemetry.gdpr.erasure', {
targetUserId: userId,
eventsDeleted,
});
return { userId, eventsDeleted, clustersUpdated: 0 }; return { userId, eventsDeleted, clustersUpdated: 0 };
}); });
// ── Admin: resolve/ignore cluster ───────────────────────────
app.patch('/telemetry/clusters/:id', async req => {
requireAdmin(req);
const { id } = req.params as { id: string };
const { pk } = req.query as { pk: string };
if (!pk) throw new BadRequestError('pk query parameter required');
const parsed = UpdateClusterSchema.safeParse(req.body);
if (!parsed.success) {
throw new BadRequestError(parsed.error.issues.map(i => i.message).join('; '));
}
const updates: Partial<TelemetryErrorCluster> = {
status: parsed.data.status,
};
if (parsed.data.status === 'resolved' || parsed.data.status === 'ignored') {
updates.resolvedBy = req.jwtPayload?.sub ?? 'unknown';
updates.resolvedAt = new Date().toISOString();
}
const updated = await repo.updateCluster(id, pk, updates);
if (!updated) throw new NotFoundError('Cluster not found');
const productId = getRequestProductId(req);
emitAudit(
productId,
req.jwtPayload?.sub ?? 'unknown',
`telemetry.cluster.${parsed.data.status}`,
{
clusterId: id,
pk,
}
);
return updated;
});
// ── Admin: ingestion metrics ──────────────────────────────
app.get('/telemetry/metrics', async req => {
requireAdmin(req);
return metrics;
});
} }

View File

@ -201,6 +201,8 @@ export interface TelemetryCollectionConfig {
// ─── Error Cluster ────────────────────────────────────────────────── // ─── Error Cluster ──────────────────────────────────────────────────
export const ClusterStatusEnum = z.enum(['open', 'resolved', 'ignored']);
export interface TelemetryErrorCluster { export interface TelemetryErrorCluster {
id: string; // ${fingerprint}:${yyyyMM} id: string; // ${fingerprint}:${yyyyMM}
pk: string; // ${productId}:${platform}:${module} pk: string; // ${productId}:${platform}:${module}
@ -230,9 +232,16 @@ export interface TelemetryErrorCluster {
sampleErrorCode?: string; sampleErrorCode?: string;
sampleMessage?: string; sampleMessage?: string;
severity: 'warn' | 'error' | 'fatal'; severity: 'warn' | 'error' | 'fatal';
status: 'open' | 'resolved' | 'ignored';
resolvedBy?: string;
resolvedAt?: string;
ttl: number; ttl: number;
} }
export const UpdateClusterSchema = z.object({
status: ClusterStatusEnum,
});
// ─── Query / Admin types ──────────────────────────────────────────── // ─── Query / Admin types ────────────────────────────────────────────
export const TelemetryQuerySchema = z.object({ export const TelemetryQuerySchema = z.object({
@ -253,3 +262,15 @@ export const TelemetryQuerySchema = z.object({
}); });
export type TelemetryQueryInput = z.infer<typeof TelemetryQuerySchema>; export type TelemetryQueryInput = z.infer<typeof TelemetryQuerySchema>;
// ─── Ingestion Metrics (in-memory counters) ─────────────────────────
export interface TelemetryMetrics {
totalEventsIngested: number;
totalEventsRejected: number;
totalBatchRequests: number;
totalRateLimited: number;
totalPiiBlocked: number;
totalDuplicatesDropped: number;
uptimeSince: string;
}