diff --git a/packages/llm-router/README.md b/packages/llm-router/README.md new file mode 100644 index 00000000..b15c376d --- /dev/null +++ b/packages/llm-router/README.md @@ -0,0 +1,134 @@ +# @bytelyst/llm-router + +Pure-code LLM router for free-tier API providers. No LLM-in-the-loop — deterministic routing with automatic fallback, health tracking, and round-robin load distribution. + +## Features + +- **4 free providers** out of the box: Groq, OpenRouter, Together AI, Cerebras +- **Prompt classification** — regex-based detection of code/math/reasoning/creative prompts +- **Smart selection** — routes to the best model for each prompt category +- **Round-robin** — distributes load across providers to maximize free-tier usage +- **Auto-fallback** — retries on 429/5xx with next-best provider +- **Health tracking** — sliding-window stats (latency, error rate, rate-limit rate) +- **Telemetry hook** — log every routing decision for analysis +- **OpenAI-compatible** — same request/response format as OpenAI chat completions +- **Zero dependencies** — pure TypeScript, uses native `fetch` + +## Quick Start + +```bash +# Set at least one API key +export GROQ_API_KEY=gsk_... +export OPENROUTER_API_KEY=sk-or-... +export TOGETHER_API_KEY=... +export CEREBRAS_API_KEY=... +``` + +```typescript +import { LlmRouter } from '@bytelyst/llm-router'; + +const router = new LlmRouter(); + +// Automatic routing — classifier picks best provider+model +const result = await router.chat({ + messages: [{ role: 'user', content: 'Write a quicksort in TypeScript' }], +}); + +console.log(result.response.choices[0].message.content); +console.log(`Served by: ${result.provider}/${result.model} in ${result.totalLatencyMs}ms`); +``` + +## Explicit Provider Routing + +```typescript +// Force a specific provider:model +const result = await router.chat({ + messages: [{ role: 'user', content: 'Hello' }], + model: 'groq:llama-3.3-70b-versatile', +}); +``` + +## Telemetry + +```typescript +const router = new LlmRouter({ + onTelemetry: entry => { + // entry: { event, provider, model, attempt, latencyMs, category, tokens?, error? } + console.log(`[${entry.event}] ${entry.provider}/${entry.model} — ${entry.latencyMs}ms`); + }, +}); +``` + +## Health Monitoring + +```typescript +const snapshots = router.getHealth(); +// Returns: HealthSnapshot[] with per-provider stats +// { provider, model, totalRequests, successes, rateLimits, errors, avgLatencyMs, p95LatencyMs, healthy } +``` + +## Configuration + +```typescript +const router = new LlmRouter({ + // Override default providers + providers: [...], + // Health window (default: 60s) + healthWindowMs: 120_000, + // Error rate to mark unhealthy (default: 50%) + errorThreshold: 0.4, + // Rate-limit rate to mark unhealthy (default: 30%) + rateLimitThreshold: 0.2, + // Request timeout (default: 30s) + timeoutMs: 15_000, + // Max retry attempts (default: 3) + maxRetries: 4, +}); +``` + +## Provider Selection Logic + +1. **Classify** prompt → code, math, reasoning, creative, or general +2. **Score** each available model based on category match, speed tier, context window, and model size +3. **Filter** unhealthy models (based on sliding-window error/rate-limit rates) +4. **Round-robin** across top-scoring providers to spread rate-limit load +5. **Fallback** on 429/5xx → exclude failed model, pick next best + +## Default Provider Registry + +| Provider | Models | Speed | Strengths | +| -------------- | ---------------------------------------- | ---------- | ------------------------ | +| **Groq** | Llama 3.3 70B, Llama 3.1 8B, Gemma 2 9B | ⚡ Fastest | General, reasoning, code | +| **OpenRouter** | DeepSeek R1, Llama 3.3 70B, Gemma 2 9B | Medium | Reasoning, code, math | +| **Together** | Llama 3.3 70B Turbo, DeepSeek R1 Distill | Medium | General, reasoning, code | +| **Cerebras** | Llama 3.3 70B | ⚡ Fastest | General, reasoning, code | + +## Adding Custom Providers + +Any OpenAI-compatible endpoint works: + +```typescript +import { LlmRouter, DEFAULT_PROVIDERS } from '@bytelyst/llm-router'; + +const router = new LlmRouter({ + providers: [ + ...DEFAULT_PROVIDERS, + { + name: 'my-provider', + baseUrl: 'https://my-api.example.com/v1', + apiKeyEnv: 'MY_PROVIDER_KEY', + rpmLimit: 60, + tpmLimit: 100_000, + models: [ + { + id: 'my-model', + label: 'My Model', + contextWindow: 32_000, + strengths: ['general', 'code'], + speedTier: 2, + }, + ], + }, + ], +}); +``` diff --git a/packages/llm-router/package.json b/packages/llm-router/package.json new file mode 100644 index 00000000..a3b65e32 --- /dev/null +++ b/packages/llm-router/package.json @@ -0,0 +1,26 @@ +{ + "name": "@bytelyst/llm-router", + "version": "0.1.0", + "description": "Pure-code LLM router for free-tier API providers with round-robin, fallback, and health tracking", + "type": "module", + "exports": { + ".": { + "import": "./dist/index.js", + "types": "./dist/index.d.ts" + } + }, + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "files": [ + "dist" + ], + "scripts": { + "build": "tsc", + "test": "vitest run", + "typecheck": "tsc --noEmit" + }, + "devDependencies": { + "vitest": "^3.0.0", + "typescript": "^5.7.0" + } +} diff --git a/packages/llm-router/src/__tests__/classifier.test.ts b/packages/llm-router/src/__tests__/classifier.test.ts new file mode 100644 index 00000000..440bfbe7 --- /dev/null +++ b/packages/llm-router/src/__tests__/classifier.test.ts @@ -0,0 +1,73 @@ +import { describe, it, expect } from 'vitest'; +import { classifyPrompt } from '../classifier.js'; + +describe('classifyPrompt', () => { + it('classifies code prompts', () => { + const result = classifyPrompt([ + { role: 'user', content: 'Write a typescript function to sort an array' }, + ]); + expect(result.category).toBe('code'); + expect(result.estimatedTokens).toBeGreaterThan(0); + }); + + it('classifies code with keywords like refactor and debug', () => { + const result = classifyPrompt([ + { role: 'user', content: 'Debug this error in my React component and refactor the handler' }, + ]); + expect(result.category).toBe('code'); + }); + + it('classifies math prompts', () => { + const result = classifyPrompt([ + { role: 'user', content: 'Calculate the integral of x^2 from 0 to 5' }, + ]); + expect(result.category).toBe('math'); + }); + + it('classifies reasoning prompts', () => { + const result = classifyPrompt([ + { + role: 'user', + content: + 'Explain step by step why this approach has trade-offs and analyze the implications', + }, + ]); + expect(result.category).toBe('reasoning'); + }); + + it('classifies creative prompts', () => { + const result = classifyPrompt([ + { role: 'user', content: 'Write a short story about a robot who learns to paint' }, + ]); + expect(result.category).toBe('creative'); + }); + + it('defaults to general for ambiguous prompts', () => { + const result = classifyPrompt([{ role: 'user', content: 'Hello, how are you?' }]); + expect(result.category).toBe('general'); + }); + + it('estimates tokens roughly correctly', () => { + const text = 'a'.repeat(400); // ~100 tokens + const result = classifyPrompt([{ role: 'user', content: text }]); + expect(result.estimatedTokens).toBe(100); + }); + + it('handles multi-message conversations', () => { + const result = classifyPrompt([ + { role: 'system', content: 'You are a coding assistant' }, + { role: 'user', content: 'Fix the bug in my python function' }, + ]); + expect(result.category).toBe('code'); + }); + + it('detects code blocks in backticks', () => { + const result = classifyPrompt([ + { + role: 'user', + content: 'What is wrong with this?\n```\nconst x = 1;\nconsole.log(x);\n```', + }, + ]); + expect(result.category).toBe('code'); + }); +}); diff --git a/packages/llm-router/src/__tests__/health.test.ts b/packages/llm-router/src/__tests__/health.test.ts new file mode 100644 index 00000000..31f864d9 --- /dev/null +++ b/packages/llm-router/src/__tests__/health.test.ts @@ -0,0 +1,121 @@ +import { describe, it, expect, beforeEach } from 'vitest'; +import { HealthTracker } from '../health.js'; + +describe('HealthTracker', () => { + let tracker: HealthTracker; + + beforeEach(() => { + tracker = new HealthTracker({ windowMs: 10_000, errorThreshold: 0.5, rateLimitThreshold: 0.3 }); + }); + + it('reports healthy with no data', () => { + expect(tracker.isHealthy('groq', 'llama-3.3-70b')).toBe(true); + }); + + it('reports healthy with all successes', () => { + for (let i = 0; i < 5; i++) { + tracker.record('groq', 'llama-3.3-70b', { + timestamp: Date.now(), + latencyMs: 200, + status: 'success', + }); + } + expect(tracker.isHealthy('groq', 'llama-3.3-70b')).toBe(true); + }); + + it('marks unhealthy when error rate exceeds threshold', () => { + for (let i = 0; i < 5; i++) { + tracker.record('groq', 'llama-3.3-70b', { + timestamp: Date.now(), + latencyMs: 200, + status: 'error', + }); + } + expect(tracker.isHealthy('groq', 'llama-3.3-70b')).toBe(false); + }); + + it('marks unhealthy when rate-limit rate exceeds threshold', () => { + // 2 successes + 3 rate limits = 60% rate limit rate > 30% threshold + for (let i = 0; i < 2; i++) { + tracker.record('openrouter', 'model-a', { + timestamp: Date.now(), + latencyMs: 100, + status: 'success', + }); + } + for (let i = 0; i < 3; i++) { + tracker.record('openrouter', 'model-a', { + timestamp: Date.now(), + latencyMs: 50, + status: 'rate_limit', + }); + } + expect(tracker.isHealthy('openrouter', 'model-a')).toBe(false); + }); + + it('assumes healthy with fewer than 3 records', () => { + tracker.record('groq', 'llama-3.3-70b', { + timestamp: Date.now(), + latencyMs: 200, + status: 'error', + }); + tracker.record('groq', 'llama-3.3-70b', { + timestamp: Date.now(), + latencyMs: 200, + status: 'error', + }); + // Only 2 records — not enough data, should still be healthy + expect(tracker.isHealthy('groq', 'llama-3.3-70b')).toBe(true); + }); + + it('computes avg and p95 latency', () => { + const latencies = [100, 200, 300, 400, 500]; + for (const latencyMs of latencies) { + tracker.record('groq', 'model-a', { + timestamp: Date.now(), + latencyMs, + status: 'success', + }); + } + const snap = tracker.snapshot('groq', 'model-a'); + expect(snap.avgLatencyMs).toBe(300); + expect(snap.p95LatencyMs).toBe(500); + expect(snap.successes).toBe(5); + }); + + it('tracks different providers independently', () => { + tracker.record('groq', 'model-a', { + timestamp: Date.now(), + latencyMs: 100, + status: 'success', + }); + tracker.record('openrouter', 'model-b', { + timestamp: Date.now(), + latencyMs: 500, + status: 'error', + }); + + const snapA = tracker.snapshot('groq', 'model-a'); + const snapB = tracker.snapshot('openrouter', 'model-b'); + expect(snapA.successes).toBe(1); + expect(snapB.errors).toBe(1); + }); + + it('returns all snapshots', () => { + tracker.record('groq', 'model-a', { timestamp: Date.now(), latencyMs: 100, status: 'success' }); + tracker.record('together', 'model-b', { + timestamp: Date.now(), + latencyMs: 200, + status: 'success', + }); + + const all = tracker.allSnapshots(); + expect(all).toHaveLength(2); + }); + + it('resets all data', () => { + tracker.record('groq', 'model-a', { timestamp: Date.now(), latencyMs: 100, status: 'success' }); + tracker.reset(); + expect(tracker.allSnapshots()).toHaveLength(0); + }); +}); diff --git a/packages/llm-router/src/__tests__/registry.test.ts b/packages/llm-router/src/__tests__/registry.test.ts new file mode 100644 index 00000000..4bf9fd9e --- /dev/null +++ b/packages/llm-router/src/__tests__/registry.test.ts @@ -0,0 +1,83 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { getAvailableProviders, DEFAULT_PROVIDERS } from '../registry.js'; +import type { ProviderConfig } from '../types.js'; + +describe('getAvailableProviders', () => { + const saved: Record = {}; + + beforeEach(() => { + // Save and clear all default provider env vars + for (const p of DEFAULT_PROVIDERS) { + saved[p.apiKeyEnv] = process.env[p.apiKeyEnv]; + delete process.env[p.apiKeyEnv]; + } + }); + + afterEach(() => { + // Restore original env + for (const [key, val] of Object.entries(saved)) { + if (val === undefined) { + delete process.env[key]; + } else { + process.env[key] = val; + } + } + }); + + it('returns empty array when no API keys are set', () => { + expect(getAvailableProviders()).toEqual([]); + }); + + it('returns only providers with API keys set', () => { + process.env.GROQ_API_KEY = 'gsk_test'; + const result = getAvailableProviders(); + expect(result).toHaveLength(1); + expect(result[0]!.name).toBe('groq'); + }); + + it('returns multiple providers when multiple keys are set', () => { + process.env.GROQ_API_KEY = 'gsk_test'; + process.env.CEREBRAS_API_KEY = 'csk_test'; + const result = getAvailableProviders(); + expect(result).toHaveLength(2); + const names = result.map(p => p.name); + expect(names).toContain('groq'); + expect(names).toContain('cerebras'); + }); + + it('excludes providers with empty string API key', () => { + process.env.GROQ_API_KEY = ''; + expect(getAvailableProviders()).toEqual([]); + }); + + it('works with custom provider list', () => { + const custom: ProviderConfig[] = [ + { + name: 'custom', + baseUrl: 'https://example.com/v1', + apiKeyEnv: 'CUSTOM_TEST_KEY', + rpmLimit: 10, + tpmLimit: 0, + models: [], + }, + ]; + expect(getAvailableProviders(custom)).toEqual([]); + + process.env.CUSTOM_TEST_KEY = 'test'; + expect(getAvailableProviders(custom)).toHaveLength(1); + delete process.env.CUSTOM_TEST_KEY; + }); + + it('DEFAULT_PROVIDERS includes all 4 providers', () => { + expect(DEFAULT_PROVIDERS).toHaveLength(4); + const names = DEFAULT_PROVIDERS.map(p => p.name); + expect(names).toEqual(['groq', 'openrouter', 'together', 'cerebras']); + }); + + it('OpenRouter provider has recommended extra headers', () => { + const openrouter = DEFAULT_PROVIDERS.find(p => p.name === 'openrouter'); + expect(openrouter?.extraHeaders).toBeDefined(); + expect(openrouter?.extraHeaders?.['HTTP-Referer']).toBeDefined(); + expect(openrouter?.extraHeaders?.['X-Title']).toBeDefined(); + }); +}); diff --git a/packages/llm-router/src/__tests__/router.test.ts b/packages/llm-router/src/__tests__/router.test.ts new file mode 100644 index 00000000..9ece98e8 --- /dev/null +++ b/packages/llm-router/src/__tests__/router.test.ts @@ -0,0 +1,290 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { LlmRouter } from '../router.js'; +import type { ProviderConfig, ChatCompletionResponse } from '../types.js'; +import * as client from '../client.js'; + +// Mock the HTTP client +vi.mock('../client.js', () => ({ + sendChatCompletion: vi.fn(), +})); + +const MOCK_RESPONSE: ChatCompletionResponse = { + id: 'chatcmpl-test', + object: 'chat.completion', + created: Date.now(), + model: 'test-model', + choices: [ + { + index: 0, + message: { role: 'assistant', content: 'Hello!' }, + finish_reason: 'stop', + }, + ], + usage: { prompt_tokens: 10, completion_tokens: 5, total_tokens: 15 }, +}; + +const TEST_PROVIDERS: ProviderConfig[] = [ + { + name: 'test-fast', + baseUrl: 'https://fast.test/v1', + apiKeyEnv: 'TEST_FAST_KEY', + rpmLimit: 30, + tpmLimit: 10_000, + models: [ + { + id: 'fast-model', + label: 'Fast', + contextWindow: 8_192, + strengths: ['general'], + speedTier: 1, + }, + ], + }, + { + name: 'test-quality', + baseUrl: 'https://quality.test/v1', + apiKeyEnv: 'TEST_QUALITY_KEY', + rpmLimit: 20, + tpmLimit: 0, + models: [ + { + id: 'quality-model', + label: 'Quality', + contextWindow: 128_000, + strengths: ['code', 'reasoning'], + speedTier: 2, + }, + ], + }, +]; + +describe('LlmRouter', () => { + beforeEach(() => { + vi.resetAllMocks(); + // Set fake API keys + process.env.TEST_FAST_KEY = 'test-key-fast'; + process.env.TEST_QUALITY_KEY = 'test-key-quality'; + }); + + afterEach(() => { + delete process.env.TEST_FAST_KEY; + delete process.env.TEST_QUALITY_KEY; + }); + + it('throws if no providers have API keys', () => { + delete process.env.TEST_FAST_KEY; + delete process.env.TEST_QUALITY_KEY; + expect(() => new LlmRouter({ providers: TEST_PROVIDERS })).toThrow('No providers available'); + }); + + it('routes a simple prompt to a provider', async () => { + vi.mocked(client.sendChatCompletion).mockResolvedValueOnce({ + response: MOCK_RESPONSE, + latencyMs: 150, + status: 200, + }); + + const router = new LlmRouter({ providers: TEST_PROVIDERS }); + const result = await router.chat({ + messages: [{ role: 'user', content: 'Hello' }], + }); + + expect(result.response.choices[0]!.message.content).toBe('Hello!'); + expect(result.attempts).toBe(1); + expect(result.provider).toBeDefined(); + expect(result.model).toBeDefined(); + }); + + it('retries on 429 with fallback provider', async () => { + // First call: rate limited + vi.mocked(client.sendChatCompletion).mockResolvedValueOnce({ + response: null as unknown as ChatCompletionResponse, + latencyMs: 50, + status: 429, + }); + // Second call: success + vi.mocked(client.sendChatCompletion).mockResolvedValueOnce({ + response: MOCK_RESPONSE, + latencyMs: 200, + status: 200, + }); + + const router = new LlmRouter({ providers: TEST_PROVIDERS }); + const result = await router.chat({ + messages: [{ role: 'user', content: 'Hello' }], + }); + + expect(result.attempts).toBe(2); + expect(result.response.choices[0]!.message.content).toBe('Hello!'); + }); + + it('retries on error with fallback provider', async () => { + // First call: error + vi.mocked(client.sendChatCompletion).mockRejectedValueOnce(new Error('Network error')); + // Second call: success + vi.mocked(client.sendChatCompletion).mockResolvedValueOnce({ + response: MOCK_RESPONSE, + latencyMs: 200, + status: 200, + }); + + const router = new LlmRouter({ providers: TEST_PROVIDERS }); + const result = await router.chat({ + messages: [{ role: 'user', content: 'Hello' }], + }); + + expect(result.attempts).toBe(2); + }); + + it('throws after exhausting all retries', async () => { + vi.mocked(client.sendChatCompletion).mockRejectedValue(new Error('All down')); + + const router = new LlmRouter({ providers: TEST_PROVIDERS, maxRetries: 2 }); + await expect(router.chat({ messages: [{ role: 'user', content: 'Hello' }] })).rejects.toThrow( + 'All providers exhausted' + ); + }); + + it('routes code prompts to code-capable models', async () => { + vi.mocked(client.sendChatCompletion).mockResolvedValueOnce({ + response: MOCK_RESPONSE, + latencyMs: 200, + status: 200, + }); + + const router = new LlmRouter({ providers: TEST_PROVIDERS }); + await router.chat({ + messages: [{ role: 'user', content: 'Write a typescript function to sort an array' }], + }); + + // Should have been called with quality-model (has 'code' strength) + const callArgs = vi.mocked(client.sendChatCompletion).mock.calls[0]!; + expect(callArgs[1]).toBe('quality-model'); + }); + + it('fires telemetry callback on success', async () => { + vi.mocked(client.sendChatCompletion).mockResolvedValueOnce({ + response: MOCK_RESPONSE, + latencyMs: 150, + status: 200, + }); + + const telemetry = vi.fn(); + const router = new LlmRouter({ providers: TEST_PROVIDERS, onTelemetry: telemetry }); + await router.chat({ messages: [{ role: 'user', content: 'Hello' }] }); + + expect(telemetry).toHaveBeenCalledWith( + expect.objectContaining({ event: 'success', attempt: 1 }) + ); + }); + + it('fires telemetry callback on rate limit', async () => { + vi.mocked(client.sendChatCompletion).mockResolvedValueOnce({ + response: null as unknown as ChatCompletionResponse, + latencyMs: 50, + status: 429, + }); + vi.mocked(client.sendChatCompletion).mockResolvedValueOnce({ + response: MOCK_RESPONSE, + latencyMs: 200, + status: 200, + }); + + const telemetry = vi.fn(); + const router = new LlmRouter({ providers: TEST_PROVIDERS, onTelemetry: telemetry }); + await router.chat({ messages: [{ role: 'user', content: 'Hello' }] }); + + expect(telemetry).toHaveBeenCalledWith(expect.objectContaining({ event: 'rate_limit' })); + }); + + it('handles explicit provider:model routing', async () => { + vi.mocked(client.sendChatCompletion).mockResolvedValueOnce({ + response: MOCK_RESPONSE, + latencyMs: 100, + status: 200, + }); + + const router = new LlmRouter({ providers: TEST_PROVIDERS }); + const result = await router.chat({ + messages: [{ role: 'user', content: 'Hello' }], + model: 'test-fast:fast-model', + }); + + expect(result.provider).toBe('test-fast'); + expect(result.model).toBe('fast-model'); + }); + + it('throws for unknown explicit provider', async () => { + const router = new LlmRouter({ providers: TEST_PROVIDERS }); + await expect( + router.chat({ messages: [{ role: 'user', content: 'Hello' }], model: 'unknown:model' }) + ).rejects.toThrow('Provider "unknown" not found'); + }); + + it('returns health snapshots', async () => { + vi.mocked(client.sendChatCompletion).mockResolvedValueOnce({ + response: MOCK_RESPONSE, + latencyMs: 150, + status: 200, + }); + + const router = new LlmRouter({ providers: TEST_PROVIDERS }); + await router.chat({ messages: [{ role: 'user', content: 'Hello' }] }); + + const health = router.getHealth(); + expect(health.length).toBeGreaterThan(0); + expect(health[0]!.successes).toBe(1); + }); + + it('lists available providers', () => { + const router = new LlmRouter({ providers: TEST_PROVIDERS }); + expect(router.getProviders()).toEqual(['test-fast', 'test-quality']); + }); + + it('fires telemetry for explicit model routing', async () => { + vi.mocked(client.sendChatCompletion).mockResolvedValueOnce({ + response: MOCK_RESPONSE, + latencyMs: 100, + status: 200, + }); + + const telemetry = vi.fn(); + const router = new LlmRouter({ providers: TEST_PROVIDERS, onTelemetry: telemetry }); + await router.chat({ + messages: [{ role: 'user', content: 'Hello' }], + model: 'test-fast:fast-model', + }); + + expect(telemetry).toHaveBeenCalledWith( + expect.objectContaining({ + event: 'success', + provider: 'test-fast', + model: 'fast-model', + category: 'explicit', + }) + ); + }); + + it('records health on explicit model 429', async () => { + vi.mocked(client.sendChatCompletion).mockResolvedValueOnce({ + response: null as unknown as ChatCompletionResponse, + latencyMs: 50, + status: 429, + }); + + const telemetry = vi.fn(); + const router = new LlmRouter({ providers: TEST_PROVIDERS, onTelemetry: telemetry }); + await expect( + router.chat({ messages: [{ role: 'user', content: 'Hello' }], model: 'test-fast:fast-model' }) + ).rejects.toThrow('Rate limited'); + + expect(telemetry).toHaveBeenCalledWith( + expect.objectContaining({ event: 'rate_limit', provider: 'test-fast' }) + ); + + // Health should have recorded the rate limit + const health = router.getHealth(); + expect(health).toHaveLength(1); + expect(health[0]!.rateLimits).toBe(1); + }); +}); diff --git a/packages/llm-router/src/__tests__/selector.test.ts b/packages/llm-router/src/__tests__/selector.test.ts new file mode 100644 index 00000000..429d629f --- /dev/null +++ b/packages/llm-router/src/__tests__/selector.test.ts @@ -0,0 +1,138 @@ +import { describe, it, expect, beforeEach } from 'vitest'; +import { + selectCandidates, + pickNext, + excludeCandidate, + createRoundRobinState, +} from '../selector.js'; +import { HealthTracker } from '../health.js'; +import type { ProviderConfig } from '../types.js'; + +const MOCK_PROVIDERS: ProviderConfig[] = [ + { + name: 'fast-provider', + baseUrl: 'https://fast.example.com/v1', + apiKeyEnv: 'FAST_KEY', + rpmLimit: 30, + tpmLimit: 10_000, + models: [ + { + id: 'small-8b', + label: 'Small 8B', + contextWindow: 8_192, + strengths: ['general'], + speedTier: 1, + }, + { + id: 'large-70b', + label: 'Large 70B', + contextWindow: 128_000, + strengths: ['code', 'reasoning'], + speedTier: 1, + }, + ], + }, + { + name: 'quality-provider', + baseUrl: 'https://quality.example.com/v1', + apiKeyEnv: 'QUALITY_KEY', + rpmLimit: 20, + tpmLimit: 0, + models: [ + { + id: 'deepseek-r1', + label: 'DeepSeek R1', + contextWindow: 64_000, + strengths: ['reasoning', 'code', 'math'], + speedTier: 3, + }, + ], + }, +]; + +describe('selectCandidates', () => { + let health: HealthTracker; + + beforeEach(() => { + health = new HealthTracker(); + }); + + it('returns candidates sorted by score for code', () => { + const candidates = selectCandidates(MOCK_PROVIDERS, 'code', health); + expect(candidates.length).toBeGreaterThan(0); + // large-70b and deepseek-r1 should score high for code + const names = candidates.map(c => c.model.id); + expect(names[0]).toBe('large-70b'); // speed 1 + code strength + 70b bonus + }); + + it('returns candidates sorted by score for general', () => { + const candidates = selectCandidates(MOCK_PROVIDERS, 'general', health); + // small-8b has 'general' strength + speed tier 1 + expect(candidates[0]!.model.id).toBe('small-8b'); + }); + + it('filters out unhealthy providers', () => { + // Make fast-provider/large-70b unhealthy + for (let i = 0; i < 5; i++) { + health.record('fast-provider', 'large-70b', { + timestamp: Date.now(), + latencyMs: 100, + status: 'error', + }); + } + const candidates = selectCandidates(MOCK_PROVIDERS, 'code', health); + const ids = candidates.map(c => `${c.provider.name}::${c.model.id}`); + expect(ids).not.toContain('fast-provider::large-70b'); + }); +}); + +describe('pickNext', () => { + it('returns null for empty candidates', () => { + const state = createRoundRobinState(); + expect(pickNext([], state)).toBeNull(); + }); + + it('returns the only candidate when there is one', () => { + const state = createRoundRobinState(); + const candidate = { provider: MOCK_PROVIDERS[0]!, model: MOCK_PROVIDERS[0]!.models[0]! }; + expect(pickNext([candidate], state)).toBe(candidate); + }); + + it('round-robins across providers', () => { + const state = createRoundRobinState(); + const candidates = [ + { provider: MOCK_PROVIDERS[0]!, model: MOCK_PROVIDERS[0]!.models[0]! }, + { provider: MOCK_PROVIDERS[1]!, model: MOCK_PROVIDERS[1]!.models[0]! }, + ]; + + const first = pickNext(candidates, state); + const second = pickNext(candidates, state); + expect(first!.provider.name).not.toBe(second!.provider.name); + }); + + it('uses independent state per instance', () => { + const stateA = createRoundRobinState(); + const stateB = createRoundRobinState(); + const candidates = [ + { provider: MOCK_PROVIDERS[0]!, model: MOCK_PROVIDERS[0]!.models[0]! }, + { provider: MOCK_PROVIDERS[1]!, model: MOCK_PROVIDERS[1]!.models[0]! }, + ]; + + const fromA = pickNext(candidates, stateA); + const fromB = pickNext(candidates, stateB); + // Both start at same position since states are independent + expect(fromA!.provider.name).toBe(fromB!.provider.name); + }); +}); + +describe('excludeCandidate', () => { + it('removes the specified candidate', () => { + const candidates = [ + { provider: MOCK_PROVIDERS[0]!, model: MOCK_PROVIDERS[0]!.models[0]! }, + { provider: MOCK_PROVIDERS[1]!, model: MOCK_PROVIDERS[1]!.models[0]! }, + ]; + const remaining = excludeCandidate(candidates, 'fast-provider', 'small-8b'); + expect(remaining).toHaveLength(1); + expect(remaining[0]!.provider.name).toBe('quality-provider'); + }); +}); diff --git a/packages/llm-router/src/classifier.ts b/packages/llm-router/src/classifier.ts new file mode 100644 index 00000000..b76e8bfb --- /dev/null +++ b/packages/llm-router/src/classifier.ts @@ -0,0 +1,85 @@ +import type { ClassificationResult, PromptCategory } from './types.js'; + +// ── Keyword patterns for classification ──────────────────────── + +const CODE_PATTERNS = [ + /\b(function|const |let |var |class |import |export |return |async |await )\b/, + /\b(def |print\(|if __name__|lambda )\b/, + /[{}();]=>/, + /```[\s\S]*```/, + /\b(typescript|javascript|python|rust|golang|java|kotlin|swift|sql|html|css|react|node)\b/i, + /\b(debug|refactor|compile|build|deploy|lint|test|api|endpoint|route|middleware)\b/i, + /\b(fix|bug|error|exception|stack trace|undefined|null|NaN)\b/i, +]; + +const MATH_PATTERNS = [ + /\b(calculate|compute|solve|equation|formula|integral|derivative|matrix)\b/i, + /\b(probability|statistics|regression|correlation|variance|median|mean)\b/i, + /\b(algebra|geometry|calculus|theorem|proof|hypothesis)\b/i, + /[+\-*/^=]{2,}/, + /\d+\s*[+\-*/^]\s*\d+/, +]; + +const REASONING_PATTERNS = [ + /\b(explain|analyze|compare|evaluate|reason|logic|argument|conclusion)\b/i, + /\b(why|how does|what if|pros and cons|trade-?offs|implications)\b/i, + /\b(step[- ]by[- ]step|chain of thought|think through|break down)\b/i, + /\b(strategy|approach|methodology|framework|architecture|design)\b/i, +]; + +const CREATIVE_PATTERNS = [ + /\b(write|compose|draft|create|generate|story|poem|essay|blog|article)\b/i, + /\b(creative|imaginative|brainstorm|ideas|fiction|narrative|dialogue)\b/i, + /\b(rewrite|rephrase|summarize|translate|tone|style|voice)\b/i, +]; + +// ── Token estimation ─────────────────────────────────────────── + +/** + * Rough token estimate: ~4 chars per token for English text. + * Good enough for routing decisions. + */ +function estimateTokens(text: string): number { + return Math.ceil(text.length / 4); +} + +// ── Classifier ───────────────────────────────────────────────── + +function countMatches(text: string, patterns: RegExp[]): number { + let count = 0; + for (const pattern of patterns) { + if (pattern.test(text)) count++; + } + return count; +} + +/** + * Classify a prompt into a category based on keyword matching. + * No LLM needed — pure regex heuristics. + */ +export function classifyPrompt( + messages: { role: string; content: string }[] +): ClassificationResult { + const fullText = messages.map(m => m.content).join('\n'); + const estimatedTokens = estimateTokens(fullText); + + const scores: Record = { + code: countMatches(fullText, CODE_PATTERNS), + math: countMatches(fullText, MATH_PATTERNS), + reasoning: countMatches(fullText, REASONING_PATTERNS), + creative: countMatches(fullText, CREATIVE_PATTERNS), + general: 1, // baseline + }; + + // Pick highest scoring category + let best: PromptCategory = 'general'; + let bestScore = 0; + for (const [cat, score] of Object.entries(scores) as [PromptCategory, number][]) { + if (score > bestScore) { + bestScore = score; + best = cat; + } + } + + return { category: best, estimatedTokens }; +} diff --git a/packages/llm-router/src/client.ts b/packages/llm-router/src/client.ts new file mode 100644 index 00000000..febd13b8 --- /dev/null +++ b/packages/llm-router/src/client.ts @@ -0,0 +1,66 @@ +import type { ChatCompletionRequest, ChatCompletionResponse, ProviderConfig } from './types.js'; + +/** + * Send an OpenAI-compatible chat completion request to a provider. + * Returns the parsed response or throws on HTTP/network errors. + */ +export async function sendChatCompletion( + provider: ProviderConfig, + modelId: string, + request: ChatCompletionRequest, + timeoutMs: number = 30_000 +): Promise<{ response: ChatCompletionResponse; latencyMs: number; status: number }> { + const apiKey = process.env[provider.apiKeyEnv]; + if (!apiKey) { + throw new Error(`Missing API key: env var ${provider.apiKeyEnv} is not set`); + } + + const url = `${provider.baseUrl}/chat/completions`; + const headers: Record = { + 'Content-Type': 'application/json', + Authorization: `Bearer ${apiKey}`, + ...provider.extraHeaders, + }; + + const body = JSON.stringify({ + model: modelId, + messages: request.messages, + ...(request.temperature !== undefined && { temperature: request.temperature }), + ...(request.max_tokens !== undefined && { max_tokens: request.max_tokens }), + ...(request.top_p !== undefined && { top_p: request.top_p }), + stream: false, + }); + + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), timeoutMs); + const start = Date.now(); + + try { + const res = await fetch(url, { + method: 'POST', + headers, + body, + signal: controller.signal, + }); + + const latencyMs = Date.now() - start; + + if (res.status === 429) { + return { + response: null as unknown as ChatCompletionResponse, + latencyMs, + status: 429, + }; + } + + if (!res.ok) { + const text = await res.text().catch(() => ''); + throw new Error(`${provider.name} returned ${res.status}: ${text.slice(0, 200)}`); + } + + const data = (await res.json()) as ChatCompletionResponse; + return { response: data, latencyMs, status: res.status }; + } finally { + clearTimeout(timer); + } +} diff --git a/packages/llm-router/src/health.ts b/packages/llm-router/src/health.ts new file mode 100644 index 00000000..7e1fc9cc --- /dev/null +++ b/packages/llm-router/src/health.ts @@ -0,0 +1,103 @@ +import type { HealthSnapshot, RequestRecord } from './types.js'; + +/** + * Sliding-window health tracker for provider+model pairs. + * Tracks latency, error rates, and rate-limit hits. + */ +export class HealthTracker { + private records = new Map(); + private readonly windowMs: number; + private readonly errorThreshold: number; + private readonly rateLimitThreshold: number; + + constructor(opts?: { windowMs?: number; errorThreshold?: number; rateLimitThreshold?: number }) { + this.windowMs = opts?.windowMs ?? 60_000; + this.errorThreshold = opts?.errorThreshold ?? 0.5; + this.rateLimitThreshold = opts?.rateLimitThreshold ?? 0.3; + } + + private key(provider: string, model: string): string { + return `${provider}::${model}`; + } + + private prune(records: RequestRecord[]): RequestRecord[] { + const cutoff = Date.now() - this.windowMs; + return records.filter(r => r.timestamp >= cutoff); + } + + /** Record a completed request (success, rate_limit, or error). */ + record(provider: string, model: string, entry: RequestRecord): void { + const k = this.key(provider, model); + const existing = this.records.get(k) ?? []; + existing.push(entry); + this.records.set(k, this.prune(existing)); + } + + /** Get health snapshot for a provider+model pair. */ + snapshot(provider: string, model: string): HealthSnapshot { + const k = this.key(provider, model); + const raw = this.records.get(k) ?? []; + const records = this.prune(raw); + this.records.set(k, records); + + const total = records.length; + const successes = records.filter(r => r.status === 'success').length; + const rateLimits = records.filter(r => r.status === 'rate_limit').length; + const errors = records.filter(r => r.status === 'error').length; + + const successLatencies = records + .filter(r => r.status === 'success') + .map(r => r.latencyMs) + .sort((a, b) => a - b); + + const avgLatencyMs = + successLatencies.length > 0 + ? successLatencies.reduce((a, b) => a + b, 0) / successLatencies.length + : 0; + + const p95LatencyMs = + successLatencies.length > 0 + ? (successLatencies[Math.floor(successLatencies.length * 0.95)] ?? + successLatencies[successLatencies.length - 1]!) + : 0; + + // Healthy = not too many errors or rate limits + const errorRate = total > 0 ? errors / total : 0; + const rateLimitRate = total > 0 ? rateLimits / total : 0; + const healthy = + total < 3 || // not enough data → assume healthy + (errorRate < this.errorThreshold && rateLimitRate < this.rateLimitThreshold); + + return { + provider, + model, + totalRequests: total, + successes, + rateLimits, + errors, + avgLatencyMs: Math.round(avgLatencyMs), + p95LatencyMs: Math.round(p95LatencyMs), + healthy, + }; + } + + /** Check if a specific provider+model is currently healthy. */ + isHealthy(provider: string, model: string): boolean { + return this.snapshot(provider, model).healthy; + } + + /** Get all tracked snapshots. */ + allSnapshots(): HealthSnapshot[] { + const snapshots: HealthSnapshot[] = []; + for (const k of this.records.keys()) { + const [provider, model] = k.split('::') as [string, string]; + snapshots.push(this.snapshot(provider, model)); + } + return snapshots; + } + + /** Clear all tracking data. */ + reset(): void { + this.records.clear(); + } +} diff --git a/packages/llm-router/src/index.ts b/packages/llm-router/src/index.ts new file mode 100644 index 00000000..f7dbe513 --- /dev/null +++ b/packages/llm-router/src/index.ts @@ -0,0 +1,25 @@ +export { LlmRouter } from './router.js'; +export type { TelemetryEntry } from './router.js'; + +export { DEFAULT_PROVIDERS, getAvailableProviders } from './registry.js'; +export { classifyPrompt } from './classifier.js'; +export { HealthTracker } from './health.js'; +export { selectCandidates, pickNext, excludeCandidate, createRoundRobinState } from './selector.js'; +export type { SelectionCandidate } from './selector.js'; +export { sendChatCompletion } from './client.js'; + +export type { + ModelConfig, + ProviderConfig, + PromptCategory, + ClassificationResult, + HealthSnapshot, + RequestRecord, + RouterConfig, + ChatMessage, + ChatCompletionRequest, + ChatCompletionChoice, + ChatCompletionUsage, + ChatCompletionResponse, + RouteResult, +} from './types.js'; diff --git a/packages/llm-router/src/registry.ts b/packages/llm-router/src/registry.ts new file mode 100644 index 00000000..2d533111 --- /dev/null +++ b/packages/llm-router/src/registry.ts @@ -0,0 +1,134 @@ +import type { ProviderConfig } from './types.js'; + +/** + * Default free-tier provider configurations. + * All use OpenAI-compatible /v1/chat/completions endpoints. + */ +export const DEFAULT_PROVIDERS: ProviderConfig[] = [ + // ── Groq ───────────────────────────────────────────────────── + // Free tier: 30 RPM, 14.4K TPM (large), 30K TPM (small) + { + name: 'groq', + baseUrl: 'https://api.groq.com/openai/v1', + apiKeyEnv: 'GROQ_API_KEY', + rpmLimit: 30, + tpmLimit: 14_400, + models: [ + { + id: 'llama-3.3-70b-versatile', + label: 'Llama 3.3 70B', + contextWindow: 128_000, + strengths: ['general', 'reasoning', 'code'], + speedTier: 1, + }, + { + id: 'llama-3.1-8b-instant', + label: 'Llama 3.1 8B Instant', + contextWindow: 128_000, + strengths: ['general'], + speedTier: 1, + }, + { + id: 'gemma2-9b-it', + label: 'Gemma 2 9B', + contextWindow: 8_192, + strengths: ['general', 'creative'], + speedTier: 1, + }, + ], + }, + + // ── OpenRouter ─────────────────────────────────────────────── + // Free models available (rate-limited per model) + { + name: 'openrouter', + baseUrl: 'https://openrouter.ai/api/v1', + apiKeyEnv: 'OPENROUTER_API_KEY', + extraHeaders: { + 'HTTP-Referer': 'https://bytelyst.com', + 'X-Title': 'ByteLyst LLM Router', + }, + rpmLimit: 20, + tpmLimit: 0, + models: [ + { + id: 'deepseek/deepseek-r1:free', + label: 'DeepSeek R1 (Free)', + contextWindow: 64_000, + strengths: ['reasoning', 'code', 'math'], + speedTier: 3, + }, + { + id: 'meta-llama/llama-3.3-70b-instruct:free', + label: 'Llama 3.3 70B (Free)', + contextWindow: 128_000, + strengths: ['general', 'reasoning', 'code'], + speedTier: 2, + }, + { + id: 'google/gemma-2-9b-it:free', + label: 'Gemma 2 9B (Free)', + contextWindow: 8_192, + strengths: ['general', 'creative'], + speedTier: 2, + }, + ], + }, + + // ── Together AI ────────────────────────────────────────────── + // Free tier: limited RPM, several open models + { + name: 'together', + baseUrl: 'https://api.together.xyz/v1', + apiKeyEnv: 'TOGETHER_API_KEY', + rpmLimit: 20, + tpmLimit: 0, + models: [ + { + id: 'meta-llama/Llama-3.3-70B-Instruct-Turbo', + label: 'Llama 3.3 70B Turbo', + contextWindow: 128_000, + strengths: ['general', 'reasoning', 'code'], + speedTier: 2, + }, + { + id: 'deepseek-ai/DeepSeek-R1-Distill-Llama-70B', + label: 'DeepSeek R1 Distill 70B', + contextWindow: 128_000, + strengths: ['reasoning', 'math', 'code'], + speedTier: 2, + }, + ], + }, + + // ── Cerebras ───────────────────────────────────────────────── + // Free inference tier — extremely fast + { + name: 'cerebras', + baseUrl: 'https://api.cerebras.ai/v1', + apiKeyEnv: 'CEREBRAS_API_KEY', + rpmLimit: 30, + tpmLimit: 60_000, + models: [ + { + id: 'llama-3.3-70b', + label: 'Llama 3.3 70B (Cerebras)', + contextWindow: 128_000, + strengths: ['general', 'reasoning', 'code'], + speedTier: 1, + }, + ], + }, +]; + +/** + * Filter providers to only those with API keys present in env. + */ +export function getAvailableProviders( + providers: ProviderConfig[] = DEFAULT_PROVIDERS +): ProviderConfig[] { + return providers.filter(p => { + const key = process.env[p.apiKeyEnv]; + return key !== undefined && key !== ''; + }); +} diff --git a/packages/llm-router/src/router.ts b/packages/llm-router/src/router.ts new file mode 100644 index 00000000..e8cdd821 --- /dev/null +++ b/packages/llm-router/src/router.ts @@ -0,0 +1,285 @@ +import type { + ChatCompletionRequest, + RouterConfig, + ProviderConfig, + RouteResult, + HealthSnapshot, +} from './types.js'; +import { DEFAULT_PROVIDERS, getAvailableProviders } from './registry.js'; +import { classifyPrompt } from './classifier.js'; +import { HealthTracker } from './health.js'; +import { selectCandidates, pickNext, excludeCandidate, createRoundRobinState } from './selector.js'; +import { sendChatCompletion } from './client.js'; + +export class LlmRouter { + private readonly providers: ProviderConfig[]; + private readonly health: HealthTracker; + private readonly timeoutMs: number; + private readonly maxRetries: number; + private readonly log: (entry: TelemetryEntry) => void; + private readonly roundRobinState: Map; + + constructor(config?: RouterConfig & { onTelemetry?: (entry: TelemetryEntry) => void }) { + const allProviders = config?.providers ?? DEFAULT_PROVIDERS; + this.providers = getAvailableProviders(allProviders); + + if (this.providers.length === 0) { + throw new Error( + 'No providers available. Set at least one API key env var: ' + + allProviders.map(p => p.apiKeyEnv).join(', ') + ); + } + + this.health = new HealthTracker({ + windowMs: config?.healthWindowMs, + errorThreshold: config?.errorThreshold, + rateLimitThreshold: config?.rateLimitThreshold, + }); + + this.timeoutMs = config?.timeoutMs ?? 30_000; + this.maxRetries = config?.maxRetries ?? 3; + this.log = config?.onTelemetry ?? (() => {}); + this.roundRobinState = createRoundRobinState(); + } + + /** + * Route a chat completion request to the best available provider. + * Automatically retries on 429/5xx with fallback to other providers. + */ + async chat(request: ChatCompletionRequest): Promise { + const startTime = Date.now(); + + // If user specified a specific provider:model or provider/model, try that first + if (request.model && (request.model.includes(':') || request.model.includes('/'))) { + return this.chatWithExplicitModel(request, startTime); + } + + // Classify the prompt + const classification = classifyPrompt(request.messages); + + // Get ranked candidates + let candidates = selectCandidates(this.providers, classification.category, this.health); + + if (candidates.length === 0) { + throw new Error('No healthy providers available for routing'); + } + + let lastError: Error | null = null; + + for (let attempt = 1; attempt <= this.maxRetries; attempt++) { + const pick = pickNext(candidates, this.roundRobinState); + if (!pick) break; + + const { provider, model } = pick; + const attemptStart = Date.now(); + + try { + const result = await sendChatCompletion(provider, model.id, request, this.timeoutMs); + + if (result.status === 429) { + // Rate limited — record and try next provider + this.health.record(provider.name, model.id, { + timestamp: Date.now(), + latencyMs: result.latencyMs, + status: 'rate_limit', + }); + + this.log({ + event: 'rate_limit', + provider: provider.name, + model: model.id, + attempt, + latencyMs: result.latencyMs, + category: classification.category, + }); + + candidates = excludeCandidate(candidates, provider.name, model.id); + continue; + } + + // Success + this.health.record(provider.name, model.id, { + timestamp: Date.now(), + latencyMs: result.latencyMs, + status: 'success', + }); + + this.log({ + event: 'success', + provider: provider.name, + model: model.id, + attempt, + latencyMs: result.latencyMs, + category: classification.category, + tokens: result.response.usage?.total_tokens, + }); + + return { + response: result.response, + provider: provider.name, + model: model.id, + totalLatencyMs: Date.now() - startTime, + attempts: attempt, + }; + } catch (err) { + lastError = err instanceof Error ? err : new Error(String(err)); + const attemptLatency = Date.now() - attemptStart; + + this.health.record(provider.name, model.id, { + timestamp: Date.now(), + latencyMs: attemptLatency, + status: 'error', + }); + + this.log({ + event: 'error', + provider: provider.name, + model: model.id, + attempt, + latencyMs: attemptLatency, + category: classification.category, + error: lastError.message, + }); + + candidates = excludeCandidate(candidates, provider.name, model.id); + } + } + + throw new Error( + `All providers exhausted after ${this.maxRetries} attempts. Last error: ${lastError?.message ?? 'unknown'}` + ); + } + + /** + * Handle explicit provider:model routing (bypass classifier). + */ + private async chatWithExplicitModel( + request: ChatCompletionRequest, + startTime: number + ): Promise { + // Support both "provider:model" and "provider/model" separators + // Use first colon or first slash (whichever comes first) as separator + const raw = request.model!; + const colonIdx = raw.indexOf(':'); + const slashIdx = raw.indexOf('/'); + let sepIdx: number; + if (colonIdx === -1 && slashIdx === -1) { + sepIdx = -1; + } else if (colonIdx === -1) { + sepIdx = slashIdx; + } else if (slashIdx === -1) { + sepIdx = colonIdx; + } else { + sepIdx = Math.min(colonIdx, slashIdx); + } + + const providerName = sepIdx === -1 ? raw : raw.slice(0, sepIdx); + const modelId = sepIdx === -1 ? '' : raw.slice(sepIdx + 1); + + const provider = this.providers.find(p => p.name === providerName); + if (!provider) { + throw new Error( + `Provider "${providerName}" not found. Available: ${this.providers.map(p => p.name).join(', ')}` + ); + } + + try { + const result = await sendChatCompletion(provider, modelId, request, this.timeoutMs); + + if (result.status === 429) { + this.health.record(provider.name, modelId, { + timestamp: Date.now(), + latencyMs: result.latencyMs, + status: 'rate_limit', + }); + + this.log({ + event: 'rate_limit', + provider: provider.name, + model: modelId, + attempt: 1, + latencyMs: result.latencyMs, + category: 'explicit', + }); + + throw new Error(`Rate limited by ${providerName} for model ${modelId}`); + } + + this.health.record(provider.name, modelId, { + timestamp: Date.now(), + latencyMs: result.latencyMs, + status: 'success', + }); + + this.log({ + event: 'success', + provider: provider.name, + model: modelId, + attempt: 1, + latencyMs: result.latencyMs, + category: 'explicit', + tokens: result.response.usage?.total_tokens, + }); + + return { + response: result.response, + provider: provider.name, + model: modelId, + totalLatencyMs: Date.now() - startTime, + attempts: 1, + }; + } catch (err) { + // Re-throw rate-limit errors (already logged above) + if (err instanceof Error && err.message.startsWith('Rate limited by')) { + throw err; + } + + const latency = Date.now() - startTime; + this.health.record(provider.name, modelId, { + timestamp: Date.now(), + latencyMs: latency, + status: 'error', + }); + + this.log({ + event: 'error', + provider: provider.name, + model: modelId, + attempt: 1, + latencyMs: latency, + category: 'explicit', + error: err instanceof Error ? err.message : String(err), + }); + + throw err; + } + } + + /** Get health snapshots for all tracked provider+model pairs. */ + getHealth(): HealthSnapshot[] { + return this.health.allSnapshots(); + } + + /** Get list of available (configured) providers. */ + getProviders(): string[] { + return this.providers.map(p => p.name); + } + + /** Reset health tracking data. */ + resetHealth(): void { + this.health.reset(); + } +} + +// ── Telemetry types ──────────────────────────────────────────── + +export interface TelemetryEntry { + event: 'success' | 'rate_limit' | 'error'; + provider: string; + model: string; + attempt: number; + latencyMs: number; + category: string; + tokens?: number; + error?: string; +} diff --git a/packages/llm-router/src/selector.ts b/packages/llm-router/src/selector.ts new file mode 100644 index 00000000..1d82e9f7 --- /dev/null +++ b/packages/llm-router/src/selector.ts @@ -0,0 +1,101 @@ +import type { ModelConfig, PromptCategory, ProviderConfig } from './types.js'; +import type { HealthTracker } from './health.js'; + +export interface SelectionCandidate { + provider: ProviderConfig; + model: ModelConfig; +} + +/** Create a fresh round-robin state map (one per router instance). */ +export function createRoundRobinState(): Map { + return new Map(); +} + +/** + * Score a model for a given prompt category. + * Higher = better fit. + */ +function scoreModel(model: ModelConfig, category: PromptCategory): number { + let score = 0; + + // Direct strength match is the strongest signal + if (model.strengths.includes(category)) { + score += 10; + } + + // Speed bonus (lower tier = faster = better for simple tasks) + score += (4 - model.speedTier) * 2; + + // Context window bonus for reasoning/creative (often longer) + if ((category === 'reasoning' || category === 'creative') && model.contextWindow >= 64_000) { + score += 3; + } + + // Prefer larger models for code/math/reasoning + if (['code', 'math', 'reasoning'].includes(category)) { + if (model.id.includes('70b') || model.id.includes('70B')) score += 5; + if (model.id.includes('r1') || model.id.includes('R1')) score += 4; + } + + return score; +} + +/** + * Select the best provider+model candidates for a prompt category. + * Returns candidates sorted by score (best first), filtered by health. + */ +export function selectCandidates( + providers: ProviderConfig[], + category: PromptCategory, + health: HealthTracker +): SelectionCandidate[] { + const candidates: (SelectionCandidate & { score: number })[] = []; + + for (const provider of providers) { + for (const model of provider.models) { + if (!health.isHealthy(provider.name, model.id)) continue; + + const score = scoreModel(model, category); + candidates.push({ provider, model, score }); + } + } + + // Sort by score descending + candidates.sort((a, b) => b.score - a.score); + + return candidates; +} + +/** + * Pick the next candidate using round-robin within the top tier. + * Groups candidates by provider, rotates between them to spread rate-limit load. + */ +export function pickNext( + candidates: SelectionCandidate[], + state: Map +): SelectionCandidate | null { + if (candidates.length === 0) return null; + if (candidates.length === 1) return candidates[0]!; + + // Group by provider name for round-robin + const providerNames = [...new Set(candidates.map(c => c.provider.name))]; + const key = providerNames.join(','); + + const idx = state.get(key) ?? 0; + const targetProvider = providerNames[idx % providerNames.length]!; + state.set(key, idx + 1); + + // Pick the best model from the selected provider + return candidates.find(c => c.provider.name === targetProvider) ?? candidates[0]!; +} + +/** + * Remove a candidate from the list (after failure) and return remaining. + */ +export function excludeCandidate( + candidates: SelectionCandidate[], + provider: string, + model: string +): SelectionCandidate[] { + return candidates.filter(c => !(c.provider.name === provider && c.model.id === model)); +} diff --git a/packages/llm-router/src/types.ts b/packages/llm-router/src/types.ts new file mode 100644 index 00000000..7b7c1d17 --- /dev/null +++ b/packages/llm-router/src/types.ts @@ -0,0 +1,136 @@ +// ── Provider & Model Types ───────────────────────────────────── + +export interface ModelConfig { + /** Model identifier as the provider expects it */ + id: string; + /** Human-readable label */ + label: string; + /** Max context window tokens */ + contextWindow: number; + /** What this model is good at */ + strengths: PromptCategory[]; + /** Relative speed tier: 1 = fastest, 3 = slowest */ + speedTier: 1 | 2 | 3; +} + +export interface ProviderConfig { + /** Unique provider name */ + name: string; + /** OpenAI-compatible base URL (e.g. https://api.groq.com/openai/v1) */ + baseUrl: string; + /** Environment variable name that holds the API key */ + apiKeyEnv: string; + /** Available models on this provider */ + models: ModelConfig[]; + /** Extra headers to send with every request */ + extraHeaders?: Record; + /** Free-tier rate limit: requests per minute (0 = unknown) */ + rpmLimit: number; + /** Free-tier rate limit: tokens per minute (0 = unknown) */ + tpmLimit: number; +} + +// ── Prompt Classification ────────────────────────────────────── + +export type PromptCategory = 'code' | 'math' | 'reasoning' | 'creative' | 'general'; + +export interface ClassificationResult { + category: PromptCategory; + estimatedTokens: number; +} + +// ── Health Tracking ──────────────────────────────────────────── + +export interface HealthSnapshot { + provider: string; + model: string; + /** Total requests in the window */ + totalRequests: number; + /** Successful requests */ + successes: number; + /** 429 rate-limit hits */ + rateLimits: number; + /** 5xx / network errors */ + errors: number; + /** Average latency in ms (successes only) */ + avgLatencyMs: number; + /** p95 latency in ms */ + p95LatencyMs: number; + /** Whether this provider is currently considered healthy */ + healthy: boolean; +} + +export interface RequestRecord { + timestamp: number; + latencyMs: number; + status: 'success' | 'rate_limit' | 'error'; +} + +// ── Router Config ────────────────────────────────────────────── + +export interface RouterConfig { + /** Provider configurations (use DEFAULT_PROVIDERS if omitted) */ + providers?: ProviderConfig[]; + /** Health window in ms (default: 60_000 = 1 minute) */ + healthWindowMs?: number; + /** Error rate threshold to mark unhealthy (default: 0.5 = 50%) */ + errorThreshold?: number; + /** Rate-limit rate threshold to mark unhealthy (default: 0.3 = 30%) */ + rateLimitThreshold?: number; + /** Request timeout in ms (default: 30_000) */ + timeoutMs?: number; + /** Max retry attempts across providers (default: 3) */ + maxRetries?: number; +} + +// ── OpenAI-Compatible Request/Response ───────────────────────── + +export interface ChatMessage { + role: 'system' | 'user' | 'assistant'; + content: string; +} + +export interface ChatCompletionRequest { + messages: ChatMessage[]; + /** Optional: force a specific model (provider:model format or just model id) */ + model?: string; + temperature?: number; + max_tokens?: number; + top_p?: number; + stream?: boolean; +} + +export interface ChatCompletionChoice { + index: number; + message: ChatMessage; + finish_reason: string | null; +} + +export interface ChatCompletionUsage { + prompt_tokens: number; + completion_tokens: number; + total_tokens: number; +} + +export interface ChatCompletionResponse { + id: string; + object: 'chat.completion'; + created: number; + model: string; + choices: ChatCompletionChoice[]; + usage?: ChatCompletionUsage; +} + +// ── Router Result (wraps response + metadata) ────────────────── + +export interface RouteResult { + response: ChatCompletionResponse; + /** Which provider served this request */ + provider: string; + /** Which model was used */ + model: string; + /** Total latency in ms (including retries) */ + totalLatencyMs: number; + /** How many attempts were made */ + attempts: number; +} diff --git a/packages/llm-router/tsconfig.json b/packages/llm-router/tsconfig.json new file mode 100644 index 00000000..8635ab2d --- /dev/null +++ b/packages/llm-router/tsconfig.json @@ -0,0 +1,9 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "outDir": "dist", + "rootDir": "src" + }, + "include": ["src/**/*.ts"], + "exclude": ["src/**/*.test.ts"] +} diff --git a/packages/llm-router/vitest.config.ts b/packages/llm-router/vitest.config.ts new file mode 100644 index 00000000..cc7b9264 --- /dev/null +++ b/packages/llm-router/vitest.config.ts @@ -0,0 +1,8 @@ +import { defineConfig } from 'vitest/config'; + +export default defineConfig({ + test: { + globals: true, + passWithNoTests: true, + }, +});