From a77b3ff931ef048198f5366681ddf886bcf3f4a4 Mon Sep 17 00:00:00 2001 From: saravanakumardb1 Date: Mon, 6 Apr 2026 11:30:22 -0700 Subject: [PATCH] =?UTF-8?q?refactor(extraction-service):=20provider-agnost?= =?UTF-8?q?ic=20transcription=20=E2=80=94=20OpenAI=20+=20Azure=20Speech=20?= =?UTF-8?q?+=20Mock?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - TranscriptionProvider interface with transcribe() + isConfigured() - OpenAITranscriptionProvider: Whisper API (existing behavior) - AzureTranscriptionProvider: Azure Speech REST API for short audio - MockTranscriptionProvider: deterministic results for testing - Factory: getSTT() singleton with env-driven auto-detection - STT_PROVIDER=openai|azure|mock (explicit) - Auto-detect: AZURE_SPEECH_KEY → azure, OPENAI_API_KEY → openai, else mock - Config: add STT_PROVIDER, AZURE_SPEECH_KEY, AZURE_SPEECH_REGION env vars - Route refactored: audio download (common) → provider.transcribe() (swappable) - deriveFilename() extracted to types.ts (shared by route + providers) - 35 transcription tests (was 12), 171 total passing - Follows same pattern as @bytelyst/llm provider abstraction --- services/extraction-service/src/lib/config.ts | 3 + .../src/modules/transcribe/factory.ts | 72 ++++++ .../src/modules/transcribe/providers/azure.ts | 119 +++++++++ .../src/modules/transcribe/providers/mock.ts | 28 +++ .../modules/transcribe/providers/openai.ts | 74 ++++++ .../src/modules/transcribe/routes.test.ts | 233 ++++++++++++++++-- .../src/modules/transcribe/routes.ts | 179 +++++--------- .../src/modules/transcribe/types.ts | 70 ++++++ 8 files changed, 642 insertions(+), 136 deletions(-) create mode 100644 services/extraction-service/src/modules/transcribe/factory.ts create mode 100644 services/extraction-service/src/modules/transcribe/providers/azure.ts create mode 100644 services/extraction-service/src/modules/transcribe/providers/mock.ts create mode 100644 services/extraction-service/src/modules/transcribe/providers/openai.ts diff --git a/services/extraction-service/src/lib/config.ts b/services/extraction-service/src/lib/config.ts index 5939546d..2aeb0d3e 100644 --- a/services/extraction-service/src/lib/config.ts +++ b/services/extraction-service/src/lib/config.ts @@ -19,9 +19,12 @@ const envSchema = z.object({ EXTRACTION_QUEUE_FILE: z.string().optional(), EXTRACTION_QUEUE_POLL_MS: z.coerce.number().default(100), EXTRACTION_QUEUE_LEASE_MS: z.coerce.number().default(30000), + STT_PROVIDER: z.enum(['openai', 'azure', 'mock']).optional(), OPENAI_API_KEY: z.string().optional(), OPENAI_BASE_URL: z.string().default('https://api.openai.com/v1'), WHISPER_MODEL: z.string().default('whisper-1'), + AZURE_SPEECH_KEY: z.string().optional(), + AZURE_SPEECH_REGION: z.string().optional(), }); export const config = envSchema.parse(process.env); diff --git a/services/extraction-service/src/modules/transcribe/factory.ts b/services/extraction-service/src/modules/transcribe/factory.ts new file mode 100644 index 00000000..505f4d1b --- /dev/null +++ b/services/extraction-service/src/modules/transcribe/factory.ts @@ -0,0 +1,72 @@ +/** + * Transcription provider factory. + * + * Creates a TranscriptionProvider based on STT_PROVIDER env var. + * Auto-detects Azure vs OpenAI from available credentials if not explicit. + * + * Priority: STT_PROVIDER > auto-detect (AZURE_SPEECH_KEY → azure, OPENAI_API_KEY → openai) + */ + +import { OpenAITranscriptionProvider } from './providers/openai.js'; +import { AzureTranscriptionProvider } from './providers/azure.js'; +import { MockTranscriptionProvider } from './providers/mock.js'; +import type { TranscriptionProvider, STTProviderType } from './types.js'; + +let _provider: TranscriptionProvider | null = null; + +/** + * Resolve provider type from env vars. + */ +function resolveProviderType(): STTProviderType { + const explicit = (process.env.STT_PROVIDER || '').toLowerCase(); + if (explicit === 'openai') return 'openai'; + if (explicit === 'azure') return 'azure'; + if (explicit === 'mock') return 'mock'; + + // Auto-detect from available credentials + if (process.env.AZURE_SPEECH_KEY && process.env.AZURE_SPEECH_REGION) return 'azure'; + if (process.env.OPENAI_API_KEY) return 'openai'; + + return 'mock'; +} + +/** + * Create a transcription provider by type. + */ +export function createSTTProvider(type: STTProviderType): TranscriptionProvider { + switch (type) { + case 'openai': + return new OpenAITranscriptionProvider(); + case 'azure': + return new AzureTranscriptionProvider(); + case 'mock': + return new MockTranscriptionProvider(); + default: + throw new Error(`Unknown STT_PROVIDER: '${type}'. Valid: openai, azure, mock`); + } +} + +/** + * Get the singleton transcription provider. + */ +export function getSTT(): TranscriptionProvider { + if (!_provider) { + const type = resolveProviderType(); + _provider = createSTTProvider(type); + } + return _provider; +} + +/** + * Set the singleton transcription provider (for testing). + */ +export function setSTT(provider: TranscriptionProvider): void { + _provider = provider; +} + +/** + * @internal Reset singleton (for testing). + */ +export function _resetSTT(): void { + _provider = null; +} diff --git a/services/extraction-service/src/modules/transcribe/providers/azure.ts b/services/extraction-service/src/modules/transcribe/providers/azure.ts new file mode 100644 index 00000000..03263b12 --- /dev/null +++ b/services/extraction-service/src/modules/transcribe/providers/azure.ts @@ -0,0 +1,119 @@ +/** + * Azure Speech Services transcription provider. + * + * Uses the Speech to text REST API for short audio (≤60s). + * Endpoint: https://.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1 + * Auth: Ocp-Apim-Subscription-Key header. + * + * Requires AZURE_SPEECH_KEY + AZURE_SPEECH_REGION env vars. + */ + +import type { TranscriptionProvider, TranscriptionInput, TranscriptionResult } from '../types.js'; + +export interface AzureSTTConfig { + speechKey: string; + speechRegion: string; +} + +/** Map filename extension to Azure-compatible Content-Type. */ +function audioContentType(filename: string): string { + const ext = filename.split('.').pop()?.toLowerCase(); + switch (ext) { + case 'wav': + return 'audio/wav; codecs=audio/pcm; samplerate=16000'; + case 'ogg': + return 'audio/ogg; codecs=opus'; + case 'mp3': + case 'mpeg': + case 'mpga': + return 'audio/mpeg'; + case 'webm': + return 'audio/webm; codecs=opus'; + case 'flac': + return 'audio/flac'; + case 'm4a': + case 'mp4': + return 'audio/mp4'; + default: + return 'audio/wav'; + } +} + +export class AzureTranscriptionProvider implements TranscriptionProvider { + private config: AzureSTTConfig; + + constructor(config?: Partial) { + this.config = { + speechKey: config?.speechKey ?? process.env.AZURE_SPEECH_KEY ?? '', + speechRegion: config?.speechRegion ?? process.env.AZURE_SPEECH_REGION ?? '', + }; + } + + isConfigured(): boolean { + return Boolean(this.config.speechKey && this.config.speechRegion); + } + + async transcribe(input: TranscriptionInput): Promise { + if (!this.isConfigured()) { + throw new Error( + 'Azure Speech is not configured (missing AZURE_SPEECH_KEY or AZURE_SPEECH_REGION)' + ); + } + + const { speechKey, speechRegion } = this.config; + const language = input.language || 'en-US'; + // Azure expects full locale (e.g. "en-US"), not just ISO 639-1 "en" + const locale = language.length <= 3 ? `${language}-${language.toUpperCase()}` : language; + + const url = new URL( + `https://${speechRegion}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1` + ); + url.searchParams.set('language', locale); + url.searchParams.set('format', 'detailed'); + + const contentType = audioContentType(input.filename); + + const response = await fetch(url.toString(), { + method: 'POST', + headers: { + 'Ocp-Apim-Subscription-Key': speechKey, + 'Content-Type': contentType, + Accept: 'application/json', + }, + body: input.audio, + signal: AbortSignal.timeout(120_000), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Azure Speech error ${response.status}: ${errorText}`); + } + + const data = (await response.json()) as { + RecognitionStatus: string; + DisplayText?: string; + Duration?: number; + NBest?: Array<{ + Display: string; + Confidence: number; + }>; + }; + + if (data.RecognitionStatus !== 'Success') { + throw new Error(`Azure Speech recognition failed: ${data.RecognitionStatus}`); + } + + // Duration is in 100-nanosecond ticks + const durationSeconds = data.Duration ? data.Duration / 10_000_000 : null; + + // Use NBest[0] if available (higher quality), fall back to DisplayText + const text = data.NBest?.[0]?.Display ?? data.DisplayText ?? ''; + + return { + text, + language: locale, + durationSeconds, + model: `azure-speech-${speechRegion}`, + }; + } +} diff --git a/services/extraction-service/src/modules/transcribe/providers/mock.ts b/services/extraction-service/src/modules/transcribe/providers/mock.ts new file mode 100644 index 00000000..20e52345 --- /dev/null +++ b/services/extraction-service/src/modules/transcribe/providers/mock.ts @@ -0,0 +1,28 @@ +/** + * Mock transcription provider for testing. + * + * Returns deterministic transcription results without calling any external API. + * Always reports as configured. + */ + +import type { TranscriptionProvider, TranscriptionInput, TranscriptionResult } from '../types.js'; + +export class MockTranscriptionProvider implements TranscriptionProvider { + isConfigured(): boolean { + return true; + } + + async transcribe(input: TranscriptionInput): Promise { + // Simulate ~50ms processing time + await new Promise(resolve => setTimeout(resolve, 50)); + + const audioSizeKB = Math.round(input.audio.byteLength / 1024); + + return { + text: `[Mock transcription of ${input.filename} (${audioSizeKB}KB)]`, + language: input.language ?? 'en', + durationSeconds: Math.max(1, Math.round(input.audio.byteLength / 16000)), + model: 'mock-stt', + }; + } +} diff --git a/services/extraction-service/src/modules/transcribe/providers/openai.ts b/services/extraction-service/src/modules/transcribe/providers/openai.ts new file mode 100644 index 00000000..cc3cc918 --- /dev/null +++ b/services/extraction-service/src/modules/transcribe/providers/openai.ts @@ -0,0 +1,74 @@ +/** + * OpenAI Whisper transcription provider. + * + * Uses POST /v1/audio/transcriptions (multipart/form-data). + * Requires OPENAI_API_KEY env var. + */ + +import type { TranscriptionProvider, TranscriptionInput, TranscriptionResult } from '../types.js'; + +export interface OpenAISTTConfig { + apiKey: string; + baseUrl?: string; + model?: string; +} + +export class OpenAITranscriptionProvider implements TranscriptionProvider { + private config: Required; + + constructor(config?: Partial) { + this.config = { + apiKey: config?.apiKey ?? process.env.OPENAI_API_KEY ?? '', + baseUrl: ( + config?.baseUrl ?? + process.env.OPENAI_BASE_URL ?? + 'https://api.openai.com/v1' + ).replace(/\/+$/, ''), + model: config?.model ?? process.env.WHISPER_MODEL ?? 'whisper-1', + }; + } + + isConfigured(): boolean { + return Boolean(this.config.apiKey); + } + + async transcribe(input: TranscriptionInput): Promise { + if (!this.isConfigured()) { + throw new Error('OpenAI is not configured (missing OPENAI_API_KEY)'); + } + + const formData = new FormData(); + formData.append('file', new Blob([input.audio]), input.filename); + formData.append('model', this.config.model); + if (input.language) formData.append('language', input.language); + if (input.prompt) formData.append('prompt', input.prompt); + formData.append('response_format', 'verbose_json'); + + const response = await fetch(`${this.config.baseUrl}/audio/transcriptions`, { + method: 'POST', + headers: { + Authorization: `Bearer ${this.config.apiKey}`, + }, + body: formData, + signal: AbortSignal.timeout(120_000), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`OpenAI Whisper error ${response.status}: ${errorText}`); + } + + const data = (await response.json()) as { + text: string; + language?: string; + duration?: number; + }; + + return { + text: data.text, + language: data.language ?? input.language ?? null, + durationSeconds: data.duration ?? null, + model: this.config.model, + }; + } +} diff --git a/services/extraction-service/src/modules/transcribe/routes.test.ts b/services/extraction-service/src/modules/transcribe/routes.test.ts index 59969331..d2bc79b4 100644 --- a/services/extraction-service/src/modules/transcribe/routes.test.ts +++ b/services/extraction-service/src/modules/transcribe/routes.test.ts @@ -1,5 +1,15 @@ import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; -import { TranscribeRequestSchema, MAX_AUDIO_SIZE_BYTES, SUPPORTED_AUDIO_TYPES } from './types.js'; +import { + TranscribeRequestSchema, + MAX_AUDIO_SIZE_BYTES, + SUPPORTED_AUDIO_TYPES, + deriveFilename, +} from './types.js'; +import type { TranscriptionProvider } from './types.js'; +import { MockTranscriptionProvider } from './providers/mock.js'; +import { OpenAITranscriptionProvider } from './providers/openai.js'; +import { AzureTranscriptionProvider } from './providers/azure.js'; +import { createSTTProvider, _resetSTT, getSTT } from './factory.js'; // ── Schema validation tests ───────────────────────────────────── @@ -91,34 +101,219 @@ describe('transcription constants', () => { }); }); -// ── Route integration tests (mocked fetch) ────────────────────── +// ── deriveFilename tests ──────────────────────────────────────── -describe('transcribe route', () => { - const originalFetch = globalThis.fetch; +describe('deriveFilename', () => { + it('extracts filename from URL path', () => { + expect(deriveFilename('https://blob.example.com/uploads/meeting.wav?sv=2024', null)).toBe( + 'meeting.wav' + ); + }); + it('extracts filename with various extensions', () => { + expect(deriveFilename('https://x.com/a.mp3', null)).toBe('a.mp3'); + expect(deriveFilename('https://x.com/b.m4a', null)).toBe('b.m4a'); + expect(deriveFilename('https://x.com/c.webm', null)).toBe('c.webm'); + expect(deriveFilename('https://x.com/d.flac', null)).toBe('d.flac'); + }); + + it('falls back to content-type mapping', () => { + expect(deriveFilename('https://x.com/blob123', 'audio/mpeg')).toBe('audio.mp3'); + expect(deriveFilename('https://x.com/blob123', 'audio/wav')).toBe('audio.wav'); + expect(deriveFilename('https://x.com/blob123', 'audio/webm')).toBe('audio.webm'); + expect(deriveFilename('https://x.com/blob123', 'audio/ogg')).toBe('audio.ogg'); + }); + + it('handles content-type with charset parameter', () => { + expect(deriveFilename('https://x.com/blob', 'audio/mpeg; charset=utf-8')).toBe('audio.mp3'); + }); + + it('defaults to audio.mp3 when no match', () => { + expect(deriveFilename('https://x.com/blob', null)).toBe('audio.mp3'); + expect(deriveFilename('https://x.com/blob', 'application/octet-stream')).toBe('audio.mp3'); + }); +}); + +// ── MockTranscriptionProvider tests ───────────────────────────── + +describe('MockTranscriptionProvider', () => { + it('is always configured', () => { + const provider = new MockTranscriptionProvider(); + expect(provider.isConfigured()).toBe(true); + }); + + it('returns deterministic transcription with filename and size', async () => { + const provider = new MockTranscriptionProvider(); + const audio = new ArrayBuffer(16000); + const result = await provider.transcribe({ + audio, + filename: 'test.mp3', + }); + + expect(result.text).toContain('Mock transcription'); + expect(result.text).toContain('test.mp3'); + expect(result.text).toContain('16KB'); + expect(result.model).toBe('mock-stt'); + expect(result.durationSeconds).toBeGreaterThan(0); + }); + + it('uses provided language', async () => { + const provider = new MockTranscriptionProvider(); + const result = await provider.transcribe({ + audio: new ArrayBuffer(100), + filename: 'a.wav', + language: 'es', + }); + expect(result.language).toBe('es'); + }); + + it('defaults language to en', async () => { + const provider = new MockTranscriptionProvider(); + const result = await provider.transcribe({ + audio: new ArrayBuffer(100), + filename: 'a.wav', + }); + expect(result.language).toBe('en'); + }); +}); + +// ── OpenAITranscriptionProvider tests ─────────────────────────── + +describe('OpenAITranscriptionProvider', () => { + it('is not configured without OPENAI_API_KEY', () => { + const provider = new OpenAITranscriptionProvider({ apiKey: '' }); + expect(provider.isConfigured()).toBe(false); + }); + + it('is configured with apiKey', () => { + const provider = new OpenAITranscriptionProvider({ apiKey: 'sk-test' }); + expect(provider.isConfigured()).toBe(true); + }); + + it('throws when transcribing without config', async () => { + const provider = new OpenAITranscriptionProvider({ apiKey: '' }); + await expect( + provider.transcribe({ audio: new ArrayBuffer(10), filename: 'a.mp3' }) + ).rejects.toThrow('not configured'); + }); +}); + +// ── AzureTranscriptionProvider tests ──────────────────────────── + +describe('AzureTranscriptionProvider', () => { + it('is not configured without both key and region', () => { + expect(new AzureTranscriptionProvider({ speechKey: '', speechRegion: '' }).isConfigured()).toBe( + false + ); + expect( + new AzureTranscriptionProvider({ speechKey: 'k', speechRegion: '' }).isConfigured() + ).toBe(false); + expect( + new AzureTranscriptionProvider({ speechKey: '', speechRegion: 'eastus' }).isConfigured() + ).toBe(false); + }); + + it('is configured with both key and region', () => { + const provider = new AzureTranscriptionProvider({ speechKey: 'k', speechRegion: 'eastus' }); + expect(provider.isConfigured()).toBe(true); + }); + + it('throws when transcribing without config', async () => { + const provider = new AzureTranscriptionProvider({ speechKey: '', speechRegion: '' }); + await expect( + provider.transcribe({ audio: new ArrayBuffer(10), filename: 'a.wav' }) + ).rejects.toThrow('not configured'); + }); +}); + +// ── Factory tests ─────────────────────────────────────────────── + +describe('STT factory', () => { beforeEach(() => { - vi.stubEnv('OPENAI_API_KEY', 'test-key'); + _resetSTT(); }); afterEach(() => { - globalThis.fetch = originalFetch; + _resetSTT(); vi.unstubAllEnvs(); }); - it('deriveFilename extracts extension from URL path', async () => { - // Import the module to test the helper indirectly via the route - // The deriveFilename is not exported, but we can verify behavior through integration - const { TranscribeRequestSchema } = await import('./types.js'); - const req = TranscribeRequestSchema.parse({ - audioUrl: 'https://blob.example.com/uploads/meeting.wav?sv=2024-01-01&sig=abc', - }); - expect(req.audioUrl).toContain('meeting.wav'); + it('createSTTProvider returns OpenAI provider', () => { + const provider = createSTTProvider('openai'); + expect(provider).toBeInstanceOf(OpenAITranscriptionProvider); }); - it('schema defaults responseFormat to text', () => { - const result = TranscribeRequestSchema.parse({ - audioUrl: 'https://example.com/a.mp3', - }); - expect(result.responseFormat).toBe('text'); + it('createSTTProvider returns Azure provider', () => { + const provider = createSTTProvider('azure'); + expect(provider).toBeInstanceOf(AzureTranscriptionProvider); + }); + + it('createSTTProvider returns Mock provider', () => { + const provider = createSTTProvider('mock'); + expect(provider).toBeInstanceOf(MockTranscriptionProvider); + }); + + it('createSTTProvider throws for unknown type', () => { + expect(() => createSTTProvider('invalid' as 'openai')).toThrow('Unknown STT_PROVIDER'); + }); + + it('getSTT auto-detects mock when no credentials', () => { + vi.stubEnv('OPENAI_API_KEY', ''); + vi.stubEnv('AZURE_SPEECH_KEY', ''); + vi.stubEnv('AZURE_SPEECH_REGION', ''); + vi.stubEnv('STT_PROVIDER', ''); + _resetSTT(); + const provider = getSTT(); + expect(provider).toBeInstanceOf(MockTranscriptionProvider); + }); + + it('getSTT auto-detects openai when OPENAI_API_KEY set', () => { + vi.stubEnv('OPENAI_API_KEY', 'sk-test'); + vi.stubEnv('AZURE_SPEECH_KEY', ''); + vi.stubEnv('AZURE_SPEECH_REGION', ''); + vi.stubEnv('STT_PROVIDER', ''); + _resetSTT(); + const provider = getSTT(); + expect(provider).toBeInstanceOf(OpenAITranscriptionProvider); + }); + + it('getSTT auto-detects azure when AZURE_SPEECH_KEY + REGION set', () => { + vi.stubEnv('OPENAI_API_KEY', 'sk-test'); + vi.stubEnv('AZURE_SPEECH_KEY', 'my-key'); + vi.stubEnv('AZURE_SPEECH_REGION', 'eastus'); + vi.stubEnv('STT_PROVIDER', ''); + _resetSTT(); + const provider = getSTT(); + expect(provider).toBeInstanceOf(AzureTranscriptionProvider); + }); + + it('getSTT respects explicit STT_PROVIDER over auto-detect', () => { + vi.stubEnv('OPENAI_API_KEY', 'sk-test'); + vi.stubEnv('AZURE_SPEECH_KEY', 'my-key'); + vi.stubEnv('AZURE_SPEECH_REGION', 'eastus'); + vi.stubEnv('STT_PROVIDER', 'mock'); + _resetSTT(); + const provider = getSTT(); + expect(provider).toBeInstanceOf(MockTranscriptionProvider); + }); + + it('getSTT returns singleton', () => { + vi.stubEnv('STT_PROVIDER', 'mock'); + _resetSTT(); + const p1 = getSTT(); + const p2 = getSTT(); + expect(p1).toBe(p2); + }); + + it('TranscriptionProvider interface is satisfied by all providers', () => { + const providers: TranscriptionProvider[] = [ + new MockTranscriptionProvider(), + new OpenAITranscriptionProvider({ apiKey: 'test' }), + new AzureTranscriptionProvider({ speechKey: 'k', speechRegion: 'eastus' }), + ]; + for (const p of providers) { + expect(typeof p.transcribe).toBe('function'); + expect(typeof p.isConfigured).toBe('function'); + } }); }); diff --git a/services/extraction-service/src/modules/transcribe/routes.ts b/services/extraction-service/src/modules/transcribe/routes.ts index 22ef1553..6180464b 100644 --- a/services/extraction-service/src/modules/transcribe/routes.ts +++ b/services/extraction-service/src/modules/transcribe/routes.ts @@ -1,16 +1,22 @@ /** - * Transcription routes — speech-to-text via OpenAI Whisper API. + * Transcription routes — provider-agnostic speech-to-text. * - * POST /transcribe — Download audio from URL, transcribe via Whisper. + * POST /transcribe — Download audio from URL, transcribe via configured provider. * Product-agnostic: any product backend can call this endpoint. * - * Requires OPENAI_API_KEY environment variable. + * Provider selection: STT_PROVIDER env var (openai | azure | mock). + * Auto-detects from available credentials if not explicit. */ import type { FastifyInstance } from 'fastify'; import { BadRequestError } from '@bytelyst/errors'; -import { config } from '../../lib/config.js'; -import { TranscribeRequestSchema, MAX_AUDIO_SIZE_BYTES, type TranscribeResponse } from './types.js'; +import { + TranscribeRequestSchema, + MAX_AUDIO_SIZE_BYTES, + deriveFilename, + type TranscribeResponse, +} from './types.js'; +import { getSTT } from './factory.js'; export async function transcribeRoutes(app: FastifyInstance): Promise { /** @@ -18,9 +24,10 @@ export async function transcribeRoutes(app: FastifyInstance): Promise { * * Flow: * 1. Validate request (audioUrl, optional language/model/prompt) - * 2. Download audio from the provided URL - * 3. Send to OpenAI Whisper API as multipart/form-data - * 4. Return transcribed text + metadata + * 2. Check provider is configured + * 3. Download audio from the provided URL + * 4. Delegate to configured TranscriptionProvider + * 5. Return transcribed text + metadata */ app.post('/transcribe', async (req, reply) => { const parsed = TranscribeRequestSchema.safeParse(req.body); @@ -28,17 +35,19 @@ export async function transcribeRoutes(app: FastifyInstance): Promise { throw new BadRequestError(parsed.error.issues.map(i => i.message).join('; ')); } - if (!config.OPENAI_API_KEY) { + const provider = getSTT(); + + if (!provider.isConfigured()) { return reply.status(503).send({ - error: 'Transcription not available — OPENAI_API_KEY not configured', + error: 'Transcription not available — no STT provider configured', }); } - const { audioUrl, model, language, prompt, responseFormat, productId } = parsed.data; + const { audioUrl, language, prompt, productId } = parsed.data; const requestId = req.headers['x-request-id'] as string | undefined; req.log.info( - { audioUrl: audioUrl.substring(0, 80), language, model, productId }, + { audioUrl: audioUrl.substring(0, 80), language, productId }, 'transcription request' ); @@ -87,120 +96,56 @@ export async function transcribeRoutes(app: FastifyInstance): Promise { // ── Step 2: Determine filename from URL or content-type ────── const filename = deriveFilename(audioUrl, contentType); - // ── Step 3: Call OpenAI Whisper API ────────────────────────── - const whisperModel = model || config.WHISPER_MODEL; - const baseUrl = config.OPENAI_BASE_URL.replace(/\/+$/, ''); - - const formData = new FormData(); - formData.append('file', new Blob([audioBuffer]), filename); - formData.append('model', whisperModel); - if (language) formData.append('language', language); - if (prompt) formData.append('prompt', prompt); - formData.append('response_format', responseFormat === 'text' ? 'verbose_json' : responseFormat); - - let whisperResponse: Response; + // ── Step 3: Delegate to provider ───────────────────────────── + let result: TranscribeResponse; try { - whisperResponse = await fetch(`${baseUrl}/audio/transcriptions`, { - method: 'POST', - headers: { - Authorization: `Bearer ${config.OPENAI_API_KEY}`, - }, - body: formData, - signal: AbortSignal.timeout(120_000), + const sttResult = await provider.transcribe({ + audio: audioBuffer, + filename, + language, + prompt, }); + + const durationMs = Date.now() - startMs; + + req.log.info( + { + durationMs, + model: sttResult.model, + language: sttResult.language, + audioSizeBytes: audioBuffer.byteLength, + textLength: sttResult.text.length, + productId, + }, + 'transcription complete' + ); + + result = { + text: sttResult.text, + language: sttResult.language, + durationSeconds: sttResult.durationSeconds, + model: sttResult.model, + durationMs, + requestId, + }; } catch (err) { - const message = err instanceof Error ? err.message : 'Whisper API call failed'; - req.log.error({ error: message }, 'whisper API failed'); + const message = err instanceof Error ? err.message : 'Transcription failed'; + req.log.error({ error: message }, 'transcription provider failed'); + + if (message.includes('429') || message.includes('rate limit')) { + return reply.status(429).send({ + error: 'Transcription rate limit exceeded', + detail: message, + retryAfter: 60, + }); + } + return reply.status(502).send({ error: 'Transcription failed', detail: message, }); } - if (!whisperResponse.ok) { - const errorText = await whisperResponse.text(); - req.log.error({ status: whisperResponse.status, error: errorText }, 'whisper API error'); - - if (whisperResponse.status === 429) { - return reply.status(429).send({ - error: 'Transcription rate limit exceeded', - detail: errorText, - retryAfter: 60, - }); - } - - return reply.status(502).send({ - error: `Whisper API error ${whisperResponse.status}`, - detail: errorText, - }); - } - - // ── Step 4: Parse response ─────────────────────────────────── - const data = (await whisperResponse.json()) as { - text: string; - language?: string; - duration?: number; - }; - - const durationMs = Date.now() - startMs; - - req.log.info( - { - durationMs, - model: whisperModel, - language: data.language, - audioSizeBytes: audioBuffer.byteLength, - textLength: data.text.length, - productId, - }, - 'transcription complete' - ); - - const result: TranscribeResponse = { - text: data.text, - language: data.language ?? language ?? null, - durationSeconds: data.duration ?? null, - model: whisperModel, - durationMs, - requestId, - }; - return reply.send(result); }); } - -// ── Helpers ─────────────────────────────────────────────────────── - -function deriveFilename(url: string, contentType: string | null): string { - // Try to extract extension from URL path - try { - const pathname = new URL(url).pathname; - const lastSegment = pathname.split('/').pop(); - if (lastSegment && /\.(mp3|m4a|wav|webm|ogg|flac|mp4|mpeg|mpga)$/i.test(lastSegment)) { - return lastSegment; - } - } catch { - // URL parsing failed — use fallback - } - - // Map content-type to extension - const extMap: Record = { - 'audio/mpeg': 'audio.mp3', - 'audio/mp4': 'audio.m4a', - 'audio/mp4a-latm': 'audio.m4a', - 'audio/x-m4a': 'audio.m4a', - 'audio/wav': 'audio.wav', - 'audio/x-wav': 'audio.wav', - 'audio/webm': 'audio.webm', - 'audio/ogg': 'audio.ogg', - 'audio/flac': 'audio.flac', - 'video/mp4': 'audio.mp4', - }; - - if (contentType) { - const base = contentType.split(';')[0].trim().toLowerCase(); - if (extMap[base]) return extMap[base]; - } - - return 'audio.mp3'; -} diff --git a/services/extraction-service/src/modules/transcribe/types.ts b/services/extraction-service/src/modules/transcribe/types.ts index 861b8d30..501d8da6 100644 --- a/services/extraction-service/src/modules/transcribe/types.ts +++ b/services/extraction-service/src/modules/transcribe/types.ts @@ -1,5 +1,41 @@ import { z } from 'zod'; +// ── Provider interface ────────────────────────────────────── + +/** Audio input passed to a transcription provider. */ +export interface TranscriptionInput { + /** Raw audio bytes. */ + audio: ArrayBuffer; + /** Filename with extension (used for content-type inference). */ + filename: string; + /** ISO 639-1 language hint. */ + language?: string; + /** Prompt to guide transcription style. */ + prompt?: string; +} + +/** Result from a transcription provider. */ +export interface TranscriptionResult { + /** The transcribed text. */ + text: string; + /** Detected or specified language code. */ + language: string | null; + /** Duration of the audio in seconds (when available). */ + durationSeconds: number | null; + /** Provider-specific model identifier. */ + model: string; +} + +/** Cloud-agnostic speech-to-text provider. */ +export interface TranscriptionProvider { + /** Transcribe audio to text. */ + transcribe(input: TranscriptionInput): Promise; + /** Check if the provider is configured with valid credentials. */ + isConfigured(): boolean; +} + +export type STTProviderType = 'openai' | 'azure' | 'mock'; + // ── Transcription request schema ──────────────────────────────── export const TranscribeRequestSchema = z.object({ @@ -53,3 +89,37 @@ export const SUPPORTED_AUDIO_TYPES = new Set([ /** Max audio file size: 25 MB (OpenAI Whisper limit). */ export const MAX_AUDIO_SIZE_BYTES = 25 * 1024 * 1024; + +// ── Filename helper (shared by route + providers) ───────────── + +export function deriveFilename(url: string, contentType: string | null): string { + try { + const pathname = new URL(url).pathname; + const lastSegment = pathname.split('/').pop(); + if (lastSegment && /\.(mp3|m4a|wav|webm|ogg|flac|mp4|mpeg|mpga)$/i.test(lastSegment)) { + return lastSegment; + } + } catch { + // URL parsing failed — use fallback + } + + const extMap: Record = { + 'audio/mpeg': 'audio.mp3', + 'audio/mp4': 'audio.m4a', + 'audio/mp4a-latm': 'audio.m4a', + 'audio/x-m4a': 'audio.m4a', + 'audio/wav': 'audio.wav', + 'audio/x-wav': 'audio.wav', + 'audio/webm': 'audio.webm', + 'audio/ogg': 'audio.ogg', + 'audio/flac': 'audio.flac', + 'video/mp4': 'audio.mp4', + }; + + if (contentType) { + const base = contentType.split(';')[0].trim().toLowerCase(); + if (extMap[base]) return extMap[base]; + } + + return 'audio.mp3'; +}