refactor(extraction-service): provider-agnostic transcription — OpenAI + Azure Speech + Mock

- TranscriptionProvider interface with transcribe() + isConfigured() - OpenAITranscriptionProvider: Whisper API (existing behavior) - AzureTranscriptionProvider: Azure Speech REST API for short audio - MockTranscriptionProvider: deterministic results for testing - Factory: getSTT() singleton with env-driven auto-detection - STT_PROVIDER=openai|azure|mock (explicit) - Auto-detect: AZURE_SPEECH_KEY → azure, OPENAI_API_KEY → openai, else mock - Config: add STT_PROVIDER, AZURE_SPEECH_KEY, AZURE_SPEECH_REGION env vars - Route refactored: audio download (common) → provider.transcribe() (swappable) - deriveFilename() extracted to types.ts (shared by route + providers) - 35 transcription tests (was 12), 171 total passing - Follows same pattern as @bytelyst/llm provider abstraction
2026-04-06 11:30:22 -07:00 · 2026-04-06 11:30:22 -07:00 · a77b3ff931
commit a77b3ff931
parent f8e15880d2
8 changed files with 642 additions and 136 deletions
--- a/services/extraction-service/src/lib/config.ts
+++ b/services/extraction-service/src/lib/config.ts
@ -19,9 +19,12 @@ const envSchema = z.object({
  EXTRACTION_QUEUE_FILE: z.string().optional(),
  EXTRACTION_QUEUE_POLL_MS: z.coerce.number().default(100),
  EXTRACTION_QUEUE_LEASE_MS: z.coerce.number().default(30000),
+  STT_PROVIDER: z.enum(['openai', 'azure', 'mock']).optional(),
  OPENAI_API_KEY: z.string().optional(),
  OPENAI_BASE_URL: z.string().default('https://api.openai.com/v1'),
  WHISPER_MODEL: z.string().default('whisper-1'),
+  AZURE_SPEECH_KEY: z.string().optional(),
+  AZURE_SPEECH_REGION: z.string().optional(),
 });

 export const config = envSchema.parse(process.env);
--- a/services/extraction-service/src/modules/transcribe/factory.ts
+++ b/services/extraction-service/src/modules/transcribe/factory.ts
@ -0,0 +1,72 @@
+/**
+ * Transcription provider factory.
+ *
+ * Creates a TranscriptionProvider based on STT_PROVIDER env var.
+ * Auto-detects Azure vs OpenAI from available credentials if not explicit.
+ *
+ * Priority: STT_PROVIDER > auto-detect (AZURE_SPEECH_KEY → azure, OPENAI_API_KEY → openai)
+ */
+
+import { OpenAITranscriptionProvider } from './providers/openai.js';
+import { AzureTranscriptionProvider } from './providers/azure.js';
+import { MockTranscriptionProvider } from './providers/mock.js';
+import type { TranscriptionProvider, STTProviderType } from './types.js';
+
+let _provider: TranscriptionProvider | null = null;
+
+/**
+ * Resolve provider type from env vars.
+ */
+function resolveProviderType(): STTProviderType {
+  const explicit = (process.env.STT_PROVIDER || '').toLowerCase();
+  if (explicit === 'openai') return 'openai';
+  if (explicit === 'azure') return 'azure';
+  if (explicit === 'mock') return 'mock';
+
+  // Auto-detect from available credentials
+  if (process.env.AZURE_SPEECH_KEY && process.env.AZURE_SPEECH_REGION) return 'azure';
+  if (process.env.OPENAI_API_KEY) return 'openai';
+
+  return 'mock';
+}
+
+/**
+ * Create a transcription provider by type.
+ */
+export function createSTTProvider(type: STTProviderType): TranscriptionProvider {
+  switch (type) {
+    case 'openai':
+      return new OpenAITranscriptionProvider();
+    case 'azure':
+      return new AzureTranscriptionProvider();
+    case 'mock':
+      return new MockTranscriptionProvider();
+    default:
+      throw new Error(`Unknown STT_PROVIDER: '${type}'. Valid: openai, azure, mock`);
+  }
+}
+
+/**
+ * Get the singleton transcription provider.
+ */
+export function getSTT(): TranscriptionProvider {
+  if (!_provider) {
+    const type = resolveProviderType();
+    _provider = createSTTProvider(type);
+  }
+  return _provider;
+}
+
+/**
+ * Set the singleton transcription provider (for testing).
+ */
+export function setSTT(provider: TranscriptionProvider): void {
+  _provider = provider;
+}
+
+/**
+ * @internal Reset singleton (for testing).
+ */
+export function _resetSTT(): void {
+  _provider = null;
+}
--- a/services/extraction-service/src/modules/transcribe/providers/azure.ts
+++ b/services/extraction-service/src/modules/transcribe/providers/azure.ts
@ -0,0 +1,119 @@
+/**
+ * Azure Speech Services transcription provider.
+ *
+ * Uses the Speech to text REST API for short audio (≤60s).
+ * Endpoint: https://<REGION>.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1
+ * Auth: Ocp-Apim-Subscription-Key header.
+ *
+ * Requires AZURE_SPEECH_KEY + AZURE_SPEECH_REGION env vars.
+ */
+
+import type { TranscriptionProvider, TranscriptionInput, TranscriptionResult } from '../types.js';
+
+export interface AzureSTTConfig {
+  speechKey: string;
+  speechRegion: string;
+}
+
+/** Map filename extension to Azure-compatible Content-Type. */
+function audioContentType(filename: string): string {
+  const ext = filename.split('.').pop()?.toLowerCase();
+  switch (ext) {
+    case 'wav':
+      return 'audio/wav; codecs=audio/pcm; samplerate=16000';
+    case 'ogg':
+      return 'audio/ogg; codecs=opus';
+    case 'mp3':
+    case 'mpeg':
+    case 'mpga':
+      return 'audio/mpeg';
+    case 'webm':
+      return 'audio/webm; codecs=opus';
+    case 'flac':
+      return 'audio/flac';
+    case 'm4a':
+    case 'mp4':
+      return 'audio/mp4';
+    default:
+      return 'audio/wav';
+  }
+}
+
+export class AzureTranscriptionProvider implements TranscriptionProvider {
+  private config: AzureSTTConfig;
+
+  constructor(config?: Partial<AzureSTTConfig>) {
+    this.config = {
+      speechKey: config?.speechKey ?? process.env.AZURE_SPEECH_KEY ?? '',
+      speechRegion: config?.speechRegion ?? process.env.AZURE_SPEECH_REGION ?? '',
+    };
+  }
+
+  isConfigured(): boolean {
+    return Boolean(this.config.speechKey && this.config.speechRegion);
+  }
+
+  async transcribe(input: TranscriptionInput): Promise<TranscriptionResult> {
+    if (!this.isConfigured()) {
+      throw new Error(
+        'Azure Speech is not configured (missing AZURE_SPEECH_KEY or AZURE_SPEECH_REGION)'
+      );
+    }
+
+    const { speechKey, speechRegion } = this.config;
+    const language = input.language || 'en-US';
+    // Azure expects full locale (e.g. "en-US"), not just ISO 639-1 "en"
+    const locale = language.length <= 3 ? `${language}-${language.toUpperCase()}` : language;
+
+    const url = new URL(
+      `https://${speechRegion}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1`
+    );
+    url.searchParams.set('language', locale);
+    url.searchParams.set('format', 'detailed');
+
+    const contentType = audioContentType(input.filename);
+
+    const response = await fetch(url.toString(), {
+      method: 'POST',
+      headers: {
+        'Ocp-Apim-Subscription-Key': speechKey,
+        'Content-Type': contentType,
+        Accept: 'application/json',
+      },
+      body: input.audio,
+      signal: AbortSignal.timeout(120_000),
+    });
+
+    if (!response.ok) {
+      const errorText = await response.text();
+      throw new Error(`Azure Speech error ${response.status}: ${errorText}`);
+    }
+
+    const data = (await response.json()) as {
+      RecognitionStatus: string;
+      DisplayText?: string;
+      Duration?: number;
+      NBest?: Array<{
+        Display: string;
+        Confidence: number;
+      }>;
+    };
+
+    if (data.RecognitionStatus !== 'Success') {
+      throw new Error(`Azure Speech recognition failed: ${data.RecognitionStatus}`);
+    }
+
+    // Duration is in 100-nanosecond ticks
+    const durationSeconds = data.Duration ? data.Duration / 10_000_000 : null;
+
+    // Use NBest[0] if available (higher quality), fall back to DisplayText
+    const text = data.NBest?.[0]?.Display ?? data.DisplayText ?? '';
+
+    return {
+      text,
+      language: locale,
+      durationSeconds,
+      model: `azure-speech-${speechRegion}`,
+    };
+  }
+}
--- a/services/extraction-service/src/modules/transcribe/providers/mock.ts
+++ b/services/extraction-service/src/modules/transcribe/providers/mock.ts
@ -0,0 +1,28 @@
+/**
+ * Mock transcription provider for testing.
+ *
+ * Returns deterministic transcription results without calling any external API.
+ * Always reports as configured.
+ */
+
+import type { TranscriptionProvider, TranscriptionInput, TranscriptionResult } from '../types.js';
+
+export class MockTranscriptionProvider implements TranscriptionProvider {
+  isConfigured(): boolean {
+    return true;
+  }
+
+  async transcribe(input: TranscriptionInput): Promise<TranscriptionResult> {
+    // Simulate ~50ms processing time
+    await new Promise(resolve => setTimeout(resolve, 50));
+
+    const audioSizeKB = Math.round(input.audio.byteLength / 1024);
+
+    return {
+      text: `[Mock transcription of ${input.filename} (${audioSizeKB}KB)]`,
+      language: input.language ?? 'en',
+      durationSeconds: Math.max(1, Math.round(input.audio.byteLength / 16000)),
+      model: 'mock-stt',
+    };
+  }
+}
--- a/services/extraction-service/src/modules/transcribe/providers/openai.ts
+++ b/services/extraction-service/src/modules/transcribe/providers/openai.ts
@ -0,0 +1,74 @@
+/**
+ * OpenAI Whisper transcription provider.
+ *
+ * Uses POST /v1/audio/transcriptions (multipart/form-data).
+ * Requires OPENAI_API_KEY env var.
+ */
+
+import type { TranscriptionProvider, TranscriptionInput, TranscriptionResult } from '../types.js';
+
+export interface OpenAISTTConfig {
+  apiKey: string;
+  baseUrl?: string;
+  model?: string;
+}
+
+export class OpenAITranscriptionProvider implements TranscriptionProvider {
+  private config: Required<OpenAISTTConfig>;
+
+  constructor(config?: Partial<OpenAISTTConfig>) {
+    this.config = {
+      apiKey: config?.apiKey ?? process.env.OPENAI_API_KEY ?? '',
+      baseUrl: (
+        config?.baseUrl ??
+        process.env.OPENAI_BASE_URL ??
+        'https://api.openai.com/v1'
+      ).replace(/\/+$/, ''),
+      model: config?.model ?? process.env.WHISPER_MODEL ?? 'whisper-1',
+    };
+  }
+
+  isConfigured(): boolean {
+    return Boolean(this.config.apiKey);
+  }
+
+  async transcribe(input: TranscriptionInput): Promise<TranscriptionResult> {
+    if (!this.isConfigured()) {
+      throw new Error('OpenAI is not configured (missing OPENAI_API_KEY)');
+    }
+
+    const formData = new FormData();
+    formData.append('file', new Blob([input.audio]), input.filename);
+    formData.append('model', this.config.model);
+    if (input.language) formData.append('language', input.language);
+    if (input.prompt) formData.append('prompt', input.prompt);
+    formData.append('response_format', 'verbose_json');
+
+    const response = await fetch(`${this.config.baseUrl}/audio/transcriptions`, {
+      method: 'POST',
+      headers: {
+        Authorization: `Bearer ${this.config.apiKey}`,
+      },
+      body: formData,
+      signal: AbortSignal.timeout(120_000),
+    });
+
+    if (!response.ok) {
+      const errorText = await response.text();
+      throw new Error(`OpenAI Whisper error ${response.status}: ${errorText}`);
+    }
+
+    const data = (await response.json()) as {
+      text: string;
+      language?: string;
+      duration?: number;
+    };
+
+    return {
+      text: data.text,
+      language: data.language ?? input.language ?? null,
+      durationSeconds: data.duration ?? null,
+      model: this.config.model,
+    };
+  }
+}
--- a/services/extraction-service/src/modules/transcribe/routes.test.ts
+++ b/services/extraction-service/src/modules/transcribe/routes.test.ts
@ -1,5 +1,15 @@
 import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
-import { TranscribeRequestSchema, MAX_AUDIO_SIZE_BYTES, SUPPORTED_AUDIO_TYPES } from './types.js';
+import {
+  TranscribeRequestSchema,
+  MAX_AUDIO_SIZE_BYTES,
+  SUPPORTED_AUDIO_TYPES,
+  deriveFilename,
+} from './types.js';
+import type { TranscriptionProvider } from './types.js';
+import { MockTranscriptionProvider } from './providers/mock.js';
+import { OpenAITranscriptionProvider } from './providers/openai.js';
+import { AzureTranscriptionProvider } from './providers/azure.js';
+import { createSTTProvider, _resetSTT, getSTT } from './factory.js';

 // ── Schema validation tests ─────────────────────────────────────

@ -91,34 +101,219 @@ describe('transcription constants', () => {
  });
 });

-// ── Route integration tests (mocked fetch) ──────────────────────
+// ── deriveFilename tests ────────────────────────────────────────

-describe('transcribe route', () => {
-  const originalFetch = globalThis.fetch;
+describe('deriveFilename', () => {
+  it('extracts filename from URL path', () => {
+    expect(deriveFilename('https://blob.example.com/uploads/meeting.wav?sv=2024', null)).toBe(
+      'meeting.wav'
+    );
+  });

+  it('extracts filename with various extensions', () => {
+    expect(deriveFilename('https://x.com/a.mp3', null)).toBe('a.mp3');
+    expect(deriveFilename('https://x.com/b.m4a', null)).toBe('b.m4a');
+    expect(deriveFilename('https://x.com/c.webm', null)).toBe('c.webm');
+    expect(deriveFilename('https://x.com/d.flac', null)).toBe('d.flac');
+  });
+
+  it('falls back to content-type mapping', () => {
+    expect(deriveFilename('https://x.com/blob123', 'audio/mpeg')).toBe('audio.mp3');
+    expect(deriveFilename('https://x.com/blob123', 'audio/wav')).toBe('audio.wav');
+    expect(deriveFilename('https://x.com/blob123', 'audio/webm')).toBe('audio.webm');
+    expect(deriveFilename('https://x.com/blob123', 'audio/ogg')).toBe('audio.ogg');
+  });
+
+  it('handles content-type with charset parameter', () => {
+    expect(deriveFilename('https://x.com/blob', 'audio/mpeg; charset=utf-8')).toBe('audio.mp3');
+  });
+
+  it('defaults to audio.mp3 when no match', () => {
+    expect(deriveFilename('https://x.com/blob', null)).toBe('audio.mp3');
+    expect(deriveFilename('https://x.com/blob', 'application/octet-stream')).toBe('audio.mp3');
+  });
+});
+
+// ── MockTranscriptionProvider tests ─────────────────────────────
+
+describe('MockTranscriptionProvider', () => {
+  it('is always configured', () => {
+    const provider = new MockTranscriptionProvider();
+    expect(provider.isConfigured()).toBe(true);
+  });
+
+  it('returns deterministic transcription with filename and size', async () => {
+    const provider = new MockTranscriptionProvider();
+    const audio = new ArrayBuffer(16000);
+    const result = await provider.transcribe({
+      audio,
+      filename: 'test.mp3',
+    });
+
+    expect(result.text).toContain('Mock transcription');
+    expect(result.text).toContain('test.mp3');
+    expect(result.text).toContain('16KB');
+    expect(result.model).toBe('mock-stt');
+    expect(result.durationSeconds).toBeGreaterThan(0);
+  });
+
+  it('uses provided language', async () => {
+    const provider = new MockTranscriptionProvider();
+    const result = await provider.transcribe({
+      audio: new ArrayBuffer(100),
+      filename: 'a.wav',
+      language: 'es',
+    });
+    expect(result.language).toBe('es');
+  });
+
+  it('defaults language to en', async () => {
+    const provider = new MockTranscriptionProvider();
+    const result = await provider.transcribe({
+      audio: new ArrayBuffer(100),
+      filename: 'a.wav',
+    });
+    expect(result.language).toBe('en');
+  });
+});
+
+// ── OpenAITranscriptionProvider tests ───────────────────────────
+
+describe('OpenAITranscriptionProvider', () => {
+  it('is not configured without OPENAI_API_KEY', () => {
+    const provider = new OpenAITranscriptionProvider({ apiKey: '' });
+    expect(provider.isConfigured()).toBe(false);
+  });
+
+  it('is configured with apiKey', () => {
+    const provider = new OpenAITranscriptionProvider({ apiKey: 'sk-test' });
+    expect(provider.isConfigured()).toBe(true);
+  });
+
+  it('throws when transcribing without config', async () => {
+    const provider = new OpenAITranscriptionProvider({ apiKey: '' });
+    await expect(
+      provider.transcribe({ audio: new ArrayBuffer(10), filename: 'a.mp3' })
+    ).rejects.toThrow('not configured');
+  });
+});
+
+// ── AzureTranscriptionProvider tests ────────────────────────────
+
+describe('AzureTranscriptionProvider', () => {
+  it('is not configured without both key and region', () => {
+    expect(new AzureTranscriptionProvider({ speechKey: '', speechRegion: '' }).isConfigured()).toBe(
+      false
+    );
+    expect(
+      new AzureTranscriptionProvider({ speechKey: 'k', speechRegion: '' }).isConfigured()
+    ).toBe(false);
+    expect(
+      new AzureTranscriptionProvider({ speechKey: '', speechRegion: 'eastus' }).isConfigured()
+    ).toBe(false);
+  });
+
+  it('is configured with both key and region', () => {
+    const provider = new AzureTranscriptionProvider({ speechKey: 'k', speechRegion: 'eastus' });
+    expect(provider.isConfigured()).toBe(true);
+  });
+
+  it('throws when transcribing without config', async () => {
+    const provider = new AzureTranscriptionProvider({ speechKey: '', speechRegion: '' });
+    await expect(
+      provider.transcribe({ audio: new ArrayBuffer(10), filename: 'a.wav' })
+    ).rejects.toThrow('not configured');
+  });
+});
+
+// ── Factory tests ───────────────────────────────────────────────
+
+describe('STT factory', () => {
  beforeEach(() => {
-    vi.stubEnv('OPENAI_API_KEY', 'test-key');
+    _resetSTT();
  });

  afterEach(() => {
-    globalThis.fetch = originalFetch;
+    _resetSTT();
    vi.unstubAllEnvs();
  });

-  it('deriveFilename extracts extension from URL path', async () => {
-    // Import the module to test the helper indirectly via the route
-    // The deriveFilename is not exported, but we can verify behavior through integration
-    const { TranscribeRequestSchema } = await import('./types.js');
-    const req = TranscribeRequestSchema.parse({
-      audioUrl: 'https://blob.example.com/uploads/meeting.wav?sv=2024-01-01&sig=abc',
-    });
-    expect(req.audioUrl).toContain('meeting.wav');
+  it('createSTTProvider returns OpenAI provider', () => {
+    const provider = createSTTProvider('openai');
+    expect(provider).toBeInstanceOf(OpenAITranscriptionProvider);
  });

-  it('schema defaults responseFormat to text', () => {
-    const result = TranscribeRequestSchema.parse({
-      audioUrl: 'https://example.com/a.mp3',
-    });
-    expect(result.responseFormat).toBe('text');
+  it('createSTTProvider returns Azure provider', () => {
+    const provider = createSTTProvider('azure');
+    expect(provider).toBeInstanceOf(AzureTranscriptionProvider);
+  });
+
+  it('createSTTProvider returns Mock provider', () => {
+    const provider = createSTTProvider('mock');
+    expect(provider).toBeInstanceOf(MockTranscriptionProvider);
+  });
+
+  it('createSTTProvider throws for unknown type', () => {
+    expect(() => createSTTProvider('invalid' as 'openai')).toThrow('Unknown STT_PROVIDER');
+  });
+
+  it('getSTT auto-detects mock when no credentials', () => {
+    vi.stubEnv('OPENAI_API_KEY', '');
+    vi.stubEnv('AZURE_SPEECH_KEY', '');
+    vi.stubEnv('AZURE_SPEECH_REGION', '');
+    vi.stubEnv('STT_PROVIDER', '');
+    _resetSTT();
+    const provider = getSTT();
+    expect(provider).toBeInstanceOf(MockTranscriptionProvider);
+  });
+
+  it('getSTT auto-detects openai when OPENAI_API_KEY set', () => {
+    vi.stubEnv('OPENAI_API_KEY', 'sk-test');
+    vi.stubEnv('AZURE_SPEECH_KEY', '');
+    vi.stubEnv('AZURE_SPEECH_REGION', '');
+    vi.stubEnv('STT_PROVIDER', '');
+    _resetSTT();
+    const provider = getSTT();
+    expect(provider).toBeInstanceOf(OpenAITranscriptionProvider);
+  });
+
+  it('getSTT auto-detects azure when AZURE_SPEECH_KEY + REGION set', () => {
+    vi.stubEnv('OPENAI_API_KEY', 'sk-test');
+    vi.stubEnv('AZURE_SPEECH_KEY', 'my-key');
+    vi.stubEnv('AZURE_SPEECH_REGION', 'eastus');
+    vi.stubEnv('STT_PROVIDER', '');
+    _resetSTT();
+    const provider = getSTT();
+    expect(provider).toBeInstanceOf(AzureTranscriptionProvider);
+  });
+
+  it('getSTT respects explicit STT_PROVIDER over auto-detect', () => {
+    vi.stubEnv('OPENAI_API_KEY', 'sk-test');
+    vi.stubEnv('AZURE_SPEECH_KEY', 'my-key');
+    vi.stubEnv('AZURE_SPEECH_REGION', 'eastus');
+    vi.stubEnv('STT_PROVIDER', 'mock');
+    _resetSTT();
+    const provider = getSTT();
+    expect(provider).toBeInstanceOf(MockTranscriptionProvider);
+  });
+
+  it('getSTT returns singleton', () => {
+    vi.stubEnv('STT_PROVIDER', 'mock');
+    _resetSTT();
+    const p1 = getSTT();
+    const p2 = getSTT();
+    expect(p1).toBe(p2);
+  });
+
+  it('TranscriptionProvider interface is satisfied by all providers', () => {
+    const providers: TranscriptionProvider[] = [
+      new MockTranscriptionProvider(),
+      new OpenAITranscriptionProvider({ apiKey: 'test' }),
+      new AzureTranscriptionProvider({ speechKey: 'k', speechRegion: 'eastus' }),
+    ];
+    for (const p of providers) {
+      expect(typeof p.transcribe).toBe('function');
+      expect(typeof p.isConfigured).toBe('function');
+    }
  });
 });
--- a/services/extraction-service/src/modules/transcribe/routes.ts
+++ b/services/extraction-service/src/modules/transcribe/routes.ts
@ -1,16 +1,22 @@
 /**
- * Transcription routes — speech-to-text via OpenAI Whisper API.
+ * Transcription routes — provider-agnostic speech-to-text.
 *
- * POST /transcribe — Download audio from URL, transcribe via Whisper.
+ * POST /transcribe — Download audio from URL, transcribe via configured provider.
 * Product-agnostic: any product backend can call this endpoint.
 *
- * Requires OPENAI_API_KEY environment variable.
+ * Provider selection: STT_PROVIDER env var (openai | azure | mock).
+ * Auto-detects from available credentials if not explicit.
 */

 import type { FastifyInstance } from 'fastify';
 import { BadRequestError } from '@bytelyst/errors';
-import { config } from '../../lib/config.js';
-import { TranscribeRequestSchema, MAX_AUDIO_SIZE_BYTES, type TranscribeResponse } from './types.js';
+import {
+  TranscribeRequestSchema,
+  MAX_AUDIO_SIZE_BYTES,
+  deriveFilename,
+  type TranscribeResponse,
+} from './types.js';
+import { getSTT } from './factory.js';

 export async function transcribeRoutes(app: FastifyInstance): Promise<void> {
  /**
@ -18,9 +24,10 @@ export async function transcribeRoutes(app: FastifyInstance): Promise<void> {
   *
   * Flow:
   * 1. Validate request (audioUrl, optional language/model/prompt)
-   * 2. Download audio from the provided URL
-   * 3. Send to OpenAI Whisper API as multipart/form-data
-   * 4. Return transcribed text + metadata
+   * 2. Check provider is configured
+   * 3. Download audio from the provided URL
+   * 4. Delegate to configured TranscriptionProvider
+   * 5. Return transcribed text + metadata
   */
  app.post('/transcribe', async (req, reply) => {
    const parsed = TranscribeRequestSchema.safeParse(req.body);
@ -28,17 +35,19 @@ export async function transcribeRoutes(app: FastifyInstance): Promise<void> {
      throw new BadRequestError(parsed.error.issues.map(i => i.message).join('; '));
    }

-    if (!config.OPENAI_API_KEY) {
+    const provider = getSTT();
+
+    if (!provider.isConfigured()) {
      return reply.status(503).send({
-        error: 'Transcription not available — OPENAI_API_KEY not configured',
+        error: 'Transcription not available — no STT provider configured',
      });
    }

-    const { audioUrl, model, language, prompt, responseFormat, productId } = parsed.data;
+    const { audioUrl, language, prompt, productId } = parsed.data;
    const requestId = req.headers['x-request-id'] as string | undefined;

    req.log.info(
-      { audioUrl: audioUrl.substring(0, 80), language, model, productId },
+      { audioUrl: audioUrl.substring(0, 80), language, productId },
      'transcription request'
    );

@ -87,120 +96,56 @@ export async function transcribeRoutes(app: FastifyInstance): Promise<void> {
    // ── Step 2: Determine filename from URL or content-type ──────
    const filename = deriveFilename(audioUrl, contentType);

-    // ── Step 3: Call OpenAI Whisper API ──────────────────────────
-    const whisperModel = model || config.WHISPER_MODEL;
-    const baseUrl = config.OPENAI_BASE_URL.replace(/\/+$/, '');
-
-    const formData = new FormData();
-    formData.append('file', new Blob([audioBuffer]), filename);
-    formData.append('model', whisperModel);
-    if (language) formData.append('language', language);
-    if (prompt) formData.append('prompt', prompt);
-    formData.append('response_format', responseFormat === 'text' ? 'verbose_json' : responseFormat);
-
-    let whisperResponse: Response;
+    // ── Step 3: Delegate to provider ─────────────────────────────
+    let result: TranscribeResponse;
    try {
-      whisperResponse = await fetch(`${baseUrl}/audio/transcriptions`, {
-        method: 'POST',
-        headers: {
-          Authorization: `Bearer ${config.OPENAI_API_KEY}`,
-        },
-        body: formData,
-        signal: AbortSignal.timeout(120_000),
+      const sttResult = await provider.transcribe({
+        audio: audioBuffer,
+        filename,
+        language,
+        prompt,
      });
+
+      const durationMs = Date.now() - startMs;
+
+      req.log.info(
+        {
+          durationMs,
+          model: sttResult.model,
+          language: sttResult.language,
+          audioSizeBytes: audioBuffer.byteLength,
+          textLength: sttResult.text.length,
+          productId,
+        },
+        'transcription complete'
+      );
+
+      result = {
+        text: sttResult.text,
+        language: sttResult.language,
+        durationSeconds: sttResult.durationSeconds,
+        model: sttResult.model,
+        durationMs,
+        requestId,
+      };
    } catch (err) {
-      const message = err instanceof Error ? err.message : 'Whisper API call failed';
-      req.log.error({ error: message }, 'whisper API failed');
+      const message = err instanceof Error ? err.message : 'Transcription failed';
+      req.log.error({ error: message }, 'transcription provider failed');
+
+      if (message.includes('429') || message.includes('rate limit')) {
+        return reply.status(429).send({
+          error: 'Transcription rate limit exceeded',
+          detail: message,
+          retryAfter: 60,
+        });
+      }
+
      return reply.status(502).send({
        error: 'Transcription failed',
        detail: message,
      });
    }

-    if (!whisperResponse.ok) {
-      const errorText = await whisperResponse.text();
-      req.log.error({ status: whisperResponse.status, error: errorText }, 'whisper API error');
-
-      if (whisperResponse.status === 429) {
-        return reply.status(429).send({
-          error: 'Transcription rate limit exceeded',
-          detail: errorText,
-          retryAfter: 60,
-        });
-      }
-
-      return reply.status(502).send({
-        error: `Whisper API error ${whisperResponse.status}`,
-        detail: errorText,
-      });
-    }
-
-    // ── Step 4: Parse response ───────────────────────────────────
-    const data = (await whisperResponse.json()) as {
-      text: string;
-      language?: string;
-      duration?: number;
-    };
-
-    const durationMs = Date.now() - startMs;
-
-    req.log.info(
-      {
-        durationMs,
-        model: whisperModel,
-        language: data.language,
-        audioSizeBytes: audioBuffer.byteLength,
-        textLength: data.text.length,
-        productId,
-      },
-      'transcription complete'
-    );
-
-    const result: TranscribeResponse = {
-      text: data.text,
-      language: data.language ?? language ?? null,
-      durationSeconds: data.duration ?? null,
-      model: whisperModel,
-      durationMs,
-      requestId,
-    };
-
    return reply.send(result);
  });
 }
-
-// ── Helpers ───────────────────────────────────────────────────────
-
-function deriveFilename(url: string, contentType: string | null): string {
-  // Try to extract extension from URL path
-  try {
-    const pathname = new URL(url).pathname;
-    const lastSegment = pathname.split('/').pop();
-    if (lastSegment && /\.(mp3|m4a|wav|webm|ogg|flac|mp4|mpeg|mpga)$/i.test(lastSegment)) {
-      return lastSegment;
-    }
-  } catch {
-    // URL parsing failed — use fallback
-  }
-
-  // Map content-type to extension
-  const extMap: Record<string, string> = {
-    'audio/mpeg': 'audio.mp3',
-    'audio/mp4': 'audio.m4a',
-    'audio/mp4a-latm': 'audio.m4a',
-    'audio/x-m4a': 'audio.m4a',
-    'audio/wav': 'audio.wav',
-    'audio/x-wav': 'audio.wav',
-    'audio/webm': 'audio.webm',
-    'audio/ogg': 'audio.ogg',
-    'audio/flac': 'audio.flac',
-    'video/mp4': 'audio.mp4',
-  };
-
-  if (contentType) {
-    const base = contentType.split(';')[0].trim().toLowerCase();
-    if (extMap[base]) return extMap[base];
-  }
-
-  return 'audio.mp3';
-}
--- a/services/extraction-service/src/modules/transcribe/types.ts
+++ b/services/extraction-service/src/modules/transcribe/types.ts
@ -1,5 +1,41 @@
 import { z } from 'zod';

+// ── Provider interface ──────────────────────────────────────
+
+/** Audio input passed to a transcription provider. */
+export interface TranscriptionInput {
+  /** Raw audio bytes. */
+  audio: ArrayBuffer;
+  /** Filename with extension (used for content-type inference). */
+  filename: string;
+  /** ISO 639-1 language hint. */
+  language?: string;
+  /** Prompt to guide transcription style. */
+  prompt?: string;
+}
+
+/** Result from a transcription provider. */
+export interface TranscriptionResult {
+  /** The transcribed text. */
+  text: string;
+  /** Detected or specified language code. */
+  language: string | null;
+  /** Duration of the audio in seconds (when available). */
+  durationSeconds: number | null;
+  /** Provider-specific model identifier. */
+  model: string;
+}
+
+/** Cloud-agnostic speech-to-text provider. */
+export interface TranscriptionProvider {
+  /** Transcribe audio to text. */
+  transcribe(input: TranscriptionInput): Promise<TranscriptionResult>;
+  /** Check if the provider is configured with valid credentials. */
+  isConfigured(): boolean;
+}
+
+export type STTProviderType = 'openai' | 'azure' | 'mock';
+
 // ── Transcription request schema ────────────────────────────────

 export const TranscribeRequestSchema = z.object({
@ -53,3 +89,37 @@ export const SUPPORTED_AUDIO_TYPES = new Set([

 /** Max audio file size: 25 MB (OpenAI Whisper limit). */
 export const MAX_AUDIO_SIZE_BYTES = 25 * 1024 * 1024;
+
+// ── Filename helper (shared by route + providers) ─────────────
+
+export function deriveFilename(url: string, contentType: string | null): string {
+  try {
+    const pathname = new URL(url).pathname;
+    const lastSegment = pathname.split('/').pop();
+    if (lastSegment && /\.(mp3|m4a|wav|webm|ogg|flac|mp4|mpeg|mpga)$/i.test(lastSegment)) {
+      return lastSegment;
+    }
+  } catch {
+    // URL parsing failed — use fallback
+  }
+
+  const extMap: Record<string, string> = {
+    'audio/mpeg': 'audio.mp3',
+    'audio/mp4': 'audio.m4a',
+    'audio/mp4a-latm': 'audio.m4a',
+    'audio/x-m4a': 'audio.m4a',
+    'audio/wav': 'audio.wav',
+    'audio/x-wav': 'audio.wav',
+    'audio/webm': 'audio.webm',
+    'audio/ogg': 'audio.ogg',
+    'audio/flac': 'audio.flac',
+    'video/mp4': 'audio.mp4',
+  };
+
+  if (contentType) {
+    const base = contentType.split(';')[0].trim().toLowerCase();
+    if (extMap[base]) return extMap[base];
+  }
+
+  return 'audio.mp3';
+}