From 031e910607e4ad15aee20c2ba1bf05effd310e87 Mon Sep 17 00:00:00 2001 From: saravanakumardb1 Date: Mon, 6 Apr 2026 11:40:27 -0700 Subject: [PATCH] =?UTF-8?q?fix(extraction-service):=20review=20fixes=20?= =?UTF-8?q?=E2=80=94=20locale=20mapping,=20model=20passthrough,=20content-?= =?UTF-8?q?type=20validation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BUG 1: Azure locale derivation produced 'en-EN' (invalid) for 2-letter codes. → Added toAzureLocale() with 28-language mapping table (en→en-US, pt→pt-BR, etc.) → Exported for testing; falls back to code-CODE for unmapped languages. BUG 2: model field from request schema was silently dropped after provider refactor. → Added optional model field to TranscriptionInput interface. → OpenAI provider now uses input.model override (falls back to config.model). → Route passes model through to provider.transcribe(). GAP 4: SUPPORTED_AUDIO_TYPES was defined but never validated against. → Route now rejects unsupported content-types with a clear error message. → Allows application/octet-stream (Azure Blob SAS URLs often return this). GAP 5: Client JSDoc still said 'via OpenAI Whisper API' — now 'via configured STT provider'. GAP 8: Azure WAV content-type hardcoded samplerate=16000 — now generic audio/wav. Tests: 42 transcription tests (was 35), 178 total passing. → toAzureLocale: 4 tests (locale mapping, passthrough, fallback, case-insensitive) → setSTT: 1 test (singleton override) → model passthrough: 2 tests (mock ignores, input accepts) --- packages/extraction/src/client.ts | 2 +- .../src/modules/transcribe/providers/azure.ts | 51 +++++++++++++-- .../modules/transcribe/providers/openai.ts | 6 +- .../src/modules/transcribe/routes.test.ts | 64 ++++++++++++++++++- .../src/modules/transcribe/routes.ts | 22 ++++++- .../src/modules/transcribe/types.ts | 2 + 6 files changed, 136 insertions(+), 11 deletions(-) diff --git a/packages/extraction/src/client.ts b/packages/extraction/src/client.ts index c9de8e48..7c105add 100644 --- a/packages/extraction/src/client.ts +++ b/packages/extraction/src/client.ts @@ -29,7 +29,7 @@ export interface ExtractionClient { /** Get a single task by ID. */ getTask(id: string, productId?: string): Promise; - /** Transcribe audio from a URL via OpenAI Whisper API. */ + /** Transcribe audio from a URL via the configured STT provider. */ transcribe(req: TranscribeRequest): Promise; } diff --git a/services/extraction-service/src/modules/transcribe/providers/azure.ts b/services/extraction-service/src/modules/transcribe/providers/azure.ts index 03263b12..baf7e350 100644 --- a/services/extraction-service/src/modules/transcribe/providers/azure.ts +++ b/services/extraction-service/src/modules/transcribe/providers/azure.ts @@ -20,7 +20,7 @@ function audioContentType(filename: string): string { const ext = filename.split('.').pop()?.toLowerCase(); switch (ext) { case 'wav': - return 'audio/wav; codecs=audio/pcm; samplerate=16000'; + return 'audio/wav'; case 'ogg': return 'audio/ogg; codecs=opus'; case 'mp3': @@ -39,6 +39,51 @@ function audioContentType(filename: string): string { } } +/** + * Map ISO 639-1 two-letter code to Azure BCP-47 locale. + * Azure requires full locale (e.g. "en-US"), not just "en". + * Falls back to `${code}-${CODE}` only for unmapped languages. + */ +const ISO_TO_LOCALE: Record = { + en: 'en-US', + es: 'es-ES', + fr: 'fr-FR', + de: 'de-DE', + it: 'it-IT', + pt: 'pt-BR', + ja: 'ja-JP', + ko: 'ko-KR', + zh: 'zh-CN', + ar: 'ar-SA', + hi: 'hi-IN', + ru: 'ru-RU', + nl: 'nl-NL', + sv: 'sv-SE', + pl: 'pl-PL', + tr: 'tr-TR', + vi: 'vi-VN', + th: 'th-TH', + id: 'id-ID', + uk: 'uk-UA', + cs: 'cs-CZ', + da: 'da-DK', + fi: 'fi-FI', + el: 'el-GR', + he: 'he-IL', + hu: 'hu-HU', + nb: 'nb-NO', + ro: 'ro-RO', + sk: 'sk-SK', + ms: 'ms-MY', +}; + +export function toAzureLocale(language: string): string { + // Already a full BCP-47 locale (e.g. "en-US", "pt-BR") + if (language.length > 3) return language; + const lower = language.toLowerCase(); + return ISO_TO_LOCALE[lower] ?? `${lower}-${lower.toUpperCase()}`; +} + export class AzureTranscriptionProvider implements TranscriptionProvider { private config: AzureSTTConfig; @@ -61,9 +106,7 @@ export class AzureTranscriptionProvider implements TranscriptionProvider { } const { speechKey, speechRegion } = this.config; - const language = input.language || 'en-US'; - // Azure expects full locale (e.g. "en-US"), not just ISO 639-1 "en" - const locale = language.length <= 3 ? `${language}-${language.toUpperCase()}` : language; + const locale = toAzureLocale(input.language || 'en'); const url = new URL( `https://${speechRegion}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1` diff --git a/services/extraction-service/src/modules/transcribe/providers/openai.ts b/services/extraction-service/src/modules/transcribe/providers/openai.ts index cc3cc918..a566db4c 100644 --- a/services/extraction-service/src/modules/transcribe/providers/openai.ts +++ b/services/extraction-service/src/modules/transcribe/providers/openai.ts @@ -37,9 +37,11 @@ export class OpenAITranscriptionProvider implements TranscriptionProvider { throw new Error('OpenAI is not configured (missing OPENAI_API_KEY)'); } + const effectiveModel = input.model || this.config.model; + const formData = new FormData(); formData.append('file', new Blob([input.audio]), input.filename); - formData.append('model', this.config.model); + formData.append('model', effectiveModel); if (input.language) formData.append('language', input.language); if (input.prompt) formData.append('prompt', input.prompt); formData.append('response_format', 'verbose_json'); @@ -68,7 +70,7 @@ export class OpenAITranscriptionProvider implements TranscriptionProvider { text: data.text, language: data.language ?? input.language ?? null, durationSeconds: data.duration ?? null, - model: this.config.model, + model: effectiveModel, }; } } diff --git a/services/extraction-service/src/modules/transcribe/routes.test.ts b/services/extraction-service/src/modules/transcribe/routes.test.ts index d2bc79b4..03e427bd 100644 --- a/services/extraction-service/src/modules/transcribe/routes.test.ts +++ b/services/extraction-service/src/modules/transcribe/routes.test.ts @@ -9,7 +9,8 @@ import type { TranscriptionProvider } from './types.js'; import { MockTranscriptionProvider } from './providers/mock.js'; import { OpenAITranscriptionProvider } from './providers/openai.js'; import { AzureTranscriptionProvider } from './providers/azure.js'; -import { createSTTProvider, _resetSTT, getSTT } from './factory.js'; +import { createSTTProvider, _resetSTT, getSTT, setSTT } from './factory.js'; +import { toAzureLocale } from './providers/azure.js'; // ── Schema validation tests ───────────────────────────────────── @@ -198,6 +199,36 @@ describe('OpenAITranscriptionProvider', () => { }); }); +// ── toAzureLocale tests ───────────────────────────────────────── + +describe('toAzureLocale', () => { + it('maps common 2-letter codes to correct Azure locales', () => { + expect(toAzureLocale('en')).toBe('en-US'); + expect(toAzureLocale('es')).toBe('es-ES'); + expect(toAzureLocale('fr')).toBe('fr-FR'); + expect(toAzureLocale('de')).toBe('de-DE'); + expect(toAzureLocale('pt')).toBe('pt-BR'); + expect(toAzureLocale('ja')).toBe('ja-JP'); + expect(toAzureLocale('zh')).toBe('zh-CN'); + expect(toAzureLocale('hi')).toBe('hi-IN'); + }); + + it('passes through full BCP-47 locales unchanged', () => { + expect(toAzureLocale('en-US')).toBe('en-US'); + expect(toAzureLocale('pt-BR')).toBe('pt-BR'); + expect(toAzureLocale('zh-TW')).toBe('zh-TW'); + }); + + it('falls back to code-CODE for unmapped languages', () => { + expect(toAzureLocale('xx')).toBe('xx-XX'); + }); + + it('is case-insensitive for 2-letter codes', () => { + expect(toAzureLocale('EN')).toBe('en-US'); + expect(toAzureLocale('Es')).toBe('es-ES'); + }); +}); + // ── AzureTranscriptionProvider tests ──────────────────────────── describe('AzureTranscriptionProvider', () => { @@ -305,6 +336,14 @@ describe('STT factory', () => { expect(p1).toBe(p2); }); + it('setSTT overrides the singleton', () => { + vi.stubEnv('STT_PROVIDER', 'openai'); + _resetSTT(); + const mock = new MockTranscriptionProvider(); + setSTT(mock); + expect(getSTT()).toBe(mock); + }); + it('TranscriptionProvider interface is satisfied by all providers', () => { const providers: TranscriptionProvider[] = [ new MockTranscriptionProvider(), @@ -317,3 +356,26 @@ describe('STT factory', () => { } }); }); + +// ── Model passthrough tests ───────────────────────────────────── + +describe('model passthrough', () => { + it('MockTranscriptionProvider ignores model (returns mock-stt)', async () => { + const provider = new MockTranscriptionProvider(); + const result = await provider.transcribe({ + audio: new ArrayBuffer(100), + filename: 'a.wav', + model: 'whisper-large-v3', + }); + expect(result.model).toBe('mock-stt'); + }); + + it('TranscriptionInput accepts optional model field', () => { + const input = { + audio: new ArrayBuffer(10), + filename: 'a.mp3', + model: 'whisper-1', + }; + expect(input.model).toBe('whisper-1'); + }); +}); diff --git a/services/extraction-service/src/modules/transcribe/routes.ts b/services/extraction-service/src/modules/transcribe/routes.ts index 6180464b..5e72cd1b 100644 --- a/services/extraction-service/src/modules/transcribe/routes.ts +++ b/services/extraction-service/src/modules/transcribe/routes.ts @@ -13,6 +13,7 @@ import { BadRequestError } from '@bytelyst/errors'; import { TranscribeRequestSchema, MAX_AUDIO_SIZE_BYTES, + SUPPORTED_AUDIO_TYPES, deriveFilename, type TranscribeResponse, } from './types.js'; @@ -43,7 +44,7 @@ export async function transcribeRoutes(app: FastifyInstance): Promise { }); } - const { audioUrl, language, prompt, productId } = parsed.data; + const { audioUrl, model, language, prompt, productId } = parsed.data; const requestId = req.headers['x-request-id'] as string | undefined; req.log.info( @@ -93,10 +94,24 @@ export async function transcribeRoutes(app: FastifyInstance): Promise { throw new BadRequestError(`Failed to download audio: ${message}`); } - // ── Step 2: Determine filename from URL or content-type ────── + // ── Step 2: Validate audio content-type ───────────────────── + if (contentType) { + const baseType = contentType.split(';')[0].trim().toLowerCase(); + if ( + baseType && + !SUPPORTED_AUDIO_TYPES.has(baseType) && + !baseType.startsWith('application/octet-stream') + ) { + throw new BadRequestError( + `Unsupported audio format: ${baseType}. Supported: ${[...SUPPORTED_AUDIO_TYPES].join(', ')}` + ); + } + } + + // ── Step 3: Determine filename from URL or content-type ────── const filename = deriveFilename(audioUrl, contentType); - // ── Step 3: Delegate to provider ───────────────────────────── + // ── Step 4: Delegate to provider ───────────────────────────── let result: TranscribeResponse; try { const sttResult = await provider.transcribe({ @@ -104,6 +119,7 @@ export async function transcribeRoutes(app: FastifyInstance): Promise { filename, language, prompt, + model, }); const durationMs = Date.now() - startMs; diff --git a/services/extraction-service/src/modules/transcribe/types.ts b/services/extraction-service/src/modules/transcribe/types.ts index 501d8da6..b1cedf1b 100644 --- a/services/extraction-service/src/modules/transcribe/types.ts +++ b/services/extraction-service/src/modules/transcribe/types.ts @@ -12,6 +12,8 @@ export interface TranscriptionInput { language?: string; /** Prompt to guide transcription style. */ prompt?: string; + /** Model override (provider-specific, e.g. 'whisper-1' for OpenAI). */ + model?: string; } /** Result from a transcription provider. */