fix(extraction-service): review fixes — locale mapping, model passthrough, content-type validation
BUG 1: Azure locale derivation produced 'en-EN' (invalid) for 2-letter codes. → Added toAzureLocale() with 28-language mapping table (en→en-US, pt→pt-BR, etc.) → Exported for testing; falls back to code-CODE for unmapped languages. BUG 2: model field from request schema was silently dropped after provider refactor. → Added optional model field to TranscriptionInput interface. → OpenAI provider now uses input.model override (falls back to config.model). → Route passes model through to provider.transcribe(). GAP 4: SUPPORTED_AUDIO_TYPES was defined but never validated against. → Route now rejects unsupported content-types with a clear error message. → Allows application/octet-stream (Azure Blob SAS URLs often return this). GAP 5: Client JSDoc still said 'via OpenAI Whisper API' — now 'via configured STT provider'. GAP 8: Azure WAV content-type hardcoded samplerate=16000 — now generic audio/wav. Tests: 42 transcription tests (was 35), 178 total passing. → toAzureLocale: 4 tests (locale mapping, passthrough, fallback, case-insensitive) → setSTT: 1 test (singleton override) → model passthrough: 2 tests (mock ignores, input accepts)
This commit is contained in:
parent
a77b3ff931
commit
031e910607
@ -29,7 +29,7 @@ export interface ExtractionClient {
|
|||||||
/** Get a single task by ID. */
|
/** Get a single task by ID. */
|
||||||
getTask(id: string, productId?: string): Promise<ExtractionTask>;
|
getTask(id: string, productId?: string): Promise<ExtractionTask>;
|
||||||
|
|
||||||
/** Transcribe audio from a URL via OpenAI Whisper API. */
|
/** Transcribe audio from a URL via the configured STT provider. */
|
||||||
transcribe(req: TranscribeRequest): Promise<TranscribeResponse>;
|
transcribe(req: TranscribeRequest): Promise<TranscribeResponse>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -20,7 +20,7 @@ function audioContentType(filename: string): string {
|
|||||||
const ext = filename.split('.').pop()?.toLowerCase();
|
const ext = filename.split('.').pop()?.toLowerCase();
|
||||||
switch (ext) {
|
switch (ext) {
|
||||||
case 'wav':
|
case 'wav':
|
||||||
return 'audio/wav; codecs=audio/pcm; samplerate=16000';
|
return 'audio/wav';
|
||||||
case 'ogg':
|
case 'ogg':
|
||||||
return 'audio/ogg; codecs=opus';
|
return 'audio/ogg; codecs=opus';
|
||||||
case 'mp3':
|
case 'mp3':
|
||||||
@ -39,6 +39,51 @@ function audioContentType(filename: string): string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Map ISO 639-1 two-letter code to Azure BCP-47 locale.
|
||||||
|
* Azure requires full locale (e.g. "en-US"), not just "en".
|
||||||
|
* Falls back to `${code}-${CODE}` only for unmapped languages.
|
||||||
|
*/
|
||||||
|
const ISO_TO_LOCALE: Record<string, string> = {
|
||||||
|
en: 'en-US',
|
||||||
|
es: 'es-ES',
|
||||||
|
fr: 'fr-FR',
|
||||||
|
de: 'de-DE',
|
||||||
|
it: 'it-IT',
|
||||||
|
pt: 'pt-BR',
|
||||||
|
ja: 'ja-JP',
|
||||||
|
ko: 'ko-KR',
|
||||||
|
zh: 'zh-CN',
|
||||||
|
ar: 'ar-SA',
|
||||||
|
hi: 'hi-IN',
|
||||||
|
ru: 'ru-RU',
|
||||||
|
nl: 'nl-NL',
|
||||||
|
sv: 'sv-SE',
|
||||||
|
pl: 'pl-PL',
|
||||||
|
tr: 'tr-TR',
|
||||||
|
vi: 'vi-VN',
|
||||||
|
th: 'th-TH',
|
||||||
|
id: 'id-ID',
|
||||||
|
uk: 'uk-UA',
|
||||||
|
cs: 'cs-CZ',
|
||||||
|
da: 'da-DK',
|
||||||
|
fi: 'fi-FI',
|
||||||
|
el: 'el-GR',
|
||||||
|
he: 'he-IL',
|
||||||
|
hu: 'hu-HU',
|
||||||
|
nb: 'nb-NO',
|
||||||
|
ro: 'ro-RO',
|
||||||
|
sk: 'sk-SK',
|
||||||
|
ms: 'ms-MY',
|
||||||
|
};
|
||||||
|
|
||||||
|
export function toAzureLocale(language: string): string {
|
||||||
|
// Already a full BCP-47 locale (e.g. "en-US", "pt-BR")
|
||||||
|
if (language.length > 3) return language;
|
||||||
|
const lower = language.toLowerCase();
|
||||||
|
return ISO_TO_LOCALE[lower] ?? `${lower}-${lower.toUpperCase()}`;
|
||||||
|
}
|
||||||
|
|
||||||
export class AzureTranscriptionProvider implements TranscriptionProvider {
|
export class AzureTranscriptionProvider implements TranscriptionProvider {
|
||||||
private config: AzureSTTConfig;
|
private config: AzureSTTConfig;
|
||||||
|
|
||||||
@ -61,9 +106,7 @@ export class AzureTranscriptionProvider implements TranscriptionProvider {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const { speechKey, speechRegion } = this.config;
|
const { speechKey, speechRegion } = this.config;
|
||||||
const language = input.language || 'en-US';
|
const locale = toAzureLocale(input.language || 'en');
|
||||||
// Azure expects full locale (e.g. "en-US"), not just ISO 639-1 "en"
|
|
||||||
const locale = language.length <= 3 ? `${language}-${language.toUpperCase()}` : language;
|
|
||||||
|
|
||||||
const url = new URL(
|
const url = new URL(
|
||||||
`https://${speechRegion}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1`
|
`https://${speechRegion}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1`
|
||||||
|
|||||||
@ -37,9 +37,11 @@ export class OpenAITranscriptionProvider implements TranscriptionProvider {
|
|||||||
throw new Error('OpenAI is not configured (missing OPENAI_API_KEY)');
|
throw new Error('OpenAI is not configured (missing OPENAI_API_KEY)');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const effectiveModel = input.model || this.config.model;
|
||||||
|
|
||||||
const formData = new FormData();
|
const formData = new FormData();
|
||||||
formData.append('file', new Blob([input.audio]), input.filename);
|
formData.append('file', new Blob([input.audio]), input.filename);
|
||||||
formData.append('model', this.config.model);
|
formData.append('model', effectiveModel);
|
||||||
if (input.language) formData.append('language', input.language);
|
if (input.language) formData.append('language', input.language);
|
||||||
if (input.prompt) formData.append('prompt', input.prompt);
|
if (input.prompt) formData.append('prompt', input.prompt);
|
||||||
formData.append('response_format', 'verbose_json');
|
formData.append('response_format', 'verbose_json');
|
||||||
@ -68,7 +70,7 @@ export class OpenAITranscriptionProvider implements TranscriptionProvider {
|
|||||||
text: data.text,
|
text: data.text,
|
||||||
language: data.language ?? input.language ?? null,
|
language: data.language ?? input.language ?? null,
|
||||||
durationSeconds: data.duration ?? null,
|
durationSeconds: data.duration ?? null,
|
||||||
model: this.config.model,
|
model: effectiveModel,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -9,7 +9,8 @@ import type { TranscriptionProvider } from './types.js';
|
|||||||
import { MockTranscriptionProvider } from './providers/mock.js';
|
import { MockTranscriptionProvider } from './providers/mock.js';
|
||||||
import { OpenAITranscriptionProvider } from './providers/openai.js';
|
import { OpenAITranscriptionProvider } from './providers/openai.js';
|
||||||
import { AzureTranscriptionProvider } from './providers/azure.js';
|
import { AzureTranscriptionProvider } from './providers/azure.js';
|
||||||
import { createSTTProvider, _resetSTT, getSTT } from './factory.js';
|
import { createSTTProvider, _resetSTT, getSTT, setSTT } from './factory.js';
|
||||||
|
import { toAzureLocale } from './providers/azure.js';
|
||||||
|
|
||||||
// ── Schema validation tests ─────────────────────────────────────
|
// ── Schema validation tests ─────────────────────────────────────
|
||||||
|
|
||||||
@ -198,6 +199,36 @@ describe('OpenAITranscriptionProvider', () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ── toAzureLocale tests ─────────────────────────────────────────
|
||||||
|
|
||||||
|
describe('toAzureLocale', () => {
|
||||||
|
it('maps common 2-letter codes to correct Azure locales', () => {
|
||||||
|
expect(toAzureLocale('en')).toBe('en-US');
|
||||||
|
expect(toAzureLocale('es')).toBe('es-ES');
|
||||||
|
expect(toAzureLocale('fr')).toBe('fr-FR');
|
||||||
|
expect(toAzureLocale('de')).toBe('de-DE');
|
||||||
|
expect(toAzureLocale('pt')).toBe('pt-BR');
|
||||||
|
expect(toAzureLocale('ja')).toBe('ja-JP');
|
||||||
|
expect(toAzureLocale('zh')).toBe('zh-CN');
|
||||||
|
expect(toAzureLocale('hi')).toBe('hi-IN');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('passes through full BCP-47 locales unchanged', () => {
|
||||||
|
expect(toAzureLocale('en-US')).toBe('en-US');
|
||||||
|
expect(toAzureLocale('pt-BR')).toBe('pt-BR');
|
||||||
|
expect(toAzureLocale('zh-TW')).toBe('zh-TW');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('falls back to code-CODE for unmapped languages', () => {
|
||||||
|
expect(toAzureLocale('xx')).toBe('xx-XX');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('is case-insensitive for 2-letter codes', () => {
|
||||||
|
expect(toAzureLocale('EN')).toBe('en-US');
|
||||||
|
expect(toAzureLocale('Es')).toBe('es-ES');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
// ── AzureTranscriptionProvider tests ────────────────────────────
|
// ── AzureTranscriptionProvider tests ────────────────────────────
|
||||||
|
|
||||||
describe('AzureTranscriptionProvider', () => {
|
describe('AzureTranscriptionProvider', () => {
|
||||||
@ -305,6 +336,14 @@ describe('STT factory', () => {
|
|||||||
expect(p1).toBe(p2);
|
expect(p1).toBe(p2);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('setSTT overrides the singleton', () => {
|
||||||
|
vi.stubEnv('STT_PROVIDER', 'openai');
|
||||||
|
_resetSTT();
|
||||||
|
const mock = new MockTranscriptionProvider();
|
||||||
|
setSTT(mock);
|
||||||
|
expect(getSTT()).toBe(mock);
|
||||||
|
});
|
||||||
|
|
||||||
it('TranscriptionProvider interface is satisfied by all providers', () => {
|
it('TranscriptionProvider interface is satisfied by all providers', () => {
|
||||||
const providers: TranscriptionProvider[] = [
|
const providers: TranscriptionProvider[] = [
|
||||||
new MockTranscriptionProvider(),
|
new MockTranscriptionProvider(),
|
||||||
@ -317,3 +356,26 @@ describe('STT factory', () => {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ── Model passthrough tests ─────────────────────────────────────
|
||||||
|
|
||||||
|
describe('model passthrough', () => {
|
||||||
|
it('MockTranscriptionProvider ignores model (returns mock-stt)', async () => {
|
||||||
|
const provider = new MockTranscriptionProvider();
|
||||||
|
const result = await provider.transcribe({
|
||||||
|
audio: new ArrayBuffer(100),
|
||||||
|
filename: 'a.wav',
|
||||||
|
model: 'whisper-large-v3',
|
||||||
|
});
|
||||||
|
expect(result.model).toBe('mock-stt');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('TranscriptionInput accepts optional model field', () => {
|
||||||
|
const input = {
|
||||||
|
audio: new ArrayBuffer(10),
|
||||||
|
filename: 'a.mp3',
|
||||||
|
model: 'whisper-1',
|
||||||
|
};
|
||||||
|
expect(input.model).toBe('whisper-1');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|||||||
@ -13,6 +13,7 @@ import { BadRequestError } from '@bytelyst/errors';
|
|||||||
import {
|
import {
|
||||||
TranscribeRequestSchema,
|
TranscribeRequestSchema,
|
||||||
MAX_AUDIO_SIZE_BYTES,
|
MAX_AUDIO_SIZE_BYTES,
|
||||||
|
SUPPORTED_AUDIO_TYPES,
|
||||||
deriveFilename,
|
deriveFilename,
|
||||||
type TranscribeResponse,
|
type TranscribeResponse,
|
||||||
} from './types.js';
|
} from './types.js';
|
||||||
@ -43,7 +44,7 @@ export async function transcribeRoutes(app: FastifyInstance): Promise<void> {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const { audioUrl, language, prompt, productId } = parsed.data;
|
const { audioUrl, model, language, prompt, productId } = parsed.data;
|
||||||
const requestId = req.headers['x-request-id'] as string | undefined;
|
const requestId = req.headers['x-request-id'] as string | undefined;
|
||||||
|
|
||||||
req.log.info(
|
req.log.info(
|
||||||
@ -93,10 +94,24 @@ export async function transcribeRoutes(app: FastifyInstance): Promise<void> {
|
|||||||
throw new BadRequestError(`Failed to download audio: ${message}`);
|
throw new BadRequestError(`Failed to download audio: ${message}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Step 2: Determine filename from URL or content-type ──────
|
// ── Step 2: Validate audio content-type ─────────────────────
|
||||||
|
if (contentType) {
|
||||||
|
const baseType = contentType.split(';')[0].trim().toLowerCase();
|
||||||
|
if (
|
||||||
|
baseType &&
|
||||||
|
!SUPPORTED_AUDIO_TYPES.has(baseType) &&
|
||||||
|
!baseType.startsWith('application/octet-stream')
|
||||||
|
) {
|
||||||
|
throw new BadRequestError(
|
||||||
|
`Unsupported audio format: ${baseType}. Supported: ${[...SUPPORTED_AUDIO_TYPES].join(', ')}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Step 3: Determine filename from URL or content-type ──────
|
||||||
const filename = deriveFilename(audioUrl, contentType);
|
const filename = deriveFilename(audioUrl, contentType);
|
||||||
|
|
||||||
// ── Step 3: Delegate to provider ─────────────────────────────
|
// ── Step 4: Delegate to provider ─────────────────────────────
|
||||||
let result: TranscribeResponse;
|
let result: TranscribeResponse;
|
||||||
try {
|
try {
|
||||||
const sttResult = await provider.transcribe({
|
const sttResult = await provider.transcribe({
|
||||||
@ -104,6 +119,7 @@ export async function transcribeRoutes(app: FastifyInstance): Promise<void> {
|
|||||||
filename,
|
filename,
|
||||||
language,
|
language,
|
||||||
prompt,
|
prompt,
|
||||||
|
model,
|
||||||
});
|
});
|
||||||
|
|
||||||
const durationMs = Date.now() - startMs;
|
const durationMs = Date.now() - startMs;
|
||||||
|
|||||||
@ -12,6 +12,8 @@ export interface TranscriptionInput {
|
|||||||
language?: string;
|
language?: string;
|
||||||
/** Prompt to guide transcription style. */
|
/** Prompt to guide transcription style. */
|
||||||
prompt?: string;
|
prompt?: string;
|
||||||
|
/** Model override (provider-specific, e.g. 'whisper-1' for OpenAI). */
|
||||||
|
model?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Result from a transcription provider. */
|
/** Result from a transcription provider. */
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user