refactor(extraction-service): provider-agnostic transcription — OpenAI + Azure Speech + Mock
- TranscriptionProvider interface with transcribe() + isConfigured() - OpenAITranscriptionProvider: Whisper API (existing behavior) - AzureTranscriptionProvider: Azure Speech REST API for short audio - MockTranscriptionProvider: deterministic results for testing - Factory: getSTT() singleton with env-driven auto-detection - STT_PROVIDER=openai|azure|mock (explicit) - Auto-detect: AZURE_SPEECH_KEY → azure, OPENAI_API_KEY → openai, else mock - Config: add STT_PROVIDER, AZURE_SPEECH_KEY, AZURE_SPEECH_REGION env vars - Route refactored: audio download (common) → provider.transcribe() (swappable) - deriveFilename() extracted to types.ts (shared by route + providers) - 35 transcription tests (was 12), 171 total passing - Follows same pattern as @bytelyst/llm provider abstraction
This commit is contained in:
parent
f8e15880d2
commit
a77b3ff931
@ -19,9 +19,12 @@ const envSchema = z.object({
|
||||
EXTRACTION_QUEUE_FILE: z.string().optional(),
|
||||
EXTRACTION_QUEUE_POLL_MS: z.coerce.number().default(100),
|
||||
EXTRACTION_QUEUE_LEASE_MS: z.coerce.number().default(30000),
|
||||
STT_PROVIDER: z.enum(['openai', 'azure', 'mock']).optional(),
|
||||
OPENAI_API_KEY: z.string().optional(),
|
||||
OPENAI_BASE_URL: z.string().default('https://api.openai.com/v1'),
|
||||
WHISPER_MODEL: z.string().default('whisper-1'),
|
||||
AZURE_SPEECH_KEY: z.string().optional(),
|
||||
AZURE_SPEECH_REGION: z.string().optional(),
|
||||
});
|
||||
|
||||
export const config = envSchema.parse(process.env);
|
||||
|
||||
@ -0,0 +1,72 @@
|
||||
/**
|
||||
* Transcription provider factory.
|
||||
*
|
||||
* Creates a TranscriptionProvider based on STT_PROVIDER env var.
|
||||
* Auto-detects Azure vs OpenAI from available credentials if not explicit.
|
||||
*
|
||||
* Priority: STT_PROVIDER > auto-detect (AZURE_SPEECH_KEY → azure, OPENAI_API_KEY → openai)
|
||||
*/
|
||||
|
||||
import { OpenAITranscriptionProvider } from './providers/openai.js';
|
||||
import { AzureTranscriptionProvider } from './providers/azure.js';
|
||||
import { MockTranscriptionProvider } from './providers/mock.js';
|
||||
import type { TranscriptionProvider, STTProviderType } from './types.js';
|
||||
|
||||
let _provider: TranscriptionProvider | null = null;
|
||||
|
||||
/**
|
||||
* Resolve provider type from env vars.
|
||||
*/
|
||||
function resolveProviderType(): STTProviderType {
|
||||
const explicit = (process.env.STT_PROVIDER || '').toLowerCase();
|
||||
if (explicit === 'openai') return 'openai';
|
||||
if (explicit === 'azure') return 'azure';
|
||||
if (explicit === 'mock') return 'mock';
|
||||
|
||||
// Auto-detect from available credentials
|
||||
if (process.env.AZURE_SPEECH_KEY && process.env.AZURE_SPEECH_REGION) return 'azure';
|
||||
if (process.env.OPENAI_API_KEY) return 'openai';
|
||||
|
||||
return 'mock';
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a transcription provider by type.
|
||||
*/
|
||||
export function createSTTProvider(type: STTProviderType): TranscriptionProvider {
|
||||
switch (type) {
|
||||
case 'openai':
|
||||
return new OpenAITranscriptionProvider();
|
||||
case 'azure':
|
||||
return new AzureTranscriptionProvider();
|
||||
case 'mock':
|
||||
return new MockTranscriptionProvider();
|
||||
default:
|
||||
throw new Error(`Unknown STT_PROVIDER: '${type}'. Valid: openai, azure, mock`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the singleton transcription provider.
|
||||
*/
|
||||
export function getSTT(): TranscriptionProvider {
|
||||
if (!_provider) {
|
||||
const type = resolveProviderType();
|
||||
_provider = createSTTProvider(type);
|
||||
}
|
||||
return _provider;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the singleton transcription provider (for testing).
|
||||
*/
|
||||
export function setSTT(provider: TranscriptionProvider): void {
|
||||
_provider = provider;
|
||||
}
|
||||
|
||||
/**
|
||||
* @internal Reset singleton (for testing).
|
||||
*/
|
||||
export function _resetSTT(): void {
|
||||
_provider = null;
|
||||
}
|
||||
@ -0,0 +1,119 @@
|
||||
/**
|
||||
* Azure Speech Services transcription provider.
|
||||
*
|
||||
* Uses the Speech to text REST API for short audio (≤60s).
|
||||
* Endpoint: https://<REGION>.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1
|
||||
* Auth: Ocp-Apim-Subscription-Key header.
|
||||
*
|
||||
* Requires AZURE_SPEECH_KEY + AZURE_SPEECH_REGION env vars.
|
||||
*/
|
||||
|
||||
import type { TranscriptionProvider, TranscriptionInput, TranscriptionResult } from '../types.js';
|
||||
|
||||
export interface AzureSTTConfig {
|
||||
speechKey: string;
|
||||
speechRegion: string;
|
||||
}
|
||||
|
||||
/** Map filename extension to Azure-compatible Content-Type. */
|
||||
function audioContentType(filename: string): string {
|
||||
const ext = filename.split('.').pop()?.toLowerCase();
|
||||
switch (ext) {
|
||||
case 'wav':
|
||||
return 'audio/wav; codecs=audio/pcm; samplerate=16000';
|
||||
case 'ogg':
|
||||
return 'audio/ogg; codecs=opus';
|
||||
case 'mp3':
|
||||
case 'mpeg':
|
||||
case 'mpga':
|
||||
return 'audio/mpeg';
|
||||
case 'webm':
|
||||
return 'audio/webm; codecs=opus';
|
||||
case 'flac':
|
||||
return 'audio/flac';
|
||||
case 'm4a':
|
||||
case 'mp4':
|
||||
return 'audio/mp4';
|
||||
default:
|
||||
return 'audio/wav';
|
||||
}
|
||||
}
|
||||
|
||||
export class AzureTranscriptionProvider implements TranscriptionProvider {
|
||||
private config: AzureSTTConfig;
|
||||
|
||||
constructor(config?: Partial<AzureSTTConfig>) {
|
||||
this.config = {
|
||||
speechKey: config?.speechKey ?? process.env.AZURE_SPEECH_KEY ?? '',
|
||||
speechRegion: config?.speechRegion ?? process.env.AZURE_SPEECH_REGION ?? '',
|
||||
};
|
||||
}
|
||||
|
||||
isConfigured(): boolean {
|
||||
return Boolean(this.config.speechKey && this.config.speechRegion);
|
||||
}
|
||||
|
||||
async transcribe(input: TranscriptionInput): Promise<TranscriptionResult> {
|
||||
if (!this.isConfigured()) {
|
||||
throw new Error(
|
||||
'Azure Speech is not configured (missing AZURE_SPEECH_KEY or AZURE_SPEECH_REGION)'
|
||||
);
|
||||
}
|
||||
|
||||
const { speechKey, speechRegion } = this.config;
|
||||
const language = input.language || 'en-US';
|
||||
// Azure expects full locale (e.g. "en-US"), not just ISO 639-1 "en"
|
||||
const locale = language.length <= 3 ? `${language}-${language.toUpperCase()}` : language;
|
||||
|
||||
const url = new URL(
|
||||
`https://${speechRegion}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1`
|
||||
);
|
||||
url.searchParams.set('language', locale);
|
||||
url.searchParams.set('format', 'detailed');
|
||||
|
||||
const contentType = audioContentType(input.filename);
|
||||
|
||||
const response = await fetch(url.toString(), {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Ocp-Apim-Subscription-Key': speechKey,
|
||||
'Content-Type': contentType,
|
||||
Accept: 'application/json',
|
||||
},
|
||||
body: input.audio,
|
||||
signal: AbortSignal.timeout(120_000),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
throw new Error(`Azure Speech error ${response.status}: ${errorText}`);
|
||||
}
|
||||
|
||||
const data = (await response.json()) as {
|
||||
RecognitionStatus: string;
|
||||
DisplayText?: string;
|
||||
Duration?: number;
|
||||
NBest?: Array<{
|
||||
Display: string;
|
||||
Confidence: number;
|
||||
}>;
|
||||
};
|
||||
|
||||
if (data.RecognitionStatus !== 'Success') {
|
||||
throw new Error(`Azure Speech recognition failed: ${data.RecognitionStatus}`);
|
||||
}
|
||||
|
||||
// Duration is in 100-nanosecond ticks
|
||||
const durationSeconds = data.Duration ? data.Duration / 10_000_000 : null;
|
||||
|
||||
// Use NBest[0] if available (higher quality), fall back to DisplayText
|
||||
const text = data.NBest?.[0]?.Display ?? data.DisplayText ?? '';
|
||||
|
||||
return {
|
||||
text,
|
||||
language: locale,
|
||||
durationSeconds,
|
||||
model: `azure-speech-${speechRegion}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,28 @@
|
||||
/**
|
||||
* Mock transcription provider for testing.
|
||||
*
|
||||
* Returns deterministic transcription results without calling any external API.
|
||||
* Always reports as configured.
|
||||
*/
|
||||
|
||||
import type { TranscriptionProvider, TranscriptionInput, TranscriptionResult } from '../types.js';
|
||||
|
||||
export class MockTranscriptionProvider implements TranscriptionProvider {
|
||||
isConfigured(): boolean {
|
||||
return true;
|
||||
}
|
||||
|
||||
async transcribe(input: TranscriptionInput): Promise<TranscriptionResult> {
|
||||
// Simulate ~50ms processing time
|
||||
await new Promise(resolve => setTimeout(resolve, 50));
|
||||
|
||||
const audioSizeKB = Math.round(input.audio.byteLength / 1024);
|
||||
|
||||
return {
|
||||
text: `[Mock transcription of ${input.filename} (${audioSizeKB}KB)]`,
|
||||
language: input.language ?? 'en',
|
||||
durationSeconds: Math.max(1, Math.round(input.audio.byteLength / 16000)),
|
||||
model: 'mock-stt',
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,74 @@
|
||||
/**
|
||||
* OpenAI Whisper transcription provider.
|
||||
*
|
||||
* Uses POST /v1/audio/transcriptions (multipart/form-data).
|
||||
* Requires OPENAI_API_KEY env var.
|
||||
*/
|
||||
|
||||
import type { TranscriptionProvider, TranscriptionInput, TranscriptionResult } from '../types.js';
|
||||
|
||||
export interface OpenAISTTConfig {
|
||||
apiKey: string;
|
||||
baseUrl?: string;
|
||||
model?: string;
|
||||
}
|
||||
|
||||
export class OpenAITranscriptionProvider implements TranscriptionProvider {
|
||||
private config: Required<OpenAISTTConfig>;
|
||||
|
||||
constructor(config?: Partial<OpenAISTTConfig>) {
|
||||
this.config = {
|
||||
apiKey: config?.apiKey ?? process.env.OPENAI_API_KEY ?? '',
|
||||
baseUrl: (
|
||||
config?.baseUrl ??
|
||||
process.env.OPENAI_BASE_URL ??
|
||||
'https://api.openai.com/v1'
|
||||
).replace(/\/+$/, ''),
|
||||
model: config?.model ?? process.env.WHISPER_MODEL ?? 'whisper-1',
|
||||
};
|
||||
}
|
||||
|
||||
isConfigured(): boolean {
|
||||
return Boolean(this.config.apiKey);
|
||||
}
|
||||
|
||||
async transcribe(input: TranscriptionInput): Promise<TranscriptionResult> {
|
||||
if (!this.isConfigured()) {
|
||||
throw new Error('OpenAI is not configured (missing OPENAI_API_KEY)');
|
||||
}
|
||||
|
||||
const formData = new FormData();
|
||||
formData.append('file', new Blob([input.audio]), input.filename);
|
||||
formData.append('model', this.config.model);
|
||||
if (input.language) formData.append('language', input.language);
|
||||
if (input.prompt) formData.append('prompt', input.prompt);
|
||||
formData.append('response_format', 'verbose_json');
|
||||
|
||||
const response = await fetch(`${this.config.baseUrl}/audio/transcriptions`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
Authorization: `Bearer ${this.config.apiKey}`,
|
||||
},
|
||||
body: formData,
|
||||
signal: AbortSignal.timeout(120_000),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
throw new Error(`OpenAI Whisper error ${response.status}: ${errorText}`);
|
||||
}
|
||||
|
||||
const data = (await response.json()) as {
|
||||
text: string;
|
||||
language?: string;
|
||||
duration?: number;
|
||||
};
|
||||
|
||||
return {
|
||||
text: data.text,
|
||||
language: data.language ?? input.language ?? null,
|
||||
durationSeconds: data.duration ?? null,
|
||||
model: this.config.model,
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -1,5 +1,15 @@
|
||||
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
|
||||
import { TranscribeRequestSchema, MAX_AUDIO_SIZE_BYTES, SUPPORTED_AUDIO_TYPES } from './types.js';
|
||||
import {
|
||||
TranscribeRequestSchema,
|
||||
MAX_AUDIO_SIZE_BYTES,
|
||||
SUPPORTED_AUDIO_TYPES,
|
||||
deriveFilename,
|
||||
} from './types.js';
|
||||
import type { TranscriptionProvider } from './types.js';
|
||||
import { MockTranscriptionProvider } from './providers/mock.js';
|
||||
import { OpenAITranscriptionProvider } from './providers/openai.js';
|
||||
import { AzureTranscriptionProvider } from './providers/azure.js';
|
||||
import { createSTTProvider, _resetSTT, getSTT } from './factory.js';
|
||||
|
||||
// ── Schema validation tests ─────────────────────────────────────
|
||||
|
||||
@ -91,34 +101,219 @@ describe('transcription constants', () => {
|
||||
});
|
||||
});
|
||||
|
||||
// ── Route integration tests (mocked fetch) ──────────────────────
|
||||
// ── deriveFilename tests ────────────────────────────────────────
|
||||
|
||||
describe('transcribe route', () => {
|
||||
const originalFetch = globalThis.fetch;
|
||||
describe('deriveFilename', () => {
|
||||
it('extracts filename from URL path', () => {
|
||||
expect(deriveFilename('https://blob.example.com/uploads/meeting.wav?sv=2024', null)).toBe(
|
||||
'meeting.wav'
|
||||
);
|
||||
});
|
||||
|
||||
it('extracts filename with various extensions', () => {
|
||||
expect(deriveFilename('https://x.com/a.mp3', null)).toBe('a.mp3');
|
||||
expect(deriveFilename('https://x.com/b.m4a', null)).toBe('b.m4a');
|
||||
expect(deriveFilename('https://x.com/c.webm', null)).toBe('c.webm');
|
||||
expect(deriveFilename('https://x.com/d.flac', null)).toBe('d.flac');
|
||||
});
|
||||
|
||||
it('falls back to content-type mapping', () => {
|
||||
expect(deriveFilename('https://x.com/blob123', 'audio/mpeg')).toBe('audio.mp3');
|
||||
expect(deriveFilename('https://x.com/blob123', 'audio/wav')).toBe('audio.wav');
|
||||
expect(deriveFilename('https://x.com/blob123', 'audio/webm')).toBe('audio.webm');
|
||||
expect(deriveFilename('https://x.com/blob123', 'audio/ogg')).toBe('audio.ogg');
|
||||
});
|
||||
|
||||
it('handles content-type with charset parameter', () => {
|
||||
expect(deriveFilename('https://x.com/blob', 'audio/mpeg; charset=utf-8')).toBe('audio.mp3');
|
||||
});
|
||||
|
||||
it('defaults to audio.mp3 when no match', () => {
|
||||
expect(deriveFilename('https://x.com/blob', null)).toBe('audio.mp3');
|
||||
expect(deriveFilename('https://x.com/blob', 'application/octet-stream')).toBe('audio.mp3');
|
||||
});
|
||||
});
|
||||
|
||||
// ── MockTranscriptionProvider tests ─────────────────────────────
|
||||
|
||||
describe('MockTranscriptionProvider', () => {
|
||||
it('is always configured', () => {
|
||||
const provider = new MockTranscriptionProvider();
|
||||
expect(provider.isConfigured()).toBe(true);
|
||||
});
|
||||
|
||||
it('returns deterministic transcription with filename and size', async () => {
|
||||
const provider = new MockTranscriptionProvider();
|
||||
const audio = new ArrayBuffer(16000);
|
||||
const result = await provider.transcribe({
|
||||
audio,
|
||||
filename: 'test.mp3',
|
||||
});
|
||||
|
||||
expect(result.text).toContain('Mock transcription');
|
||||
expect(result.text).toContain('test.mp3');
|
||||
expect(result.text).toContain('16KB');
|
||||
expect(result.model).toBe('mock-stt');
|
||||
expect(result.durationSeconds).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('uses provided language', async () => {
|
||||
const provider = new MockTranscriptionProvider();
|
||||
const result = await provider.transcribe({
|
||||
audio: new ArrayBuffer(100),
|
||||
filename: 'a.wav',
|
||||
language: 'es',
|
||||
});
|
||||
expect(result.language).toBe('es');
|
||||
});
|
||||
|
||||
it('defaults language to en', async () => {
|
||||
const provider = new MockTranscriptionProvider();
|
||||
const result = await provider.transcribe({
|
||||
audio: new ArrayBuffer(100),
|
||||
filename: 'a.wav',
|
||||
});
|
||||
expect(result.language).toBe('en');
|
||||
});
|
||||
});
|
||||
|
||||
// ── OpenAITranscriptionProvider tests ───────────────────────────
|
||||
|
||||
describe('OpenAITranscriptionProvider', () => {
|
||||
it('is not configured without OPENAI_API_KEY', () => {
|
||||
const provider = new OpenAITranscriptionProvider({ apiKey: '' });
|
||||
expect(provider.isConfigured()).toBe(false);
|
||||
});
|
||||
|
||||
it('is configured with apiKey', () => {
|
||||
const provider = new OpenAITranscriptionProvider({ apiKey: 'sk-test' });
|
||||
expect(provider.isConfigured()).toBe(true);
|
||||
});
|
||||
|
||||
it('throws when transcribing without config', async () => {
|
||||
const provider = new OpenAITranscriptionProvider({ apiKey: '' });
|
||||
await expect(
|
||||
provider.transcribe({ audio: new ArrayBuffer(10), filename: 'a.mp3' })
|
||||
).rejects.toThrow('not configured');
|
||||
});
|
||||
});
|
||||
|
||||
// ── AzureTranscriptionProvider tests ────────────────────────────
|
||||
|
||||
describe('AzureTranscriptionProvider', () => {
|
||||
it('is not configured without both key and region', () => {
|
||||
expect(new AzureTranscriptionProvider({ speechKey: '', speechRegion: '' }).isConfigured()).toBe(
|
||||
false
|
||||
);
|
||||
expect(
|
||||
new AzureTranscriptionProvider({ speechKey: 'k', speechRegion: '' }).isConfigured()
|
||||
).toBe(false);
|
||||
expect(
|
||||
new AzureTranscriptionProvider({ speechKey: '', speechRegion: 'eastus' }).isConfigured()
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it('is configured with both key and region', () => {
|
||||
const provider = new AzureTranscriptionProvider({ speechKey: 'k', speechRegion: 'eastus' });
|
||||
expect(provider.isConfigured()).toBe(true);
|
||||
});
|
||||
|
||||
it('throws when transcribing without config', async () => {
|
||||
const provider = new AzureTranscriptionProvider({ speechKey: '', speechRegion: '' });
|
||||
await expect(
|
||||
provider.transcribe({ audio: new ArrayBuffer(10), filename: 'a.wav' })
|
||||
).rejects.toThrow('not configured');
|
||||
});
|
||||
});
|
||||
|
||||
// ── Factory tests ───────────────────────────────────────────────
|
||||
|
||||
describe('STT factory', () => {
|
||||
beforeEach(() => {
|
||||
vi.stubEnv('OPENAI_API_KEY', 'test-key');
|
||||
_resetSTT();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
globalThis.fetch = originalFetch;
|
||||
_resetSTT();
|
||||
vi.unstubAllEnvs();
|
||||
});
|
||||
|
||||
it('deriveFilename extracts extension from URL path', async () => {
|
||||
// Import the module to test the helper indirectly via the route
|
||||
// The deriveFilename is not exported, but we can verify behavior through integration
|
||||
const { TranscribeRequestSchema } = await import('./types.js');
|
||||
const req = TranscribeRequestSchema.parse({
|
||||
audioUrl: 'https://blob.example.com/uploads/meeting.wav?sv=2024-01-01&sig=abc',
|
||||
});
|
||||
expect(req.audioUrl).toContain('meeting.wav');
|
||||
it('createSTTProvider returns OpenAI provider', () => {
|
||||
const provider = createSTTProvider('openai');
|
||||
expect(provider).toBeInstanceOf(OpenAITranscriptionProvider);
|
||||
});
|
||||
|
||||
it('schema defaults responseFormat to text', () => {
|
||||
const result = TranscribeRequestSchema.parse({
|
||||
audioUrl: 'https://example.com/a.mp3',
|
||||
});
|
||||
expect(result.responseFormat).toBe('text');
|
||||
it('createSTTProvider returns Azure provider', () => {
|
||||
const provider = createSTTProvider('azure');
|
||||
expect(provider).toBeInstanceOf(AzureTranscriptionProvider);
|
||||
});
|
||||
|
||||
it('createSTTProvider returns Mock provider', () => {
|
||||
const provider = createSTTProvider('mock');
|
||||
expect(provider).toBeInstanceOf(MockTranscriptionProvider);
|
||||
});
|
||||
|
||||
it('createSTTProvider throws for unknown type', () => {
|
||||
expect(() => createSTTProvider('invalid' as 'openai')).toThrow('Unknown STT_PROVIDER');
|
||||
});
|
||||
|
||||
it('getSTT auto-detects mock when no credentials', () => {
|
||||
vi.stubEnv('OPENAI_API_KEY', '');
|
||||
vi.stubEnv('AZURE_SPEECH_KEY', '');
|
||||
vi.stubEnv('AZURE_SPEECH_REGION', '');
|
||||
vi.stubEnv('STT_PROVIDER', '');
|
||||
_resetSTT();
|
||||
const provider = getSTT();
|
||||
expect(provider).toBeInstanceOf(MockTranscriptionProvider);
|
||||
});
|
||||
|
||||
it('getSTT auto-detects openai when OPENAI_API_KEY set', () => {
|
||||
vi.stubEnv('OPENAI_API_KEY', 'sk-test');
|
||||
vi.stubEnv('AZURE_SPEECH_KEY', '');
|
||||
vi.stubEnv('AZURE_SPEECH_REGION', '');
|
||||
vi.stubEnv('STT_PROVIDER', '');
|
||||
_resetSTT();
|
||||
const provider = getSTT();
|
||||
expect(provider).toBeInstanceOf(OpenAITranscriptionProvider);
|
||||
});
|
||||
|
||||
it('getSTT auto-detects azure when AZURE_SPEECH_KEY + REGION set', () => {
|
||||
vi.stubEnv('OPENAI_API_KEY', 'sk-test');
|
||||
vi.stubEnv('AZURE_SPEECH_KEY', 'my-key');
|
||||
vi.stubEnv('AZURE_SPEECH_REGION', 'eastus');
|
||||
vi.stubEnv('STT_PROVIDER', '');
|
||||
_resetSTT();
|
||||
const provider = getSTT();
|
||||
expect(provider).toBeInstanceOf(AzureTranscriptionProvider);
|
||||
});
|
||||
|
||||
it('getSTT respects explicit STT_PROVIDER over auto-detect', () => {
|
||||
vi.stubEnv('OPENAI_API_KEY', 'sk-test');
|
||||
vi.stubEnv('AZURE_SPEECH_KEY', 'my-key');
|
||||
vi.stubEnv('AZURE_SPEECH_REGION', 'eastus');
|
||||
vi.stubEnv('STT_PROVIDER', 'mock');
|
||||
_resetSTT();
|
||||
const provider = getSTT();
|
||||
expect(provider).toBeInstanceOf(MockTranscriptionProvider);
|
||||
});
|
||||
|
||||
it('getSTT returns singleton', () => {
|
||||
vi.stubEnv('STT_PROVIDER', 'mock');
|
||||
_resetSTT();
|
||||
const p1 = getSTT();
|
||||
const p2 = getSTT();
|
||||
expect(p1).toBe(p2);
|
||||
});
|
||||
|
||||
it('TranscriptionProvider interface is satisfied by all providers', () => {
|
||||
const providers: TranscriptionProvider[] = [
|
||||
new MockTranscriptionProvider(),
|
||||
new OpenAITranscriptionProvider({ apiKey: 'test' }),
|
||||
new AzureTranscriptionProvider({ speechKey: 'k', speechRegion: 'eastus' }),
|
||||
];
|
||||
for (const p of providers) {
|
||||
expect(typeof p.transcribe).toBe('function');
|
||||
expect(typeof p.isConfigured).toBe('function');
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@ -1,16 +1,22 @@
|
||||
/**
|
||||
* Transcription routes — speech-to-text via OpenAI Whisper API.
|
||||
* Transcription routes — provider-agnostic speech-to-text.
|
||||
*
|
||||
* POST /transcribe — Download audio from URL, transcribe via Whisper.
|
||||
* POST /transcribe — Download audio from URL, transcribe via configured provider.
|
||||
* Product-agnostic: any product backend can call this endpoint.
|
||||
*
|
||||
* Requires OPENAI_API_KEY environment variable.
|
||||
* Provider selection: STT_PROVIDER env var (openai | azure | mock).
|
||||
* Auto-detects from available credentials if not explicit.
|
||||
*/
|
||||
|
||||
import type { FastifyInstance } from 'fastify';
|
||||
import { BadRequestError } from '@bytelyst/errors';
|
||||
import { config } from '../../lib/config.js';
|
||||
import { TranscribeRequestSchema, MAX_AUDIO_SIZE_BYTES, type TranscribeResponse } from './types.js';
|
||||
import {
|
||||
TranscribeRequestSchema,
|
||||
MAX_AUDIO_SIZE_BYTES,
|
||||
deriveFilename,
|
||||
type TranscribeResponse,
|
||||
} from './types.js';
|
||||
import { getSTT } from './factory.js';
|
||||
|
||||
export async function transcribeRoutes(app: FastifyInstance): Promise<void> {
|
||||
/**
|
||||
@ -18,9 +24,10 @@ export async function transcribeRoutes(app: FastifyInstance): Promise<void> {
|
||||
*
|
||||
* Flow:
|
||||
* 1. Validate request (audioUrl, optional language/model/prompt)
|
||||
* 2. Download audio from the provided URL
|
||||
* 3. Send to OpenAI Whisper API as multipart/form-data
|
||||
* 4. Return transcribed text + metadata
|
||||
* 2. Check provider is configured
|
||||
* 3. Download audio from the provided URL
|
||||
* 4. Delegate to configured TranscriptionProvider
|
||||
* 5. Return transcribed text + metadata
|
||||
*/
|
||||
app.post('/transcribe', async (req, reply) => {
|
||||
const parsed = TranscribeRequestSchema.safeParse(req.body);
|
||||
@ -28,17 +35,19 @@ export async function transcribeRoutes(app: FastifyInstance): Promise<void> {
|
||||
throw new BadRequestError(parsed.error.issues.map(i => i.message).join('; '));
|
||||
}
|
||||
|
||||
if (!config.OPENAI_API_KEY) {
|
||||
const provider = getSTT();
|
||||
|
||||
if (!provider.isConfigured()) {
|
||||
return reply.status(503).send({
|
||||
error: 'Transcription not available — OPENAI_API_KEY not configured',
|
||||
error: 'Transcription not available — no STT provider configured',
|
||||
});
|
||||
}
|
||||
|
||||
const { audioUrl, model, language, prompt, responseFormat, productId } = parsed.data;
|
||||
const { audioUrl, language, prompt, productId } = parsed.data;
|
||||
const requestId = req.headers['x-request-id'] as string | undefined;
|
||||
|
||||
req.log.info(
|
||||
{ audioUrl: audioUrl.substring(0, 80), language, model, productId },
|
||||
{ audioUrl: audioUrl.substring(0, 80), language, productId },
|
||||
'transcription request'
|
||||
);
|
||||
|
||||
@ -87,120 +96,56 @@ export async function transcribeRoutes(app: FastifyInstance): Promise<void> {
|
||||
// ── Step 2: Determine filename from URL or content-type ──────
|
||||
const filename = deriveFilename(audioUrl, contentType);
|
||||
|
||||
// ── Step 3: Call OpenAI Whisper API ──────────────────────────
|
||||
const whisperModel = model || config.WHISPER_MODEL;
|
||||
const baseUrl = config.OPENAI_BASE_URL.replace(/\/+$/, '');
|
||||
|
||||
const formData = new FormData();
|
||||
formData.append('file', new Blob([audioBuffer]), filename);
|
||||
formData.append('model', whisperModel);
|
||||
if (language) formData.append('language', language);
|
||||
if (prompt) formData.append('prompt', prompt);
|
||||
formData.append('response_format', responseFormat === 'text' ? 'verbose_json' : responseFormat);
|
||||
|
||||
let whisperResponse: Response;
|
||||
// ── Step 3: Delegate to provider ─────────────────────────────
|
||||
let result: TranscribeResponse;
|
||||
try {
|
||||
whisperResponse = await fetch(`${baseUrl}/audio/transcriptions`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
Authorization: `Bearer ${config.OPENAI_API_KEY}`,
|
||||
},
|
||||
body: formData,
|
||||
signal: AbortSignal.timeout(120_000),
|
||||
const sttResult = await provider.transcribe({
|
||||
audio: audioBuffer,
|
||||
filename,
|
||||
language,
|
||||
prompt,
|
||||
});
|
||||
|
||||
const durationMs = Date.now() - startMs;
|
||||
|
||||
req.log.info(
|
||||
{
|
||||
durationMs,
|
||||
model: sttResult.model,
|
||||
language: sttResult.language,
|
||||
audioSizeBytes: audioBuffer.byteLength,
|
||||
textLength: sttResult.text.length,
|
||||
productId,
|
||||
},
|
||||
'transcription complete'
|
||||
);
|
||||
|
||||
result = {
|
||||
text: sttResult.text,
|
||||
language: sttResult.language,
|
||||
durationSeconds: sttResult.durationSeconds,
|
||||
model: sttResult.model,
|
||||
durationMs,
|
||||
requestId,
|
||||
};
|
||||
} catch (err) {
|
||||
const message = err instanceof Error ? err.message : 'Whisper API call failed';
|
||||
req.log.error({ error: message }, 'whisper API failed');
|
||||
const message = err instanceof Error ? err.message : 'Transcription failed';
|
||||
req.log.error({ error: message }, 'transcription provider failed');
|
||||
|
||||
if (message.includes('429') || message.includes('rate limit')) {
|
||||
return reply.status(429).send({
|
||||
error: 'Transcription rate limit exceeded',
|
||||
detail: message,
|
||||
retryAfter: 60,
|
||||
});
|
||||
}
|
||||
|
||||
return reply.status(502).send({
|
||||
error: 'Transcription failed',
|
||||
detail: message,
|
||||
});
|
||||
}
|
||||
|
||||
if (!whisperResponse.ok) {
|
||||
const errorText = await whisperResponse.text();
|
||||
req.log.error({ status: whisperResponse.status, error: errorText }, 'whisper API error');
|
||||
|
||||
if (whisperResponse.status === 429) {
|
||||
return reply.status(429).send({
|
||||
error: 'Transcription rate limit exceeded',
|
||||
detail: errorText,
|
||||
retryAfter: 60,
|
||||
});
|
||||
}
|
||||
|
||||
return reply.status(502).send({
|
||||
error: `Whisper API error ${whisperResponse.status}`,
|
||||
detail: errorText,
|
||||
});
|
||||
}
|
||||
|
||||
// ── Step 4: Parse response ───────────────────────────────────
|
||||
const data = (await whisperResponse.json()) as {
|
||||
text: string;
|
||||
language?: string;
|
||||
duration?: number;
|
||||
};
|
||||
|
||||
const durationMs = Date.now() - startMs;
|
||||
|
||||
req.log.info(
|
||||
{
|
||||
durationMs,
|
||||
model: whisperModel,
|
||||
language: data.language,
|
||||
audioSizeBytes: audioBuffer.byteLength,
|
||||
textLength: data.text.length,
|
||||
productId,
|
||||
},
|
||||
'transcription complete'
|
||||
);
|
||||
|
||||
const result: TranscribeResponse = {
|
||||
text: data.text,
|
||||
language: data.language ?? language ?? null,
|
||||
durationSeconds: data.duration ?? null,
|
||||
model: whisperModel,
|
||||
durationMs,
|
||||
requestId,
|
||||
};
|
||||
|
||||
return reply.send(result);
|
||||
});
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────
|
||||
|
||||
function deriveFilename(url: string, contentType: string | null): string {
|
||||
// Try to extract extension from URL path
|
||||
try {
|
||||
const pathname = new URL(url).pathname;
|
||||
const lastSegment = pathname.split('/').pop();
|
||||
if (lastSegment && /\.(mp3|m4a|wav|webm|ogg|flac|mp4|mpeg|mpga)$/i.test(lastSegment)) {
|
||||
return lastSegment;
|
||||
}
|
||||
} catch {
|
||||
// URL parsing failed — use fallback
|
||||
}
|
||||
|
||||
// Map content-type to extension
|
||||
const extMap: Record<string, string> = {
|
||||
'audio/mpeg': 'audio.mp3',
|
||||
'audio/mp4': 'audio.m4a',
|
||||
'audio/mp4a-latm': 'audio.m4a',
|
||||
'audio/x-m4a': 'audio.m4a',
|
||||
'audio/wav': 'audio.wav',
|
||||
'audio/x-wav': 'audio.wav',
|
||||
'audio/webm': 'audio.webm',
|
||||
'audio/ogg': 'audio.ogg',
|
||||
'audio/flac': 'audio.flac',
|
||||
'video/mp4': 'audio.mp4',
|
||||
};
|
||||
|
||||
if (contentType) {
|
||||
const base = contentType.split(';')[0].trim().toLowerCase();
|
||||
if (extMap[base]) return extMap[base];
|
||||
}
|
||||
|
||||
return 'audio.mp3';
|
||||
}
|
||||
|
||||
@ -1,5 +1,41 @@
|
||||
import { z } from 'zod';
|
||||
|
||||
// ── Provider interface ──────────────────────────────────────
|
||||
|
||||
/** Audio input passed to a transcription provider. */
|
||||
export interface TranscriptionInput {
|
||||
/** Raw audio bytes. */
|
||||
audio: ArrayBuffer;
|
||||
/** Filename with extension (used for content-type inference). */
|
||||
filename: string;
|
||||
/** ISO 639-1 language hint. */
|
||||
language?: string;
|
||||
/** Prompt to guide transcription style. */
|
||||
prompt?: string;
|
||||
}
|
||||
|
||||
/** Result from a transcription provider. */
|
||||
export interface TranscriptionResult {
|
||||
/** The transcribed text. */
|
||||
text: string;
|
||||
/** Detected or specified language code. */
|
||||
language: string | null;
|
||||
/** Duration of the audio in seconds (when available). */
|
||||
durationSeconds: number | null;
|
||||
/** Provider-specific model identifier. */
|
||||
model: string;
|
||||
}
|
||||
|
||||
/** Cloud-agnostic speech-to-text provider. */
|
||||
export interface TranscriptionProvider {
|
||||
/** Transcribe audio to text. */
|
||||
transcribe(input: TranscriptionInput): Promise<TranscriptionResult>;
|
||||
/** Check if the provider is configured with valid credentials. */
|
||||
isConfigured(): boolean;
|
||||
}
|
||||
|
||||
export type STTProviderType = 'openai' | 'azure' | 'mock';
|
||||
|
||||
// ── Transcription request schema ────────────────────────────────
|
||||
|
||||
export const TranscribeRequestSchema = z.object({
|
||||
@ -53,3 +89,37 @@ export const SUPPORTED_AUDIO_TYPES = new Set([
|
||||
|
||||
/** Max audio file size: 25 MB (OpenAI Whisper limit). */
|
||||
export const MAX_AUDIO_SIZE_BYTES = 25 * 1024 * 1024;
|
||||
|
||||
// ── Filename helper (shared by route + providers) ─────────────
|
||||
|
||||
export function deriveFilename(url: string, contentType: string | null): string {
|
||||
try {
|
||||
const pathname = new URL(url).pathname;
|
||||
const lastSegment = pathname.split('/').pop();
|
||||
if (lastSegment && /\.(mp3|m4a|wav|webm|ogg|flac|mp4|mpeg|mpga)$/i.test(lastSegment)) {
|
||||
return lastSegment;
|
||||
}
|
||||
} catch {
|
||||
// URL parsing failed — use fallback
|
||||
}
|
||||
|
||||
const extMap: Record<string, string> = {
|
||||
'audio/mpeg': 'audio.mp3',
|
||||
'audio/mp4': 'audio.m4a',
|
||||
'audio/mp4a-latm': 'audio.m4a',
|
||||
'audio/x-m4a': 'audio.m4a',
|
||||
'audio/wav': 'audio.wav',
|
||||
'audio/x-wav': 'audio.wav',
|
||||
'audio/webm': 'audio.webm',
|
||||
'audio/ogg': 'audio.ogg',
|
||||
'audio/flac': 'audio.flac',
|
||||
'video/mp4': 'audio.mp4',
|
||||
};
|
||||
|
||||
if (contentType) {
|
||||
const base = contentType.split(';')[0].trim().toLowerCase();
|
||||
if (extMap[base]) return extMap[base];
|
||||
}
|
||||
|
||||
return 'audio.mp3';
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user