refactor(extraction-service): provider-agnostic transcription — OpenAI + Azure Speech + Mock

- TranscriptionProvider interface with transcribe() + isConfigured()
- OpenAITranscriptionProvider: Whisper API (existing behavior)
- AzureTranscriptionProvider: Azure Speech REST API for short audio
- MockTranscriptionProvider: deterministic results for testing
- Factory: getSTT() singleton with env-driven auto-detection
  - STT_PROVIDER=openai|azure|mock (explicit)
  - Auto-detect: AZURE_SPEECH_KEY → azure, OPENAI_API_KEY → openai, else mock
- Config: add STT_PROVIDER, AZURE_SPEECH_KEY, AZURE_SPEECH_REGION env vars
- Route refactored: audio download (common) → provider.transcribe() (swappable)
- deriveFilename() extracted to types.ts (shared by route + providers)
- 35 transcription tests (was 12), 171 total passing
- Follows same pattern as @bytelyst/llm provider abstraction
This commit is contained in:
saravanakumardb1 2026-04-06 11:30:22 -07:00
parent f8e15880d2
commit a77b3ff931
8 changed files with 642 additions and 136 deletions

View File

@ -19,9 +19,12 @@ const envSchema = z.object({
EXTRACTION_QUEUE_FILE: z.string().optional(),
EXTRACTION_QUEUE_POLL_MS: z.coerce.number().default(100),
EXTRACTION_QUEUE_LEASE_MS: z.coerce.number().default(30000),
STT_PROVIDER: z.enum(['openai', 'azure', 'mock']).optional(),
OPENAI_API_KEY: z.string().optional(),
OPENAI_BASE_URL: z.string().default('https://api.openai.com/v1'),
WHISPER_MODEL: z.string().default('whisper-1'),
AZURE_SPEECH_KEY: z.string().optional(),
AZURE_SPEECH_REGION: z.string().optional(),
});
export const config = envSchema.parse(process.env);

View File

@ -0,0 +1,72 @@
/**
* Transcription provider factory.
*
* Creates a TranscriptionProvider based on STT_PROVIDER env var.
* Auto-detects Azure vs OpenAI from available credentials if not explicit.
*
* Priority: STT_PROVIDER > auto-detect (AZURE_SPEECH_KEY azure, OPENAI_API_KEY openai)
*/
import { OpenAITranscriptionProvider } from './providers/openai.js';
import { AzureTranscriptionProvider } from './providers/azure.js';
import { MockTranscriptionProvider } from './providers/mock.js';
import type { TranscriptionProvider, STTProviderType } from './types.js';
let _provider: TranscriptionProvider | null = null;
/**
* Resolve provider type from env vars.
*/
function resolveProviderType(): STTProviderType {
const explicit = (process.env.STT_PROVIDER || '').toLowerCase();
if (explicit === 'openai') return 'openai';
if (explicit === 'azure') return 'azure';
if (explicit === 'mock') return 'mock';
// Auto-detect from available credentials
if (process.env.AZURE_SPEECH_KEY && process.env.AZURE_SPEECH_REGION) return 'azure';
if (process.env.OPENAI_API_KEY) return 'openai';
return 'mock';
}
/**
* Create a transcription provider by type.
*/
export function createSTTProvider(type: STTProviderType): TranscriptionProvider {
switch (type) {
case 'openai':
return new OpenAITranscriptionProvider();
case 'azure':
return new AzureTranscriptionProvider();
case 'mock':
return new MockTranscriptionProvider();
default:
throw new Error(`Unknown STT_PROVIDER: '${type}'. Valid: openai, azure, mock`);
}
}
/**
* Get the singleton transcription provider.
*/
export function getSTT(): TranscriptionProvider {
if (!_provider) {
const type = resolveProviderType();
_provider = createSTTProvider(type);
}
return _provider;
}
/**
* Set the singleton transcription provider (for testing).
*/
export function setSTT(provider: TranscriptionProvider): void {
_provider = provider;
}
/**
* @internal Reset singleton (for testing).
*/
export function _resetSTT(): void {
_provider = null;
}

View File

@ -0,0 +1,119 @@
/**
* Azure Speech Services transcription provider.
*
* Uses the Speech to text REST API for short audio (60s).
* Endpoint: https://<REGION>.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1
* Auth: Ocp-Apim-Subscription-Key header.
*
* Requires AZURE_SPEECH_KEY + AZURE_SPEECH_REGION env vars.
*/
import type { TranscriptionProvider, TranscriptionInput, TranscriptionResult } from '../types.js';
export interface AzureSTTConfig {
speechKey: string;
speechRegion: string;
}
/** Map filename extension to Azure-compatible Content-Type. */
function audioContentType(filename: string): string {
const ext = filename.split('.').pop()?.toLowerCase();
switch (ext) {
case 'wav':
return 'audio/wav; codecs=audio/pcm; samplerate=16000';
case 'ogg':
return 'audio/ogg; codecs=opus';
case 'mp3':
case 'mpeg':
case 'mpga':
return 'audio/mpeg';
case 'webm':
return 'audio/webm; codecs=opus';
case 'flac':
return 'audio/flac';
case 'm4a':
case 'mp4':
return 'audio/mp4';
default:
return 'audio/wav';
}
}
export class AzureTranscriptionProvider implements TranscriptionProvider {
private config: AzureSTTConfig;
constructor(config?: Partial<AzureSTTConfig>) {
this.config = {
speechKey: config?.speechKey ?? process.env.AZURE_SPEECH_KEY ?? '',
speechRegion: config?.speechRegion ?? process.env.AZURE_SPEECH_REGION ?? '',
};
}
isConfigured(): boolean {
return Boolean(this.config.speechKey && this.config.speechRegion);
}
async transcribe(input: TranscriptionInput): Promise<TranscriptionResult> {
if (!this.isConfigured()) {
throw new Error(
'Azure Speech is not configured (missing AZURE_SPEECH_KEY or AZURE_SPEECH_REGION)'
);
}
const { speechKey, speechRegion } = this.config;
const language = input.language || 'en-US';
// Azure expects full locale (e.g. "en-US"), not just ISO 639-1 "en"
const locale = language.length <= 3 ? `${language}-${language.toUpperCase()}` : language;
const url = new URL(
`https://${speechRegion}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1`
);
url.searchParams.set('language', locale);
url.searchParams.set('format', 'detailed');
const contentType = audioContentType(input.filename);
const response = await fetch(url.toString(), {
method: 'POST',
headers: {
'Ocp-Apim-Subscription-Key': speechKey,
'Content-Type': contentType,
Accept: 'application/json',
},
body: input.audio,
signal: AbortSignal.timeout(120_000),
});
if (!response.ok) {
const errorText = await response.text();
throw new Error(`Azure Speech error ${response.status}: ${errorText}`);
}
const data = (await response.json()) as {
RecognitionStatus: string;
DisplayText?: string;
Duration?: number;
NBest?: Array<{
Display: string;
Confidence: number;
}>;
};
if (data.RecognitionStatus !== 'Success') {
throw new Error(`Azure Speech recognition failed: ${data.RecognitionStatus}`);
}
// Duration is in 100-nanosecond ticks
const durationSeconds = data.Duration ? data.Duration / 10_000_000 : null;
// Use NBest[0] if available (higher quality), fall back to DisplayText
const text = data.NBest?.[0]?.Display ?? data.DisplayText ?? '';
return {
text,
language: locale,
durationSeconds,
model: `azure-speech-${speechRegion}`,
};
}
}

View File

@ -0,0 +1,28 @@
/**
* Mock transcription provider for testing.
*
* Returns deterministic transcription results without calling any external API.
* Always reports as configured.
*/
import type { TranscriptionProvider, TranscriptionInput, TranscriptionResult } from '../types.js';
export class MockTranscriptionProvider implements TranscriptionProvider {
isConfigured(): boolean {
return true;
}
async transcribe(input: TranscriptionInput): Promise<TranscriptionResult> {
// Simulate ~50ms processing time
await new Promise(resolve => setTimeout(resolve, 50));
const audioSizeKB = Math.round(input.audio.byteLength / 1024);
return {
text: `[Mock transcription of ${input.filename} (${audioSizeKB}KB)]`,
language: input.language ?? 'en',
durationSeconds: Math.max(1, Math.round(input.audio.byteLength / 16000)),
model: 'mock-stt',
};
}
}

View File

@ -0,0 +1,74 @@
/**
* OpenAI Whisper transcription provider.
*
* Uses POST /v1/audio/transcriptions (multipart/form-data).
* Requires OPENAI_API_KEY env var.
*/
import type { TranscriptionProvider, TranscriptionInput, TranscriptionResult } from '../types.js';
export interface OpenAISTTConfig {
apiKey: string;
baseUrl?: string;
model?: string;
}
export class OpenAITranscriptionProvider implements TranscriptionProvider {
private config: Required<OpenAISTTConfig>;
constructor(config?: Partial<OpenAISTTConfig>) {
this.config = {
apiKey: config?.apiKey ?? process.env.OPENAI_API_KEY ?? '',
baseUrl: (
config?.baseUrl ??
process.env.OPENAI_BASE_URL ??
'https://api.openai.com/v1'
).replace(/\/+$/, ''),
model: config?.model ?? process.env.WHISPER_MODEL ?? 'whisper-1',
};
}
isConfigured(): boolean {
return Boolean(this.config.apiKey);
}
async transcribe(input: TranscriptionInput): Promise<TranscriptionResult> {
if (!this.isConfigured()) {
throw new Error('OpenAI is not configured (missing OPENAI_API_KEY)');
}
const formData = new FormData();
formData.append('file', new Blob([input.audio]), input.filename);
formData.append('model', this.config.model);
if (input.language) formData.append('language', input.language);
if (input.prompt) formData.append('prompt', input.prompt);
formData.append('response_format', 'verbose_json');
const response = await fetch(`${this.config.baseUrl}/audio/transcriptions`, {
method: 'POST',
headers: {
Authorization: `Bearer ${this.config.apiKey}`,
},
body: formData,
signal: AbortSignal.timeout(120_000),
});
if (!response.ok) {
const errorText = await response.text();
throw new Error(`OpenAI Whisper error ${response.status}: ${errorText}`);
}
const data = (await response.json()) as {
text: string;
language?: string;
duration?: number;
};
return {
text: data.text,
language: data.language ?? input.language ?? null,
durationSeconds: data.duration ?? null,
model: this.config.model,
};
}
}

View File

@ -1,5 +1,15 @@
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
import { TranscribeRequestSchema, MAX_AUDIO_SIZE_BYTES, SUPPORTED_AUDIO_TYPES } from './types.js';
import {
TranscribeRequestSchema,
MAX_AUDIO_SIZE_BYTES,
SUPPORTED_AUDIO_TYPES,
deriveFilename,
} from './types.js';
import type { TranscriptionProvider } from './types.js';
import { MockTranscriptionProvider } from './providers/mock.js';
import { OpenAITranscriptionProvider } from './providers/openai.js';
import { AzureTranscriptionProvider } from './providers/azure.js';
import { createSTTProvider, _resetSTT, getSTT } from './factory.js';
// ── Schema validation tests ─────────────────────────────────────
@ -91,34 +101,219 @@ describe('transcription constants', () => {
});
});
// ── Route integration tests (mocked fetch) ──────────────────────
// ── deriveFilename tests ────────────────────────────────────────
describe('transcribe route', () => {
const originalFetch = globalThis.fetch;
describe('deriveFilename', () => {
it('extracts filename from URL path', () => {
expect(deriveFilename('https://blob.example.com/uploads/meeting.wav?sv=2024', null)).toBe(
'meeting.wav'
);
});
it('extracts filename with various extensions', () => {
expect(deriveFilename('https://x.com/a.mp3', null)).toBe('a.mp3');
expect(deriveFilename('https://x.com/b.m4a', null)).toBe('b.m4a');
expect(deriveFilename('https://x.com/c.webm', null)).toBe('c.webm');
expect(deriveFilename('https://x.com/d.flac', null)).toBe('d.flac');
});
it('falls back to content-type mapping', () => {
expect(deriveFilename('https://x.com/blob123', 'audio/mpeg')).toBe('audio.mp3');
expect(deriveFilename('https://x.com/blob123', 'audio/wav')).toBe('audio.wav');
expect(deriveFilename('https://x.com/blob123', 'audio/webm')).toBe('audio.webm');
expect(deriveFilename('https://x.com/blob123', 'audio/ogg')).toBe('audio.ogg');
});
it('handles content-type with charset parameter', () => {
expect(deriveFilename('https://x.com/blob', 'audio/mpeg; charset=utf-8')).toBe('audio.mp3');
});
it('defaults to audio.mp3 when no match', () => {
expect(deriveFilename('https://x.com/blob', null)).toBe('audio.mp3');
expect(deriveFilename('https://x.com/blob', 'application/octet-stream')).toBe('audio.mp3');
});
});
// ── MockTranscriptionProvider tests ─────────────────────────────
describe('MockTranscriptionProvider', () => {
it('is always configured', () => {
const provider = new MockTranscriptionProvider();
expect(provider.isConfigured()).toBe(true);
});
it('returns deterministic transcription with filename and size', async () => {
const provider = new MockTranscriptionProvider();
const audio = new ArrayBuffer(16000);
const result = await provider.transcribe({
audio,
filename: 'test.mp3',
});
expect(result.text).toContain('Mock transcription');
expect(result.text).toContain('test.mp3');
expect(result.text).toContain('16KB');
expect(result.model).toBe('mock-stt');
expect(result.durationSeconds).toBeGreaterThan(0);
});
it('uses provided language', async () => {
const provider = new MockTranscriptionProvider();
const result = await provider.transcribe({
audio: new ArrayBuffer(100),
filename: 'a.wav',
language: 'es',
});
expect(result.language).toBe('es');
});
it('defaults language to en', async () => {
const provider = new MockTranscriptionProvider();
const result = await provider.transcribe({
audio: new ArrayBuffer(100),
filename: 'a.wav',
});
expect(result.language).toBe('en');
});
});
// ── OpenAITranscriptionProvider tests ───────────────────────────
describe('OpenAITranscriptionProvider', () => {
it('is not configured without OPENAI_API_KEY', () => {
const provider = new OpenAITranscriptionProvider({ apiKey: '' });
expect(provider.isConfigured()).toBe(false);
});
it('is configured with apiKey', () => {
const provider = new OpenAITranscriptionProvider({ apiKey: 'sk-test' });
expect(provider.isConfigured()).toBe(true);
});
it('throws when transcribing without config', async () => {
const provider = new OpenAITranscriptionProvider({ apiKey: '' });
await expect(
provider.transcribe({ audio: new ArrayBuffer(10), filename: 'a.mp3' })
).rejects.toThrow('not configured');
});
});
// ── AzureTranscriptionProvider tests ────────────────────────────
describe('AzureTranscriptionProvider', () => {
it('is not configured without both key and region', () => {
expect(new AzureTranscriptionProvider({ speechKey: '', speechRegion: '' }).isConfigured()).toBe(
false
);
expect(
new AzureTranscriptionProvider({ speechKey: 'k', speechRegion: '' }).isConfigured()
).toBe(false);
expect(
new AzureTranscriptionProvider({ speechKey: '', speechRegion: 'eastus' }).isConfigured()
).toBe(false);
});
it('is configured with both key and region', () => {
const provider = new AzureTranscriptionProvider({ speechKey: 'k', speechRegion: 'eastus' });
expect(provider.isConfigured()).toBe(true);
});
it('throws when transcribing without config', async () => {
const provider = new AzureTranscriptionProvider({ speechKey: '', speechRegion: '' });
await expect(
provider.transcribe({ audio: new ArrayBuffer(10), filename: 'a.wav' })
).rejects.toThrow('not configured');
});
});
// ── Factory tests ───────────────────────────────────────────────
describe('STT factory', () => {
beforeEach(() => {
vi.stubEnv('OPENAI_API_KEY', 'test-key');
_resetSTT();
});
afterEach(() => {
globalThis.fetch = originalFetch;
_resetSTT();
vi.unstubAllEnvs();
});
it('deriveFilename extracts extension from URL path', async () => {
// Import the module to test the helper indirectly via the route
// The deriveFilename is not exported, but we can verify behavior through integration
const { TranscribeRequestSchema } = await import('./types.js');
const req = TranscribeRequestSchema.parse({
audioUrl: 'https://blob.example.com/uploads/meeting.wav?sv=2024-01-01&sig=abc',
});
expect(req.audioUrl).toContain('meeting.wav');
it('createSTTProvider returns OpenAI provider', () => {
const provider = createSTTProvider('openai');
expect(provider).toBeInstanceOf(OpenAITranscriptionProvider);
});
it('schema defaults responseFormat to text', () => {
const result = TranscribeRequestSchema.parse({
audioUrl: 'https://example.com/a.mp3',
});
expect(result.responseFormat).toBe('text');
it('createSTTProvider returns Azure provider', () => {
const provider = createSTTProvider('azure');
expect(provider).toBeInstanceOf(AzureTranscriptionProvider);
});
it('createSTTProvider returns Mock provider', () => {
const provider = createSTTProvider('mock');
expect(provider).toBeInstanceOf(MockTranscriptionProvider);
});
it('createSTTProvider throws for unknown type', () => {
expect(() => createSTTProvider('invalid' as 'openai')).toThrow('Unknown STT_PROVIDER');
});
it('getSTT auto-detects mock when no credentials', () => {
vi.stubEnv('OPENAI_API_KEY', '');
vi.stubEnv('AZURE_SPEECH_KEY', '');
vi.stubEnv('AZURE_SPEECH_REGION', '');
vi.stubEnv('STT_PROVIDER', '');
_resetSTT();
const provider = getSTT();
expect(provider).toBeInstanceOf(MockTranscriptionProvider);
});
it('getSTT auto-detects openai when OPENAI_API_KEY set', () => {
vi.stubEnv('OPENAI_API_KEY', 'sk-test');
vi.stubEnv('AZURE_SPEECH_KEY', '');
vi.stubEnv('AZURE_SPEECH_REGION', '');
vi.stubEnv('STT_PROVIDER', '');
_resetSTT();
const provider = getSTT();
expect(provider).toBeInstanceOf(OpenAITranscriptionProvider);
});
it('getSTT auto-detects azure when AZURE_SPEECH_KEY + REGION set', () => {
vi.stubEnv('OPENAI_API_KEY', 'sk-test');
vi.stubEnv('AZURE_SPEECH_KEY', 'my-key');
vi.stubEnv('AZURE_SPEECH_REGION', 'eastus');
vi.stubEnv('STT_PROVIDER', '');
_resetSTT();
const provider = getSTT();
expect(provider).toBeInstanceOf(AzureTranscriptionProvider);
});
it('getSTT respects explicit STT_PROVIDER over auto-detect', () => {
vi.stubEnv('OPENAI_API_KEY', 'sk-test');
vi.stubEnv('AZURE_SPEECH_KEY', 'my-key');
vi.stubEnv('AZURE_SPEECH_REGION', 'eastus');
vi.stubEnv('STT_PROVIDER', 'mock');
_resetSTT();
const provider = getSTT();
expect(provider).toBeInstanceOf(MockTranscriptionProvider);
});
it('getSTT returns singleton', () => {
vi.stubEnv('STT_PROVIDER', 'mock');
_resetSTT();
const p1 = getSTT();
const p2 = getSTT();
expect(p1).toBe(p2);
});
it('TranscriptionProvider interface is satisfied by all providers', () => {
const providers: TranscriptionProvider[] = [
new MockTranscriptionProvider(),
new OpenAITranscriptionProvider({ apiKey: 'test' }),
new AzureTranscriptionProvider({ speechKey: 'k', speechRegion: 'eastus' }),
];
for (const p of providers) {
expect(typeof p.transcribe).toBe('function');
expect(typeof p.isConfigured).toBe('function');
}
});
});

View File

@ -1,16 +1,22 @@
/**
* Transcription routes speech-to-text via OpenAI Whisper API.
* Transcription routes provider-agnostic speech-to-text.
*
* POST /transcribe Download audio from URL, transcribe via Whisper.
* POST /transcribe Download audio from URL, transcribe via configured provider.
* Product-agnostic: any product backend can call this endpoint.
*
* Requires OPENAI_API_KEY environment variable.
* Provider selection: STT_PROVIDER env var (openai | azure | mock).
* Auto-detects from available credentials if not explicit.
*/
import type { FastifyInstance } from 'fastify';
import { BadRequestError } from '@bytelyst/errors';
import { config } from '../../lib/config.js';
import { TranscribeRequestSchema, MAX_AUDIO_SIZE_BYTES, type TranscribeResponse } from './types.js';
import {
TranscribeRequestSchema,
MAX_AUDIO_SIZE_BYTES,
deriveFilename,
type TranscribeResponse,
} from './types.js';
import { getSTT } from './factory.js';
export async function transcribeRoutes(app: FastifyInstance): Promise<void> {
/**
@ -18,9 +24,10 @@ export async function transcribeRoutes(app: FastifyInstance): Promise<void> {
*
* Flow:
* 1. Validate request (audioUrl, optional language/model/prompt)
* 2. Download audio from the provided URL
* 3. Send to OpenAI Whisper API as multipart/form-data
* 4. Return transcribed text + metadata
* 2. Check provider is configured
* 3. Download audio from the provided URL
* 4. Delegate to configured TranscriptionProvider
* 5. Return transcribed text + metadata
*/
app.post('/transcribe', async (req, reply) => {
const parsed = TranscribeRequestSchema.safeParse(req.body);
@ -28,17 +35,19 @@ export async function transcribeRoutes(app: FastifyInstance): Promise<void> {
throw new BadRequestError(parsed.error.issues.map(i => i.message).join('; '));
}
if (!config.OPENAI_API_KEY) {
const provider = getSTT();
if (!provider.isConfigured()) {
return reply.status(503).send({
error: 'Transcription not available — OPENAI_API_KEY not configured',
error: 'Transcription not available — no STT provider configured',
});
}
const { audioUrl, model, language, prompt, responseFormat, productId } = parsed.data;
const { audioUrl, language, prompt, productId } = parsed.data;
const requestId = req.headers['x-request-id'] as string | undefined;
req.log.info(
{ audioUrl: audioUrl.substring(0, 80), language, model, productId },
{ audioUrl: audioUrl.substring(0, 80), language, productId },
'transcription request'
);
@ -87,120 +96,56 @@ export async function transcribeRoutes(app: FastifyInstance): Promise<void> {
// ── Step 2: Determine filename from URL or content-type ──────
const filename = deriveFilename(audioUrl, contentType);
// ── Step 3: Call OpenAI Whisper API ──────────────────────────
const whisperModel = model || config.WHISPER_MODEL;
const baseUrl = config.OPENAI_BASE_URL.replace(/\/+$/, '');
const formData = new FormData();
formData.append('file', new Blob([audioBuffer]), filename);
formData.append('model', whisperModel);
if (language) formData.append('language', language);
if (prompt) formData.append('prompt', prompt);
formData.append('response_format', responseFormat === 'text' ? 'verbose_json' : responseFormat);
let whisperResponse: Response;
// ── Step 3: Delegate to provider ─────────────────────────────
let result: TranscribeResponse;
try {
whisperResponse = await fetch(`${baseUrl}/audio/transcriptions`, {
method: 'POST',
headers: {
Authorization: `Bearer ${config.OPENAI_API_KEY}`,
},
body: formData,
signal: AbortSignal.timeout(120_000),
const sttResult = await provider.transcribe({
audio: audioBuffer,
filename,
language,
prompt,
});
const durationMs = Date.now() - startMs;
req.log.info(
{
durationMs,
model: sttResult.model,
language: sttResult.language,
audioSizeBytes: audioBuffer.byteLength,
textLength: sttResult.text.length,
productId,
},
'transcription complete'
);
result = {
text: sttResult.text,
language: sttResult.language,
durationSeconds: sttResult.durationSeconds,
model: sttResult.model,
durationMs,
requestId,
};
} catch (err) {
const message = err instanceof Error ? err.message : 'Whisper API call failed';
req.log.error({ error: message }, 'whisper API failed');
const message = err instanceof Error ? err.message : 'Transcription failed';
req.log.error({ error: message }, 'transcription provider failed');
if (message.includes('429') || message.includes('rate limit')) {
return reply.status(429).send({
error: 'Transcription rate limit exceeded',
detail: message,
retryAfter: 60,
});
}
return reply.status(502).send({
error: 'Transcription failed',
detail: message,
});
}
if (!whisperResponse.ok) {
const errorText = await whisperResponse.text();
req.log.error({ status: whisperResponse.status, error: errorText }, 'whisper API error');
if (whisperResponse.status === 429) {
return reply.status(429).send({
error: 'Transcription rate limit exceeded',
detail: errorText,
retryAfter: 60,
});
}
return reply.status(502).send({
error: `Whisper API error ${whisperResponse.status}`,
detail: errorText,
});
}
// ── Step 4: Parse response ───────────────────────────────────
const data = (await whisperResponse.json()) as {
text: string;
language?: string;
duration?: number;
};
const durationMs = Date.now() - startMs;
req.log.info(
{
durationMs,
model: whisperModel,
language: data.language,
audioSizeBytes: audioBuffer.byteLength,
textLength: data.text.length,
productId,
},
'transcription complete'
);
const result: TranscribeResponse = {
text: data.text,
language: data.language ?? language ?? null,
durationSeconds: data.duration ?? null,
model: whisperModel,
durationMs,
requestId,
};
return reply.send(result);
});
}
// ── Helpers ───────────────────────────────────────────────────────
function deriveFilename(url: string, contentType: string | null): string {
// Try to extract extension from URL path
try {
const pathname = new URL(url).pathname;
const lastSegment = pathname.split('/').pop();
if (lastSegment && /\.(mp3|m4a|wav|webm|ogg|flac|mp4|mpeg|mpga)$/i.test(lastSegment)) {
return lastSegment;
}
} catch {
// URL parsing failed — use fallback
}
// Map content-type to extension
const extMap: Record<string, string> = {
'audio/mpeg': 'audio.mp3',
'audio/mp4': 'audio.m4a',
'audio/mp4a-latm': 'audio.m4a',
'audio/x-m4a': 'audio.m4a',
'audio/wav': 'audio.wav',
'audio/x-wav': 'audio.wav',
'audio/webm': 'audio.webm',
'audio/ogg': 'audio.ogg',
'audio/flac': 'audio.flac',
'video/mp4': 'audio.mp4',
};
if (contentType) {
const base = contentType.split(';')[0].trim().toLowerCase();
if (extMap[base]) return extMap[base];
}
return 'audio.mp3';
}

View File

@ -1,5 +1,41 @@
import { z } from 'zod';
// ── Provider interface ──────────────────────────────────────
/** Audio input passed to a transcription provider. */
export interface TranscriptionInput {
/** Raw audio bytes. */
audio: ArrayBuffer;
/** Filename with extension (used for content-type inference). */
filename: string;
/** ISO 639-1 language hint. */
language?: string;
/** Prompt to guide transcription style. */
prompt?: string;
}
/** Result from a transcription provider. */
export interface TranscriptionResult {
/** The transcribed text. */
text: string;
/** Detected or specified language code. */
language: string | null;
/** Duration of the audio in seconds (when available). */
durationSeconds: number | null;
/** Provider-specific model identifier. */
model: string;
}
/** Cloud-agnostic speech-to-text provider. */
export interface TranscriptionProvider {
/** Transcribe audio to text. */
transcribe(input: TranscriptionInput): Promise<TranscriptionResult>;
/** Check if the provider is configured with valid credentials. */
isConfigured(): boolean;
}
export type STTProviderType = 'openai' | 'azure' | 'mock';
// ── Transcription request schema ────────────────────────────────
export const TranscribeRequestSchema = z.object({
@ -53,3 +89,37 @@ export const SUPPORTED_AUDIO_TYPES = new Set([
/** Max audio file size: 25 MB (OpenAI Whisper limit). */
export const MAX_AUDIO_SIZE_BYTES = 25 * 1024 * 1024;
// ── Filename helper (shared by route + providers) ─────────────
export function deriveFilename(url: string, contentType: string | null): string {
try {
const pathname = new URL(url).pathname;
const lastSegment = pathname.split('/').pop();
if (lastSegment && /\.(mp3|m4a|wav|webm|ogg|flac|mp4|mpeg|mpga)$/i.test(lastSegment)) {
return lastSegment;
}
} catch {
// URL parsing failed — use fallback
}
const extMap: Record<string, string> = {
'audio/mpeg': 'audio.mp3',
'audio/mp4': 'audio.m4a',
'audio/mp4a-latm': 'audio.m4a',
'audio/x-m4a': 'audio.m4a',
'audio/wav': 'audio.wav',
'audio/x-wav': 'audio.wav',
'audio/webm': 'audio.webm',
'audio/ogg': 'audio.ogg',
'audio/flac': 'audio.flac',
'video/mp4': 'audio.mp4',
};
if (contentType) {
const base = contentType.split(';')[0].trim().toLowerCase();
if (extMap[base]) return extMap[base];
}
return 'audio.mp3';
}