learning_ai_common_plat/services/extraction-service/src/lib/python-bridge.ts
saravanakumardb1 c292bb5cc1 feat(extraction): scaffold extraction-service + @bytelyst/extraction package
- extraction-service: Fastify scaffold (port 4005) with extract/tasks modules
- src/lib/: config, errors, cosmos, product-config, python-bridge
- src/modules/extract/: types (Zod schemas), routes (POST /extract, batch, models)
- src/modules/tasks/: types, repository (Cosmos CRUD), routes (CRUD endpoints)
- Python sidecar: FastAPI app, LangExtract wrapper, models, task registry
- @bytelyst/extraction package: types, client factory, index exports
- Both pnpm build pass clean
2026-02-14 13:31:40 -08:00

146 lines
3.5 KiB
TypeScript

/**
* HTTP bridge to the Python sidecar running LangExtract.
*
* The sidecar exposes FastAPI endpoints on an internal port (default 4006).
* This module provides a typed client with timeout, retry, and error mapping.
*/
import { config } from './config.js';
const SIDECAR_URL = config.PYTHON_SIDECAR_URL;
const DEFAULT_TIMEOUT_MS = 120_000;
export interface SidecarExtractRequest {
text: string;
task_id?: string;
task_prompt?: string;
examples?: SidecarExample[];
model_id?: string;
extraction_passes?: number;
max_workers?: number;
max_char_buffer?: number;
}
export interface SidecarExample {
text: string;
extractions: SidecarExtraction[];
}
export interface SidecarExtraction {
extraction_class: string;
extraction_text: string;
attributes?: Record<string, string>;
}
export interface SidecarExtractResponse {
extractions: SidecarExtraction[];
metadata: {
model_id: string;
duration_ms: number;
token_count?: number;
char_count: number;
};
}
export interface SidecarHealthResponse {
status: string;
version?: string;
}
/**
* POST /extract on the Python sidecar.
*/
export async function sidecarExtract(
req: SidecarExtractRequest,
requestId?: string,
timeoutMs: number = DEFAULT_TIMEOUT_MS
): Promise<SidecarExtractResponse> {
const headers: Record<string, string> = {
'Content-Type': 'application/json',
};
if (requestId) {
headers['x-request-id'] = requestId;
}
const res = await fetch(`${SIDECAR_URL}/extract`, {
method: 'POST',
headers,
body: JSON.stringify(req),
signal: AbortSignal.timeout(timeoutMs),
});
if (!res.ok) {
const body = await res.json().catch(() => ({ error: res.statusText }));
throw new Error(`Sidecar error ${res.status}: ${body.error || body.detail || res.statusText}`);
}
return res.json() as Promise<SidecarExtractResponse>;
}
/**
* POST /extract/batch on the Python sidecar.
*/
export async function sidecarExtractBatch(
requests: SidecarExtractRequest[],
requestId?: string,
timeoutMs: number = DEFAULT_TIMEOUT_MS
): Promise<SidecarExtractResponse[]> {
const headers: Record<string, string> = {
'Content-Type': 'application/json',
};
if (requestId) {
headers['x-request-id'] = requestId;
}
const res = await fetch(`${SIDECAR_URL}/extract/batch`, {
method: 'POST',
headers,
body: JSON.stringify({ requests }),
signal: AbortSignal.timeout(timeoutMs),
});
if (!res.ok) {
const body = await res.json().catch(() => ({ error: res.statusText }));
throw new Error(
`Sidecar batch error ${res.status}: ${body.error || body.detail || res.statusText}`
);
}
return res.json() as Promise<SidecarExtractResponse[]>;
}
/**
* GET /health on the Python sidecar.
*/
export async function sidecarHealth(): Promise<SidecarHealthResponse> {
const res = await fetch(`${SIDECAR_URL}/health`, {
signal: AbortSignal.timeout(5_000),
});
if (!res.ok) {
throw new Error(`Sidecar health check failed: ${res.status}`);
}
return res.json() as Promise<SidecarHealthResponse>;
}
/**
* Wait for the Python sidecar to become ready (poll /health).
*/
export async function waitForSidecar(
maxRetries: number = 30,
intervalMs: number = 2_000
): Promise<boolean> {
for (let i = 0; i < maxRetries; i++) {
try {
await sidecarHealth();
return true;
} catch {
if (i < maxRetries - 1) {
await new Promise(resolve => globalThis.setTimeout(resolve, intervalMs));
}
}
}
return false;
}