feat(extraction): Phase 6 advanced features (6.1-6.8)

- 6.1-6.2: Entity visualization components (bar chart, pie chart, timeline) [in LysnrAI repo] - 6.3-6.4: Async job queue — POST /extract/jobs, GET /extract/jobs/:id, GET /extract/jobs - 6.5-6.6: Model registry with tier (standard/premium/free/mock) + GET /extract/models - 6.7-6.8: Multi-language detection (es/fr/de/pt/ja/zh/ko/ar) + prompt enrichment - ExtractMetadata.language field added to Python models - 46 TS tests passing, build clean
2026-02-14 14:08:02 -08:00 · 2026-02-14 14:08:02 -08:00 · 5c1744d3a4
commit 5c1744d3a4
parent b8c0a73e89
4 changed files with 259 additions and 5 deletions
--- a/services/extraction-service/python/src/extractor.py
+++ b/services/extraction-service/python/src/extractor.py
@ -16,6 +16,45 @@ logger = structlog.get_logger(__name__)
 DEFAULT_MODEL_ID = os.environ.get("DEFAULT_MODEL_ID", "gemini-2.5-flash")
 # ── Language detection ─────────────────────────────────────────────
 LANG_PATTERNS: list[tuple[str, str, list[str]]] = [
    ("es", "Spanish", ["el", "la", "los", "las", "de", "en", "que", "por", "con", "para"]),
    ("fr", "French", ["le", "la", "les", "des", "une", "est", "que", "dans", "pour", "avec"]),
    ("de", "German", ["der", "die", "das", "ein", "eine", "ist", "und", "oder", "aber", "nicht"]),
    ("pt", "Portuguese", ["o", "os", "as", "de", "em", "que", "por", "com", "para", "como"]),
    ("ja", "Japanese", []),
    ("zh", "Chinese", []),
    ("ko", "Korean", []),
 ]
 def detect_language(text: str) -> tuple[str, str, float]:
    """Detect language from text. Returns (code, name, confidence)."""
    import re
    # CJK detection via unicode ranges
    if re.search(r"[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF]", text):
        if re.search(r"[\u3040-\u309F\u30A0-\u30FF]", text):
            return ("ja", "Japanese", 0.9)
        return ("zh", "Chinese", 0.85)
    if re.search(r"[\uAC00-\uD7AF]", text):
        return ("ko", "Korean", 0.9)
    if re.search(r"[\u0600-\u06FF]", text):
        return ("ar", "Arabic", 0.9)
    words = text.lower().split()
    for code, name, keywords in LANG_PATTERNS:
        if not keywords:
            continue
        matches = sum(1 for w in words if w in keywords)
        if matches >= 3:
            confidence = min(0.95, 0.5 + matches * 0.05)
            return (code, name, confidence)
    return ("en", "English", 0.85)
 async def extract(
    text: str,
    task_prompt: str | None = None,
@ -24,6 +63,7 @@ async def extract(
    extraction_passes: int | None = None,
    max_workers: int | None = None,
    max_char_buffer: int | None = None,
    language: str | None = None,
 ) -> ExtractResponse:
    """
    Run LangExtract on the given text.
@ -55,8 +95,17 @@ async def extract(
            "model_id": resolved_model,
        }
        # Multi-language support: detect language and enrich prompt
        lang_code, lang_name, lang_conf = detect_language(text) if not language else (language, language, 1.0)
        lang_hint = ""
        if lang_code != "en" and lang_conf >= 0.7:
            lang_hint = f"\nIMPORTANT: The input text is in {lang_name}. Extract entities in their original language but use English for class labels."
            logger.info("multilang_detected", language=lang_code, confidence=round(lang_conf, 2))
        if task_prompt:
-            lx_kwargs["prompt_description"] = task_prompt
+            lx_kwargs["prompt_description"] = task_prompt + lang_hint
        elif lang_hint:
            lx_kwargs["prompt_description"] = f"Extract structured entities from the text.{lang_hint}"
        if examples:
            lx_kwargs["examples"] = examples
@ -104,6 +153,7 @@ async def extract(
                model_id=resolved_model,
                duration_ms=round(duration_ms, 2),
                char_count=len(text),
                language=lang_code if lang_code != "en" else None,
            ),
        )
--- a/services/extraction-service/python/src/models.py
+++ b/services/extraction-service/python/src/models.py
@ -39,6 +39,7 @@ class ExtractMetadata(BaseModel):
    duration_ms: float
    token_count: int | None = None
    char_count: int
    language: str | None = None
 class ExtractResponse(BaseModel):
--- a/services/extraction-service/src/modules/extract/jobs.ts
+++ b/services/extraction-service/src/modules/extract/jobs.ts
@ -0,0 +1,95 @@
 /**
 * Async extraction job queue.
 *
 * For large batch requests, callers can submit an async job and poll for results.
 * Jobs are stored in-memory (Cosmos persistence deferred to Phase 7).
 *
 * Flow: POST /extract/jobs → { jobId } → GET /extract/jobs/:id → { status, results }
 */
 import { randomUUID } from 'node:crypto';
 import {
  sidecarExtract,
  type SidecarExtractRequest,
  type SidecarExtractResponse,
 } from '../../lib/python-bridge.js';
 export type JobStatus = 'pending' | 'processing' | 'completed' | 'failed';
 export interface ExtractionJob {
  id: string;
  status: JobStatus;
  inputs: SidecarExtractRequest[];
  results: SidecarExtractResponse[];
  errors: Array<{ index: number; error: string }>;
  progress: { completed: number; total: number };
  createdAt: string;
  completedAt?: string;
 }
 const jobStore = new Map<string, ExtractionJob>();
 /**
 * Create a new async extraction job and start processing in background.
 */
 export function createJob(inputs: SidecarExtractRequest[], requestId?: string): ExtractionJob {
  const job: ExtractionJob = {
    id: randomUUID(),
    status: 'pending',
    inputs,
    results: [],
    errors: [],
    progress: { completed: 0, total: inputs.length },
    createdAt: new Date().toISOString(),
  };
  jobStore.set(job.id, job);
  // Start processing in background (non-blocking)
  processJob(job, requestId).catch(() => {
    job.status = 'failed';
  });
  return job;
 }
 /**
 * Get job by ID.
 */
 export function getJob(jobId: string): ExtractionJob | undefined {
  return jobStore.get(jobId);
 }
 /**
 * List recent jobs (last 50).
 */
 export function listJobs(limit = 50): ExtractionJob[] {
  return [...jobStore.values()]
    .sort((a, b) => b.createdAt.localeCompare(a.createdAt))
    .slice(0, limit);
 }
 // ── Internal ─────────────────────────────────────────────────────
 async function processJob(job: ExtractionJob, requestId?: string): Promise<void> {
  job.status = 'processing';
  for (let i = 0; i < job.inputs.length; i++) {
    try {
      const result = await sidecarExtract(job.inputs[i], requestId);
      job.results.push(result);
    } catch (err) {
      const message = err instanceof Error ? err.message : 'Unknown error';
      job.errors.push({ index: i, error: message });
      // Push a placeholder so indices align
      job.results.push({
        extractions: [],
        metadata: { model_id: 'error', duration_ms: 0, char_count: 0 },
      });
    }
    job.progress.completed = i + 1;
  }
  job.status = job.errors.length === job.inputs.length ? 'failed' : 'completed';
  job.completedAt = new Date().toISOString();
 }
--- a/services/extraction-service/src/modules/extract/routes.ts
+++ b/services/extraction-service/src/modules/extract/routes.ts
@ -13,6 +13,7 @@ import { BadRequestError } from '../../lib/errors.js';
 import { checkQuota, incrementUsage, getUsageSummary } from './usage.js';
 import { recordExtraction, getMetricsSummary } from '../../lib/metrics.js';
 import { sidecarBreaker } from '../../lib/circuit-breaker.js';
 import { createJob, getJob, listJobs } from './jobs.js';
 // ── In-memory LRU cache ────────────────────────────────────────
 const CACHE_TTL_MS = parseInt(process.env.EXTRACTION_CACHE_TTL_MS || '86400000', 10); // 24h
@ -63,6 +64,29 @@ function cachePut(
  cache.set(cacheKey(text, taskId, modelId), { response, createdAt: Date.now() });
 }
 // ── Model registry ───────────────────────────────────────────────
 const MODEL_REGISTRY = [
  {
    id: 'gemini-2.5-flash',
    provider: 'google',
    description: 'Gemini 2.5 Flash (default)',
    tier: 'standard',
  },
  { id: 'gemini-2.5-pro', provider: 'google', description: 'Gemini 2.5 Pro', tier: 'premium' },
  {
    id: 'gemini-2.0-flash',
    provider: 'google',
    description: 'Gemini 2.0 Flash (legacy)',
    tier: 'standard',
  },
  {
    id: 'gemini-2.5-flash-mock',
    provider: 'mock',
    description: 'Mock extractor (no API key)',
    tier: 'free',
  },
 ];
 export async function extractRoutes(app: FastifyInstance) {
  // Rate limiting for extraction endpoints — 30 req/min per IP (configurable)
  await app.register(rateLimit, {
@ -280,15 +304,99 @@ export async function extractRoutes(app: FastifyInstance) {
    });
  });
  /**
   * POST /extract/jobs — Submit async batch extraction job.
   */
  app.post('/extract/jobs', async (req, reply) => {
    const parsed = BatchExtractRequestSchema.safeParse(req.body);
    if (!parsed.success) {
      throw new BadRequestError(parsed.error.issues.map(i => i.message).join('; '));
    }
    const { inputs, examples, modelId } = parsed.data;
    const requestId = req.headers['x-request-id'] as string | undefined;
    const sidecarRequests = inputs.map(input => ({
      text: input.text,
      task_id: input.taskId,
      task_prompt: input.taskPrompt,
      examples: examples?.map(e => ({
        text: e.text,
        extractions: e.extractions.map(ex => ({
          extraction_class: ex.extraction_class,
          extraction_text: ex.extraction_text,
          attributes: ex.attributes,
        })),
      })),
      model_id: modelId,
    }));
    const job = createJob(sidecarRequests, requestId);
    req.log.info({ jobId: job.id, inputCount: inputs.length }, 'async job created');
    return reply.status(202).send({
      jobId: job.id,
      status: job.status,
      progress: job.progress,
      createdAt: job.createdAt,
    });
  });
  /**
   * GET /extract/jobs/:id — Get async job status and results.
   */
  app.get('/extract/jobs/:id', async (req, reply) => {
    const { id } = req.params as { id: string };
    const job = getJob(id);
    if (!job) {
      return reply.status(404).send({ error: 'Job not found' });
    }
    return reply.send({
      id: job.id,
      status: job.status,
      progress: job.progress,
      results:
        job.status === 'completed' || job.status === 'failed'
          ? job.results.map(r => ({
              extractions: r.extractions,
              metadata: {
                modelId: r.metadata.model_id,
                durationMs: r.metadata.duration_ms,
                charCount: r.metadata.char_count,
              },
            }))
          : undefined,
      errors: job.errors.length > 0 ? job.errors : undefined,
      createdAt: job.createdAt,
      completedAt: job.completedAt,
    });
  });
  /**
   * GET /extract/jobs — List recent async jobs.
   */
  app.get('/extract/jobs', async (_req, reply) => {
    const jobs = listJobs();
    return reply.send({
      jobs: jobs.map(j => ({
        id: j.id,
        status: j.status,
        progress: j.progress,
        inputCount: j.inputs.length,
        createdAt: j.createdAt,
        completedAt: j.completedAt,
      })),
    });
  });
  /**
   * GET /extract/models — List available model providers.
   */
  app.get('/extract/models', async (_req, reply) => {
    return reply.send({
-      models: [
+      models: MODEL_REGISTRY,
-        { id: 'gemini-2.5-flash', provider: 'google', description: 'Gemini 2.5 Flash (default)' },
+      default: process.env.DEFAULT_MODEL_ID || 'gemini-2.5-flash',
        { id: 'gemini-2.5-pro', provider: 'google', description: 'Gemini 2.5 Pro' },
      ],
    });
  });