diff --git a/services/extraction-service/evals/compare-evals.sh b/services/extraction-service/evals/compare-evals.sh new file mode 100755 index 00000000..36267b14 --- /dev/null +++ b/services/extraction-service/evals/compare-evals.sh @@ -0,0 +1,142 @@ +#!/usr/bin/env bash +# compare-evals.sh — Run evals against both Gemini (via extraction-service) and +# Ollama (local), then print a side-by-side pass-rate comparison. +# +# Usage: +# ./evals/compare-evals.sh +# OLLAMA_MODEL=qwen2.5:7b ./evals/compare-evals.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SERVICE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" + +EXTRACTION_SERVICE_URL="${EXTRACTION_SERVICE_URL:-http://localhost:4005}" +OLLAMA_BASE_URL="${OLLAMA_BASE_URL:-http://localhost:11434/v1}" +OLLAMA_MODEL="${OLLAMA_MODEL:-llama3.1:8b}" + +GEMINI_OUT="$SCRIPT_DIR/.results-gemini.json" +OLLAMA_OUT="$SCRIPT_DIR/.results-ollama.json" + +echo "╔══════════════════════════════════════════════════════════════╗" +echo "║ Extraction Eval — Gemini vs Ollama ($OLLAMA_MODEL)" +echo "╚══════════════════════════════════════════════════════════════╝" +echo "" + +# ── Check extraction-service ───────────────────────────────────── +echo "→ [1/4] Checking extraction-service at $EXTRACTION_SERVICE_URL ..." +if ! curl -sf "$EXTRACTION_SERVICE_URL/health" > /dev/null 2>&1; then + echo "✗ extraction-service not running — start with: pnpm dev" + exit 1 +fi +echo "✓ extraction-service up" + +# ── Check Ollama ───────────────────────────────────────────────── +echo "→ [2/4] Checking Ollama at $OLLAMA_BASE_URL ..." +if ! curl -sf "http://localhost:11434/api/tags" > /dev/null 2>&1; then + echo "✗ Ollama not running — start with: ollama serve" + exit 1 +fi + +# Check model is pulled +if ! curl -sf "http://localhost:11434/api/tags" | grep -q "$OLLAMA_MODEL"; then + echo "✗ Model '$OLLAMA_MODEL' not found — pull with: ollama pull $OLLAMA_MODEL" + exit 1 +fi +echo "✓ Ollama up, model '$OLLAMA_MODEL' available" + +# ── Run Gemini evals ───────────────────────────────────────────── +echo "" +echo "→ [3/4] Running Gemini evals (via extraction-service) ..." +cd "$SERVICE_DIR" +EXTRACTION_SERVICE_URL="$EXTRACTION_SERVICE_URL" \ +EXTRACTION_EVAL_TOKEN="${EXTRACTION_EVAL_TOKEN:-}" \ +EVAL_PRODUCT_ID="${EVAL_PRODUCT_ID:-lysnrai}" \ + npx promptfoo eval \ + --config "$SCRIPT_DIR/promptfoo.yaml" \ + --output json \ + --no-cache \ + --no-progress-bar \ + 2>/dev/null > "$GEMINI_OUT" || true + +# ── Run Ollama evals ───────────────────────────────────────────── +echo "→ [4/4] Running Ollama evals ($OLLAMA_MODEL) ..." +OLLAMA_BASE_URL="$OLLAMA_BASE_URL" \ +OLLAMA_MODEL="$OLLAMA_MODEL" \ + npx promptfoo eval \ + --config "$SCRIPT_DIR/promptfoo.ollama.yaml" \ + --output json \ + --no-cache \ + --no-progress-bar \ + 2>/dev/null > "$OLLAMA_OUT" || true + +# ── Compare results ────────────────────────────────────────────── +echo "" +echo "╔══════════════════════════════════════════════════════════════╗" +echo "║ RESULTS ║" +echo "╠══════════════════════════════════════════════════════════════╣" + +node --input-type=module << 'EOF' +import { readFileSync } from 'fs'; + +const geminiPath = process.env.GEMINI_OUT; +const ollamaPath = process.env.OLLAMA_OUT; + +function parseResults(path) { + try { + const raw = JSON.parse(readFileSync(path, 'utf8')); + const results = raw.results?.results || []; + const total = results.length; + const passed = results.filter(r => r.success).length; + const byTask = {}; + for (const r of results) { + const task = r.vars?.taskId || r.vars?.taskPrompt?.split(' ')[0] || 'unknown'; + const key = task.length > 20 ? task.slice(0, 20) : task; + if (!byTask[key]) byTask[key] = { pass: 0, total: 0 }; + byTask[key].total++; + if (r.success) byTask[key].pass++; + } + return { total, passed, byTask }; + } catch { + return { total: 0, passed: 0, byTask: {} }; + } +} + +const gemini = parseResults(geminiPath); +const ollama = parseResults(ollamaPath); + +const pct = (p, t) => t === 0 ? 'N/A' : `${Math.round((p/t)*100)}%`; +const bar = (p, t) => { + if (t === 0) return '░░░░░░░░░░'; + const filled = Math.round((p/t)*10); + return '█'.repeat(filled) + '░'.repeat(10-filled); +}; + +console.log(`\n ${'Provider'.padEnd(12)} ${'Passed'.padEnd(10)} ${'Total'.padEnd(8)} ${'Rate'.padEnd(8)} Progress`); +console.log(` ${'─'.repeat(55)}`); +console.log(` ${'Gemini'.padEnd(12)} ${String(gemini.passed).padEnd(10)} ${String(gemini.total).padEnd(8)} ${pct(gemini.passed,gemini.total).padEnd(8)} ${bar(gemini.passed,gemini.total)}`); +console.log(` ${'Ollama'.padEnd(12)} ${String(ollama.passed).padEnd(10)} ${String(ollama.total).padEnd(8)} ${pct(ollama.passed,ollama.total).padEnd(8)} ${bar(ollama.passed,ollama.total)}`); + +const allTasks = new Set([...Object.keys(gemini.byTask), ...Object.keys(ollama.byTask)]); +if (allTasks.size > 0) { + console.log(`\n Per-task breakdown:`); + console.log(` ${'Task'.padEnd(25)} ${'Gemini'.padEnd(12)} ${'Ollama'.padEnd(12)}`); + console.log(` ${'─'.repeat(50)}`); + for (const task of allTasks) { + const g = gemini.byTask[task] || { pass: 0, total: 0 }; + const o = ollama.byTask[task] || { pass: 0, total: 0 }; + const gStr = `${g.pass}/${g.total} (${pct(g.pass,g.total)})`; + const oStr = `${o.pass}/${o.total} (${pct(o.pass,o.total)})`; + console.log(` ${task.padEnd(25)} ${gStr.padEnd(12)} ${oStr.padEnd(12)}`); + } +} + +const winner = gemini.passed >= ollama.passed ? 'Gemini' : `Ollama (${process.env.OLLAMA_MODEL})`; +console.log(`\n Winner: ${winner}`); +EOF + +# Cleanup temp files +rm -f "$GEMINI_OUT" "$OLLAMA_OUT" + +echo "" +echo "╚══════════════════════════════════════════════════════════════╝" diff --git a/services/extraction-service/evals/promptfoo.ollama.yaml b/services/extraction-service/evals/promptfoo.ollama.yaml new file mode 100644 index 00000000..6c69687b --- /dev/null +++ b/services/extraction-service/evals/promptfoo.ollama.yaml @@ -0,0 +1,311 @@ +# promptfoo eval config — Ollama (local OSS models) +# Runs the same 19 extraction cases directly against Ollama's OpenAI-compatible API. +# +# Usage: +# pnpm eval:ollama # run with llama3.1:8b (default) +# OLLAMA_MODEL=qwen2.5:7b pnpm eval:ollama +# pnpm eval:compare # run both gemini + ollama and diff +# +# Prerequisites: +# 1. ollama serve (running on localhost:11434) +# 2. ollama pull llama3.1:8b +# +# NOTE: output is a raw JSON string from Ollama — every assertion uses JSON.parse(output). + +description: Extraction Service — LLM Output Quality Evals (Ollama / Local) + +providers: + - id: openai:chat:{{env.OLLAMA_MODEL | default('llama3.1:8b')}} + config: + apiBaseUrl: "{{env.OLLAMA_BASE_URL | default('http://localhost:11434/v1')}}" + apiKey: ollama + temperature: 0.1 + response_format: + type: json_object + +prompts: + - | + You are a structured information extraction engine. + + Task: {{taskPrompt}} + + Extract entities from the text below. You MUST only use these extraction classes: {{classes}} + + Return ONLY a valid JSON object in this exact format — no markdown, no explanation: + { + "extractions": [ + { + "extraction_class": "", + "extraction_text": "", + "attributes": {} + } + ] + } + + For brain_signal extractions, set attributes.brain to one of: work, home, money, health, global + For emotion/emotional_state extractions, set attributes.valence to: positive, negative, or neutral + For pattern extractions, set attributes.frequency to: recurring, occasional, or one-time + For severity extractions, set attributes.level to: critical, high, medium, or low + + Text to extract from: + {{text}} + +defaultTest: + options: + timeoutMs: 90000 + assert: + - type: latency + threshold: 60000 + +tests: + # ── transcript-extraction ────────────────────────────────────── + - description: 'transcript: extracts action item and deadline' + vars: + taskPrompt: 'Extract action items, decisions, questions, deadlines, people, and topics from the following transcript.' + classes: 'action_item, decision, question, deadline, person, topic' + text: 'John said we need to ship the feature by Friday. Sarah agreed to handle the testing.' + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('action_item');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('deadline');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('person');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_text.toLowerCase()).some(t=>t.includes('friday')||t.includes('ship'));" + + - description: 'transcript: extracts decision from meeting note' + vars: + taskPrompt: 'Extract action items, decisions, questions, deadlines, people, and topics from the following transcript.' + classes: 'action_item, decision, question, deadline, person, topic' + text: 'The team decided to postpone the launch to Q3. Alice will notify all stakeholders by Monday.' + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('decision');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('action_item');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('person');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('deadline');" + + - description: 'transcript: extracts question from discussion' + vars: + taskPrompt: 'Extract action items, decisions, questions, deadlines, people, and topics from the following transcript.' + classes: 'action_item, decision, question, deadline, person, topic' + text: 'Bob asked: should we use Postgres or Cosmos DB for the new service? No decision was made.' + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('question');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('person');" + + - description: 'transcript: handles multi-person transcript' + vars: + taskPrompt: 'Extract action items, decisions, questions, deadlines, people, and topics from the following transcript.' + classes: 'action_item, decision, question, deadline, person, topic' + text: "Maria: I finished the design mockups. Tom: Great, I'll review them by EOD. Maria: Can you also check the mobile screens?" + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('action_item');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('person');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_text.toLowerCase()).some(t=>t.includes('maria')||t.includes('tom'));" + - type: javascript + value: 'const r=JSON.parse(output); return r.extractions.length>=3;' + + # ── triage ───────────────────────────────────────────────────── + - description: 'triage: health brain signal for medical content' + vars: + taskPrompt: 'Analyze the user capture and extract topics, named entities, action items, emotional signals, date references, and brain routing signals. Brain signals should include which brain (work, home, money, health, global) the content belongs to.' + classes: 'topic, entity, action, emotion, date_reference, brain_signal' + text: "Remind me to call the dentist tomorrow about my appointment. I'm stressed about the cost." + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('action');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('date_reference');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('emotion');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='brain_signal'&&e.attributes&&e.attributes.brain==='health');" + + - description: 'triage: work brain signal for project content' + vars: + taskPrompt: 'Analyze the user capture and extract topics, named entities, action items, emotional signals, date references, and brain routing signals. Brain signals should include which brain (work, home, money, health, global) the content belongs to.' + classes: 'topic, entity, action, emotion, date_reference, brain_signal' + text: 'Need to finish the Q1 report for my manager by end of week. The presentation is on Thursday.' + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('action');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('date_reference');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='brain_signal'&&e.attributes&&e.attributes.brain==='work');" + + - description: 'triage: money brain signal for financial content' + vars: + taskPrompt: 'Analyze the user capture and extract topics, named entities, action items, emotional signals, date references, and brain routing signals. Brain signals should include which brain (work, home, money, health, global) the content belongs to.' + classes: 'topic, entity, action, emotion, date_reference, brain_signal' + text: "I need to pay the credit card bill before the 15th or I'll get charged interest." + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('action');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('date_reference');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='brain_signal'&&e.attributes&&e.attributes.brain==='money');" + + - description: 'triage: negative emotion detected' + vars: + taskPrompt: 'Analyze the user capture and extract topics, named entities, action items, emotional signals, date references, and brain routing signals.' + classes: 'topic, entity, action, emotion, date_reference, brain_signal' + text: "Feeling really overwhelmed today. Too many things on my plate and I can't focus." + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('emotion');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='emotion'&&e.attributes&&e.attributes.valence==='negative');" + + - description: 'triage: multiple brain signals for mixed content' + vars: + taskPrompt: 'Analyze the user capture and extract topics, named entities, action items, emotional signals, date references, and brain routing signals. Brain signals should include which brain (work, home, money, health, global) the content belongs to.' + classes: 'topic, entity, action, emotion, date_reference, brain_signal' + text: 'Doctor said I need to exercise more. Also need to check my 401k contributions before year end.' + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='brain_signal'&&e.attributes&&e.attributes.brain==='health');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='brain_signal'&&e.attributes&&e.attributes.brain==='money');" + + # ── memory-insight ───────────────────────────────────────────── + - description: 'memory-insight: detects recurring pattern' + vars: + taskPrompt: 'Analyze the collection of memory items and extract recurring patterns, themes, relationships between items, and milestones.' + classes: 'pattern, recurring_theme, relationship, milestone' + text: 'Item 1: Skipped gym again. Item 2: Feeling tired at work. Item 3: Had coffee at 4pm to stay awake. Item 4: Skipped gym for the third time this week.' + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('pattern');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='pattern'&&e.attributes&&e.attributes.frequency==='recurring');" + + - description: 'memory-insight: detects relationship between items' + vars: + taskPrompt: 'Analyze the collection of memory items and extract recurring patterns, themes, relationships between items, and milestones.' + classes: 'pattern, recurring_theme, relationship, milestone' + text: "Item 1: Stayed up until 2am coding. Item 2: Missed standup the next morning. Item 3: Felt foggy all day. Item 4: Late night again, can't stop." + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('relationship');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('pattern');" + + - description: 'memory-insight: detects milestone' + vars: + taskPrompt: 'Analyze the collection of memory items and extract recurring patterns, themes, relationships between items, and milestones.' + classes: 'pattern, recurring_theme, relationship, milestone' + text: 'Item 1: Started learning Spanish 3 months ago. Item 2: Had first full conversation in Spanish today. Item 3: Completed Duolingo 90-day streak.' + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('milestone');" + - type: javascript + value: 'const r=JSON.parse(output); return r.extractions.length>=2;' + + - description: 'memory-insight: detects recurring theme across entries' + vars: + taskPrompt: 'Analyze the collection of memory items and extract recurring patterns, themes, relationships between items, and milestones.' + classes: 'pattern, recurring_theme, relationship, milestone' + text: 'Entry 1: Anxious before the presentation. Entry 2: Nervous about the client call. Entry 3: Worried about the demo tomorrow. Entry 4: Stressed about the board meeting.' + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('recurring_theme');" + - type: javascript + value: 'const r=JSON.parse(output); return r.extractions.length>=1;' + + # ── reflection-enrichment ────────────────────────────────────── + - description: 'reflection: extracts accomplishment and concern' + vars: + taskPrompt: 'Analyze the reflection or journal entry and extract emotional states, accomplishments, concerns, and goal progress indicators.' + classes: 'emotional_state, accomplishment, concern, goal_progress' + text: "Good day overall. Finally finished the proposal I've been putting off. Still worried about the budget review next week." + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('accomplishment');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('concern');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('emotional_state');" + + - description: 'reflection: positive emotional state detected' + vars: + taskPrompt: 'Analyze the reflection or journal entry and extract emotional states, accomplishments, concerns, and goal progress indicators.' + classes: 'emotional_state, accomplishment, concern, goal_progress' + text: 'Had a fantastic week. Shipped the new feature, got great feedback from users, and the team celebrated together.' + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('emotional_state');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='emotional_state'&&e.attributes&&e.attributes.valence==='positive');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('accomplishment');" + + - description: 'reflection: goal progress detected' + vars: + taskPrompt: 'Analyze the reflection or journal entry and extract emotional states, accomplishments, concerns, and goal progress indicators.' + classes: 'emotional_state, accomplishment, concern, goal_progress' + text: "I've been trying to read more this year. This month I finished my third book — ahead of my goal of one per month." + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('goal_progress');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('accomplishment');" + + - description: 'reflection: mixed positive and negative signals' + vars: + taskPrompt: 'Analyze the reflection or journal entry and extract emotional states, accomplishments, concerns, and goal progress indicators.' + classes: 'emotional_state, accomplishment, concern, goal_progress' + text: "Proud of finishing the marathon training plan. But I'm really worried I won't be able to run the actual race — my knee has been acting up." + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('accomplishment');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('concern');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='emotional_state'&&e.attributes&&e.attributes.valence==='positive');" + + # ── bug-report-extraction ────────────────────────────────────── + - description: 'bug-report: extracts all 5 fields' + vars: + taskPrompt: 'Extract steps to reproduce, expected behavior, actual behavior, affected component, and severity from the bug report.' + classes: 'steps_to_reproduce, expected_behavior, actual_behavior, affected_component, severity' + text: 'When I click the save button on the settings page, nothing happens. It should save my preferences. This is a critical issue affecting all users.' + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('steps_to_reproduce');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('expected_behavior');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('actual_behavior');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('affected_component');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('severity');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='severity'&&e.attributes&&e.attributes.level==='critical');" + + - description: 'bug-report: extracts steps and component from login bug' + vars: + taskPrompt: 'Extract steps to reproduce, expected behavior, actual behavior, affected component, and severity from the bug report.' + classes: 'steps_to_reproduce, expected_behavior, actual_behavior, affected_component, severity' + text: 'Steps: 1) Open login page, 2) Enter valid credentials, 3) Click login. Expected: redirect to dashboard. Actual: spinner shows forever. Affects the login page on mobile.' + assert: + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('steps_to_reproduce');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('expected_behavior');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('actual_behavior');" + - type: javascript + value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('affected_component');"