feat(extraction-service): add Ollama local model eval config and compare script

- Add evals/promptfoo.ollama.yaml: same 19 cases hitting Ollama OpenAI-compat
  API directly (no extraction-service needed); all assertions use inline
  JSON.parse(output) to handle raw string response from Ollama
- Add evals/compare-evals.sh: runs Gemini + Ollama evals back-to-back and
  prints side-by-side pass-rate comparison table
- Supports OLLAMA_MODEL env var (default: llama3.1:8b)
This commit is contained in:
saravanakumardb1 2026-02-19 12:19:24 -08:00
parent acd4c3542b
commit da9ca9dc1a
2 changed files with 453 additions and 0 deletions

View File

@ -0,0 +1,142 @@
#!/usr/bin/env bash
# compare-evals.sh — Run evals against both Gemini (via extraction-service) and
# Ollama (local), then print a side-by-side pass-rate comparison.
#
# Usage:
# ./evals/compare-evals.sh
# OLLAMA_MODEL=qwen2.5:7b ./evals/compare-evals.sh
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SERVICE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
EXTRACTION_SERVICE_URL="${EXTRACTION_SERVICE_URL:-http://localhost:4005}"
OLLAMA_BASE_URL="${OLLAMA_BASE_URL:-http://localhost:11434/v1}"
OLLAMA_MODEL="${OLLAMA_MODEL:-llama3.1:8b}"
GEMINI_OUT="$SCRIPT_DIR/.results-gemini.json"
OLLAMA_OUT="$SCRIPT_DIR/.results-ollama.json"
echo "╔══════════════════════════════════════════════════════════════╗"
echo "║ Extraction Eval — Gemini vs Ollama ($OLLAMA_MODEL)"
echo "╚══════════════════════════════════════════════════════════════╝"
echo ""
# ── Check extraction-service ─────────────────────────────────────
echo "→ [1/4] Checking extraction-service at $EXTRACTION_SERVICE_URL ..."
if ! curl -sf "$EXTRACTION_SERVICE_URL/health" > /dev/null 2>&1; then
echo "✗ extraction-service not running — start with: pnpm dev"
exit 1
fi
echo "✓ extraction-service up"
# ── Check Ollama ─────────────────────────────────────────────────
echo "→ [2/4] Checking Ollama at $OLLAMA_BASE_URL ..."
if ! curl -sf "http://localhost:11434/api/tags" > /dev/null 2>&1; then
echo "✗ Ollama not running — start with: ollama serve"
exit 1
fi
# Check model is pulled
if ! curl -sf "http://localhost:11434/api/tags" | grep -q "$OLLAMA_MODEL"; then
echo "✗ Model '$OLLAMA_MODEL' not found — pull with: ollama pull $OLLAMA_MODEL"
exit 1
fi
echo "✓ Ollama up, model '$OLLAMA_MODEL' available"
# ── Run Gemini evals ─────────────────────────────────────────────
echo ""
echo "→ [3/4] Running Gemini evals (via extraction-service) ..."
cd "$SERVICE_DIR"
EXTRACTION_SERVICE_URL="$EXTRACTION_SERVICE_URL" \
EXTRACTION_EVAL_TOKEN="${EXTRACTION_EVAL_TOKEN:-}" \
EVAL_PRODUCT_ID="${EVAL_PRODUCT_ID:-lysnrai}" \
npx promptfoo eval \
--config "$SCRIPT_DIR/promptfoo.yaml" \
--output json \
--no-cache \
--no-progress-bar \
2>/dev/null > "$GEMINI_OUT" || true
# ── Run Ollama evals ─────────────────────────────────────────────
echo "→ [4/4] Running Ollama evals ($OLLAMA_MODEL) ..."
OLLAMA_BASE_URL="$OLLAMA_BASE_URL" \
OLLAMA_MODEL="$OLLAMA_MODEL" \
npx promptfoo eval \
--config "$SCRIPT_DIR/promptfoo.ollama.yaml" \
--output json \
--no-cache \
--no-progress-bar \
2>/dev/null > "$OLLAMA_OUT" || true
# ── Compare results ──────────────────────────────────────────────
echo ""
echo "╔══════════════════════════════════════════════════════════════╗"
echo "║ RESULTS ║"
echo "╠══════════════════════════════════════════════════════════════╣"
node --input-type=module << 'EOF'
import { readFileSync } from 'fs';
const geminiPath = process.env.GEMINI_OUT;
const ollamaPath = process.env.OLLAMA_OUT;
function parseResults(path) {
try {
const raw = JSON.parse(readFileSync(path, 'utf8'));
const results = raw.results?.results || [];
const total = results.length;
const passed = results.filter(r => r.success).length;
const byTask = {};
for (const r of results) {
const task = r.vars?.taskId || r.vars?.taskPrompt?.split(' ')[0] || 'unknown';
const key = task.length > 20 ? task.slice(0, 20) : task;
if (!byTask[key]) byTask[key] = { pass: 0, total: 0 };
byTask[key].total++;
if (r.success) byTask[key].pass++;
}
return { total, passed, byTask };
} catch {
return { total: 0, passed: 0, byTask: {} };
}
}
const gemini = parseResults(geminiPath);
const ollama = parseResults(ollamaPath);
const pct = (p, t) => t === 0 ? 'N/A' : `${Math.round((p/t)*100)}%`;
const bar = (p, t) => {
if (t === 0) return '░░░░░░░░░░';
const filled = Math.round((p/t)*10);
return '█'.repeat(filled) + '░'.repeat(10-filled);
};
console.log(`\n ${'Provider'.padEnd(12)} ${'Passed'.padEnd(10)} ${'Total'.padEnd(8)} ${'Rate'.padEnd(8)} Progress`);
console.log(` ${'─'.repeat(55)}`);
console.log(` ${'Gemini'.padEnd(12)} ${String(gemini.passed).padEnd(10)} ${String(gemini.total).padEnd(8)} ${pct(gemini.passed,gemini.total).padEnd(8)} ${bar(gemini.passed,gemini.total)}`);
console.log(` ${'Ollama'.padEnd(12)} ${String(ollama.passed).padEnd(10)} ${String(ollama.total).padEnd(8)} ${pct(ollama.passed,ollama.total).padEnd(8)} ${bar(ollama.passed,ollama.total)}`);
const allTasks = new Set([...Object.keys(gemini.byTask), ...Object.keys(ollama.byTask)]);
if (allTasks.size > 0) {
console.log(`\n Per-task breakdown:`);
console.log(` ${'Task'.padEnd(25)} ${'Gemini'.padEnd(12)} ${'Ollama'.padEnd(12)}`);
console.log(` ${'─'.repeat(50)}`);
for (const task of allTasks) {
const g = gemini.byTask[task] || { pass: 0, total: 0 };
const o = ollama.byTask[task] || { pass: 0, total: 0 };
const gStr = `${g.pass}/${g.total} (${pct(g.pass,g.total)})`;
const oStr = `${o.pass}/${o.total} (${pct(o.pass,o.total)})`;
console.log(` ${task.padEnd(25)} ${gStr.padEnd(12)} ${oStr.padEnd(12)}`);
}
}
const winner = gemini.passed >= ollama.passed ? 'Gemini' : `Ollama (${process.env.OLLAMA_MODEL})`;
console.log(`\n Winner: ${winner}`);
EOF
# Cleanup temp files
rm -f "$GEMINI_OUT" "$OLLAMA_OUT"
echo ""
echo "╚══════════════════════════════════════════════════════════════╝"

View File

@ -0,0 +1,311 @@
# promptfoo eval config — Ollama (local OSS models)
# Runs the same 19 extraction cases directly against Ollama's OpenAI-compatible API.
#
# Usage:
# pnpm eval:ollama # run with llama3.1:8b (default)
# OLLAMA_MODEL=qwen2.5:7b pnpm eval:ollama
# pnpm eval:compare # run both gemini + ollama and diff
#
# Prerequisites:
# 1. ollama serve (running on localhost:11434)
# 2. ollama pull llama3.1:8b
#
# NOTE: output is a raw JSON string from Ollama — every assertion uses JSON.parse(output).
description: Extraction Service — LLM Output Quality Evals (Ollama / Local)
providers:
- id: openai:chat:{{env.OLLAMA_MODEL | default('llama3.1:8b')}}
config:
apiBaseUrl: "{{env.OLLAMA_BASE_URL | default('http://localhost:11434/v1')}}"
apiKey: ollama
temperature: 0.1
response_format:
type: json_object
prompts:
- |
You are a structured information extraction engine.
Task: {{taskPrompt}}
Extract entities from the text below. You MUST only use these extraction classes: {{classes}}
Return ONLY a valid JSON object in this exact format — no markdown, no explanation:
{
"extractions": [
{
"extraction_class": "<one of the allowed classes>",
"extraction_text": "<verbatim text from the input>",
"attributes": {}
}
]
}
For brain_signal extractions, set attributes.brain to one of: work, home, money, health, global
For emotion/emotional_state extractions, set attributes.valence to: positive, negative, or neutral
For pattern extractions, set attributes.frequency to: recurring, occasional, or one-time
For severity extractions, set attributes.level to: critical, high, medium, or low
Text to extract from:
{{text}}
defaultTest:
options:
timeoutMs: 90000
assert:
- type: latency
threshold: 60000
tests:
# ── transcript-extraction ──────────────────────────────────────
- description: 'transcript: extracts action item and deadline'
vars:
taskPrompt: 'Extract action items, decisions, questions, deadlines, people, and topics from the following transcript.'
classes: 'action_item, decision, question, deadline, person, topic'
text: 'John said we need to ship the feature by Friday. Sarah agreed to handle the testing.'
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('action_item');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('deadline');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('person');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_text.toLowerCase()).some(t=>t.includes('friday')||t.includes('ship'));"
- description: 'transcript: extracts decision from meeting note'
vars:
taskPrompt: 'Extract action items, decisions, questions, deadlines, people, and topics from the following transcript.'
classes: 'action_item, decision, question, deadline, person, topic'
text: 'The team decided to postpone the launch to Q3. Alice will notify all stakeholders by Monday.'
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('decision');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('action_item');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('person');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('deadline');"
- description: 'transcript: extracts question from discussion'
vars:
taskPrompt: 'Extract action items, decisions, questions, deadlines, people, and topics from the following transcript.'
classes: 'action_item, decision, question, deadline, person, topic'
text: 'Bob asked: should we use Postgres or Cosmos DB for the new service? No decision was made.'
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('question');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('person');"
- description: 'transcript: handles multi-person transcript'
vars:
taskPrompt: 'Extract action items, decisions, questions, deadlines, people, and topics from the following transcript.'
classes: 'action_item, decision, question, deadline, person, topic'
text: "Maria: I finished the design mockups. Tom: Great, I'll review them by EOD. Maria: Can you also check the mobile screens?"
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('action_item');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('person');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_text.toLowerCase()).some(t=>t.includes('maria')||t.includes('tom'));"
- type: javascript
value: 'const r=JSON.parse(output); return r.extractions.length>=3;'
# ── triage ─────────────────────────────────────────────────────
- description: 'triage: health brain signal for medical content'
vars:
taskPrompt: 'Analyze the user capture and extract topics, named entities, action items, emotional signals, date references, and brain routing signals. Brain signals should include which brain (work, home, money, health, global) the content belongs to.'
classes: 'topic, entity, action, emotion, date_reference, brain_signal'
text: "Remind me to call the dentist tomorrow about my appointment. I'm stressed about the cost."
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('action');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('date_reference');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('emotion');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='brain_signal'&&e.attributes&&e.attributes.brain==='health');"
- description: 'triage: work brain signal for project content'
vars:
taskPrompt: 'Analyze the user capture and extract topics, named entities, action items, emotional signals, date references, and brain routing signals. Brain signals should include which brain (work, home, money, health, global) the content belongs to.'
classes: 'topic, entity, action, emotion, date_reference, brain_signal'
text: 'Need to finish the Q1 report for my manager by end of week. The presentation is on Thursday.'
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('action');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('date_reference');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='brain_signal'&&e.attributes&&e.attributes.brain==='work');"
- description: 'triage: money brain signal for financial content'
vars:
taskPrompt: 'Analyze the user capture and extract topics, named entities, action items, emotional signals, date references, and brain routing signals. Brain signals should include which brain (work, home, money, health, global) the content belongs to.'
classes: 'topic, entity, action, emotion, date_reference, brain_signal'
text: "I need to pay the credit card bill before the 15th or I'll get charged interest."
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('action');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('date_reference');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='brain_signal'&&e.attributes&&e.attributes.brain==='money');"
- description: 'triage: negative emotion detected'
vars:
taskPrompt: 'Analyze the user capture and extract topics, named entities, action items, emotional signals, date references, and brain routing signals.'
classes: 'topic, entity, action, emotion, date_reference, brain_signal'
text: "Feeling really overwhelmed today. Too many things on my plate and I can't focus."
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('emotion');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='emotion'&&e.attributes&&e.attributes.valence==='negative');"
- description: 'triage: multiple brain signals for mixed content'
vars:
taskPrompt: 'Analyze the user capture and extract topics, named entities, action items, emotional signals, date references, and brain routing signals. Brain signals should include which brain (work, home, money, health, global) the content belongs to.'
classes: 'topic, entity, action, emotion, date_reference, brain_signal'
text: 'Doctor said I need to exercise more. Also need to check my 401k contributions before year end.'
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='brain_signal'&&e.attributes&&e.attributes.brain==='health');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='brain_signal'&&e.attributes&&e.attributes.brain==='money');"
# ── memory-insight ─────────────────────────────────────────────
- description: 'memory-insight: detects recurring pattern'
vars:
taskPrompt: 'Analyze the collection of memory items and extract recurring patterns, themes, relationships between items, and milestones.'
classes: 'pattern, recurring_theme, relationship, milestone'
text: 'Item 1: Skipped gym again. Item 2: Feeling tired at work. Item 3: Had coffee at 4pm to stay awake. Item 4: Skipped gym for the third time this week.'
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('pattern');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='pattern'&&e.attributes&&e.attributes.frequency==='recurring');"
- description: 'memory-insight: detects relationship between items'
vars:
taskPrompt: 'Analyze the collection of memory items and extract recurring patterns, themes, relationships between items, and milestones.'
classes: 'pattern, recurring_theme, relationship, milestone'
text: "Item 1: Stayed up until 2am coding. Item 2: Missed standup the next morning. Item 3: Felt foggy all day. Item 4: Late night again, can't stop."
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('relationship');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('pattern');"
- description: 'memory-insight: detects milestone'
vars:
taskPrompt: 'Analyze the collection of memory items and extract recurring patterns, themes, relationships between items, and milestones.'
classes: 'pattern, recurring_theme, relationship, milestone'
text: 'Item 1: Started learning Spanish 3 months ago. Item 2: Had first full conversation in Spanish today. Item 3: Completed Duolingo 90-day streak.'
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('milestone');"
- type: javascript
value: 'const r=JSON.parse(output); return r.extractions.length>=2;'
- description: 'memory-insight: detects recurring theme across entries'
vars:
taskPrompt: 'Analyze the collection of memory items and extract recurring patterns, themes, relationships between items, and milestones.'
classes: 'pattern, recurring_theme, relationship, milestone'
text: 'Entry 1: Anxious before the presentation. Entry 2: Nervous about the client call. Entry 3: Worried about the demo tomorrow. Entry 4: Stressed about the board meeting.'
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('recurring_theme');"
- type: javascript
value: 'const r=JSON.parse(output); return r.extractions.length>=1;'
# ── reflection-enrichment ──────────────────────────────────────
- description: 'reflection: extracts accomplishment and concern'
vars:
taskPrompt: 'Analyze the reflection or journal entry and extract emotional states, accomplishments, concerns, and goal progress indicators.'
classes: 'emotional_state, accomplishment, concern, goal_progress'
text: "Good day overall. Finally finished the proposal I've been putting off. Still worried about the budget review next week."
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('accomplishment');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('concern');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('emotional_state');"
- description: 'reflection: positive emotional state detected'
vars:
taskPrompt: 'Analyze the reflection or journal entry and extract emotional states, accomplishments, concerns, and goal progress indicators.'
classes: 'emotional_state, accomplishment, concern, goal_progress'
text: 'Had a fantastic week. Shipped the new feature, got great feedback from users, and the team celebrated together.'
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('emotional_state');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='emotional_state'&&e.attributes&&e.attributes.valence==='positive');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('accomplishment');"
- description: 'reflection: goal progress detected'
vars:
taskPrompt: 'Analyze the reflection or journal entry and extract emotional states, accomplishments, concerns, and goal progress indicators.'
classes: 'emotional_state, accomplishment, concern, goal_progress'
text: "I've been trying to read more this year. This month I finished my third book — ahead of my goal of one per month."
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('goal_progress');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('accomplishment');"
- description: 'reflection: mixed positive and negative signals'
vars:
taskPrompt: 'Analyze the reflection or journal entry and extract emotional states, accomplishments, concerns, and goal progress indicators.'
classes: 'emotional_state, accomplishment, concern, goal_progress'
text: "Proud of finishing the marathon training plan. But I'm really worried I won't be able to run the actual race — my knee has been acting up."
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('accomplishment');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('concern');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='emotional_state'&&e.attributes&&e.attributes.valence==='positive');"
# ── bug-report-extraction ──────────────────────────────────────
- description: 'bug-report: extracts all 5 fields'
vars:
taskPrompt: 'Extract steps to reproduce, expected behavior, actual behavior, affected component, and severity from the bug report.'
classes: 'steps_to_reproduce, expected_behavior, actual_behavior, affected_component, severity'
text: 'When I click the save button on the settings page, nothing happens. It should save my preferences. This is a critical issue affecting all users.'
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('steps_to_reproduce');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('expected_behavior');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('actual_behavior');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('affected_component');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('severity');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.some(e=>e.extraction_class==='severity'&&e.attributes&&e.attributes.level==='critical');"
- description: 'bug-report: extracts steps and component from login bug'
vars:
taskPrompt: 'Extract steps to reproduce, expected behavior, actual behavior, affected component, and severity from the bug report.'
classes: 'steps_to_reproduce, expected_behavior, actual_behavior, affected_component, severity'
text: 'Steps: 1) Open login page, 2) Enter valid credentials, 3) Click login. Expected: redirect to dashboard. Actual: spinner shows forever. Affects the login page on mobile.'
assert:
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('steps_to_reproduce');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('expected_behavior');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('actual_behavior');"
- type: javascript
value: "const r=JSON.parse(output); return r.extractions.map(e=>e.extraction_class).includes('affected_component');"