#!/usr/bin/env bash # compare-evals.sh — Run evals against both Gemini (via extraction-service) and # Ollama (local), then print a side-by-side pass-rate comparison. # # Usage: # ./evals/compare-evals.sh # OLLAMA_MODEL=qwen2.5:7b ./evals/compare-evals.sh set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SERVICE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" EXTRACTION_SERVICE_URL="${EXTRACTION_SERVICE_URL:-http://localhost:4005}" OLLAMA_BASE_URL="${OLLAMA_BASE_URL:-http://localhost:11434/v1}" OLLAMA_MODEL="${OLLAMA_MODEL:-llama3.1:8b}" GEMINI_OUT="$SCRIPT_DIR/.results-gemini.json" OLLAMA_OUT="$SCRIPT_DIR/.results-ollama.json" echo "╔══════════════════════════════════════════════════════════════╗" echo "║ Extraction Eval — Gemini vs Ollama ($OLLAMA_MODEL)" echo "╚══════════════════════════════════════════════════════════════╝" echo "" # ── Check extraction-service ───────────────────────────────────── echo "→ [1/4] Checking extraction-service at $EXTRACTION_SERVICE_URL ..." if ! curl -sf "$EXTRACTION_SERVICE_URL/health" > /dev/null 2>&1; then echo "✗ extraction-service not running — start with: pnpm dev" exit 1 fi echo "✓ extraction-service up" # ── Check Ollama ───────────────────────────────────────────────── echo "→ [2/4] Checking Ollama at $OLLAMA_BASE_URL ..." if ! curl -sf "http://localhost:11434/api/tags" > /dev/null 2>&1; then echo "✗ Ollama not running — start with: ollama serve" exit 1 fi # Check model is pulled if ! curl -sf "http://localhost:11434/api/tags" | grep -q "$OLLAMA_MODEL"; then echo "✗ Model '$OLLAMA_MODEL' not found — pull with: ollama pull $OLLAMA_MODEL" exit 1 fi echo "✓ Ollama up, model '$OLLAMA_MODEL' available" # ── Run Gemini evals ───────────────────────────────────────────── echo "" echo "→ [3/4] Running Gemini evals (via extraction-service) ..." cd "$SERVICE_DIR" EXTRACTION_SERVICE_URL="$EXTRACTION_SERVICE_URL" \ EXTRACTION_EVAL_TOKEN="${EXTRACTION_EVAL_TOKEN:-}" \ EVAL_PRODUCT_ID="${EVAL_PRODUCT_ID:-lysnrai}" \ npx promptfoo eval \ --config "$SCRIPT_DIR/promptfoo.yaml" \ --output json \ --no-cache \ --no-progress-bar \ 2>/dev/null > "$GEMINI_OUT" || true # ── Run Ollama evals ───────────────────────────────────────────── echo "→ [4/4] Running Ollama evals ($OLLAMA_MODEL) ..." OLLAMA_BASE_URL="$OLLAMA_BASE_URL" \ OLLAMA_MODEL="$OLLAMA_MODEL" \ npx promptfoo eval \ --config "$SCRIPT_DIR/promptfoo.ollama.yaml" \ --output json \ --no-cache \ --no-progress-bar \ 2>/dev/null > "$OLLAMA_OUT" || true # ── Compare results ────────────────────────────────────────────── echo "" echo "╔══════════════════════════════════════════════════════════════╗" echo "║ RESULTS ║" echo "╠══════════════════════════════════════════════════════════════╣" node --input-type=module << 'EOF' import { readFileSync } from 'fs'; const geminiPath = process.env.GEMINI_OUT; const ollamaPath = process.env.OLLAMA_OUT; function parseResults(path) { try { const raw = JSON.parse(readFileSync(path, 'utf8')); const results = raw.results?.results || []; const total = results.length; const passed = results.filter(r => r.success).length; const byTask = {}; for (const r of results) { const task = r.vars?.taskId || r.vars?.taskPrompt?.split(' ')[0] || 'unknown'; const key = task.length > 20 ? task.slice(0, 20) : task; if (!byTask[key]) byTask[key] = { pass: 0, total: 0 }; byTask[key].total++; if (r.success) byTask[key].pass++; } return { total, passed, byTask }; } catch { return { total: 0, passed: 0, byTask: {} }; } } const gemini = parseResults(geminiPath); const ollama = parseResults(ollamaPath); const pct = (p, t) => t === 0 ? 'N/A' : `${Math.round((p/t)*100)}%`; const bar = (p, t) => { if (t === 0) return '░░░░░░░░░░'; const filled = Math.round((p/t)*10); return '█'.repeat(filled) + '░'.repeat(10-filled); }; console.log(`\n ${'Provider'.padEnd(12)} ${'Passed'.padEnd(10)} ${'Total'.padEnd(8)} ${'Rate'.padEnd(8)} Progress`); console.log(` ${'─'.repeat(55)}`); console.log(` ${'Gemini'.padEnd(12)} ${String(gemini.passed).padEnd(10)} ${String(gemini.total).padEnd(8)} ${pct(gemini.passed,gemini.total).padEnd(8)} ${bar(gemini.passed,gemini.total)}`); console.log(` ${'Ollama'.padEnd(12)} ${String(ollama.passed).padEnd(10)} ${String(ollama.total).padEnd(8)} ${pct(ollama.passed,ollama.total).padEnd(8)} ${bar(ollama.passed,ollama.total)}`); const allTasks = new Set([...Object.keys(gemini.byTask), ...Object.keys(ollama.byTask)]); if (allTasks.size > 0) { console.log(`\n Per-task breakdown:`); console.log(` ${'Task'.padEnd(25)} ${'Gemini'.padEnd(12)} ${'Ollama'.padEnd(12)}`); console.log(` ${'─'.repeat(50)}`); for (const task of allTasks) { const g = gemini.byTask[task] || { pass: 0, total: 0 }; const o = ollama.byTask[task] || { pass: 0, total: 0 }; const gStr = `${g.pass}/${g.total} (${pct(g.pass,g.total)})`; const oStr = `${o.pass}/${o.total} (${pct(o.pass,o.total)})`; console.log(` ${task.padEnd(25)} ${gStr.padEnd(12)} ${oStr.padEnd(12)}`); } } const winner = gemini.passed >= ollama.passed ? 'Gemini' : `Ollama (${process.env.OLLAMA_MODEL})`; console.log(`\n Winner: ${winner}`); EOF # Cleanup temp files rm -f "$GEMINI_OUT" "$OLLAMA_OUT" echo "" echo "╚══════════════════════════════════════════════════════════════╝"