learning_ai_common_plat/services/extraction-service/evals/compare-evals.sh
saravanakumardb1 da9ca9dc1a feat(extraction-service): add Ollama local model eval config and compare script
- Add evals/promptfoo.ollama.yaml: same 19 cases hitting Ollama OpenAI-compat
  API directly (no extraction-service needed); all assertions use inline
  JSON.parse(output) to handle raw string response from Ollama
- Add evals/compare-evals.sh: runs Gemini + Ollama evals back-to-back and
  prints side-by-side pass-rate comparison table
- Supports OLLAMA_MODEL env var (default: llama3.1:8b)
2026-02-19 12:19:24 -08:00

143 lines
6.5 KiB
Bash
Executable File

#!/usr/bin/env bash
# compare-evals.sh — Run evals against both Gemini (via extraction-service) and
# Ollama (local), then print a side-by-side pass-rate comparison.
#
# Usage:
# ./evals/compare-evals.sh
# OLLAMA_MODEL=qwen2.5:7b ./evals/compare-evals.sh
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SERVICE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
EXTRACTION_SERVICE_URL="${EXTRACTION_SERVICE_URL:-http://localhost:4005}"
OLLAMA_BASE_URL="${OLLAMA_BASE_URL:-http://localhost:11434/v1}"
OLLAMA_MODEL="${OLLAMA_MODEL:-llama3.1:8b}"
GEMINI_OUT="$SCRIPT_DIR/.results-gemini.json"
OLLAMA_OUT="$SCRIPT_DIR/.results-ollama.json"
echo "╔══════════════════════════════════════════════════════════════╗"
echo "║ Extraction Eval — Gemini vs Ollama ($OLLAMA_MODEL)"
echo "╚══════════════════════════════════════════════════════════════╝"
echo ""
# ── Check extraction-service ─────────────────────────────────────
echo "→ [1/4] Checking extraction-service at $EXTRACTION_SERVICE_URL ..."
if ! curl -sf "$EXTRACTION_SERVICE_URL/health" > /dev/null 2>&1; then
echo "✗ extraction-service not running — start with: pnpm dev"
exit 1
fi
echo "✓ extraction-service up"
# ── Check Ollama ─────────────────────────────────────────────────
echo "→ [2/4] Checking Ollama at $OLLAMA_BASE_URL ..."
if ! curl -sf "http://localhost:11434/api/tags" > /dev/null 2>&1; then
echo "✗ Ollama not running — start with: ollama serve"
exit 1
fi
# Check model is pulled
if ! curl -sf "http://localhost:11434/api/tags" | grep -q "$OLLAMA_MODEL"; then
echo "✗ Model '$OLLAMA_MODEL' not found — pull with: ollama pull $OLLAMA_MODEL"
exit 1
fi
echo "✓ Ollama up, model '$OLLAMA_MODEL' available"
# ── Run Gemini evals ─────────────────────────────────────────────
echo ""
echo "→ [3/4] Running Gemini evals (via extraction-service) ..."
cd "$SERVICE_DIR"
EXTRACTION_SERVICE_URL="$EXTRACTION_SERVICE_URL" \
EXTRACTION_EVAL_TOKEN="${EXTRACTION_EVAL_TOKEN:-}" \
EVAL_PRODUCT_ID="${EVAL_PRODUCT_ID:-lysnrai}" \
npx promptfoo eval \
--config "$SCRIPT_DIR/promptfoo.yaml" \
--output json \
--no-cache \
--no-progress-bar \
2>/dev/null > "$GEMINI_OUT" || true
# ── Run Ollama evals ─────────────────────────────────────────────
echo "→ [4/4] Running Ollama evals ($OLLAMA_MODEL) ..."
OLLAMA_BASE_URL="$OLLAMA_BASE_URL" \
OLLAMA_MODEL="$OLLAMA_MODEL" \
npx promptfoo eval \
--config "$SCRIPT_DIR/promptfoo.ollama.yaml" \
--output json \
--no-cache \
--no-progress-bar \
2>/dev/null > "$OLLAMA_OUT" || true
# ── Compare results ──────────────────────────────────────────────
echo ""
echo "╔══════════════════════════════════════════════════════════════╗"
echo "║ RESULTS ║"
echo "╠══════════════════════════════════════════════════════════════╣"
node --input-type=module << 'EOF'
import { readFileSync } from 'fs';
const geminiPath = process.env.GEMINI_OUT;
const ollamaPath = process.env.OLLAMA_OUT;
function parseResults(path) {
try {
const raw = JSON.parse(readFileSync(path, 'utf8'));
const results = raw.results?.results || [];
const total = results.length;
const passed = results.filter(r => r.success).length;
const byTask = {};
for (const r of results) {
const task = r.vars?.taskId || r.vars?.taskPrompt?.split(' ')[0] || 'unknown';
const key = task.length > 20 ? task.slice(0, 20) : task;
if (!byTask[key]) byTask[key] = { pass: 0, total: 0 };
byTask[key].total++;
if (r.success) byTask[key].pass++;
}
return { total, passed, byTask };
} catch {
return { total: 0, passed: 0, byTask: {} };
}
}
const gemini = parseResults(geminiPath);
const ollama = parseResults(ollamaPath);
const pct = (p, t) => t === 0 ? 'N/A' : `${Math.round((p/t)*100)}%`;
const bar = (p, t) => {
if (t === 0) return '░░░░░░░░░░';
const filled = Math.round((p/t)*10);
return '█'.repeat(filled) + '░'.repeat(10-filled);
};
console.log(`\n ${'Provider'.padEnd(12)} ${'Passed'.padEnd(10)} ${'Total'.padEnd(8)} ${'Rate'.padEnd(8)} Progress`);
console.log(` ${'─'.repeat(55)}`);
console.log(` ${'Gemini'.padEnd(12)} ${String(gemini.passed).padEnd(10)} ${String(gemini.total).padEnd(8)} ${pct(gemini.passed,gemini.total).padEnd(8)} ${bar(gemini.passed,gemini.total)}`);
console.log(` ${'Ollama'.padEnd(12)} ${String(ollama.passed).padEnd(10)} ${String(ollama.total).padEnd(8)} ${pct(ollama.passed,ollama.total).padEnd(8)} ${bar(ollama.passed,ollama.total)}`);
const allTasks = new Set([...Object.keys(gemini.byTask), ...Object.keys(ollama.byTask)]);
if (allTasks.size > 0) {
console.log(`\n Per-task breakdown:`);
console.log(` ${'Task'.padEnd(25)} ${'Gemini'.padEnd(12)} ${'Ollama'.padEnd(12)}`);
console.log(` ${'─'.repeat(50)}`);
for (const task of allTasks) {
const g = gemini.byTask[task] || { pass: 0, total: 0 };
const o = ollama.byTask[task] || { pass: 0, total: 0 };
const gStr = `${g.pass}/${g.total} (${pct(g.pass,g.total)})`;
const oStr = `${o.pass}/${o.total} (${pct(o.pass,o.total)})`;
console.log(` ${task.padEnd(25)} ${gStr.padEnd(12)} ${oStr.padEnd(12)}`);
}
}
const winner = gemini.passed >= ollama.passed ? 'Gemini' : `Ollama (${process.env.OLLAMA_MODEL})`;
console.log(`\n Winner: ${winner}`);
EOF
# Cleanup temp files
rm -f "$GEMINI_OUT" "$OLLAMA_OUT"
echo ""
echo "╚══════════════════════════════════════════════════════════════╝"