- Add evals/promptfoo.ollama.yaml: same 19 cases hitting Ollama OpenAI-compat API directly (no extraction-service needed); all assertions use inline JSON.parse(output) to handle raw string response from Ollama - Add evals/compare-evals.sh: runs Gemini + Ollama evals back-to-back and prints side-by-side pass-rate comparison table - Supports OLLAMA_MODEL env var (default: llama3.1:8b)
143 lines
6.5 KiB
Bash
Executable File
143 lines
6.5 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# compare-evals.sh — Run evals against both Gemini (via extraction-service) and
|
|
# Ollama (local), then print a side-by-side pass-rate comparison.
|
|
#
|
|
# Usage:
|
|
# ./evals/compare-evals.sh
|
|
# OLLAMA_MODEL=qwen2.5:7b ./evals/compare-evals.sh
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
SERVICE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
|
|
EXTRACTION_SERVICE_URL="${EXTRACTION_SERVICE_URL:-http://localhost:4005}"
|
|
OLLAMA_BASE_URL="${OLLAMA_BASE_URL:-http://localhost:11434/v1}"
|
|
OLLAMA_MODEL="${OLLAMA_MODEL:-llama3.1:8b}"
|
|
|
|
GEMINI_OUT="$SCRIPT_DIR/.results-gemini.json"
|
|
OLLAMA_OUT="$SCRIPT_DIR/.results-ollama.json"
|
|
|
|
echo "╔══════════════════════════════════════════════════════════════╗"
|
|
echo "║ Extraction Eval — Gemini vs Ollama ($OLLAMA_MODEL)"
|
|
echo "╚══════════════════════════════════════════════════════════════╝"
|
|
echo ""
|
|
|
|
# ── Check extraction-service ─────────────────────────────────────
|
|
echo "→ [1/4] Checking extraction-service at $EXTRACTION_SERVICE_URL ..."
|
|
if ! curl -sf "$EXTRACTION_SERVICE_URL/health" > /dev/null 2>&1; then
|
|
echo "✗ extraction-service not running — start with: pnpm dev"
|
|
exit 1
|
|
fi
|
|
echo "✓ extraction-service up"
|
|
|
|
# ── Check Ollama ─────────────────────────────────────────────────
|
|
echo "→ [2/4] Checking Ollama at $OLLAMA_BASE_URL ..."
|
|
if ! curl -sf "http://localhost:11434/api/tags" > /dev/null 2>&1; then
|
|
echo "✗ Ollama not running — start with: ollama serve"
|
|
exit 1
|
|
fi
|
|
|
|
# Check model is pulled
|
|
if ! curl -sf "http://localhost:11434/api/tags" | grep -q "$OLLAMA_MODEL"; then
|
|
echo "✗ Model '$OLLAMA_MODEL' not found — pull with: ollama pull $OLLAMA_MODEL"
|
|
exit 1
|
|
fi
|
|
echo "✓ Ollama up, model '$OLLAMA_MODEL' available"
|
|
|
|
# ── Run Gemini evals ─────────────────────────────────────────────
|
|
echo ""
|
|
echo "→ [3/4] Running Gemini evals (via extraction-service) ..."
|
|
cd "$SERVICE_DIR"
|
|
EXTRACTION_SERVICE_URL="$EXTRACTION_SERVICE_URL" \
|
|
EXTRACTION_EVAL_TOKEN="${EXTRACTION_EVAL_TOKEN:-}" \
|
|
EVAL_PRODUCT_ID="${EVAL_PRODUCT_ID:-lysnrai}" \
|
|
npx promptfoo eval \
|
|
--config "$SCRIPT_DIR/promptfoo.yaml" \
|
|
--output json \
|
|
--no-cache \
|
|
--no-progress-bar \
|
|
2>/dev/null > "$GEMINI_OUT" || true
|
|
|
|
# ── Run Ollama evals ─────────────────────────────────────────────
|
|
echo "→ [4/4] Running Ollama evals ($OLLAMA_MODEL) ..."
|
|
OLLAMA_BASE_URL="$OLLAMA_BASE_URL" \
|
|
OLLAMA_MODEL="$OLLAMA_MODEL" \
|
|
npx promptfoo eval \
|
|
--config "$SCRIPT_DIR/promptfoo.ollama.yaml" \
|
|
--output json \
|
|
--no-cache \
|
|
--no-progress-bar \
|
|
2>/dev/null > "$OLLAMA_OUT" || true
|
|
|
|
# ── Compare results ──────────────────────────────────────────────
|
|
echo ""
|
|
echo "╔══════════════════════════════════════════════════════════════╗"
|
|
echo "║ RESULTS ║"
|
|
echo "╠══════════════════════════════════════════════════════════════╣"
|
|
|
|
node --input-type=module << 'EOF'
|
|
import { readFileSync } from 'fs';
|
|
|
|
const geminiPath = process.env.GEMINI_OUT;
|
|
const ollamaPath = process.env.OLLAMA_OUT;
|
|
|
|
function parseResults(path) {
|
|
try {
|
|
const raw = JSON.parse(readFileSync(path, 'utf8'));
|
|
const results = raw.results?.results || [];
|
|
const total = results.length;
|
|
const passed = results.filter(r => r.success).length;
|
|
const byTask = {};
|
|
for (const r of results) {
|
|
const task = r.vars?.taskId || r.vars?.taskPrompt?.split(' ')[0] || 'unknown';
|
|
const key = task.length > 20 ? task.slice(0, 20) : task;
|
|
if (!byTask[key]) byTask[key] = { pass: 0, total: 0 };
|
|
byTask[key].total++;
|
|
if (r.success) byTask[key].pass++;
|
|
}
|
|
return { total, passed, byTask };
|
|
} catch {
|
|
return { total: 0, passed: 0, byTask: {} };
|
|
}
|
|
}
|
|
|
|
const gemini = parseResults(geminiPath);
|
|
const ollama = parseResults(ollamaPath);
|
|
|
|
const pct = (p, t) => t === 0 ? 'N/A' : `${Math.round((p/t)*100)}%`;
|
|
const bar = (p, t) => {
|
|
if (t === 0) return '░░░░░░░░░░';
|
|
const filled = Math.round((p/t)*10);
|
|
return '█'.repeat(filled) + '░'.repeat(10-filled);
|
|
};
|
|
|
|
console.log(`\n ${'Provider'.padEnd(12)} ${'Passed'.padEnd(10)} ${'Total'.padEnd(8)} ${'Rate'.padEnd(8)} Progress`);
|
|
console.log(` ${'─'.repeat(55)}`);
|
|
console.log(` ${'Gemini'.padEnd(12)} ${String(gemini.passed).padEnd(10)} ${String(gemini.total).padEnd(8)} ${pct(gemini.passed,gemini.total).padEnd(8)} ${bar(gemini.passed,gemini.total)}`);
|
|
console.log(` ${'Ollama'.padEnd(12)} ${String(ollama.passed).padEnd(10)} ${String(ollama.total).padEnd(8)} ${pct(ollama.passed,ollama.total).padEnd(8)} ${bar(ollama.passed,ollama.total)}`);
|
|
|
|
const allTasks = new Set([...Object.keys(gemini.byTask), ...Object.keys(ollama.byTask)]);
|
|
if (allTasks.size > 0) {
|
|
console.log(`\n Per-task breakdown:`);
|
|
console.log(` ${'Task'.padEnd(25)} ${'Gemini'.padEnd(12)} ${'Ollama'.padEnd(12)}`);
|
|
console.log(` ${'─'.repeat(50)}`);
|
|
for (const task of allTasks) {
|
|
const g = gemini.byTask[task] || { pass: 0, total: 0 };
|
|
const o = ollama.byTask[task] || { pass: 0, total: 0 };
|
|
const gStr = `${g.pass}/${g.total} (${pct(g.pass,g.total)})`;
|
|
const oStr = `${o.pass}/${o.total} (${pct(o.pass,o.total)})`;
|
|
console.log(` ${task.padEnd(25)} ${gStr.padEnd(12)} ${oStr.padEnd(12)}`);
|
|
}
|
|
}
|
|
|
|
const winner = gemini.passed >= ollama.passed ? 'Gemini' : `Ollama (${process.env.OLLAMA_MODEL})`;
|
|
console.log(`\n Winner: ${winner}`);
|
|
EOF
|
|
|
|
# Cleanup temp files
|
|
rm -f "$GEMINI_OUT" "$OLLAMA_OUT"
|
|
|
|
echo ""
|
|
echo "╚══════════════════════════════════════════════════════════════╝"
|