feat(extraction-service): add unattended eval runner with structured logging
- Add evals/run-ollama-evals-logged.sh: self-logging eval script that runs without babysitting; writes timestamped log to evals/logs/; includes Ollama health check, model availability check (auto-pulls if missing), JSON smoke test, cache clear, full promptfoo run, pass-rate summary, and macOS notification on completion - Update package.json scripts: add eval, eval:ci, eval:task, eval:json, eval:ollama, eval:compare
This commit is contained in:
parent
da9ca9dc1a
commit
f0accc0946
161
services/extraction-service/evals/run-ollama-evals-logged.sh
Executable file
161
services/extraction-service/evals/run-ollama-evals-logged.sh
Executable file
@ -0,0 +1,161 @@
|
||||
#!/usr/bin/env bash
|
||||
# run-ollama-evals-logged.sh
|
||||
# Runs Ollama evals unattended and writes a structured log file.
|
||||
# You can walk away — check the log when it's done.
|
||||
#
|
||||
# Usage:
|
||||
# ./evals/run-ollama-evals-logged.sh
|
||||
# OLLAMA_MODEL=qwen2.5:7b ./evals/run-ollama-evals-logged.sh
|
||||
# OLLAMA_MODEL=qwen2.5-coder:32b ./evals/run-ollama-evals-logged.sh
|
||||
#
|
||||
# Log written to: evals/logs/ollama-eval-<model>-<timestamp>.log
|
||||
# Summary line at end of log — grep for RESULT: to get pass/fail at a glance.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────────────
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
SERVICE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
LOG_DIR="$SCRIPT_DIR/logs"
|
||||
OLLAMA_MODEL="${OLLAMA_MODEL:-llama3.1:8b}"
|
||||
OLLAMA_BASE_URL="${OLLAMA_BASE_URL:-http://localhost:11434/v1}"
|
||||
TIMESTAMP="$(date +%Y-%m-%dT%H-%M-%S)"
|
||||
SAFE_MODEL="${OLLAMA_MODEL//:/-}" # llama3.1:8b → llama3.1-8b
|
||||
LOG_FILE="$LOG_DIR/ollama-eval-${SAFE_MODEL}-${TIMESTAMP}.log"
|
||||
PROMPTFOO_CONFIG="$SCRIPT_DIR/promptfoo.ollama.yaml"
|
||||
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
log() { echo "$1" | tee -a "$LOG_FILE"; }
|
||||
log_section() { log ""; log "══════════════════════════════════════════════════"; log " $1"; log "══════════════════════════════════════════════════"; }
|
||||
|
||||
# ── Header ────────────────────────────────────────────────────────────────────
|
||||
log_section "OLLAMA EVAL RUN"
|
||||
log "Timestamp : $TIMESTAMP"
|
||||
log "Model : $OLLAMA_MODEL"
|
||||
log "Ollama URL : $OLLAMA_BASE_URL"
|
||||
log "Config : $PROMPTFOO_CONFIG"
|
||||
log "Log file : $LOG_FILE"
|
||||
log ""
|
||||
|
||||
# ── Step 1: Check Ollama is running ───────────────────────────────────────────
|
||||
log_section "STEP 1: Ollama health check"
|
||||
if curl -sf "$OLLAMA_BASE_URL/../tags" > /dev/null 2>&1 || \
|
||||
curl -sf "http://localhost:11434/api/tags" > /dev/null 2>&1; then
|
||||
log "✅ Ollama is running at $OLLAMA_BASE_URL"
|
||||
else
|
||||
log "❌ Ollama is NOT running. Start it with: ollama serve"
|
||||
log ""
|
||||
log "RESULT: FAILED — Ollama not running"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# ── Step 2: Check model is available ─────────────────────────────────────────
|
||||
log_section "STEP 2: Model availability check"
|
||||
MODELS_JSON="$(curl -sf http://localhost:11434/api/tags 2>/dev/null || echo '{}')"
|
||||
MODEL_BASE="${OLLAMA_MODEL%%:*}" # llama3.1:8b → llama3.1
|
||||
if echo "$MODELS_JSON" | grep -q "$MODEL_BASE"; then
|
||||
log "✅ Model '$OLLAMA_MODEL' is available"
|
||||
else
|
||||
log "⚠️ Model '$OLLAMA_MODEL' not found locally. Pulling now..."
|
||||
log ""
|
||||
ollama pull "$OLLAMA_MODEL" 2>&1 | tee -a "$LOG_FILE"
|
||||
log ""
|
||||
log "✅ Pull complete"
|
||||
fi
|
||||
|
||||
# ── Step 3: Smoke test — verify model returns valid JSON ──────────────────────
|
||||
log_section "STEP 3: Smoke test (JSON output check)"
|
||||
SMOKE_RESPONSE="$(curl -sf http://localhost:11434/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"model\": \"$OLLAMA_MODEL\",
|
||||
\"messages\": [{\"role\": \"user\", \"content\": \"Return only this JSON, no other text: {\\\"extractions\\\": [{\\\"extraction_class\\\": \\\"action\\\", \\\"extraction_text\\\": \\\"test\\\"}]}\"}],
|
||||
\"response_format\": {\"type\": \"json_object\"},
|
||||
\"temperature\": 0.1
|
||||
}" 2>/dev/null || echo "CURL_FAILED")"
|
||||
|
||||
if echo "$SMOKE_RESPONSE" | grep -q "extractions"; then
|
||||
log "✅ Smoke test passed — model returns valid JSON with 'extractions' key"
|
||||
elif echo "$SMOKE_RESPONSE" | grep -q "CURL_FAILED"; then
|
||||
log "❌ Smoke test failed — could not reach Ollama API"
|
||||
log "RESULT: FAILED — API unreachable"
|
||||
exit 1
|
||||
else
|
||||
log "⚠️ Smoke test response (may still work in evals):"
|
||||
echo "$SMOKE_RESPONSE" | head -5 | tee -a "$LOG_FILE"
|
||||
fi
|
||||
|
||||
# ── Step 4: Clear promptfoo cache ─────────────────────────────────────────────
|
||||
log_section "STEP 4: Clear promptfoo cache"
|
||||
rm -rf "$SERVICE_DIR/.promptfoo" 2>/dev/null && log "✅ Local .promptfoo cache cleared" || log "ℹ️ No local cache found"
|
||||
rm -rf ~/.promptfoo/cache 2>/dev/null && log "✅ Global ~/.promptfoo/cache cleared" || log "ℹ️ No global cache found"
|
||||
|
||||
# ── Step 5: Run evals ─────────────────────────────────────────────────────────
|
||||
log_section "STEP 5: Running promptfoo evals"
|
||||
log "Model: $OLLAMA_MODEL | Started: $(date)"
|
||||
log ""
|
||||
|
||||
EVAL_START="$(date +%s)"
|
||||
|
||||
# Run promptfoo, capture full output to log, also stream to terminal
|
||||
OLLAMA_MODEL="$OLLAMA_MODEL" OLLAMA_BASE_URL="$OLLAMA_BASE_URL" \
|
||||
npx promptfoo eval \
|
||||
--config "$PROMPTFOO_CONFIG" \
|
||||
--no-cache \
|
||||
--no-progress-bar \
|
||||
2>&1 | tee -a "$LOG_FILE"
|
||||
|
||||
EVAL_EXIT="${PIPESTATUS[0]}"
|
||||
EVAL_END="$(date +%s)"
|
||||
EVAL_DURATION=$(( EVAL_END - EVAL_START ))
|
||||
|
||||
# ── Step 6: Parse results ─────────────────────────────────────────────────────
|
||||
log_section "STEP 6: Results summary"
|
||||
log "Duration : ${EVAL_DURATION}s"
|
||||
log "Finished : $(date)"
|
||||
log ""
|
||||
|
||||
# Extract pass/fail line from log
|
||||
RESULTS_LINE="$(grep -E "Results:.*passed" "$LOG_FILE" | tail -1 || echo "Results line not found")"
|
||||
log "Raw result : $RESULTS_LINE"
|
||||
log ""
|
||||
|
||||
# Parse numbers
|
||||
PASSED="$(echo "$RESULTS_LINE" | grep -oE '[0-9]+ passed' | grep -oE '[0-9]+' || echo 0)"
|
||||
FAILED="$(echo "$RESULTS_LINE" | grep -oE '[0-9]+ failed' | grep -oE '[0-9]+' || echo 0)"
|
||||
TOTAL=$(( PASSED + FAILED ))
|
||||
|
||||
if [[ "$TOTAL" -gt 0 ]]; then
|
||||
PASS_RATE=$(( PASSED * 100 / TOTAL ))
|
||||
else
|
||||
PASS_RATE=0
|
||||
fi
|
||||
|
||||
log "Passed : $PASSED / $TOTAL ($PASS_RATE%)"
|
||||
log "Failed : $FAILED / $TOTAL"
|
||||
log ""
|
||||
|
||||
# ── Final verdict ─────────────────────────────────────────────────────────────
|
||||
if [[ "$EVAL_EXIT" -eq 0 ]]; then
|
||||
log "RESULT: ✅ PASSED — $PASSED/$TOTAL assertions passed ($PASS_RATE%)"
|
||||
elif [[ "$PASS_RATE" -ge 70 ]]; then
|
||||
log "RESULT: ⚠️ PARTIAL — $PASSED/$TOTAL passed ($PASS_RATE%) — some assertions failed (expected for 8B model)"
|
||||
elif [[ "$PASS_RATE" -ge 40 ]]; then
|
||||
log "RESULT: ⚠️ LOW — $PASSED/$TOTAL passed ($PASS_RATE%) — model may need prompt tuning"
|
||||
else
|
||||
log "RESULT: ❌ FAILED — $PASSED/$TOTAL passed ($PASS_RATE%) — check assertion errors above"
|
||||
fi
|
||||
|
||||
log ""
|
||||
log "Full log : $LOG_FILE"
|
||||
log "View UI : npx promptfoo view"
|
||||
log ""
|
||||
|
||||
# ── Notify when done (macOS) ──────────────────────────────────────────────────
|
||||
if command -v osascript &>/dev/null; then
|
||||
osascript -e "display notification \"$PASSED/$TOTAL passed ($PASS_RATE%) in ${EVAL_DURATION}s\" with title \"Ollama Evals Done\" subtitle \"Model: $OLLAMA_MODEL\"" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
exit "$EVAL_EXIT"
|
||||
@ -10,7 +10,13 @@
|
||||
"start": "node dist/server.js",
|
||||
"test": "vitest run",
|
||||
"test:watch": "vitest",
|
||||
"lint": "eslint src/"
|
||||
"lint": "eslint src/",
|
||||
"eval": "bash evals/run-evals.sh",
|
||||
"eval:ci": "bash evals/run-evals.sh --ci",
|
||||
"eval:task": "bash evals/run-evals.sh --task",
|
||||
"eval:json": "bash evals/run-evals.sh --output json",
|
||||
"eval:ollama": "npx promptfoo eval --config evals/promptfoo.ollama.yaml",
|
||||
"eval:compare": "GEMINI_OUT=evals/.results-gemini.json OLLAMA_OUT=evals/.results-ollama.json bash evals/compare-evals.sh"
|
||||
},
|
||||
"dependencies": {
|
||||
"@bytelyst/auth": "workspace:*",
|
||||
|
||||
Loading…
Reference in New Issue
Block a user