learning_ai_common_plat/services/extraction-service/evals/run-ollama-evals-logged.sh
saravanakumardb1 f0accc0946 feat(extraction-service): add unattended eval runner with structured logging
- Add evals/run-ollama-evals-logged.sh: self-logging eval script that runs
  without babysitting; writes timestamped log to evals/logs/; includes
  Ollama health check, model availability check (auto-pulls if missing),
  JSON smoke test, cache clear, full promptfoo run, pass-rate summary,
  and macOS notification on completion
- Update package.json scripts: add eval, eval:ci, eval:task, eval:json,
  eval:ollama, eval:compare
2026-02-19 12:19:34 -08:00

162 lines
7.6 KiB
Bash
Executable File
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# run-ollama-evals-logged.sh
# Runs Ollama evals unattended and writes a structured log file.
# You can walk away — check the log when it's done.
#
# Usage:
# ./evals/run-ollama-evals-logged.sh
# OLLAMA_MODEL=qwen2.5:7b ./evals/run-ollama-evals-logged.sh
# OLLAMA_MODEL=qwen2.5-coder:32b ./evals/run-ollama-evals-logged.sh
#
# Log written to: evals/logs/ollama-eval-<model>-<timestamp>.log
# Summary line at end of log — grep for RESULT: to get pass/fail at a glance.
set -euo pipefail
# ── Config ────────────────────────────────────────────────────────────────────
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SERVICE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
LOG_DIR="$SCRIPT_DIR/logs"
OLLAMA_MODEL="${OLLAMA_MODEL:-llama3.1:8b}"
OLLAMA_BASE_URL="${OLLAMA_BASE_URL:-http://localhost:11434/v1}"
TIMESTAMP="$(date +%Y-%m-%dT%H-%M-%S)"
SAFE_MODEL="${OLLAMA_MODEL//:/-}" # llama3.1:8b → llama3.1-8b
LOG_FILE="$LOG_DIR/ollama-eval-${SAFE_MODEL}-${TIMESTAMP}.log"
PROMPTFOO_CONFIG="$SCRIPT_DIR/promptfoo.ollama.yaml"
mkdir -p "$LOG_DIR"
# ── Helpers ───────────────────────────────────────────────────────────────────
log() { echo "$1" | tee -a "$LOG_FILE"; }
log_section() { log ""; log "══════════════════════════════════════════════════"; log " $1"; log "══════════════════════════════════════════════════"; }
# ── Header ────────────────────────────────────────────────────────────────────
log_section "OLLAMA EVAL RUN"
log "Timestamp : $TIMESTAMP"
log "Model : $OLLAMA_MODEL"
log "Ollama URL : $OLLAMA_BASE_URL"
log "Config : $PROMPTFOO_CONFIG"
log "Log file : $LOG_FILE"
log ""
# ── Step 1: Check Ollama is running ───────────────────────────────────────────
log_section "STEP 1: Ollama health check"
if curl -sf "$OLLAMA_BASE_URL/../tags" > /dev/null 2>&1 || \
curl -sf "http://localhost:11434/api/tags" > /dev/null 2>&1; then
log "✅ Ollama is running at $OLLAMA_BASE_URL"
else
log "❌ Ollama is NOT running. Start it with: ollama serve"
log ""
log "RESULT: FAILED — Ollama not running"
exit 1
fi
# ── Step 2: Check model is available ─────────────────────────────────────────
log_section "STEP 2: Model availability check"
MODELS_JSON="$(curl -sf http://localhost:11434/api/tags 2>/dev/null || echo '{}')"
MODEL_BASE="${OLLAMA_MODEL%%:*}" # llama3.1:8b → llama3.1
if echo "$MODELS_JSON" | grep -q "$MODEL_BASE"; then
log "✅ Model '$OLLAMA_MODEL' is available"
else
log "⚠️ Model '$OLLAMA_MODEL' not found locally. Pulling now..."
log ""
ollama pull "$OLLAMA_MODEL" 2>&1 | tee -a "$LOG_FILE"
log ""
log "✅ Pull complete"
fi
# ── Step 3: Smoke test — verify model returns valid JSON ──────────────────────
log_section "STEP 3: Smoke test (JSON output check)"
SMOKE_RESPONSE="$(curl -sf http://localhost:11434/v1/chat/completions \
-H "Content-Type: application/json" \
-d "{
\"model\": \"$OLLAMA_MODEL\",
\"messages\": [{\"role\": \"user\", \"content\": \"Return only this JSON, no other text: {\\\"extractions\\\": [{\\\"extraction_class\\\": \\\"action\\\", \\\"extraction_text\\\": \\\"test\\\"}]}\"}],
\"response_format\": {\"type\": \"json_object\"},
\"temperature\": 0.1
}" 2>/dev/null || echo "CURL_FAILED")"
if echo "$SMOKE_RESPONSE" | grep -q "extractions"; then
log "✅ Smoke test passed — model returns valid JSON with 'extractions' key"
elif echo "$SMOKE_RESPONSE" | grep -q "CURL_FAILED"; then
log "❌ Smoke test failed — could not reach Ollama API"
log "RESULT: FAILED — API unreachable"
exit 1
else
log "⚠️ Smoke test response (may still work in evals):"
echo "$SMOKE_RESPONSE" | head -5 | tee -a "$LOG_FILE"
fi
# ── Step 4: Clear promptfoo cache ─────────────────────────────────────────────
log_section "STEP 4: Clear promptfoo cache"
rm -rf "$SERVICE_DIR/.promptfoo" 2>/dev/null && log "✅ Local .promptfoo cache cleared" || log " No local cache found"
rm -rf ~/.promptfoo/cache 2>/dev/null && log "✅ Global ~/.promptfoo/cache cleared" || log " No global cache found"
# ── Step 5: Run evals ─────────────────────────────────────────────────────────
log_section "STEP 5: Running promptfoo evals"
log "Model: $OLLAMA_MODEL | Started: $(date)"
log ""
EVAL_START="$(date +%s)"
# Run promptfoo, capture full output to log, also stream to terminal
OLLAMA_MODEL="$OLLAMA_MODEL" OLLAMA_BASE_URL="$OLLAMA_BASE_URL" \
npx promptfoo eval \
--config "$PROMPTFOO_CONFIG" \
--no-cache \
--no-progress-bar \
2>&1 | tee -a "$LOG_FILE"
EVAL_EXIT="${PIPESTATUS[0]}"
EVAL_END="$(date +%s)"
EVAL_DURATION=$(( EVAL_END - EVAL_START ))
# ── Step 6: Parse results ─────────────────────────────────────────────────────
log_section "STEP 6: Results summary"
log "Duration : ${EVAL_DURATION}s"
log "Finished : $(date)"
log ""
# Extract pass/fail line from log
RESULTS_LINE="$(grep -E "Results:.*passed" "$LOG_FILE" | tail -1 || echo "Results line not found")"
log "Raw result : $RESULTS_LINE"
log ""
# Parse numbers
PASSED="$(echo "$RESULTS_LINE" | grep -oE '[0-9]+ passed' | grep -oE '[0-9]+' || echo 0)"
FAILED="$(echo "$RESULTS_LINE" | grep -oE '[0-9]+ failed' | grep -oE '[0-9]+' || echo 0)"
TOTAL=$(( PASSED + FAILED ))
if [[ "$TOTAL" -gt 0 ]]; then
PASS_RATE=$(( PASSED * 100 / TOTAL ))
else
PASS_RATE=0
fi
log "Passed : $PASSED / $TOTAL ($PASS_RATE%)"
log "Failed : $FAILED / $TOTAL"
log ""
# ── Final verdict ─────────────────────────────────────────────────────────────
if [[ "$EVAL_EXIT" -eq 0 ]]; then
log "RESULT: ✅ PASSED — $PASSED/$TOTAL assertions passed ($PASS_RATE%)"
elif [[ "$PASS_RATE" -ge 70 ]]; then
log "RESULT: ⚠️ PARTIAL — $PASSED/$TOTAL passed ($PASS_RATE%) — some assertions failed (expected for 8B model)"
elif [[ "$PASS_RATE" -ge 40 ]]; then
log "RESULT: ⚠️ LOW — $PASSED/$TOTAL passed ($PASS_RATE%) — model may need prompt tuning"
else
log "RESULT: ❌ FAILED — $PASSED/$TOTAL passed ($PASS_RATE%) — check assertion errors above"
fi
log ""
log "Full log : $LOG_FILE"
log "View UI : npx promptfoo view"
log ""
# ── Notify when done (macOS) ──────────────────────────────────────────────────
if command -v osascript &>/dev/null; then
osascript -e "display notification \"$PASSED/$TOTAL passed ($PASS_RATE%) in ${EVAL_DURATION}s\" with title \"Ollama Evals Done\" subtitle \"Model: $OLLAMA_MODEL\"" 2>/dev/null || true
fi
exit "$EVAL_EXIT"