From f0accc09466e5d66b2b8672bb05330d982247fc1 Mon Sep 17 00:00:00 2001 From: saravanakumardb1 Date: Thu, 19 Feb 2026 12:19:34 -0800 Subject: [PATCH] feat(extraction-service): add unattended eval runner with structured logging - Add evals/run-ollama-evals-logged.sh: self-logging eval script that runs without babysitting; writes timestamped log to evals/logs/; includes Ollama health check, model availability check (auto-pulls if missing), JSON smoke test, cache clear, full promptfoo run, pass-rate summary, and macOS notification on completion - Update package.json scripts: add eval, eval:ci, eval:task, eval:json, eval:ollama, eval:compare --- .../evals/run-ollama-evals-logged.sh | 161 ++++++++++++++++++ services/extraction-service/package.json | 8 +- 2 files changed, 168 insertions(+), 1 deletion(-) create mode 100755 services/extraction-service/evals/run-ollama-evals-logged.sh diff --git a/services/extraction-service/evals/run-ollama-evals-logged.sh b/services/extraction-service/evals/run-ollama-evals-logged.sh new file mode 100755 index 00000000..821246d3 --- /dev/null +++ b/services/extraction-service/evals/run-ollama-evals-logged.sh @@ -0,0 +1,161 @@ +#!/usr/bin/env bash +# run-ollama-evals-logged.sh +# Runs Ollama evals unattended and writes a structured log file. +# You can walk away — check the log when it's done. +# +# Usage: +# ./evals/run-ollama-evals-logged.sh +# OLLAMA_MODEL=qwen2.5:7b ./evals/run-ollama-evals-logged.sh +# OLLAMA_MODEL=qwen2.5-coder:32b ./evals/run-ollama-evals-logged.sh +# +# Log written to: evals/logs/ollama-eval--.log +# Summary line at end of log — grep for RESULT: to get pass/fail at a glance. + +set -euo pipefail + +# ── Config ──────────────────────────────────────────────────────────────────── +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SERVICE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +LOG_DIR="$SCRIPT_DIR/logs" +OLLAMA_MODEL="${OLLAMA_MODEL:-llama3.1:8b}" +OLLAMA_BASE_URL="${OLLAMA_BASE_URL:-http://localhost:11434/v1}" +TIMESTAMP="$(date +%Y-%m-%dT%H-%M-%S)" +SAFE_MODEL="${OLLAMA_MODEL//:/-}" # llama3.1:8b → llama3.1-8b +LOG_FILE="$LOG_DIR/ollama-eval-${SAFE_MODEL}-${TIMESTAMP}.log" +PROMPTFOO_CONFIG="$SCRIPT_DIR/promptfoo.ollama.yaml" + +mkdir -p "$LOG_DIR" + +# ── Helpers ─────────────────────────────────────────────────────────────────── +log() { echo "$1" | tee -a "$LOG_FILE"; } +log_section() { log ""; log "══════════════════════════════════════════════════"; log " $1"; log "══════════════════════════════════════════════════"; } + +# ── Header ──────────────────────────────────────────────────────────────────── +log_section "OLLAMA EVAL RUN" +log "Timestamp : $TIMESTAMP" +log "Model : $OLLAMA_MODEL" +log "Ollama URL : $OLLAMA_BASE_URL" +log "Config : $PROMPTFOO_CONFIG" +log "Log file : $LOG_FILE" +log "" + +# ── Step 1: Check Ollama is running ─────────────────────────────────────────── +log_section "STEP 1: Ollama health check" +if curl -sf "$OLLAMA_BASE_URL/../tags" > /dev/null 2>&1 || \ + curl -sf "http://localhost:11434/api/tags" > /dev/null 2>&1; then + log "✅ Ollama is running at $OLLAMA_BASE_URL" +else + log "❌ Ollama is NOT running. Start it with: ollama serve" + log "" + log "RESULT: FAILED — Ollama not running" + exit 1 +fi + +# ── Step 2: Check model is available ───────────────────────────────────────── +log_section "STEP 2: Model availability check" +MODELS_JSON="$(curl -sf http://localhost:11434/api/tags 2>/dev/null || echo '{}')" +MODEL_BASE="${OLLAMA_MODEL%%:*}" # llama3.1:8b → llama3.1 +if echo "$MODELS_JSON" | grep -q "$MODEL_BASE"; then + log "✅ Model '$OLLAMA_MODEL' is available" +else + log "⚠️ Model '$OLLAMA_MODEL' not found locally. Pulling now..." + log "" + ollama pull "$OLLAMA_MODEL" 2>&1 | tee -a "$LOG_FILE" + log "" + log "✅ Pull complete" +fi + +# ── Step 3: Smoke test — verify model returns valid JSON ────────────────────── +log_section "STEP 3: Smoke test (JSON output check)" +SMOKE_RESPONSE="$(curl -sf http://localhost:11434/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d "{ + \"model\": \"$OLLAMA_MODEL\", + \"messages\": [{\"role\": \"user\", \"content\": \"Return only this JSON, no other text: {\\\"extractions\\\": [{\\\"extraction_class\\\": \\\"action\\\", \\\"extraction_text\\\": \\\"test\\\"}]}\"}], + \"response_format\": {\"type\": \"json_object\"}, + \"temperature\": 0.1 + }" 2>/dev/null || echo "CURL_FAILED")" + +if echo "$SMOKE_RESPONSE" | grep -q "extractions"; then + log "✅ Smoke test passed — model returns valid JSON with 'extractions' key" +elif echo "$SMOKE_RESPONSE" | grep -q "CURL_FAILED"; then + log "❌ Smoke test failed — could not reach Ollama API" + log "RESULT: FAILED — API unreachable" + exit 1 +else + log "⚠️ Smoke test response (may still work in evals):" + echo "$SMOKE_RESPONSE" | head -5 | tee -a "$LOG_FILE" +fi + +# ── Step 4: Clear promptfoo cache ───────────────────────────────────────────── +log_section "STEP 4: Clear promptfoo cache" +rm -rf "$SERVICE_DIR/.promptfoo" 2>/dev/null && log "✅ Local .promptfoo cache cleared" || log "ℹ️ No local cache found" +rm -rf ~/.promptfoo/cache 2>/dev/null && log "✅ Global ~/.promptfoo/cache cleared" || log "ℹ️ No global cache found" + +# ── Step 5: Run evals ───────────────────────────────────────────────────────── +log_section "STEP 5: Running promptfoo evals" +log "Model: $OLLAMA_MODEL | Started: $(date)" +log "" + +EVAL_START="$(date +%s)" + +# Run promptfoo, capture full output to log, also stream to terminal +OLLAMA_MODEL="$OLLAMA_MODEL" OLLAMA_BASE_URL="$OLLAMA_BASE_URL" \ + npx promptfoo eval \ + --config "$PROMPTFOO_CONFIG" \ + --no-cache \ + --no-progress-bar \ + 2>&1 | tee -a "$LOG_FILE" + +EVAL_EXIT="${PIPESTATUS[0]}" +EVAL_END="$(date +%s)" +EVAL_DURATION=$(( EVAL_END - EVAL_START )) + +# ── Step 6: Parse results ───────────────────────────────────────────────────── +log_section "STEP 6: Results summary" +log "Duration : ${EVAL_DURATION}s" +log "Finished : $(date)" +log "" + +# Extract pass/fail line from log +RESULTS_LINE="$(grep -E "Results:.*passed" "$LOG_FILE" | tail -1 || echo "Results line not found")" +log "Raw result : $RESULTS_LINE" +log "" + +# Parse numbers +PASSED="$(echo "$RESULTS_LINE" | grep -oE '[0-9]+ passed' | grep -oE '[0-9]+' || echo 0)" +FAILED="$(echo "$RESULTS_LINE" | grep -oE '[0-9]+ failed' | grep -oE '[0-9]+' || echo 0)" +TOTAL=$(( PASSED + FAILED )) + +if [[ "$TOTAL" -gt 0 ]]; then + PASS_RATE=$(( PASSED * 100 / TOTAL )) +else + PASS_RATE=0 +fi + +log "Passed : $PASSED / $TOTAL ($PASS_RATE%)" +log "Failed : $FAILED / $TOTAL" +log "" + +# ── Final verdict ───────────────────────────────────────────────────────────── +if [[ "$EVAL_EXIT" -eq 0 ]]; then + log "RESULT: ✅ PASSED — $PASSED/$TOTAL assertions passed ($PASS_RATE%)" +elif [[ "$PASS_RATE" -ge 70 ]]; then + log "RESULT: ⚠️ PARTIAL — $PASSED/$TOTAL passed ($PASS_RATE%) — some assertions failed (expected for 8B model)" +elif [[ "$PASS_RATE" -ge 40 ]]; then + log "RESULT: ⚠️ LOW — $PASSED/$TOTAL passed ($PASS_RATE%) — model may need prompt tuning" +else + log "RESULT: ❌ FAILED — $PASSED/$TOTAL passed ($PASS_RATE%) — check assertion errors above" +fi + +log "" +log "Full log : $LOG_FILE" +log "View UI : npx promptfoo view" +log "" + +# ── Notify when done (macOS) ────────────────────────────────────────────────── +if command -v osascript &>/dev/null; then + osascript -e "display notification \"$PASSED/$TOTAL passed ($PASS_RATE%) in ${EVAL_DURATION}s\" with title \"Ollama Evals Done\" subtitle \"Model: $OLLAMA_MODEL\"" 2>/dev/null || true +fi + +exit "$EVAL_EXIT" diff --git a/services/extraction-service/package.json b/services/extraction-service/package.json index c3e392e3..9da6e5f4 100644 --- a/services/extraction-service/package.json +++ b/services/extraction-service/package.json @@ -10,7 +10,13 @@ "start": "node dist/server.js", "test": "vitest run", "test:watch": "vitest", - "lint": "eslint src/" + "lint": "eslint src/", + "eval": "bash evals/run-evals.sh", + "eval:ci": "bash evals/run-evals.sh --ci", + "eval:task": "bash evals/run-evals.sh --task", + "eval:json": "bash evals/run-evals.sh --output json", + "eval:ollama": "npx promptfoo eval --config evals/promptfoo.ollama.yaml", + "eval:compare": "GEMINI_OUT=evals/.results-gemini.json OLLAMA_OUT=evals/.results-ollama.json bash evals/compare-evals.sh" }, "dependencies": { "@bytelyst/auth": "workspace:*",