#!/usr/bin/env bash # run-ollama-evals-logged.sh # Runs Ollama evals unattended and writes a structured log file. # You can walk away — check the log when it's done. # # Usage: # ./evals/run-ollama-evals-logged.sh # OLLAMA_MODEL=qwen2.5:7b ./evals/run-ollama-evals-logged.sh # OLLAMA_MODEL=qwen2.5-coder:32b ./evals/run-ollama-evals-logged.sh # # Log written to: evals/logs/ollama-eval--.log # Summary line at end of log — grep for RESULT: to get pass/fail at a glance. set -euo pipefail # ── Config ──────────────────────────────────────────────────────────────────── SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SERVICE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" LOG_DIR="$SCRIPT_DIR/logs" OLLAMA_MODEL="${OLLAMA_MODEL:-llama3.1:8b}" OLLAMA_BASE_URL="${OLLAMA_BASE_URL:-http://localhost:11434/v1}" TIMESTAMP="$(date +%Y-%m-%dT%H-%M-%S)" SAFE_MODEL="${OLLAMA_MODEL//:/-}" # llama3.1:8b → llama3.1-8b LOG_FILE="$LOG_DIR/ollama-eval-${SAFE_MODEL}-${TIMESTAMP}.log" PROMPTFOO_CONFIG="$SCRIPT_DIR/promptfoo.ollama.yaml" mkdir -p "$LOG_DIR" # ── Helpers ─────────────────────────────────────────────────────────────────── log() { echo "$1" | tee -a "$LOG_FILE"; } log_section() { log ""; log "══════════════════════════════════════════════════"; log " $1"; log "══════════════════════════════════════════════════"; } # ── Header ──────────────────────────────────────────────────────────────────── log_section "OLLAMA EVAL RUN" log "Timestamp : $TIMESTAMP" log "Model : $OLLAMA_MODEL" log "Ollama URL : $OLLAMA_BASE_URL" log "Config : $PROMPTFOO_CONFIG" log "Log file : $LOG_FILE" log "" # ── Step 1: Check Ollama is running ─────────────────────────────────────────── log_section "STEP 1: Ollama health check" if curl -sf "$OLLAMA_BASE_URL/../tags" > /dev/null 2>&1 || \ curl -sf "http://localhost:11434/api/tags" > /dev/null 2>&1; then log "✅ Ollama is running at $OLLAMA_BASE_URL" else log "❌ Ollama is NOT running. Start it with: ollama serve" log "" log "RESULT: FAILED — Ollama not running" exit 1 fi # ── Step 2: Check model is available ───────────────────────────────────────── log_section "STEP 2: Model availability check" MODELS_JSON="$(curl -sf http://localhost:11434/api/tags 2>/dev/null || echo '{}')" MODEL_BASE="${OLLAMA_MODEL%%:*}" # llama3.1:8b → llama3.1 if echo "$MODELS_JSON" | grep -q "$MODEL_BASE"; then log "✅ Model '$OLLAMA_MODEL' is available" else log "⚠️ Model '$OLLAMA_MODEL' not found locally. Pulling now..." log "" ollama pull "$OLLAMA_MODEL" 2>&1 | tee -a "$LOG_FILE" log "" log "✅ Pull complete" fi # ── Step 3: Smoke test — verify model returns valid JSON ────────────────────── log_section "STEP 3: Smoke test (JSON output check)" SMOKE_RESPONSE="$(curl -sf http://localhost:11434/v1/chat/completions \ -H "Content-Type: application/json" \ -d "{ \"model\": \"$OLLAMA_MODEL\", \"messages\": [{\"role\": \"user\", \"content\": \"Return only this JSON, no other text: {\\\"extractions\\\": [{\\\"extraction_class\\\": \\\"action\\\", \\\"extraction_text\\\": \\\"test\\\"}]}\"}], \"response_format\": {\"type\": \"json_object\"}, \"temperature\": 0.1 }" 2>/dev/null || echo "CURL_FAILED")" if echo "$SMOKE_RESPONSE" | grep -q "extractions"; then log "✅ Smoke test passed — model returns valid JSON with 'extractions' key" elif echo "$SMOKE_RESPONSE" | grep -q "CURL_FAILED"; then log "❌ Smoke test failed — could not reach Ollama API" log "RESULT: FAILED — API unreachable" exit 1 else log "⚠️ Smoke test response (may still work in evals):" echo "$SMOKE_RESPONSE" | head -5 | tee -a "$LOG_FILE" fi # ── Step 4: Clear promptfoo cache ───────────────────────────────────────────── log_section "STEP 4: Clear promptfoo cache" rm -rf "$SERVICE_DIR/.promptfoo" 2>/dev/null && log "✅ Local .promptfoo cache cleared" || log "ℹ️ No local cache found" rm -rf ~/.promptfoo/cache 2>/dev/null && log "✅ Global ~/.promptfoo/cache cleared" || log "ℹ️ No global cache found" # ── Step 5: Run evals ───────────────────────────────────────────────────────── log_section "STEP 5: Running promptfoo evals" log "Model: $OLLAMA_MODEL | Started: $(date)" log "" EVAL_START="$(date +%s)" # Run promptfoo, capture full output to log, also stream to terminal OLLAMA_MODEL="$OLLAMA_MODEL" OLLAMA_BASE_URL="$OLLAMA_BASE_URL" \ npx promptfoo eval \ --config "$PROMPTFOO_CONFIG" \ --no-cache \ --no-progress-bar \ 2>&1 | tee -a "$LOG_FILE" EVAL_EXIT="${PIPESTATUS[0]}" EVAL_END="$(date +%s)" EVAL_DURATION=$(( EVAL_END - EVAL_START )) # ── Step 6: Parse results ───────────────────────────────────────────────────── log_section "STEP 6: Results summary" log "Duration : ${EVAL_DURATION}s" log "Finished : $(date)" log "" # Extract pass/fail line from log RESULTS_LINE="$(grep -E "Results:.*passed" "$LOG_FILE" | tail -1 || echo "Results line not found")" log "Raw result : $RESULTS_LINE" log "" # Parse numbers PASSED="$(echo "$RESULTS_LINE" | grep -oE '[0-9]+ passed' | grep -oE '[0-9]+' || echo 0)" FAILED="$(echo "$RESULTS_LINE" | grep -oE '[0-9]+ failed' | grep -oE '[0-9]+' || echo 0)" TOTAL=$(( PASSED + FAILED )) if [[ "$TOTAL" -gt 0 ]]; then PASS_RATE=$(( PASSED * 100 / TOTAL )) else PASS_RATE=0 fi log "Passed : $PASSED / $TOTAL ($PASS_RATE%)" log "Failed : $FAILED / $TOTAL" log "" # ── Final verdict ───────────────────────────────────────────────────────────── if [[ "$EVAL_EXIT" -eq 0 ]]; then log "RESULT: ✅ PASSED — $PASSED/$TOTAL assertions passed ($PASS_RATE%)" elif [[ "$PASS_RATE" -ge 70 ]]; then log "RESULT: ⚠️ PARTIAL — $PASSED/$TOTAL passed ($PASS_RATE%) — some assertions failed (expected for 8B model)" elif [[ "$PASS_RATE" -ge 40 ]]; then log "RESULT: ⚠️ LOW — $PASSED/$TOTAL passed ($PASS_RATE%) — model may need prompt tuning" else log "RESULT: ❌ FAILED — $PASSED/$TOTAL passed ($PASS_RATE%) — check assertion errors above" fi log "" log "Full log : $LOG_FILE" log "View UI : npx promptfoo view" log "" # ── Notify when done (macOS) ────────────────────────────────────────────────── if command -v osascript &>/dev/null; then osascript -e "display notification \"$PASSED/$TOTAL passed ($PASS_RATE%) in ${EVAL_DURATION}s\" with title \"Ollama Evals Done\" subtitle \"Model: $OLLAMA_MODEL\"" 2>/dev/null || true fi exit "$EVAL_EXIT"