learning_ai_common_plat/services/extraction-service/evals/promptfoo.ollama.yaml

# promptfoo eval config — Ollama (local OSS models)
# Runs the same 19 extraction cases directly against Ollama's OpenAI-compatible API.
#
# Usage:
#   pnpm eval:ollama                        # run with llama3.1:8b (default)
#   OLLAMA_MODEL=qwen2.5:7b pnpm eval:ollama
#   pnpm eval:compare                       # run both gemini + ollama and diff
#
# Prerequisites:
#   1. ollama serve  (running on localhost:11434)
#   2. ollama pull llama3.1:8b
#
# NOTE: promptfoo javascript assertions must be single expressions (no const/return).
#       Use JSON.parse(output).extractions... chained directly.

description: Extraction Service — LLM Output Quality Evals (Ollama / Local)

providers:
  - id: openai:chat:{{env.OLLAMA_MODEL | default('llama3.1:8b')}}
    config:
      apiBaseUrl: "{{env.OLLAMA_BASE_URL | default('http://localhost:11434/v1')}}"
      apiKey: ollama
      temperature: 0.1
      response_format:
        type: json_object

prompts:
  - |
    You are a structured information extraction engine.

    Task: {{taskPrompt}}

    Extract entities from the text below. You MUST only use these extraction classes: {{classes}}

    Return ONLY a valid JSON object in this exact format — no markdown, no explanation:
    {
      "extractions": [
        {
          "extraction_class": "<one of the allowed classes>",
          "extraction_text": "<verbatim text from the input>",
          "attributes": {}
        }
      ]
    }

    For brain_signal extractions, set attributes.brain to one of: work, home, money, health, global
    For emotion/emotional_state extractions, set attributes.valence to: positive, negative, or neutral
    For pattern extractions, set attributes.frequency to: recurring, occasional, or one-time
    For severity extractions, set attributes.level to: critical, high, medium, or low

    Text to extract from:
    {{text}}

defaultTest:
  options:
    timeoutMs: 90000
  assert:
    - type: latency
      threshold: 60000

tests:
  # ── transcript-extraction ──────────────────────────────────────
  - description: 'transcript: extracts action item and deadline'
    vars:
      taskPrompt: 'Extract action items, decisions, questions, deadlines, people, and topics from the following transcript.'
      classes: 'action_item, decision, question, deadline, person, topic'
      text: 'John said we need to ship the feature by Friday. Sarah agreed to handle the testing.'
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('action_item')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('deadline')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('person')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_text.toLowerCase()}).some(function(t){return t.includes('friday')||t.includes('ship')})"

  - description: 'transcript: extracts decision from meeting note'
    vars:
      taskPrompt: 'Extract action items, decisions, questions, deadlines, people, and topics from the following transcript.'
      classes: 'action_item, decision, question, deadline, person, topic'
      text: 'The team decided to postpone the launch to Q3. Alice will notify all stakeholders by Monday.'
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('decision')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('action_item')"
      - type: javascript
        value: "JSON.parse(output).extractions.some(function(e){return ['person','topic','entity','action_item','decision'].includes(e.extraction_class)||e.extraction_text.toLowerCase().includes('alice')||e.extraction_text.toLowerCase().includes('team')})"
      - type: javascript
        value: "JSON.parse(output).extractions.some(function(e){return ['deadline','date_reference','topic','decision','action_item'].includes(e.extraction_class)||e.extraction_text.toLowerCase().includes('monday')||e.extraction_text.toLowerCase().includes('q3')||e.extraction_text.toLowerCase().includes('postpone')})"

  - description: 'transcript: extracts question from discussion'
    vars:
      taskPrompt: 'Extract action items, decisions, questions, deadlines, people, and topics from the following transcript.'
      classes: 'action_item, decision, question, deadline, person, topic'
      text: 'Bob asked: should we use Postgres or Cosmos DB for the new service? No decision was made.'
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('question')"
      - type: javascript
        value: "JSON.parse(output).extractions.some(function(e){return ['person','topic','entity','question'].includes(e.extraction_class)||e.extraction_text.toLowerCase().includes('bob')||e.extraction_text.toLowerCase().includes('postgres')||e.extraction_text.toLowerCase().includes('cosmos')})"

  - description: 'transcript: handles multi-person transcript'
    vars:
      taskPrompt: 'Extract action items, decisions, questions, deadlines, people, and topics from the following transcript.'
      classes: 'action_item, decision, question, deadline, person, topic'
      text: "Maria: I finished the design mockups. Tom: Great, I'll review them by EOD. Maria: Can you also check the mobile screens?"
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('action_item')"
      - type: javascript
        value: "JSON.parse(output).extractions.some(function(e){return ['person','topic','entity','action_item','question'].includes(e.extraction_class)})"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_text.toLowerCase()}).some(function(t){return t.includes('maria')||t.includes('tom')||t.includes('design')||t.includes('mobile')||t.includes('review')||t.includes('mockup')})"
      - type: javascript
        value: 'JSON.parse(output).extractions.length >= 2'

  # ── triage ─────────────────────────────────────────────────────
  - description: 'triage: health brain signal for medical content'
    vars:
      taskPrompt: 'Analyze the user capture and extract topics, named entities, action items, emotional signals, date references, and brain routing signals. Brain signals should include which brain (work, home, money, health, global) the content belongs to.'
      classes: 'topic, entity, action, emotion, date_reference, brain_signal'
      text: "Remind me to call the dentist tomorrow about my appointment. I'm stressed about the cost."
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('action')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('date_reference')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('emotion')"
      - type: javascript
        value: "JSON.parse(output).extractions.some(function(e){return (e.extraction_class==='brain_signal'&&e.attributes&&e.attributes.brain==='health')||e.extraction_text.toLowerCase().includes('dentist')||e.extraction_text.toLowerCase().includes('health')||e.extraction_text.toLowerCase().includes('appointment')})"

  - description: 'triage: work brain signal for project content'
    vars:
      taskPrompt: 'Analyze the user capture and extract topics, named entities, action items, emotional signals, date references, and brain routing signals. Brain signals should include which brain (work, home, money, health, global) the content belongs to.'
      classes: 'topic, entity, action, emotion, date_reference, brain_signal'
      text: 'Need to finish the Q1 report for my manager by end of week. The presentation is on Thursday.'
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('action')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('date_reference')"
      - type: javascript
        value: "JSON.parse(output).extractions.some(function(e){return (e.extraction_class==='brain_signal'&&e.attributes&&e.attributes.brain==='work')||e.extraction_text.toLowerCase().includes('report')||e.extraction_text.toLowerCase().includes('manager')||e.extraction_text.toLowerCase().includes('presentation')})"

  - description: 'triage: money brain signal for financial content'
    vars:
      taskPrompt: 'Analyze the user capture and extract topics, named entities, action items, emotional signals, date references, and brain routing signals. Brain signals should include which brain (work, home, money, health, global) the content belongs to.'
      classes: 'topic, entity, action, emotion, date_reference, brain_signal'
      text: "I need to pay the credit card bill before the 15th or I'll get charged interest."
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('action')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('date_reference')"
      - type: javascript
        value: "JSON.parse(output).extractions.some(function(e){return (e.extraction_class==='brain_signal'&&e.attributes&&e.attributes.brain==='money')||e.extraction_text.toLowerCase().includes('credit')||e.extraction_text.toLowerCase().includes('bill')||e.extraction_text.toLowerCase().includes('interest')})"

  - description: 'triage: negative emotion detected'
    vars:
      taskPrompt: 'Analyze the user capture and extract topics, named entities, action items, emotional signals, date references, and brain routing signals.'
      classes: 'topic, entity, action, emotion, date_reference, brain_signal'
      text: "Feeling really overwhelmed today. Too many things on my plate and I can't focus."
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('emotion')"
      - type: javascript
        value: "JSON.parse(output).extractions.some(function(e){return e.extraction_class==='emotion'&&e.attributes&&e.attributes.valence==='negative'})"

  - description: 'triage: multiple brain signals for mixed content'
    vars:
      taskPrompt: 'Analyze the user capture and extract topics, named entities, action items, emotional signals, date references, and brain routing signals. Brain signals should include which brain (work, home, money, health, global) the content belongs to.'
      classes: 'topic, entity, action, emotion, date_reference, brain_signal'
      text: 'Doctor said I need to exercise more. Also need to check my 401k contributions before year end.'
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.some(function(e){return (e.extraction_class==='brain_signal'&&e.attributes&&e.attributes.brain==='health')||e.extraction_text.toLowerCase().includes('doctor')||e.extraction_text.toLowerCase().includes('exercise')})"
      - type: javascript
        value: "JSON.parse(output).extractions.some(function(e){return (e.extraction_class==='brain_signal'&&e.attributes&&e.attributes.brain==='money')||e.extraction_text.toLowerCase().includes('401k')||e.extraction_text.toLowerCase().includes('contribution')})"

  # ── memory-insight ─────────────────────────────────────────────
  - description: 'memory-insight: detects recurring pattern'
    vars:
      taskPrompt: 'Analyze the collection of memory items and extract recurring patterns, themes, relationships between items, and milestones.'
      classes: 'pattern, recurring_theme, relationship, milestone'
      text: 'Item 1: Skipped gym again. Item 2: Feeling tired at work. Item 3: Had coffee at 4pm to stay awake. Item 4: Skipped gym for the third time this week.'
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.some(function(e){return ['pattern','recurring_theme','relationship'].includes(e.extraction_class)})"
      - type: javascript
        value: "JSON.parse(output).extractions.some(function(e){return (['pattern','recurring_theme'].includes(e.extraction_class)&&e.attributes&&e.attributes.frequency==='recurring')||['pattern','recurring_theme'].includes(e.extraction_class)})"

  - description: 'memory-insight: detects relationship between items'
    vars:
      taskPrompt: 'Analyze the collection of memory items and extract recurring patterns, themes, relationships between items, and milestones.'
      classes: 'pattern, recurring_theme, relationship, milestone'
      text: "Item 1: Stayed up until 2am coding. Item 2: Missed standup the next morning. Item 3: Felt foggy all day. Item 4: Late night again, can't stop."
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('relationship')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('pattern')"

  - description: 'memory-insight: detects milestone'
    vars:
      taskPrompt: 'Analyze the collection of memory items and extract recurring patterns, themes, relationships between items, and milestones.'
      classes: 'pattern, recurring_theme, relationship, milestone'
      text: 'Item 1: Started learning Spanish 3 months ago. Item 2: Had first full conversation in Spanish today. Item 3: Completed Duolingo 90-day streak.'
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('milestone')"
      - type: javascript
        value: 'JSON.parse(output).extractions.length >= 2'

  - description: 'memory-insight: detects recurring theme across entries'
    vars:
      taskPrompt: 'Analyze the collection of memory items and extract recurring patterns, themes, relationships between items, and milestones.'
      classes: 'pattern, recurring_theme, relationship, milestone'
      text: 'Entry 1: Anxious before the presentation. Entry 2: Nervous about the client call. Entry 3: Worried about the demo tomorrow. Entry 4: Stressed about the board meeting.'
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('recurring_theme')"
      - type: javascript
        value: 'JSON.parse(output).extractions.length >= 1'

  # ── reflection-enrichment ──────────────────────────────────────
  - description: 'reflection: extracts accomplishment and concern'
    vars:
      taskPrompt: 'Analyze the reflection or journal entry and extract emotional states, accomplishments, concerns, and goal progress indicators.'
      classes: 'emotional_state, accomplishment, concern, goal_progress'
      text: "Good day overall. Finally finished the proposal I've been putting off. Still worried about the budget review next week."
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('accomplishment')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('concern')"
      - type: javascript
        value: "JSON.parse(output).extractions.some(function(e){return ['emotional_state','emotion','concern','accomplishment'].includes(e.extraction_class)})"

  - description: 'reflection: positive emotional state detected'
    vars:
      taskPrompt: 'Analyze the reflection or journal entry and extract emotional states, accomplishments, concerns, and goal progress indicators.'
      classes: 'emotional_state, accomplishment, concern, goal_progress'
      text: 'Had a fantastic week. Shipped the new feature, got great feedback from users, and the team celebrated together.'
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('emotional_state')"
      - type: javascript
        value: "JSON.parse(output).extractions.some(function(e){return e.extraction_class==='emotional_state'&&e.attributes&&e.attributes.valence==='positive'})"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('accomplishment')"

  - description: 'reflection: goal progress detected'
    vars:
      taskPrompt: 'Analyze the reflection or journal entry and extract emotional states, accomplishments, concerns, and goal progress indicators.'
      classes: 'emotional_state, accomplishment, concern, goal_progress'
      text: "I've been trying to read more this year. This month I finished my third book — ahead of my goal of one per month."
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('goal_progress')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('accomplishment')"

  - description: 'reflection: mixed positive and negative signals'
    vars:
      taskPrompt: 'Analyze the reflection or journal entry and extract emotional states, accomplishments, concerns, and goal progress indicators.'
      classes: 'emotional_state, accomplishment, concern, goal_progress'
      text: "Proud of finishing the marathon training plan. But I'm really worried I won't be able to run the actual race — my knee has been acting up."
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('accomplishment')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('concern')"
      - type: javascript
        value: "JSON.parse(output).extractions.some(function(e){return e.extraction_class==='emotional_state'&&e.attributes&&e.attributes.valence==='positive'})"

  # ── bug-report-extraction ──────────────────────────────────────
  - description: 'bug-report: extracts all 5 fields'
    vars:
      taskPrompt: 'Extract steps to reproduce, expected behavior, actual behavior, affected component, and severity from the bug report.'
      classes: 'steps_to_reproduce, expected_behavior, actual_behavior, affected_component, severity'
      text: 'When I click the save button on the settings page, nothing happens. It should save my preferences. This is a critical issue affecting all users.'
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('steps_to_reproduce')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('expected_behavior')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('actual_behavior')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('affected_component')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('severity')"
      - type: javascript
        value: "JSON.parse(output).extractions.some(function(e){return e.extraction_class==='severity'&&e.attributes&&e.attributes.level==='critical'})"

  - description: 'bug-report: extracts steps and component from login bug'
    vars:
      taskPrompt: 'Extract steps to reproduce, expected behavior, actual behavior, affected component, and severity from the bug report.'
      classes: 'steps_to_reproduce, expected_behavior, actual_behavior, affected_component, severity'
      text: 'Steps: 1) Open login page, 2) Enter valid credentials, 3) Click login. Expected: redirect to dashboard. Actual: spinner shows forever. Affects the login page on mobile.'
    assert:
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('steps_to_reproduce')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('expected_behavior')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('actual_behavior')"
      - type: javascript
        value: "JSON.parse(output).extractions.map(function(e){return e.extraction_class}).includes('affected_component')"