learning_ai_common_plat/services/extraction-service/evals/promptfoo.yaml
saravanakumardb1 acd4c3542b feat(extraction-service): scaffold promptfoo eval suite with 19 test cases
- Add evals/promptfoo.yaml: HTTP provider hitting extraction-service API
  covering all 5 built-in tasks (transcript, triage, memory-insight,
  reflection-enrichment, bug-report-extraction)
- Add evals/fixtures/golden.json: machine-readable golden input/output fixtures
- Add evals/run-evals.sh: shell runner with health checks, auth token
  handling, task filtering, and CI mode
- Add evals/README.md: usage docs, prerequisites, cost estimates, CI integration
2026-02-19 12:19:16 -08:00

320 lines
13 KiB
YAML

# promptfoo eval config for extraction-service
# Docs: https://promptfoo.dev/docs/configuration/guide
#
# Usage:
# pnpm eval # run all evals (service must be running on port 4005)
# pnpm eval:task triage # run a single task suite
# pnpm eval:ci # CI mode — fail on any assertion failure
#
# Prerequisites:
# 1. extraction-service running: pnpm dev (port 4005)
# 2. EXTRACTION_EVAL_TOKEN set in env (any valid JWT from platform-service)
description: Extraction Service — LLM Output Quality Evals
# ── Provider: the extraction-service HTTP API ────────────────────
providers:
- id: http
config:
url: "{{env.EXTRACTION_SERVICE_URL | default('http://localhost:4005')}}/api/extract"
method: POST
headers:
Content-Type: application/json
Authorization: 'Bearer {{env.EXTRACTION_EVAL_TOKEN}}'
body:
text: '{{text}}'
taskId: '{{taskId}}'
productId: "{{env.EVAL_PRODUCT_ID | default('lysnrai')}}"
transformResponse: |
return {
extractions: json.extractions,
classes: json.extractions.map(e => e.extraction_class),
texts: json.extractions.map(e => e.extraction_text),
durationMs: json.metadata?.durationMs,
};
# ── Default assertion thresholds ────────────────────────────────
defaultTest:
options:
timeoutMs: 30000
assert:
- type: latency
threshold: 15000
# ── Test suites per task ─────────────────────────────────────────
tests:
# ── transcript-extraction ──────────────────────────────────────
- description: 'transcript: extracts action item and deadline'
vars:
taskId: transcript-extraction
text: 'John said we need to ship the feature by Friday. Sarah agreed to handle the testing.'
assert:
- type: javascript
value: |
output.classes.includes('action_item')
- type: javascript
value: |
output.classes.includes('deadline')
- type: javascript
value: |
output.classes.includes('person')
- type: javascript
value: |
output.texts.some(t => t.toLowerCase().includes('friday') || t.toLowerCase().includes('ship'))
- description: 'transcript: extracts decision from meeting note'
vars:
taskId: transcript-extraction
text: 'The team decided to postpone the launch to Q3. Alice will notify all stakeholders by Monday.'
assert:
- type: javascript
value: output.classes.includes('decision')
- type: javascript
value: output.classes.includes('action_item')
- type: javascript
value: output.classes.includes('person')
- type: javascript
value: output.classes.includes('deadline')
- description: 'transcript: extracts question from discussion'
vars:
taskId: transcript-extraction
text: 'Bob asked: should we use Postgres or Cosmos DB for the new service? No decision was made.'
assert:
- type: javascript
value: output.classes.includes('question')
- type: javascript
value: output.classes.includes('person')
- description: 'transcript: handles multi-person transcript'
vars:
taskId: transcript-extraction
text: "Maria: I finished the design mockups. Tom: Great, I'll review them by EOD. Maria: Can you also check the mobile screens?"
assert:
- type: javascript
value: output.classes.includes('action_item')
- type: javascript
value: output.classes.includes('person')
- type: javascript
value: output.texts.some(t => t.toLowerCase().includes('maria') || t.toLowerCase().includes('tom'))
- type: javascript
value: output.extractions.length >= 3
# ── triage ─────────────────────────────────────────────────────
- description: 'triage: health brain signal for medical content'
vars:
taskId: triage
text: "Remind me to call the dentist tomorrow about my appointment. I'm stressed about the cost."
assert:
- type: javascript
value: output.classes.includes('action')
- type: javascript
value: output.classes.includes('date_reference')
- type: javascript
value: output.classes.includes('emotion')
- type: javascript
value: |
output.extractions.some(e =>
e.extraction_class === 'brain_signal' &&
e.attributes?.brain === 'health'
)
- description: 'triage: work brain signal for project content'
vars:
taskId: triage
text: 'Need to finish the Q1 report for my manager by end of week. The presentation is on Thursday.'
assert:
- type: javascript
value: output.classes.includes('action')
- type: javascript
value: output.classes.includes('date_reference')
- type: javascript
value: |
output.extractions.some(e =>
e.extraction_class === 'brain_signal' &&
e.attributes?.brain === 'work'
)
- description: 'triage: money brain signal for financial content'
vars:
taskId: triage
text: "I need to pay the credit card bill before the 15th or I'll get charged interest."
assert:
- type: javascript
value: output.classes.includes('action')
- type: javascript
value: output.classes.includes('date_reference')
- type: javascript
value: |
output.extractions.some(e =>
e.extraction_class === 'brain_signal' &&
e.attributes?.brain === 'money'
)
- description: 'triage: negative emotion detected'
vars:
taskId: triage
text: "Feeling really overwhelmed today. Too many things on my plate and I can't focus."
assert:
- type: javascript
value: output.classes.includes('emotion')
- type: javascript
value: |
output.extractions.some(e =>
e.extraction_class === 'emotion' &&
e.attributes?.valence === 'negative'
)
- description: 'triage: multiple brain signals for mixed content'
vars:
taskId: triage
text: 'Doctor said I need to exercise more. Also need to check my 401k contributions before year end.'
assert:
- type: javascript
value: |
output.extractions.some(e =>
e.extraction_class === 'brain_signal' && e.attributes?.brain === 'health'
)
- type: javascript
value: |
output.extractions.some(e =>
e.extraction_class === 'brain_signal' && e.attributes?.brain === 'money'
)
# ── memory-insight ─────────────────────────────────────────────
- description: 'memory-insight: detects recurring pattern'
vars:
taskId: memory-insight
text: 'Item 1: Skipped gym again. Item 2: Feeling tired at work. Item 3: Had coffee at 4pm to stay awake. Item 4: Skipped gym for the third time this week.'
assert:
- type: javascript
value: output.classes.includes('pattern')
- type: javascript
value: |
output.extractions.some(e =>
e.extraction_class === 'pattern' &&
e.attributes?.frequency === 'recurring'
)
- description: 'memory-insight: detects relationship between items'
vars:
taskId: memory-insight
text: "Item 1: Stayed up until 2am coding. Item 2: Missed standup the next morning. Item 3: Felt foggy all day. Item 4: Late night again, can't stop."
assert:
- type: javascript
value: output.classes.includes('relationship')
- type: javascript
value: output.classes.includes('pattern')
- description: 'memory-insight: detects milestone'
vars:
taskId: memory-insight
text: 'Item 1: Started learning Spanish 3 months ago. Item 2: Had first full conversation in Spanish today. Item 3: Completed Duolingo 90-day streak.'
assert:
- type: javascript
value: output.classes.includes('milestone')
- type: javascript
value: output.extractions.length >= 2
- description: 'memory-insight: detects recurring theme across entries'
vars:
taskId: memory-insight
text: 'Entry 1: Anxious before the presentation. Entry 2: Nervous about the client call. Entry 3: Worried about the demo tomorrow. Entry 4: Stressed about the board meeting.'
assert:
- type: javascript
value: output.classes.includes('recurring_theme')
- type: javascript
value: output.extractions.length >= 1
# ── reflection-enrichment ──────────────────────────────────────
- description: 'reflection: extracts accomplishment and concern'
vars:
taskId: reflection-enrichment
text: "Good day overall. Finally finished the proposal I've been putting off. Still worried about the budget review next week."
assert:
- type: javascript
value: output.classes.includes('accomplishment')
- type: javascript
value: output.classes.includes('concern')
- type: javascript
value: output.classes.includes('emotional_state')
- description: 'reflection: positive emotional state detected'
vars:
taskId: reflection-enrichment
text: 'Had a fantastic week. Shipped the new feature, got great feedback from users, and the team celebrated together.'
assert:
- type: javascript
value: output.classes.includes('emotional_state')
- type: javascript
value: |
output.extractions.some(e =>
e.extraction_class === 'emotional_state' &&
e.attributes?.valence === 'positive'
)
- type: javascript
value: output.classes.includes('accomplishment')
- description: 'reflection: goal progress detected'
vars:
taskId: reflection-enrichment
text: "I've been trying to read more this year. This month I finished my third book — ahead of my goal of one per month."
assert:
- type: javascript
value: output.classes.includes('goal_progress')
- type: javascript
value: output.classes.includes('accomplishment')
- description: 'reflection: mixed positive and negative signals'
vars:
taskId: reflection-enrichment
text: "Proud of finishing the marathon training plan. But I'm really worried I won't be able to run the actual race — my knee has been acting up."
assert:
- type: javascript
value: output.classes.includes('accomplishment')
- type: javascript
value: output.classes.includes('concern')
- type: javascript
value: |
output.extractions.some(e =>
e.extraction_class === 'emotional_state' &&
e.attributes?.valence === 'positive'
)
# ── bug-report-extraction ──────────────────────────────────────
- description: 'bug-report: extracts all 5 fields'
vars:
taskId: bug-report-extraction
text: 'When I click the save button on the settings page, nothing happens. It should save my preferences. This is a critical issue affecting all users.'
assert:
- type: javascript
value: output.classes.includes('steps_to_reproduce')
- type: javascript
value: output.classes.includes('expected_behavior')
- type: javascript
value: output.classes.includes('actual_behavior')
- type: javascript
value: output.classes.includes('affected_component')
- type: javascript
value: output.classes.includes('severity')
- type: javascript
value: |
output.extractions.some(e =>
e.extraction_class === 'severity' &&
e.attributes?.level === 'critical'
)
- description: 'bug-report: extracts steps and component from login bug'
vars:
taskId: bug-report-extraction
text: 'Steps: 1) Open login page, 2) Enter valid credentials, 3) Click login. Expected: redirect to dashboard. Actual: spinner shows forever. Affects the login page on mobile.'
assert:
- type: javascript
value: output.classes.includes('steps_to_reproduce')
- type: javascript
value: output.classes.includes('expected_behavior')
- type: javascript
value: output.classes.includes('actual_behavior')
- type: javascript
value: output.classes.includes('affected_component')