- Add evals/promptfoo.yaml: HTTP provider hitting extraction-service API covering all 5 built-in tasks (transcript, triage, memory-insight, reflection-enrichment, bug-report-extraction) - Add evals/fixtures/golden.json: machine-readable golden input/output fixtures - Add evals/run-evals.sh: shell runner with health checks, auth token handling, task filtering, and CI mode - Add evals/README.md: usage docs, prerequisites, cost estimates, CI integration
320 lines
13 KiB
YAML
320 lines
13 KiB
YAML
# promptfoo eval config for extraction-service
|
|
# Docs: https://promptfoo.dev/docs/configuration/guide
|
|
#
|
|
# Usage:
|
|
# pnpm eval # run all evals (service must be running on port 4005)
|
|
# pnpm eval:task triage # run a single task suite
|
|
# pnpm eval:ci # CI mode — fail on any assertion failure
|
|
#
|
|
# Prerequisites:
|
|
# 1. extraction-service running: pnpm dev (port 4005)
|
|
# 2. EXTRACTION_EVAL_TOKEN set in env (any valid JWT from platform-service)
|
|
|
|
description: Extraction Service — LLM Output Quality Evals
|
|
|
|
# ── Provider: the extraction-service HTTP API ────────────────────
|
|
providers:
|
|
- id: http
|
|
config:
|
|
url: "{{env.EXTRACTION_SERVICE_URL | default('http://localhost:4005')}}/api/extract"
|
|
method: POST
|
|
headers:
|
|
Content-Type: application/json
|
|
Authorization: 'Bearer {{env.EXTRACTION_EVAL_TOKEN}}'
|
|
body:
|
|
text: '{{text}}'
|
|
taskId: '{{taskId}}'
|
|
productId: "{{env.EVAL_PRODUCT_ID | default('lysnrai')}}"
|
|
transformResponse: |
|
|
return {
|
|
extractions: json.extractions,
|
|
classes: json.extractions.map(e => e.extraction_class),
|
|
texts: json.extractions.map(e => e.extraction_text),
|
|
durationMs: json.metadata?.durationMs,
|
|
};
|
|
|
|
# ── Default assertion thresholds ────────────────────────────────
|
|
defaultTest:
|
|
options:
|
|
timeoutMs: 30000
|
|
assert:
|
|
- type: latency
|
|
threshold: 15000
|
|
|
|
# ── Test suites per task ─────────────────────────────────────────
|
|
tests:
|
|
# ── transcript-extraction ──────────────────────────────────────
|
|
- description: 'transcript: extracts action item and deadline'
|
|
vars:
|
|
taskId: transcript-extraction
|
|
text: 'John said we need to ship the feature by Friday. Sarah agreed to handle the testing.'
|
|
assert:
|
|
- type: javascript
|
|
value: |
|
|
output.classes.includes('action_item')
|
|
- type: javascript
|
|
value: |
|
|
output.classes.includes('deadline')
|
|
- type: javascript
|
|
value: |
|
|
output.classes.includes('person')
|
|
- type: javascript
|
|
value: |
|
|
output.texts.some(t => t.toLowerCase().includes('friday') || t.toLowerCase().includes('ship'))
|
|
|
|
- description: 'transcript: extracts decision from meeting note'
|
|
vars:
|
|
taskId: transcript-extraction
|
|
text: 'The team decided to postpone the launch to Q3. Alice will notify all stakeholders by Monday.'
|
|
assert:
|
|
- type: javascript
|
|
value: output.classes.includes('decision')
|
|
- type: javascript
|
|
value: output.classes.includes('action_item')
|
|
- type: javascript
|
|
value: output.classes.includes('person')
|
|
- type: javascript
|
|
value: output.classes.includes('deadline')
|
|
|
|
- description: 'transcript: extracts question from discussion'
|
|
vars:
|
|
taskId: transcript-extraction
|
|
text: 'Bob asked: should we use Postgres or Cosmos DB for the new service? No decision was made.'
|
|
assert:
|
|
- type: javascript
|
|
value: output.classes.includes('question')
|
|
- type: javascript
|
|
value: output.classes.includes('person')
|
|
|
|
- description: 'transcript: handles multi-person transcript'
|
|
vars:
|
|
taskId: transcript-extraction
|
|
text: "Maria: I finished the design mockups. Tom: Great, I'll review them by EOD. Maria: Can you also check the mobile screens?"
|
|
assert:
|
|
- type: javascript
|
|
value: output.classes.includes('action_item')
|
|
- type: javascript
|
|
value: output.classes.includes('person')
|
|
- type: javascript
|
|
value: output.texts.some(t => t.toLowerCase().includes('maria') || t.toLowerCase().includes('tom'))
|
|
- type: javascript
|
|
value: output.extractions.length >= 3
|
|
|
|
# ── triage ─────────────────────────────────────────────────────
|
|
- description: 'triage: health brain signal for medical content'
|
|
vars:
|
|
taskId: triage
|
|
text: "Remind me to call the dentist tomorrow about my appointment. I'm stressed about the cost."
|
|
assert:
|
|
- type: javascript
|
|
value: output.classes.includes('action')
|
|
- type: javascript
|
|
value: output.classes.includes('date_reference')
|
|
- type: javascript
|
|
value: output.classes.includes('emotion')
|
|
- type: javascript
|
|
value: |
|
|
output.extractions.some(e =>
|
|
e.extraction_class === 'brain_signal' &&
|
|
e.attributes?.brain === 'health'
|
|
)
|
|
|
|
- description: 'triage: work brain signal for project content'
|
|
vars:
|
|
taskId: triage
|
|
text: 'Need to finish the Q1 report for my manager by end of week. The presentation is on Thursday.'
|
|
assert:
|
|
- type: javascript
|
|
value: output.classes.includes('action')
|
|
- type: javascript
|
|
value: output.classes.includes('date_reference')
|
|
- type: javascript
|
|
value: |
|
|
output.extractions.some(e =>
|
|
e.extraction_class === 'brain_signal' &&
|
|
e.attributes?.brain === 'work'
|
|
)
|
|
|
|
- description: 'triage: money brain signal for financial content'
|
|
vars:
|
|
taskId: triage
|
|
text: "I need to pay the credit card bill before the 15th or I'll get charged interest."
|
|
assert:
|
|
- type: javascript
|
|
value: output.classes.includes('action')
|
|
- type: javascript
|
|
value: output.classes.includes('date_reference')
|
|
- type: javascript
|
|
value: |
|
|
output.extractions.some(e =>
|
|
e.extraction_class === 'brain_signal' &&
|
|
e.attributes?.brain === 'money'
|
|
)
|
|
|
|
- description: 'triage: negative emotion detected'
|
|
vars:
|
|
taskId: triage
|
|
text: "Feeling really overwhelmed today. Too many things on my plate and I can't focus."
|
|
assert:
|
|
- type: javascript
|
|
value: output.classes.includes('emotion')
|
|
- type: javascript
|
|
value: |
|
|
output.extractions.some(e =>
|
|
e.extraction_class === 'emotion' &&
|
|
e.attributes?.valence === 'negative'
|
|
)
|
|
|
|
- description: 'triage: multiple brain signals for mixed content'
|
|
vars:
|
|
taskId: triage
|
|
text: 'Doctor said I need to exercise more. Also need to check my 401k contributions before year end.'
|
|
assert:
|
|
- type: javascript
|
|
value: |
|
|
output.extractions.some(e =>
|
|
e.extraction_class === 'brain_signal' && e.attributes?.brain === 'health'
|
|
)
|
|
- type: javascript
|
|
value: |
|
|
output.extractions.some(e =>
|
|
e.extraction_class === 'brain_signal' && e.attributes?.brain === 'money'
|
|
)
|
|
|
|
# ── memory-insight ─────────────────────────────────────────────
|
|
- description: 'memory-insight: detects recurring pattern'
|
|
vars:
|
|
taskId: memory-insight
|
|
text: 'Item 1: Skipped gym again. Item 2: Feeling tired at work. Item 3: Had coffee at 4pm to stay awake. Item 4: Skipped gym for the third time this week.'
|
|
assert:
|
|
- type: javascript
|
|
value: output.classes.includes('pattern')
|
|
- type: javascript
|
|
value: |
|
|
output.extractions.some(e =>
|
|
e.extraction_class === 'pattern' &&
|
|
e.attributes?.frequency === 'recurring'
|
|
)
|
|
|
|
- description: 'memory-insight: detects relationship between items'
|
|
vars:
|
|
taskId: memory-insight
|
|
text: "Item 1: Stayed up until 2am coding. Item 2: Missed standup the next morning. Item 3: Felt foggy all day. Item 4: Late night again, can't stop."
|
|
assert:
|
|
- type: javascript
|
|
value: output.classes.includes('relationship')
|
|
- type: javascript
|
|
value: output.classes.includes('pattern')
|
|
|
|
- description: 'memory-insight: detects milestone'
|
|
vars:
|
|
taskId: memory-insight
|
|
text: 'Item 1: Started learning Spanish 3 months ago. Item 2: Had first full conversation in Spanish today. Item 3: Completed Duolingo 90-day streak.'
|
|
assert:
|
|
- type: javascript
|
|
value: output.classes.includes('milestone')
|
|
- type: javascript
|
|
value: output.extractions.length >= 2
|
|
|
|
- description: 'memory-insight: detects recurring theme across entries'
|
|
vars:
|
|
taskId: memory-insight
|
|
text: 'Entry 1: Anxious before the presentation. Entry 2: Nervous about the client call. Entry 3: Worried about the demo tomorrow. Entry 4: Stressed about the board meeting.'
|
|
assert:
|
|
- type: javascript
|
|
value: output.classes.includes('recurring_theme')
|
|
- type: javascript
|
|
value: output.extractions.length >= 1
|
|
|
|
# ── reflection-enrichment ──────────────────────────────────────
|
|
- description: 'reflection: extracts accomplishment and concern'
|
|
vars:
|
|
taskId: reflection-enrichment
|
|
text: "Good day overall. Finally finished the proposal I've been putting off. Still worried about the budget review next week."
|
|
assert:
|
|
- type: javascript
|
|
value: output.classes.includes('accomplishment')
|
|
- type: javascript
|
|
value: output.classes.includes('concern')
|
|
- type: javascript
|
|
value: output.classes.includes('emotional_state')
|
|
|
|
- description: 'reflection: positive emotional state detected'
|
|
vars:
|
|
taskId: reflection-enrichment
|
|
text: 'Had a fantastic week. Shipped the new feature, got great feedback from users, and the team celebrated together.'
|
|
assert:
|
|
- type: javascript
|
|
value: output.classes.includes('emotional_state')
|
|
- type: javascript
|
|
value: |
|
|
output.extractions.some(e =>
|
|
e.extraction_class === 'emotional_state' &&
|
|
e.attributes?.valence === 'positive'
|
|
)
|
|
- type: javascript
|
|
value: output.classes.includes('accomplishment')
|
|
|
|
- description: 'reflection: goal progress detected'
|
|
vars:
|
|
taskId: reflection-enrichment
|
|
text: "I've been trying to read more this year. This month I finished my third book — ahead of my goal of one per month."
|
|
assert:
|
|
- type: javascript
|
|
value: output.classes.includes('goal_progress')
|
|
- type: javascript
|
|
value: output.classes.includes('accomplishment')
|
|
|
|
- description: 'reflection: mixed positive and negative signals'
|
|
vars:
|
|
taskId: reflection-enrichment
|
|
text: "Proud of finishing the marathon training plan. But I'm really worried I won't be able to run the actual race — my knee has been acting up."
|
|
assert:
|
|
- type: javascript
|
|
value: output.classes.includes('accomplishment')
|
|
- type: javascript
|
|
value: output.classes.includes('concern')
|
|
- type: javascript
|
|
value: |
|
|
output.extractions.some(e =>
|
|
e.extraction_class === 'emotional_state' &&
|
|
e.attributes?.valence === 'positive'
|
|
)
|
|
|
|
# ── bug-report-extraction ──────────────────────────────────────
|
|
- description: 'bug-report: extracts all 5 fields'
|
|
vars:
|
|
taskId: bug-report-extraction
|
|
text: 'When I click the save button on the settings page, nothing happens. It should save my preferences. This is a critical issue affecting all users.'
|
|
assert:
|
|
- type: javascript
|
|
value: output.classes.includes('steps_to_reproduce')
|
|
- type: javascript
|
|
value: output.classes.includes('expected_behavior')
|
|
- type: javascript
|
|
value: output.classes.includes('actual_behavior')
|
|
- type: javascript
|
|
value: output.classes.includes('affected_component')
|
|
- type: javascript
|
|
value: output.classes.includes('severity')
|
|
- type: javascript
|
|
value: |
|
|
output.extractions.some(e =>
|
|
e.extraction_class === 'severity' &&
|
|
e.attributes?.level === 'critical'
|
|
)
|
|
|
|
- description: 'bug-report: extracts steps and component from login bug'
|
|
vars:
|
|
taskId: bug-report-extraction
|
|
text: 'Steps: 1) Open login page, 2) Enter valid credentials, 3) Click login. Expected: redirect to dashboard. Actual: spinner shows forever. Affects the login page on mobile.'
|
|
assert:
|
|
- type: javascript
|
|
value: output.classes.includes('steps_to_reproduce')
|
|
- type: javascript
|
|
value: output.classes.includes('expected_behavior')
|
|
- type: javascript
|
|
value: output.classes.includes('actual_behavior')
|
|
- type: javascript
|
|
value: output.classes.includes('affected_component')
|