# promptfoo eval config for extraction-service # Docs: https://promptfoo.dev/docs/configuration/guide # # Usage: # pnpm eval # run all evals (service must be running on port 4005) # pnpm eval:task triage # run a single task suite # pnpm eval:ci # CI mode — fail on any assertion failure # # Prerequisites: # 1. extraction-service running: pnpm dev (port 4005) # 2. EXTRACTION_EVAL_TOKEN set in env (any valid JWT from platform-service) description: Extraction Service — LLM Output Quality Evals # ── Provider: the extraction-service HTTP API ──────────────────── providers: - id: http config: url: "{{env.EXTRACTION_SERVICE_URL | default('http://localhost:4005')}}/api/extract" method: POST headers: Content-Type: application/json Authorization: 'Bearer {{env.EXTRACTION_EVAL_TOKEN}}' body: text: '{{text}}' taskId: '{{taskId}}' productId: "{{env.EVAL_PRODUCT_ID | default('lysnrai')}}" transformResponse: | return { extractions: json.extractions, classes: json.extractions.map(e => e.extraction_class), texts: json.extractions.map(e => e.extraction_text), durationMs: json.metadata?.durationMs, }; # ── Default assertion thresholds ──────────────────────────────── defaultTest: options: timeoutMs: 30000 assert: - type: latency threshold: 15000 # ── Test suites per task ───────────────────────────────────────── tests: # ── transcript-extraction ────────────────────────────────────── - description: 'transcript: extracts action item and deadline' vars: taskId: transcript-extraction text: 'John said we need to ship the feature by Friday. Sarah agreed to handle the testing.' assert: - type: javascript value: | output.classes.includes('action_item') - type: javascript value: | output.classes.includes('deadline') - type: javascript value: | output.classes.includes('person') - type: javascript value: | output.texts.some(t => t.toLowerCase().includes('friday') || t.toLowerCase().includes('ship')) - description: 'transcript: extracts decision from meeting note' vars: taskId: transcript-extraction text: 'The team decided to postpone the launch to Q3. Alice will notify all stakeholders by Monday.' assert: - type: javascript value: output.classes.includes('decision') - type: javascript value: output.classes.includes('action_item') - type: javascript value: output.classes.includes('person') - type: javascript value: output.classes.includes('deadline') - description: 'transcript: extracts question from discussion' vars: taskId: transcript-extraction text: 'Bob asked: should we use Postgres or Cosmos DB for the new service? No decision was made.' assert: - type: javascript value: output.classes.includes('question') - type: javascript value: output.classes.includes('person') - description: 'transcript: handles multi-person transcript' vars: taskId: transcript-extraction text: "Maria: I finished the design mockups. Tom: Great, I'll review them by EOD. Maria: Can you also check the mobile screens?" assert: - type: javascript value: output.classes.includes('action_item') - type: javascript value: output.classes.includes('person') - type: javascript value: output.texts.some(t => t.toLowerCase().includes('maria') || t.toLowerCase().includes('tom')) - type: javascript value: output.extractions.length >= 3 # ── triage ───────────────────────────────────────────────────── - description: 'triage: health brain signal for medical content' vars: taskId: triage text: "Remind me to call the dentist tomorrow about my appointment. I'm stressed about the cost." assert: - type: javascript value: output.classes.includes('action') - type: javascript value: output.classes.includes('date_reference') - type: javascript value: output.classes.includes('emotion') - type: javascript value: | output.extractions.some(e => e.extraction_class === 'brain_signal' && e.attributes?.brain === 'health' ) - description: 'triage: work brain signal for project content' vars: taskId: triage text: 'Need to finish the Q1 report for my manager by end of week. The presentation is on Thursday.' assert: - type: javascript value: output.classes.includes('action') - type: javascript value: output.classes.includes('date_reference') - type: javascript value: | output.extractions.some(e => e.extraction_class === 'brain_signal' && e.attributes?.brain === 'work' ) - description: 'triage: money brain signal for financial content' vars: taskId: triage text: "I need to pay the credit card bill before the 15th or I'll get charged interest." assert: - type: javascript value: output.classes.includes('action') - type: javascript value: output.classes.includes('date_reference') - type: javascript value: | output.extractions.some(e => e.extraction_class === 'brain_signal' && e.attributes?.brain === 'money' ) - description: 'triage: negative emotion detected' vars: taskId: triage text: "Feeling really overwhelmed today. Too many things on my plate and I can't focus." assert: - type: javascript value: output.classes.includes('emotion') - type: javascript value: | output.extractions.some(e => e.extraction_class === 'emotion' && e.attributes?.valence === 'negative' ) - description: 'triage: multiple brain signals for mixed content' vars: taskId: triage text: 'Doctor said I need to exercise more. Also need to check my 401k contributions before year end.' assert: - type: javascript value: | output.extractions.some(e => e.extraction_class === 'brain_signal' && e.attributes?.brain === 'health' ) - type: javascript value: | output.extractions.some(e => e.extraction_class === 'brain_signal' && e.attributes?.brain === 'money' ) # ── memory-insight ───────────────────────────────────────────── - description: 'memory-insight: detects recurring pattern' vars: taskId: memory-insight text: 'Item 1: Skipped gym again. Item 2: Feeling tired at work. Item 3: Had coffee at 4pm to stay awake. Item 4: Skipped gym for the third time this week.' assert: - type: javascript value: output.classes.includes('pattern') - type: javascript value: | output.extractions.some(e => e.extraction_class === 'pattern' && e.attributes?.frequency === 'recurring' ) - description: 'memory-insight: detects relationship between items' vars: taskId: memory-insight text: "Item 1: Stayed up until 2am coding. Item 2: Missed standup the next morning. Item 3: Felt foggy all day. Item 4: Late night again, can't stop." assert: - type: javascript value: output.classes.includes('relationship') - type: javascript value: output.classes.includes('pattern') - description: 'memory-insight: detects milestone' vars: taskId: memory-insight text: 'Item 1: Started learning Spanish 3 months ago. Item 2: Had first full conversation in Spanish today. Item 3: Completed Duolingo 90-day streak.' assert: - type: javascript value: output.classes.includes('milestone') - type: javascript value: output.extractions.length >= 2 - description: 'memory-insight: detects recurring theme across entries' vars: taskId: memory-insight text: 'Entry 1: Anxious before the presentation. Entry 2: Nervous about the client call. Entry 3: Worried about the demo tomorrow. Entry 4: Stressed about the board meeting.' assert: - type: javascript value: output.classes.includes('recurring_theme') - type: javascript value: output.extractions.length >= 1 # ── reflection-enrichment ────────────────────────────────────── - description: 'reflection: extracts accomplishment and concern' vars: taskId: reflection-enrichment text: "Good day overall. Finally finished the proposal I've been putting off. Still worried about the budget review next week." assert: - type: javascript value: output.classes.includes('accomplishment') - type: javascript value: output.classes.includes('concern') - type: javascript value: output.classes.includes('emotional_state') - description: 'reflection: positive emotional state detected' vars: taskId: reflection-enrichment text: 'Had a fantastic week. Shipped the new feature, got great feedback from users, and the team celebrated together.' assert: - type: javascript value: output.classes.includes('emotional_state') - type: javascript value: | output.extractions.some(e => e.extraction_class === 'emotional_state' && e.attributes?.valence === 'positive' ) - type: javascript value: output.classes.includes('accomplishment') - description: 'reflection: goal progress detected' vars: taskId: reflection-enrichment text: "I've been trying to read more this year. This month I finished my third book — ahead of my goal of one per month." assert: - type: javascript value: output.classes.includes('goal_progress') - type: javascript value: output.classes.includes('accomplishment') - description: 'reflection: mixed positive and negative signals' vars: taskId: reflection-enrichment text: "Proud of finishing the marathon training plan. But I'm really worried I won't be able to run the actual race — my knee has been acting up." assert: - type: javascript value: output.classes.includes('accomplishment') - type: javascript value: output.classes.includes('concern') - type: javascript value: | output.extractions.some(e => e.extraction_class === 'emotional_state' && e.attributes?.valence === 'positive' ) # ── bug-report-extraction ────────────────────────────────────── - description: 'bug-report: extracts all 5 fields' vars: taskId: bug-report-extraction text: 'When I click the save button on the settings page, nothing happens. It should save my preferences. This is a critical issue affecting all users.' assert: - type: javascript value: output.classes.includes('steps_to_reproduce') - type: javascript value: output.classes.includes('expected_behavior') - type: javascript value: output.classes.includes('actual_behavior') - type: javascript value: output.classes.includes('affected_component') - type: javascript value: output.classes.includes('severity') - type: javascript value: | output.extractions.some(e => e.extraction_class === 'severity' && e.attributes?.level === 'critical' ) - description: 'bug-report: extracts steps and component from login bug' vars: taskId: bug-report-extraction text: 'Steps: 1) Open login page, 2) Enter valid credentials, 3) Click login. Expected: redirect to dashboard. Actual: spinner shows forever. Affects the login page on mobile.' assert: - type: javascript value: output.classes.includes('steps_to_reproduce') - type: javascript value: output.classes.includes('expected_behavior') - type: javascript value: output.classes.includes('actual_behavior') - type: javascript value: output.classes.includes('affected_component')