- Add evals/promptfoo.yaml: HTTP provider hitting extraction-service API covering all 5 built-in tasks (transcript, triage, memory-insight, reflection-enrichment, bug-report-extraction) - Add evals/fixtures/golden.json: machine-readable golden input/output fixtures - Add evals/run-evals.sh: shell runner with health checks, auth token handling, task filtering, and CI mode - Add evals/README.md: usage docs, prerequisites, cost estimates, CI integration
178 lines
7.3 KiB
JSON
178 lines
7.3 KiB
JSON
{
|
|
"description": "Golden fixtures for extraction-service evals. Each entry defines the minimum expected extraction classes for a given input. Used by the promptfoo eval suite and can also be consumed by custom assertion scripts.",
|
|
"version": "1.0.0",
|
|
"tasks": {
|
|
"transcript-extraction": {
|
|
"expectedClasses": ["action_item", "decision", "question", "deadline", "person", "topic"],
|
|
"cases": [
|
|
{
|
|
"id": "tc-001",
|
|
"input": "John said we need to ship the feature by Friday. Sarah agreed to handle the testing.",
|
|
"mustContainClasses": ["action_item", "deadline", "person"],
|
|
"mustContainText": ["friday", "sarah", "john"],
|
|
"minExtractions": 3
|
|
},
|
|
{
|
|
"id": "tc-002",
|
|
"input": "The team decided to postpone the launch to Q3. Alice will notify all stakeholders by Monday.",
|
|
"mustContainClasses": ["decision", "action_item", "person", "deadline"],
|
|
"mustContainText": ["q3", "alice"],
|
|
"minExtractions": 3
|
|
},
|
|
{
|
|
"id": "tc-003",
|
|
"input": "Bob asked: should we use Postgres or Cosmos DB for the new service? No decision was made.",
|
|
"mustContainClasses": ["question", "person"],
|
|
"mustContainText": ["bob"],
|
|
"minExtractions": 2
|
|
},
|
|
{
|
|
"id": "tc-004",
|
|
"input": "Maria: I finished the design mockups. Tom: Great, I'll review them by EOD. Maria: Can you also check the mobile screens?",
|
|
"mustContainClasses": ["action_item", "person"],
|
|
"mustContainText": ["maria", "tom"],
|
|
"minExtractions": 3
|
|
}
|
|
]
|
|
},
|
|
"triage": {
|
|
"expectedClasses": ["topic", "entity", "action", "emotion", "date_reference", "brain_signal"],
|
|
"cases": [
|
|
{
|
|
"id": "tr-001",
|
|
"input": "Remind me to call the dentist tomorrow about my appointment. I'm stressed about the cost.",
|
|
"mustContainClasses": ["action", "date_reference", "emotion"],
|
|
"mustContainBrainSignal": { "brain": "health", "minConfidence": 0.5 },
|
|
"minExtractions": 3
|
|
},
|
|
{
|
|
"id": "tr-002",
|
|
"input": "Need to finish the Q1 report for my manager by end of week. The presentation is on Thursday.",
|
|
"mustContainClasses": ["action", "date_reference"],
|
|
"mustContainBrainSignal": { "brain": "work", "minConfidence": 0.5 },
|
|
"minExtractions": 2
|
|
},
|
|
{
|
|
"id": "tr-003",
|
|
"input": "I need to pay the credit card bill before the 15th or I'll get charged interest.",
|
|
"mustContainClasses": ["action", "date_reference"],
|
|
"mustContainBrainSignal": { "brain": "money", "minConfidence": 0.5 },
|
|
"minExtractions": 2
|
|
},
|
|
{
|
|
"id": "tr-004",
|
|
"input": "Feeling really overwhelmed today. Too many things on my plate and I can't focus.",
|
|
"mustContainClasses": ["emotion"],
|
|
"mustContainEmotionValence": "negative",
|
|
"minExtractions": 1
|
|
},
|
|
{
|
|
"id": "tr-005",
|
|
"input": "Doctor said I need to exercise more. Also need to check my 401k contributions before year end.",
|
|
"mustContainClasses": ["action"],
|
|
"mustContainBrainSignals": [
|
|
{ "brain": "health", "minConfidence": 0.4 },
|
|
{ "brain": "money", "minConfidence": 0.4 }
|
|
],
|
|
"minExtractions": 2
|
|
}
|
|
]
|
|
},
|
|
"memory-insight": {
|
|
"expectedClasses": ["pattern", "recurring_theme", "relationship", "milestone"],
|
|
"cases": [
|
|
{
|
|
"id": "mi-001",
|
|
"input": "Item 1: Skipped gym again. Item 2: Feeling tired at work. Item 3: Had coffee at 4pm to stay awake. Item 4: Skipped gym for the third time this week.",
|
|
"mustContainClasses": ["pattern"],
|
|
"mustContainPatternFrequency": "recurring",
|
|
"minExtractions": 1
|
|
},
|
|
{
|
|
"id": "mi-002",
|
|
"input": "Item 1: Stayed up until 2am coding. Item 2: Missed standup the next morning. Item 3: Felt foggy all day. Item 4: Late night again, can't stop.",
|
|
"mustContainClasses": ["pattern", "relationship"],
|
|
"minExtractions": 2
|
|
},
|
|
{
|
|
"id": "mi-003",
|
|
"input": "Item 1: Started learning Spanish 3 months ago. Item 2: Had first full conversation in Spanish today. Item 3: Completed Duolingo 90-day streak.",
|
|
"mustContainClasses": ["milestone"],
|
|
"minExtractions": 2
|
|
},
|
|
{
|
|
"id": "mi-004",
|
|
"input": "Entry 1: Anxious before the presentation. Entry 2: Nervous about the client call. Entry 3: Worried about the demo tomorrow. Entry 4: Stressed about the board meeting.",
|
|
"mustContainClasses": ["recurring_theme"],
|
|
"minExtractions": 1
|
|
}
|
|
]
|
|
},
|
|
"reflection-enrichment": {
|
|
"expectedClasses": ["emotional_state", "accomplishment", "concern", "goal_progress"],
|
|
"cases": [
|
|
{
|
|
"id": "re-001",
|
|
"input": "Good day overall. Finally finished the proposal I've been putting off. Still worried about the budget review next week.",
|
|
"mustContainClasses": ["accomplishment", "concern", "emotional_state"],
|
|
"minExtractions": 3
|
|
},
|
|
{
|
|
"id": "re-002",
|
|
"input": "Had a fantastic week. Shipped the new feature, got great feedback from users, and the team celebrated together.",
|
|
"mustContainClasses": ["emotional_state", "accomplishment"],
|
|
"mustContainEmotionValence": "positive",
|
|
"minExtractions": 2
|
|
},
|
|
{
|
|
"id": "re-003",
|
|
"input": "I've been trying to read more this year. This month I finished my third book — ahead of my goal of one per month.",
|
|
"mustContainClasses": ["goal_progress", "accomplishment"],
|
|
"minExtractions": 2
|
|
},
|
|
{
|
|
"id": "re-004",
|
|
"input": "Proud of finishing the marathon training plan. But I'm really worried I won't be able to run the actual race — my knee has been acting up.",
|
|
"mustContainClasses": ["accomplishment", "concern"],
|
|
"minExtractions": 2
|
|
}
|
|
]
|
|
},
|
|
"bug-report-extraction": {
|
|
"expectedClasses": [
|
|
"steps_to_reproduce",
|
|
"expected_behavior",
|
|
"actual_behavior",
|
|
"affected_component",
|
|
"severity"
|
|
],
|
|
"cases": [
|
|
{
|
|
"id": "br-001",
|
|
"input": "When I click the save button on the settings page, nothing happens. It should save my preferences. This is a critical issue affecting all users.",
|
|
"mustContainClasses": [
|
|
"steps_to_reproduce",
|
|
"expected_behavior",
|
|
"actual_behavior",
|
|
"affected_component",
|
|
"severity"
|
|
],
|
|
"mustContainSeverityLevel": "critical",
|
|
"minExtractions": 4
|
|
},
|
|
{
|
|
"id": "br-002",
|
|
"input": "Steps: 1) Open login page, 2) Enter valid credentials, 3) Click login. Expected: redirect to dashboard. Actual: spinner shows forever. Affects the login page on mobile.",
|
|
"mustContainClasses": [
|
|
"steps_to_reproduce",
|
|
"expected_behavior",
|
|
"actual_behavior",
|
|
"affected_component"
|
|
],
|
|
"minExtractions": 3
|
|
}
|
|
]
|
|
}
|
|
}
|
|
}
|