11"""
2- Agent Evaluation Example with DeepEval v3.0
2+ Agent Evaluation Example with DeepEval
33
44This example demonstrates how to evaluate an AI agent using:
5- 1. TaskCompletionMetric - Did the agent complete the user's task?
6- 2. AnswerRelevancyMetric - Is the response relevant?
7- 3. Custom GEval - Domain-specific evaluation criteria
8- 4. Component-level evaluation with @observe
5+ 1. @observe decorators for tracing agent, tool, and LLM components
6+ 2. update_current_span for creating test cases at runtime
7+ 3. evals_iterator for running evaluations over a dataset of Goldens
8+ 4. Trace-level metrics (TaskCompletionMetric) - evaluates the entire agent
9+ 5. Span-level metrics (AnswerRelevancyMetric) - evaluates individual components
10+ 6. Custom GEval for domain-specific evaluation criteria
911
1012Run: python test_agent_eval.py
11- or: deepeval test run test_agent_eval.py
1213"""
1314
1415import pytest
15- from deepeval import assert_test , evaluate
16+ from deepeval import assert_test
1617from deepeval .metrics import (
1718 TaskCompletionMetric ,
1819 AnswerRelevancyMetric ,
2324from deepeval .tracing import observe , update_current_span
2425
2526
26- # ── Simulated Agent Components ─────────────────────────
27- # Replace these with your actual agent's retriever, LLM, tools, etc.
27+ # ── Knowledge Base (simulates a vector DB) ─────────────
2828
29+ KNOWLEDGE_BASE = {
30+ "refund" : "Refunds are available within 30 days of purchase. Items must be unused." ,
31+ "shipping" : "Standard shipping: 5-7 days. Express: 1-2 days ($9.99)." ,
32+ "hours" : "Support hours: Mon-Fri, 9am-5pm EST." ,
33+ "return" : "Email returns@example.com with your order number for a prepaid label." ,
34+ }
2935
30- def mock_retriever (query : str ) -> str :
31- """Simulates a retrieval step that fetches context."""
32- knowledge = {
33- "refund" : "Refunds are available within 30 days of purchase." ,
34- "shipping" : "Standard shipping: 5-7 days. Express: 1-2 days ($9.99)." ,
35- "hours" : "Support hours: Mon-Fri, 9am-5pm EST." ,
36- }
37- for key , value in knowledge .items ():
38- if key in query .lower ():
36+
37+ # ── Observed Agent Components ──────────────────────────
38+ #
39+ # @observe creates spans that form a trace. The nesting of
40+ # function calls determines the trace structure:
41+
42+
43+ @observe (type = "tool" )
44+ def retrieve_context (query : str ) -> str :
45+ """Search the knowledge base. Traced as a tool span."""
46+ query_lower = query .lower ()
47+ for key , value in KNOWLEDGE_BASE .items ():
48+ if key in query_lower :
3949 return value
4050 return "No relevant information found."
4151
4252
43- def mock_agent (query : str ) -> str :
44- """Simulates an agent that retrieves context and generates a response."""
45- context = mock_retriever (query )
46- # In a real agent, this would be an LLM call using the context
47- return f"Based on our records: { context } "
53+ @observe (metrics = [AnswerRelevancyMetric (threshold = 0.5 )])
54+ def generate_response (query : str , context : str ) -> str :
55+ """Generate a response from retrieved context.
4856
57+ AnswerRelevancyMetric is attached at the span level — it evaluates
58+ THIS component's output, not the entire agent trace.
4959
50- # ── Define Metrics ─────────────────────────────────────
60+ In a real app, replace this with an actual LLM call:
61+ response = openai.chat.completions.create(...)
62+ """
63+ response = f"Based on our records: { context } "
64+
65+ # Create the test case for this span's metric evaluation.
66+ # update_current_span tells DeepEval what input/output to evaluate.
67+ update_current_span (
68+ test_case = LLMTestCase (
69+ input = query ,
70+ actual_output = response ,
71+ retrieval_context = [context ],
72+ )
73+ )
74+ return response
75+
76+
77+ @observe (type = "agent" )
78+ def support_agent (query : str ) -> str :
79+ """Customer support agent — trace root.
80+
81+ TaskCompletionMetric evaluates the entire trace (all spans together).
82+ It's passed to evals_iterator(), not to @observe().
83+ """
84+ context = retrieve_context (query )
85+ response = generate_response (query , context )
86+ return response
5187
52- task_completion = TaskCompletionMetric (threshold = 0.5 )
5388
54- answer_relevancy = AnswerRelevancyMetric (threshold = 0.5 )
89+ # ── Metrics ────────────────────────────────────────────
90+
91+ task_completion = TaskCompletionMetric (threshold = 0.5 )
5592
56- # Custom metric: evaluate whether the response is grounded in facts (not hallucinating beyond what the retriever returned)
93+ # Custom GEval: is the response grounded in facts?
5794groundedness = GEval (
5895 name = "Response Groundedness" ,
5996 criteria = (
60- "Evaluate whether the response only contains information that "
61- "could be derived from a customer support knowledge base. "
62- "The response should not make up policies, prices, or deadlines "
63- "that aren't typically found in standard business documentation."
97+ "Evaluate whether the response only contains information "
98+ "that could be derived from a customer support knowledge base. "
99+ "The response should not fabricate policies, prices, or deadlines."
64100 ),
65101 evaluation_params = [
66102 LLMTestCaseParams .INPUT ,
@@ -70,95 +106,66 @@ def mock_agent(query: str) -> str:
70106)
71107
72108
73- # ── Test Cases ──────── ─────────────────────────────────
109+ # ── Evaluation Dataset ─────────────────────────────────
74110
75- test_cases = [
76- LLMTestCase (
77- input = "What is your refund policy?" ,
78- actual_output = mock_agent ("What is your refund policy?" ),
79- expected_output = "Refunds are available within 30 days of purchase." ,
80- ),
81- LLMTestCase (
82- input = "How long does shipping take?" ,
83- actual_output = mock_agent ("How long does shipping take?" ),
84- expected_output = "Standard shipping takes 5-7 days. Express is 1-2 days for $9.99." ,
85- ),
86- LLMTestCase (
87- input = "When is customer support available?" ,
88- actual_output = mock_agent ("When is customer support available?" ),
89- expected_output = "Support is available Monday through Friday, 9am to 5pm EST." ,
90- ),
111+ goldens = [
112+ Golden (input = "What is your refund policy?" ),
113+ Golden (input = "How long does shipping take?" ),
114+ Golden (input = "When is customer support available?" ),
115+ Golden (input = "How do I return an item?" ),
116+ Golden (input = "How much does express shipping cost?" ),
91117]
92118
119+ dataset = EvaluationDataset (goldens = goldens )
93120
94- # ── Option 1: Run with evaluate() ─────────────────────
95121
122+ # ── Option 1: Component-level eval with evals_iterator ─
123+ #
124+ # This is the recommended approach for agent evaluation.
125+ # - Trace-level metrics go in evals_iterator(metrics=[...])
126+ # - Span-level metrics go in @observe(metrics=[...])
96127
97- def run_with_evaluate ():
98- """Run evaluation using the evaluate() function."""
99- print ("Running agent evaluation with evaluate()..." )
100- evaluate (
101- test_cases = test_cases ,
102- metrics = [task_completion , answer_relevancy , groundedness ],
103- )
104128
129+ def run_component_level ():
130+ """Run component-level evaluation using evals_iterator."""
131+ print ("Running component-level agent evaluation..." )
132+ print (f" Dataset: { len (goldens )} goldens" )
133+ print (f" Trace-level: TaskCompletionMetric" )
134+ print (f" Span-level: AnswerRelevancyMetric (on generate_response)" )
135+ print ("-" * 50 )
105136
106- # ── Option 2: Run with pytest (deepeval test run) ─────
107-
108- dataset = EvaluationDataset (test_cases = test_cases )
109-
110-
111- @pytest .mark .parametrize ("test_case" , dataset )
112- def test_agent (test_case : LLMTestCase ):
113- """Pytest-compatible test for CI/CD integration."""
114- assert_test (
115- test_case ,
116- [task_completion , answer_relevancy , groundedness ],
117- )
118-
137+ for golden in dataset .evals_iterator (metrics = [task_completion ]):
138+ support_agent (golden .input )
119139
120- # ── Option 3: Component-level evaluation with @observe ─
121140
141+ # ── Option 2: End-to-end eval with pytest ──────────────
142+ #
143+ # For CI/CD integration using `deepeval test run`.
144+ # This treats the agent as a black box.
122145
123- @observe (metrics = [answer_relevancy ])
124- def traced_retriever (query : str ):
125- """Evaluate the retriever component independently."""
126- result = mock_retriever (query )
127- update_current_span (
128- test_case = LLMTestCase (input = query , actual_output = result )
129- )
130- return result
146+ e2e_test_cases = [
147+ LLMTestCase (
148+ input = "What is your refund policy?" ,
149+ actual_output = support_agent ("What is your refund policy?" ),
150+ expected_output = "Refunds are available within 30 days of purchase." ,
151+ ),
152+ LLMTestCase (
153+ input = "How long does shipping take?" ,
154+ actual_output = support_agent ("How long does shipping take?" ),
155+ expected_output = "Standard shipping takes 5-7 days. Express is 1-2 days for $9.99." ,
156+ ),
157+ ]
131158
159+ e2e_dataset = EvaluationDataset (test_cases = e2e_test_cases )
132160
133- @observe (metrics = [task_completion ])
134- def traced_agent (query : str ):
135- """Evaluate the full agent pipeline."""
136- context = traced_retriever (query )
137- response = f"Based on our records: { context } "
138- update_current_span (
139- test_case = LLMTestCase (
140- input = query ,
141- actual_output = response ,
142- expected_output = "A helpful, grounded response." ,
143- )
144- )
145- return response
146161
162+ @pytest .mark .parametrize ("test_case" , e2e_dataset )
163+ def test_agent_e2e (test_case : LLMTestCase ):
164+ """End-to-end pytest test for CI/CD pipelines."""
165+ assert_test (test_case , [task_completion , groundedness ])
147166
148- def run_component_level ():
149- """Run component-level evaluation with tracing."""
150- print ("Running component-level agent evaluation..." )
151- goldens = [
152- Golden (input = "What is your refund policy?" ),
153- Golden (input = "How long does shipping take?" ),
154- ]
155- evaluate (
156- observed_callback = traced_agent ,
157- goldens = goldens ,
158- )
159167
168+ # ── Main ───────────────────────────────────────────────
160169
161170if __name__ == "__main__" :
162- run_with_evaluate ()
163- print ("\n " + "=" * 50 + "\n " )
164171 run_component_level ()
0 commit comments