Skip to content

Commit 0d9c390

Browse files
committed
refactor: replace deprecated observed_callback with evals_iterator
- Use evals_iterator loop instead of observed_callback - Nest @observe components to form proper trace hierarchy - Move TaskCompletionMetric to evals_iterator (trace-level) - Keep AnswerRelevancyMetric on @observe (span-level) - Add update_current_span for runtime test case creation Made-with: Cursor
1 parent 95170cf commit 0d9c390

1 file changed

Lines changed: 111 additions & 104 deletions

File tree

Lines changed: 111 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,19 @@
11
"""
2-
Agent Evaluation Example with DeepEval v3.0
2+
Agent Evaluation Example with DeepEval
33
44
This example demonstrates how to evaluate an AI agent using:
5-
1. TaskCompletionMetric - Did the agent complete the user's task?
6-
2. AnswerRelevancyMetric - Is the response relevant?
7-
3. Custom GEval - Domain-specific evaluation criteria
8-
4. Component-level evaluation with @observe
5+
1. @observe decorators for tracing agent, tool, and LLM components
6+
2. update_current_span for creating test cases at runtime
7+
3. evals_iterator for running evaluations over a dataset of Goldens
8+
4. Trace-level metrics (TaskCompletionMetric) - evaluates the entire agent
9+
5. Span-level metrics (AnswerRelevancyMetric) - evaluates individual components
10+
6. Custom GEval for domain-specific evaluation criteria
911
1012
Run: python test_agent_eval.py
11-
or: deepeval test run test_agent_eval.py
1213
"""
1314

1415
import pytest
15-
from deepeval import assert_test, evaluate
16+
from deepeval import assert_test
1617
from deepeval.metrics import (
1718
TaskCompletionMetric,
1819
AnswerRelevancyMetric,
@@ -23,44 +24,79 @@
2324
from deepeval.tracing import observe, update_current_span
2425

2526

26-
# ── Simulated Agent Components ─────────────────────────
27-
# Replace these with your actual agent's retriever, LLM, tools, etc.
27+
# ── Knowledge Base (simulates a vector DB) ─────────────
2828

29+
KNOWLEDGE_BASE = {
30+
"refund": "Refunds are available within 30 days of purchase. Items must be unused.",
31+
"shipping": "Standard shipping: 5-7 days. Express: 1-2 days ($9.99).",
32+
"hours": "Support hours: Mon-Fri, 9am-5pm EST.",
33+
"return": "Email returns@example.com with your order number for a prepaid label.",
34+
}
2935

30-
def mock_retriever(query: str) -> str:
31-
"""Simulates a retrieval step that fetches context."""
32-
knowledge = {
33-
"refund": "Refunds are available within 30 days of purchase.",
34-
"shipping": "Standard shipping: 5-7 days. Express: 1-2 days ($9.99).",
35-
"hours": "Support hours: Mon-Fri, 9am-5pm EST.",
36-
}
37-
for key, value in knowledge.items():
38-
if key in query.lower():
36+
37+
# ── Observed Agent Components ──────────────────────────
38+
#
39+
# @observe creates spans that form a trace. The nesting of
40+
# function calls determines the trace structure:
41+
42+
43+
@observe(type="tool")
44+
def retrieve_context(query: str) -> str:
45+
"""Search the knowledge base. Traced as a tool span."""
46+
query_lower = query.lower()
47+
for key, value in KNOWLEDGE_BASE.items():
48+
if key in query_lower:
3949
return value
4050
return "No relevant information found."
4151

4252

43-
def mock_agent(query: str) -> str:
44-
"""Simulates an agent that retrieves context and generates a response."""
45-
context = mock_retriever(query)
46-
# In a real agent, this would be an LLM call using the context
47-
return f"Based on our records: {context}"
53+
@observe(metrics=[AnswerRelevancyMetric(threshold=0.5)])
54+
def generate_response(query: str, context: str) -> str:
55+
"""Generate a response from retrieved context.
4856
57+
AnswerRelevancyMetric is attached at the span level — it evaluates
58+
THIS component's output, not the entire agent trace.
4959
50-
# ── Define Metrics ─────────────────────────────────────
60+
In a real app, replace this with an actual LLM call:
61+
response = openai.chat.completions.create(...)
62+
"""
63+
response = f"Based on our records: {context}"
64+
65+
# Create the test case for this span's metric evaluation.
66+
# update_current_span tells DeepEval what input/output to evaluate.
67+
update_current_span(
68+
test_case=LLMTestCase(
69+
input=query,
70+
actual_output=response,
71+
retrieval_context=[context],
72+
)
73+
)
74+
return response
75+
76+
77+
@observe(type="agent")
78+
def support_agent(query: str) -> str:
79+
"""Customer support agent — trace root.
80+
81+
TaskCompletionMetric evaluates the entire trace (all spans together).
82+
It's passed to evals_iterator(), not to @observe().
83+
"""
84+
context = retrieve_context(query)
85+
response = generate_response(query, context)
86+
return response
5187

52-
task_completion = TaskCompletionMetric(threshold=0.5)
5388

54-
answer_relevancy = AnswerRelevancyMetric(threshold=0.5)
89+
# ── Metrics ────────────────────────────────────────────
90+
91+
task_completion = TaskCompletionMetric(threshold=0.5)
5592

56-
# Custom metric: evaluate whether the response is grounded in facts (not hallucinating beyond what the retriever returned)
93+
# Custom GEval: is the response grounded in facts?
5794
groundedness = GEval(
5895
name="Response Groundedness",
5996
criteria=(
60-
"Evaluate whether the response only contains information that "
61-
"could be derived from a customer support knowledge base. "
62-
"The response should not make up policies, prices, or deadlines "
63-
"that aren't typically found in standard business documentation."
97+
"Evaluate whether the response only contains information "
98+
"that could be derived from a customer support knowledge base. "
99+
"The response should not fabricate policies, prices, or deadlines."
64100
),
65101
evaluation_params=[
66102
LLMTestCaseParams.INPUT,
@@ -70,95 +106,66 @@ def mock_agent(query: str) -> str:
70106
)
71107

72108

73-
# ── Test Cases ─────────────────────────────────────────
109+
# ── Evaluation Dataset ─────────────────────────────────
74110

75-
test_cases = [
76-
LLMTestCase(
77-
input="What is your refund policy?",
78-
actual_output=mock_agent("What is your refund policy?"),
79-
expected_output="Refunds are available within 30 days of purchase.",
80-
),
81-
LLMTestCase(
82-
input="How long does shipping take?",
83-
actual_output=mock_agent("How long does shipping take?"),
84-
expected_output="Standard shipping takes 5-7 days. Express is 1-2 days for $9.99.",
85-
),
86-
LLMTestCase(
87-
input="When is customer support available?",
88-
actual_output=mock_agent("When is customer support available?"),
89-
expected_output="Support is available Monday through Friday, 9am to 5pm EST.",
90-
),
111+
goldens = [
112+
Golden(input="What is your refund policy?"),
113+
Golden(input="How long does shipping take?"),
114+
Golden(input="When is customer support available?"),
115+
Golden(input="How do I return an item?"),
116+
Golden(input="How much does express shipping cost?"),
91117
]
92118

119+
dataset = EvaluationDataset(goldens=goldens)
93120

94-
# ── Option 1: Run with evaluate() ─────────────────────
95121

122+
# ── Option 1: Component-level eval with evals_iterator ─
123+
#
124+
# This is the recommended approach for agent evaluation.
125+
# - Trace-level metrics go in evals_iterator(metrics=[...])
126+
# - Span-level metrics go in @observe(metrics=[...])
96127

97-
def run_with_evaluate():
98-
"""Run evaluation using the evaluate() function."""
99-
print("Running agent evaluation with evaluate()...")
100-
evaluate(
101-
test_cases=test_cases,
102-
metrics=[task_completion, answer_relevancy, groundedness],
103-
)
104128

129+
def run_component_level():
130+
"""Run component-level evaluation using evals_iterator."""
131+
print("Running component-level agent evaluation...")
132+
print(f" Dataset: {len(goldens)} goldens")
133+
print(f" Trace-level: TaskCompletionMetric")
134+
print(f" Span-level: AnswerRelevancyMetric (on generate_response)")
135+
print("-" * 50)
105136

106-
# ── Option 2: Run with pytest (deepeval test run) ─────
107-
108-
dataset = EvaluationDataset(test_cases=test_cases)
109-
110-
111-
@pytest.mark.parametrize("test_case", dataset)
112-
def test_agent(test_case: LLMTestCase):
113-
"""Pytest-compatible test for CI/CD integration."""
114-
assert_test(
115-
test_case,
116-
[task_completion, answer_relevancy, groundedness],
117-
)
118-
137+
for golden in dataset.evals_iterator(metrics=[task_completion]):
138+
support_agent(golden.input)
119139

120-
# ── Option 3: Component-level evaluation with @observe ─
121140

141+
# ── Option 2: End-to-end eval with pytest ──────────────
142+
#
143+
# For CI/CD integration using `deepeval test run`.
144+
# This treats the agent as a black box.
122145

123-
@observe(metrics=[answer_relevancy])
124-
def traced_retriever(query: str):
125-
"""Evaluate the retriever component independently."""
126-
result = mock_retriever(query)
127-
update_current_span(
128-
test_case=LLMTestCase(input=query, actual_output=result)
129-
)
130-
return result
146+
e2e_test_cases = [
147+
LLMTestCase(
148+
input="What is your refund policy?",
149+
actual_output=support_agent("What is your refund policy?"),
150+
expected_output="Refunds are available within 30 days of purchase.",
151+
),
152+
LLMTestCase(
153+
input="How long does shipping take?",
154+
actual_output=support_agent("How long does shipping take?"),
155+
expected_output="Standard shipping takes 5-7 days. Express is 1-2 days for $9.99.",
156+
),
157+
]
131158

159+
e2e_dataset = EvaluationDataset(test_cases=e2e_test_cases)
132160

133-
@observe(metrics=[task_completion])
134-
def traced_agent(query: str):
135-
"""Evaluate the full agent pipeline."""
136-
context = traced_retriever(query)
137-
response = f"Based on our records: {context}"
138-
update_current_span(
139-
test_case=LLMTestCase(
140-
input=query,
141-
actual_output=response,
142-
expected_output="A helpful, grounded response.",
143-
)
144-
)
145-
return response
146161

162+
@pytest.mark.parametrize("test_case", e2e_dataset)
163+
def test_agent_e2e(test_case: LLMTestCase):
164+
"""End-to-end pytest test for CI/CD pipelines."""
165+
assert_test(test_case, [task_completion, groundedness])
147166

148-
def run_component_level():
149-
"""Run component-level evaluation with tracing."""
150-
print("Running component-level agent evaluation...")
151-
goldens = [
152-
Golden(input="What is your refund policy?"),
153-
Golden(input="How long does shipping take?"),
154-
]
155-
evaluate(
156-
observed_callback=traced_agent,
157-
goldens=goldens,
158-
)
159167

168+
# ── Main ───────────────────────────────────────────────
160169

161170
if __name__ == "__main__":
162-
run_with_evaluate()
163-
print("\n" + "=" * 50 + "\n")
164171
run_component_level()

0 commit comments

Comments
 (0)