Skip to content

Commit 38a6a96

Browse files
committed
Add integration test for live GoodMem + OpenAI RAG pipeline
Skipped by default unless GOODMEM_BASE_URL, GOODMEM_API_KEY, GOODMEM_SPACE_ID, and OPENAI_API_KEY are set. Run with: pytest -m integration Covers retrieval, structured chunks, metadata, top_k, and end-to-end RAG evaluation with DeepEval metrics.
1 parent 2867b67 commit 38a6a96

2 files changed

Lines changed: 194 additions & 1 deletion

File tree

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,10 +76,11 @@ requires = ["poetry-core"]
7676
build-backend = "poetry.core.masonry.api"
7777

7878
[tool.pytest.ini_options]
79-
addopts = "-m 'not skip_test'"
79+
addopts = "-m 'not skip_test and not integration'"
8080
markers = [
8181
"skip_test: skip the test",
8282
"enable_dotenv: allow this test to load .env files via autoload_dotenv()",
83+
"integration: requires live services (GoodMem, OpenAI) and env vars",
8384
]
8485
asyncio_mode = "auto"
8586
asyncio_default_fixture_loop_scope = "function"
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
"""
2+
Integration test: GoodMem retrieval → OpenAI generation → DeepEval evaluation.
3+
4+
Requires a running GoodMem instance with a populated space and an OpenAI key.
5+
Skipped automatically unless all required env vars are set.
6+
7+
Run with::
8+
9+
pytest tests/test_integrations/test_goodmem/test_integration.py -m integration -v
10+
11+
Required env vars:
12+
GOODMEM_BASE_URL – e.g. https://api.goodmem.ai
13+
GOODMEM_API_KEY – GoodMem API key
14+
GOODMEM_SPACE_ID – Space ID containing retrievable content
15+
OPENAI_API_KEY – OpenAI API key for generation and metrics
16+
"""
17+
18+
import os
19+
20+
import pytest
21+
22+
REQUIRED_VARS = [
23+
"GOODMEM_BASE_URL",
24+
"GOODMEM_API_KEY",
25+
"GOODMEM_SPACE_ID",
26+
"OPENAI_API_KEY",
27+
]
28+
_missing = [v for v in REQUIRED_VARS if not os.environ.get(v)]
29+
30+
pytestmark = [
31+
pytest.mark.integration,
32+
pytest.mark.skipif(
33+
len(_missing) > 0,
34+
reason=f"Missing env vars: {', '.join(_missing)}",
35+
),
36+
]
37+
38+
39+
@pytest.fixture(scope="module")
40+
def retriever():
41+
from deepeval.integrations.goodmem import GoodMemConfig, GoodMemRetriever
42+
43+
return GoodMemRetriever(
44+
GoodMemConfig(
45+
base_url=os.environ.get("GOODMEM_BASE_URL", ""),
46+
api_key=os.environ.get("GOODMEM_API_KEY", ""),
47+
space_id=os.environ.get("GOODMEM_SPACE_ID", ""),
48+
top_k=3,
49+
)
50+
)
51+
52+
53+
@pytest.fixture(scope="module")
54+
def openai_client():
55+
from openai import OpenAI
56+
57+
return OpenAI()
58+
59+
60+
SYSTEM_PROMPT = (
61+
"Answer the question accurately based only on the provided context. "
62+
"If the context doesn't contain enough information, say so."
63+
)
64+
GENERATION_MODEL = "gpt-4o-mini"
65+
66+
67+
# ---------------------------------------------------------------------------
68+
# Retrieval tests
69+
# ---------------------------------------------------------------------------
70+
71+
72+
class TestRetrieve:
73+
"""Verify that live retrieval returns usable results."""
74+
75+
def test_retrieve_returns_strings(self, retriever):
76+
results = retriever.retrieve("What is energy?")
77+
assert isinstance(results, list)
78+
assert len(results) > 0
79+
assert all(isinstance(r, str) for r in results)
80+
81+
def test_retrieve_chunks_returns_structured(self, retriever):
82+
from deepeval.integrations.goodmem import GoodMemChunk
83+
84+
chunks = retriever.retrieve_chunks("What is energy?")
85+
assert len(chunks) > 0
86+
assert all(isinstance(c, GoodMemChunk) for c in chunks)
87+
88+
def test_chunk_has_metadata(self, retriever):
89+
chunks = retriever.retrieve_chunks("What is energy?")
90+
chunk = chunks[0]
91+
assert chunk.content
92+
assert chunk.score is not None
93+
assert chunk.chunk_id
94+
assert chunk.memory_id
95+
96+
def test_top_k_respected(self, retriever):
97+
chunks = retriever.retrieve_chunks("test query")
98+
assert len(chunks) <= retriever.config.top_k
99+
100+
101+
# ---------------------------------------------------------------------------
102+
# RAG pipeline test
103+
# ---------------------------------------------------------------------------
104+
105+
106+
class TestRAGPipeline:
107+
"""End-to-end: retrieve → generate → evaluate with DeepEval metrics."""
108+
109+
@staticmethod
110+
def _generate(client, chunks, query):
111+
response = client.chat.completions.create(
112+
model=GENERATION_MODEL,
113+
messages=[
114+
{"role": "system", "content": SYSTEM_PROMPT},
115+
{
116+
"role": "user",
117+
"content": f"Context:\n{chr(10).join(chunks)}\n\nQuestion: {query}",
118+
},
119+
],
120+
)
121+
return response.choices[0].message.content
122+
123+
def test_rag_answer_relevancy(self, retriever, openai_client):
124+
"""Retrieve, generate, and verify answer relevancy score."""
125+
from deepeval.metrics import AnswerRelevancyMetric
126+
from deepeval.test_case import LLMTestCase
127+
128+
query = "What are the main forms of energy in physics?"
129+
chunks = retriever.retrieve(query)
130+
answer = self._generate(openai_client, chunks, query)
131+
132+
test_case = LLMTestCase(
133+
input=query,
134+
actual_output=answer,
135+
retrieval_context=chunks,
136+
)
137+
metric = AnswerRelevancyMetric(model="gpt-4o-mini")
138+
metric.measure(test_case)
139+
140+
assert metric.score is not None
141+
assert metric.score >= 0.0
142+
143+
def test_rag_contextual_relevancy(self, retriever, openai_client):
144+
"""Retrieve, generate, and verify contextual relevancy score."""
145+
from deepeval.metrics import ContextualRelevancyMetric
146+
from deepeval.test_case import LLMTestCase
147+
148+
query = "What are the main forms of energy in physics?"
149+
chunks = retriever.retrieve(query)
150+
answer = self._generate(openai_client, chunks, query)
151+
152+
test_case = LLMTestCase(
153+
input=query,
154+
actual_output=answer,
155+
retrieval_context=chunks,
156+
)
157+
metric = ContextualRelevancyMetric(model="gpt-4o-mini")
158+
metric.measure(test_case)
159+
160+
assert metric.score is not None
161+
assert metric.score >= 0.0
162+
163+
def test_batch_evaluate(self, retriever, openai_client):
164+
"""Build multiple test cases and run batch evaluation."""
165+
from deepeval import evaluate
166+
from deepeval.evaluate import AsyncConfig
167+
from deepeval.metrics import AnswerRelevancyMetric
168+
from deepeval.test_case import LLMTestCase
169+
170+
queries = [
171+
"What are the main forms of energy in physics?",
172+
"Who created American Idol and when did it first air?",
173+
]
174+
175+
test_cases = []
176+
for query in queries:
177+
chunks = retriever.retrieve(query)
178+
answer = self._generate(openai_client, chunks, query)
179+
test_cases.append(
180+
LLMTestCase(
181+
input=query,
182+
actual_output=answer,
183+
retrieval_context=chunks,
184+
)
185+
)
186+
187+
results = evaluate(
188+
test_cases,
189+
[AnswerRelevancyMetric(model="gpt-4o-mini")],
190+
async_config=AsyncConfig(max_concurrent=2, throttle_value=1),
191+
)
192+
assert results is not None

0 commit comments

Comments
 (0)