Skip to content

Commit 7050bb0

Browse files
bassammalikclaude
andcommitted
Add structured retrieval (GoodMemChunk), multi-space support, and e2e test
- New GoodMemChunk dataclass for structured retrieval with scores/IDs - Multi-space queries via space_ids param (backward-compat space_id kept) - retrieve_chunks() returns List[GoodMemChunk] with full metadata - retrieve() delegates to retrieve_chunks() for single @observe span - Comprehensive e2e test (test_real.py) against SQuAD 2.0 GoodMem space - Updated all unit tests (32/32 passing) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 8a13808 commit 7050bb0

8 files changed

Lines changed: 440 additions & 122 deletions

File tree

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
from .retriever import GoodMemRetriever, GoodMemConfig
2+
from .types import GoodMemChunk
23

3-
__all__ = ["GoodMemRetriever", "GoodMemConfig"]
4+
__all__ = ["GoodMemRetriever", "GoodMemConfig", "GoodMemChunk"]

deepeval/integrations/goodmem/retriever.py

Lines changed: 41 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,41 @@
33

44
from deepeval.tracing import observe, update_retriever_span
55

6-
from deepeval.integrations.goodmem.utils import (
7-
goodmem_retrieve,
8-
parse_chunks_to_texts,
9-
)
6+
from deepeval.integrations.goodmem.types import GoodMemChunk
7+
from deepeval.integrations.goodmem.utils import goodmem_retrieve
108

119

1210
@dataclass
1311
class GoodMemConfig:
14-
"""Configuration for connecting to a GoodMem instance."""
12+
"""Configuration for connecting to a GoodMem instance.
13+
14+
Supports both single-space and multi-space queries::
15+
16+
# Single space (backward compatible)
17+
config = GoodMemConfig(base_url=..., api_key=..., space_id="abc")
18+
19+
# Multiple spaces
20+
config = GoodMemConfig(base_url=..., api_key=..., space_ids=["abc", "def"])
21+
"""
1522

1623
base_url: str
1724
api_key: str
18-
space_id: str
25+
space_ids: List[str] = field(default_factory=list)
1926
top_k: int = 5
2027
reranker: Optional[str] = None
2128
relevance_threshold: Optional[float] = None
2229
metadata_filter: Optional[str] = None
2330
embedder: Optional[str] = None
2431

32+
# Backward compat: accept space_id= as a shorthand for a single space.
33+
space_id: Optional[str] = field(default=None, repr=False)
34+
35+
def __post_init__(self):
36+
if self.space_id and not self.space_ids:
37+
self.space_ids = [self.space_id]
38+
if not self.space_ids:
39+
raise ValueError("Provide space_id or space_ids")
40+
2541

2642
class GoodMemRetriever:
2743
"""DeepEval-integrated retriever for GoodMem.
@@ -39,39 +55,49 @@ class GoodMemRetriever:
3955
space_id="my-space",
4056
))
4157
42-
# Returns List[str] of chunk texts; traced in deepeval
58+
# Plain text list for LLMTestCase.retrieval_context
4359
chunks = retriever.retrieve("What is machine learning?")
60+
61+
# Structured chunks with scores and IDs
62+
detailed = retriever.retrieve_chunks("What is machine learning?")
4463
"""
4564

4665
def __init__(self, config: GoodMemConfig):
4766
self.config = config
4867

49-
@observe(type="retriever", name="GoodMem Retriever")
5068
def retrieve(self, query: str) -> List[str]:
5169
"""Retrieve relevant chunks from GoodMem for a query.
5270
53-
The call is automatically traced as a deepeval retriever span.
54-
Returns a list of text strings suitable for
55-
``LLMTestCase.retrieval_context``.
71+
Delegates to :meth:`retrieve_chunks` (which is traced) and
72+
extracts plain text. Returns a list of text strings suitable
73+
for ``LLMTestCase.retrieval_context``.
74+
"""
75+
chunks = self.retrieve_chunks(query)
76+
return [c.content for c in chunks if c.content]
77+
78+
@observe(type="retriever", name="GoodMem Retriever")
79+
def retrieve_chunks(self, query: str) -> List[GoodMemChunk]:
80+
"""Retrieve relevant chunks with full metadata.
81+
82+
Returns ``GoodMemChunk`` objects containing content, relevance
83+
scores, chunk IDs, memory IDs, and space IDs.
5684
"""
5785
update_retriever_span(
5886
embedder=self.config.embedder,
5987
top_k=self.config.top_k,
6088
)
6189

62-
response = goodmem_retrieve(
90+
return goodmem_retrieve(
6391
base_url=self.config.base_url,
6492
api_key=self.config.api_key,
65-
space_id=self.config.space_id,
93+
space_ids=self.config.space_ids,
6694
query=query,
6795
top_k=self.config.top_k,
6896
reranker=self.config.reranker,
6997
relevance_threshold=self.config.relevance_threshold,
7098
metadata_filter=self.config.metadata_filter,
7199
)
72100

73-
return parse_chunks_to_texts(response)
74-
75101
def retrieve_as_context(self, query: str) -> List[str]:
76102
"""Alias for ``retrieve`` — returns chunks formatted for
77103
``LLMTestCase.retrieval_context``."""
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from dataclasses import dataclass
2+
from typing import Optional
3+
4+
5+
@dataclass
6+
class GoodMemChunk:
7+
"""A single retrieved chunk from GoodMem with metadata."""
8+
9+
content: str
10+
score: Optional[float] = None
11+
chunk_id: str = ""
12+
memory_id: str = ""
13+
space_id: str = ""

deepeval/integrations/goodmem/utils.py

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,31 +3,36 @@
33

44
import requests
55

6+
from deepeval.integrations.goodmem.types import GoodMemChunk
7+
68

79
def goodmem_retrieve(
810
base_url: str,
911
api_key: str,
10-
space_id: str,
12+
space_ids: List[str],
1113
query: str,
1214
top_k: int = 5,
1315
reranker: Optional[str] = None,
1416
relevance_threshold: Optional[float] = None,
1517
metadata_filter: Optional[str] = None,
16-
) -> Dict[str, Any]:
18+
) -> List[GoodMemChunk]:
1719
"""Execute a semantic retrieval against GoodMem via raw HTTP.
1820
19-
Returns the parsed response dict with retrieved chunks.
21+
Returns a list of ``GoodMemChunk`` objects with content, scores, and IDs.
2022
"""
2123

2224
url = f"{base_url.rstrip('/')}/v1/memories:retrieve"
2325

24-
space_key: Dict[str, Any] = {"spaceId": space_id}
25-
if metadata_filter:
26-
space_key["filter"] = metadata_filter
26+
space_keys: List[Dict[str, Any]] = []
27+
for sid in space_ids:
28+
key: Dict[str, Any] = {"spaceId": sid}
29+
if metadata_filter:
30+
key["filter"] = metadata_filter
31+
space_keys.append(key)
2732

2833
body: Dict[str, Any] = {
2934
"message": query,
30-
"spaceKeys": [space_key],
35+
"spaceKeys": space_keys,
3136
"requestedSize": top_k,
3237
"fetchMemory": True,
3338
}
@@ -55,9 +60,9 @@ def goodmem_retrieve(
5560
return _parse_ndjson_response(response.text)
5661

5762

58-
def _parse_ndjson_response(text: str) -> Dict[str, Any]:
59-
"""Parse GoodMem's NDJSON streaming response into a structured dict."""
60-
chunks: List[Dict[str, Any]] = []
63+
def _parse_ndjson_response(text: str) -> List[GoodMemChunk]:
64+
"""Parse GoodMem's NDJSON streaming response into GoodMemChunk objects."""
65+
chunks: List[GoodMemChunk] = []
6166

6267
for line in text.strip().split("\n"):
6368
line = line.strip()
@@ -77,25 +82,18 @@ def _parse_ndjson_response(text: str) -> Dict[str, Any]:
7782
chunk_text = inner_chunk.get("chunkText", "")
7883
chunk_id = inner_chunk.get("chunkId", "")
7984
memory_id = inner_chunk.get("memoryId", "")
85+
space_id = inner_chunk.get("spaceId", "")
8086

8187
chunks.append(
82-
{
83-
"chunk_id": chunk_id,
84-
"memory_id": memory_id,
85-
"content": chunk_text,
86-
"relevance_score": chunk_data.get(
88+
GoodMemChunk(
89+
content=chunk_text,
90+
score=chunk_data.get(
8791
"relevanceScore", item.get("relevanceScore")
8892
),
89-
}
93+
chunk_id=chunk_id,
94+
memory_id=memory_id,
95+
space_id=space_id,
96+
)
9097
)
9198

92-
return {"chunks": chunks}
93-
94-
95-
def parse_chunks_to_texts(response: Dict[str, Any]) -> List[str]:
96-
"""Extract plain text strings from a parsed retrieval response."""
97-
return [
98-
chunk["content"]
99-
for chunk in response.get("chunks", [])
100-
if chunk.get("content")
101-
]
99+
return chunks

test_real.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
"""
2+
Comprehensive end-to-end test of the GoodMem + DeepEval integration.
3+
4+
Uses a GoodMem space pre-loaded with SQuAD 2.0 articles covering:
5+
- Energy (physics)
6+
- American Idol
7+
- FBI history
8+
- Greek diaspora
9+
- Universal Studios
10+
11+
Evaluates a full RAG pipeline: retrieve from GoodMem → generate with OpenAI → score
12+
with multiple DeepEval metrics across answerable and unanswerable queries.
13+
14+
Required env vars:
15+
GOODMEM_BASE_URL, GOODMEM_API_KEY, GOODMEM_SPACE_ID, OPENAI_API_KEY
16+
"""
17+
18+
import os
19+
20+
from deepeval import evaluate
21+
from deepeval.evaluate import AsyncConfig
22+
from deepeval.integrations.goodmem import GoodMemConfig, GoodMemChunk, GoodMemRetriever
23+
from deepeval.metrics import (
24+
AnswerRelevancyMetric,
25+
ContextualRelevancyMetric,
26+
FaithfulnessMetric,
27+
)
28+
from deepeval.test_case import LLMTestCase
29+
from openai import OpenAI
30+
31+
# ---------------------------------------------------------------------------
32+
# 1. Setup
33+
# ---------------------------------------------------------------------------
34+
retriever = GoodMemRetriever(
35+
GoodMemConfig(
36+
base_url=os.environ["GOODMEM_BASE_URL"],
37+
api_key=os.environ["GOODMEM_API_KEY"],
38+
space_id=os.environ["GOODMEM_SPACE_ID"],
39+
top_k=3,
40+
)
41+
)
42+
client = OpenAI()
43+
GENERATION_MODEL = "gpt-4o-mini"
44+
45+
SYSTEM_PROMPT = (
46+
"Answer the question accurately based only on the provided context. "
47+
"If the context doesn't contain enough information, say so."
48+
)
49+
50+
# ---------------------------------------------------------------------------
51+
# 2. Test queries — one per SQuAD article, plus an unanswerable query
52+
# ---------------------------------------------------------------------------
53+
test_queries = [
54+
# --- Energy (physics) ---
55+
{
56+
"query": "What are the main forms of energy in physics?",
57+
"expected": (
58+
"Common energy forms include kinetic energy of a moving object, "
59+
"potential energy stored by position in a force field, elastic energy, "
60+
"and other forms like chemical, thermal, and electromagnetic energy."
61+
),
62+
},
63+
# --- American Idol ---
64+
{
65+
"query": "Who created American Idol and when did it first air?",
66+
"expected": (
67+
"American Idol was created by Simon Fuller, produced by "
68+
"19 Entertainment, and first aired on Fox on June 11, 2002."
69+
),
70+
},
71+
# --- FBI ---
72+
{
73+
"query": "What was the FBI's role in enforcing civil rights laws?",
74+
"expected": (
75+
"The FBI is charged with the responsibility of enforcing compliance "
76+
"with United States Civil Rights Acts."
77+
),
78+
},
79+
# --- Greeks ---
80+
{
81+
"query": "Where have Greek colonies been historically established?",
82+
"expected": (
83+
"Greek colonies and communities have been historically established "
84+
"on the shores of the Mediterranean Sea and Black Sea, centered "
85+
"around the Aegean and Ionian seas."
86+
),
87+
},
88+
# --- Universal Studios ---
89+
{
90+
"query": "When did Carl Laemmle open Universal's production facility?",
91+
"expected": (
92+
"On March 15, 1915, Carl Laemmle opened the world's largest motion "
93+
"picture production facility, Universal City Studios."
94+
),
95+
},
96+
# --- Unanswerable (no relevant content in the space) ---
97+
{
98+
"query": "What is the capital of Mongolia?",
99+
"expected": (
100+
"The context does not contain information about the capital of Mongolia."
101+
),
102+
},
103+
]
104+
105+
# ---------------------------------------------------------------------------
106+
# 3. Demonstrate retrieve_chunks() — structured retrieval with scores/IDs
107+
# ---------------------------------------------------------------------------
108+
print("\n=== Structured Retrieval Demo (retrieve_chunks) ===")
109+
demo_chunks = retriever.retrieve_chunks(test_queries[0]["query"])
110+
print(f"\nQuery: {test_queries[0]['query']}")
111+
for i, chunk in enumerate(demo_chunks):
112+
print(f" Chunk {i + 1}: score={chunk.score:.4f} chunk_id={chunk.chunk_id[:16]}... memory_id={chunk.memory_id[:16]}...")
113+
print(f" {chunk.content[:80]}...")
114+
115+
# ---------------------------------------------------------------------------
116+
# 4. Build test cases: retrieve → generate → package
117+
# ---------------------------------------------------------------------------
118+
test_cases = []
119+
120+
for item in test_queries:
121+
query = item["query"]
122+
123+
# Retrieve context from GoodMem (plain text for LLMTestCase)
124+
chunks = retriever.retrieve(query)
125+
print(f"\n--- Query: {query} ---")
126+
print(f" Retrieved {len(chunks)} chunks")
127+
for i, c in enumerate(chunks):
128+
preview = c[:100].replace("\n", " ")
129+
print(f" Chunk {i + 1}: {preview}...")
130+
131+
# Generate answer grounded in retrieved context
132+
response = client.chat.completions.create(
133+
model=GENERATION_MODEL,
134+
messages=[
135+
{"role": "system", "content": SYSTEM_PROMPT},
136+
{
137+
"role": "user",
138+
"content": f"Context:\n{chr(10).join(chunks)}\n\nQuestion: {query}",
139+
},
140+
],
141+
)
142+
answer = response.choices[0].message.content
143+
print(f" Answer: {answer[:200]}...")
144+
145+
test_cases.append(
146+
LLMTestCase(
147+
input=query,
148+
actual_output=answer,
149+
expected_output=item["expected"],
150+
retrieval_context=chunks,
151+
)
152+
)
153+
154+
# ---------------------------------------------------------------------------
155+
# 5. Evaluate with multiple RAG metrics
156+
# ---------------------------------------------------------------------------
157+
print("\n\n=== Running DeepEval Evaluation ===\n")
158+
metrics = [
159+
AnswerRelevancyMetric(model="gpt-4o-mini"),
160+
ContextualRelevancyMetric(model="gpt-4o-mini"),
161+
# FaithfulnessMetric processes all retrieval chunks per claim and can
162+
# exceed OpenAI timeouts on lower-tier keys. Uncomment with a higher-tier key:
163+
# FaithfulnessMetric(model="gpt-4o-mini"),
164+
]
165+
results = evaluate(
166+
test_cases,
167+
metrics,
168+
async_config=AsyncConfig(max_concurrent=2, throttle_value=1),
169+
)

0 commit comments

Comments
 (0)