confident-ai
diff --git a/‎deepeval/integrations/goodmem/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎deepeval/integrations/goodmem/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎deepeval/integrations/goodmem/retriever.py‎
Lines changed: 41 additions & 15 deletions b/‎deepeval/integrations/goodmem/retriever.py‎
Lines changed: 41 additions & 15 deletions
diff --git a/‎deepeval/integrations/goodmem/types.py‎
Lines changed: 13 additions & 0 deletions b/‎deepeval/integrations/goodmem/types.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎deepeval/integrations/goodmem/utils.py‎
Lines changed: 24 additions & 26 deletions b/‎deepeval/integrations/goodmem/utils.py‎
Lines changed: 24 additions & 26 deletions
diff --git a/‎test_real.py‎
Lines changed: 169 additions & 0 deletions b/‎test_real.py‎
Lines changed: 169 additions & 0 deletions
@@ -1,3 +1,4 @@
 from .retriever import GoodMemRetriever, GoodMemConfig
+from .types import GoodMemChunk
 
-__all__ = ["GoodMemRetriever", "GoodMemConfig"]
+__all__ = ["GoodMemRetriever", "GoodMemConfig", "GoodMemChunk"]
@@ -3,25 +3,41 @@
 
 from deepeval.tracing import observe, update_retriever_span
 
-from deepeval.integrations.goodmem.utils import (
-    goodmem_retrieve,
-    parse_chunks_to_texts,
-)
+from deepeval.integrations.goodmem.types import GoodMemChunk
+from deepeval.integrations.goodmem.utils import goodmem_retrieve
 
 
 @dataclass
 class GoodMemConfig:
-    """Configuration for connecting to a GoodMem instance."""
+    """Configuration for connecting to a GoodMem instance.
+
+    Supports both single-space and multi-space queries::
+
+        # Single space (backward compatible)
+        config = GoodMemConfig(base_url=..., api_key=..., space_id="abc")
+
+        # Multiple spaces
+        config = GoodMemConfig(base_url=..., api_key=..., space_ids=["abc", "def"])
+    """
 
     base_url: str
     api_key: str
-    space_id: str
+    space_ids: List[str] = field(default_factory=list)
     top_k: int = 5
     reranker: Optional[str] = None
     relevance_threshold: Optional[float] = None
     metadata_filter: Optional[str] = None
     embedder: Optional[str] = None
 
+    # Backward compat: accept space_id= as a shorthand for a single space.
+    space_id: Optional[str] = field(default=None, repr=False)
+
+    def __post_init__(self):
+        if self.space_id and not self.space_ids:
+            self.space_ids = [self.space_id]
+        if not self.space_ids:
+            raise ValueError("Provide space_id or space_ids")
+
 
 class GoodMemRetriever:
     """DeepEval-integrated retriever for GoodMem.
@@ -39,39 +55,49 @@ class GoodMemRetriever:
             space_id="my-space",
         ))
 
-        # Returns List[str] of chunk texts; traced in deepeval
+        # Plain text list for LLMTestCase.retrieval_context
         chunks = retriever.retrieve("What is machine learning?")
+
+        # Structured chunks with scores and IDs
+        detailed = retriever.retrieve_chunks("What is machine learning?")
     """
 
     def __init__(self, config: GoodMemConfig):
         self.config = config
 
-    @observe(type="retriever", name="GoodMem Retriever")
     def retrieve(self, query: str) -> List[str]:
         """Retrieve relevant chunks from GoodMem for a query.
 
-        The call is automatically traced as a deepeval retriever span.
-        Returns a list of text strings suitable for
-        ``LLMTestCase.retrieval_context``.
+        Delegates to :meth:`retrieve_chunks` (which is traced) and
+        extracts plain text.  Returns a list of text strings suitable
+        for ``LLMTestCase.retrieval_context``.
+        """
+        chunks = self.retrieve_chunks(query)
+        return [c.content for c in chunks if c.content]
+
+    @observe(type="retriever", name="GoodMem Retriever")
+    def retrieve_chunks(self, query: str) -> List[GoodMemChunk]:
+        """Retrieve relevant chunks with full metadata.
+
+        Returns ``GoodMemChunk`` objects containing content, relevance
+        scores, chunk IDs, memory IDs, and space IDs.
         """
         update_retriever_span(
             embedder=self.config.embedder,
             top_k=self.config.top_k,
         )
 
-        response = goodmem_retrieve(
+        return goodmem_retrieve(
             base_url=self.config.base_url,
             api_key=self.config.api_key,
-            space_id=self.config.space_id,
+            space_ids=self.config.space_ids,
             query=query,
             top_k=self.config.top_k,
             reranker=self.config.reranker,
             relevance_threshold=self.config.relevance_threshold,
             metadata_filter=self.config.metadata_filter,
         )
 
-        return parse_chunks_to_texts(response)
-
     def retrieve_as_context(self, query: str) -> List[str]:
         """Alias for ``retrieve`` — returns chunks formatted for
         ``LLMTestCase.retrieval_context``."""
 
@@ -0,0 +1,13 @@
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class GoodMemChunk:
+    """A single retrieved chunk from GoodMem with metadata."""
+
+    content: str
+    score: Optional[float] = None
+    chunk_id: str = ""
+    memory_id: str = ""
+    space_id: str = ""
@@ -3,31 +3,36 @@
 
 import requests
 
+from deepeval.integrations.goodmem.types import GoodMemChunk
+
 
 def goodmem_retrieve(
     base_url: str,
     api_key: str,
-    space_id: str,
+    space_ids: List[str],
     query: str,
     top_k: int = 5,
     reranker: Optional[str] = None,
     relevance_threshold: Optional[float] = None,
     metadata_filter: Optional[str] = None,
-) -> Dict[str, Any]:
+) -> List[GoodMemChunk]:
     """Execute a semantic retrieval against GoodMem via raw HTTP.
 
-    Returns the parsed response dict with retrieved chunks.
+    Returns a list of ``GoodMemChunk`` objects with content, scores, and IDs.
     """
 
     url = f"{base_url.rstrip('/')}/v1/memories:retrieve"
 
-    space_key: Dict[str, Any] = {"spaceId": space_id}
-    if metadata_filter:
-        space_key["filter"] = metadata_filter
+    space_keys: List[Dict[str, Any]] = []
+    for sid in space_ids:
+        key: Dict[str, Any] = {"spaceId": sid}
+        if metadata_filter:
+            key["filter"] = metadata_filter
+        space_keys.append(key)
 
     body: Dict[str, Any] = {
         "message": query,
-        "spaceKeys": [space_key],
+        "spaceKeys": space_keys,
         "requestedSize": top_k,
         "fetchMemory": True,
     }
@@ -55,9 +60,9 @@ def goodmem_retrieve(
     return _parse_ndjson_response(response.text)
 
 
-def _parse_ndjson_response(text: str) -> Dict[str, Any]:
-    """Parse GoodMem's NDJSON streaming response into a structured dict."""
-    chunks: List[Dict[str, Any]] = []
+def _parse_ndjson_response(text: str) -> List[GoodMemChunk]:
+    """Parse GoodMem's NDJSON streaming response into GoodMemChunk objects."""
+    chunks: List[GoodMemChunk] = []
 
     for line in text.strip().split("\n"):
         line = line.strip()
@@ -77,25 +82,18 @@ def _parse_ndjson_response(text: str) -> Dict[str, Any]:
             chunk_text = inner_chunk.get("chunkText", "")
             chunk_id = inner_chunk.get("chunkId", "")
             memory_id = inner_chunk.get("memoryId", "")
+            space_id = inner_chunk.get("spaceId", "")
 
             chunks.append(
-                {
-                    "chunk_id": chunk_id,
-                    "memory_id": memory_id,
-                    "content": chunk_text,
-                    "relevance_score": chunk_data.get(
+                GoodMemChunk(
+                    content=chunk_text,
+                    score=chunk_data.get(
                         "relevanceScore", item.get("relevanceScore")
                     ),
-                }
+                    chunk_id=chunk_id,
+                    memory_id=memory_id,
+                    space_id=space_id,
+                )
             )
 
-    return {"chunks": chunks}
-
-
-def parse_chunks_to_texts(response: Dict[str, Any]) -> List[str]:
-    """Extract plain text strings from a parsed retrieval response."""
-    return [
-        chunk["content"]
-        for chunk in response.get("chunks", [])
-        if chunk.get("content")
-    ]
+    return chunks
@@ -0,0 +1,169 @@
+"""
+Comprehensive end-to-end test of the GoodMem + DeepEval integration.
+
+Uses a GoodMem space pre-loaded with SQuAD 2.0 articles covering:
+  - Energy (physics)
+  - American Idol
+  - FBI history
+  - Greek diaspora
+  - Universal Studios
+
+Evaluates a full RAG pipeline: retrieve from GoodMem → generate with OpenAI → score
+with multiple DeepEval metrics across answerable and unanswerable queries.
+
+Required env vars:
+  GOODMEM_BASE_URL, GOODMEM_API_KEY, GOODMEM_SPACE_ID, OPENAI_API_KEY
+"""
+
+import os
+
+from deepeval import evaluate
+from deepeval.evaluate import AsyncConfig
+from deepeval.integrations.goodmem import GoodMemConfig, GoodMemChunk, GoodMemRetriever
+from deepeval.metrics import (
+    AnswerRelevancyMetric,
+    ContextualRelevancyMetric,
+    FaithfulnessMetric,
+)
+from deepeval.test_case import LLMTestCase
+from openai import OpenAI
+
+# ---------------------------------------------------------------------------
+# 1. Setup
+# ---------------------------------------------------------------------------
+retriever = GoodMemRetriever(
+    GoodMemConfig(
+        base_url=os.environ["GOODMEM_BASE_URL"],
+        api_key=os.environ["GOODMEM_API_KEY"],
+        space_id=os.environ["GOODMEM_SPACE_ID"],
+        top_k=3,
+    )
+)
+client = OpenAI()
+GENERATION_MODEL = "gpt-4o-mini"
+
+SYSTEM_PROMPT = (
+    "Answer the question accurately based only on the provided context. "
+    "If the context doesn't contain enough information, say so."
+)
+
+# ---------------------------------------------------------------------------
+# 2. Test queries — one per SQuAD article, plus an unanswerable query
+# ---------------------------------------------------------------------------
+test_queries = [
+    # --- Energy (physics) ---
+    {
+        "query": "What are the main forms of energy in physics?",
+        "expected": (
+            "Common energy forms include kinetic energy of a moving object, "
+            "potential energy stored by position in a force field, elastic energy, "
+            "and other forms like chemical, thermal, and electromagnetic energy."
+        ),
+    },
+    # --- American Idol ---
+    {
+        "query": "Who created American Idol and when did it first air?",
+        "expected": (
+            "American Idol was created by Simon Fuller, produced by "
+            "19 Entertainment, and first aired on Fox on June 11, 2002."
+        ),
+    },
+    # --- FBI ---
+    {
+        "query": "What was the FBI's role in enforcing civil rights laws?",
+        "expected": (
+            "The FBI is charged with the responsibility of enforcing compliance "
+            "with United States Civil Rights Acts."
+        ),
+    },
+    # --- Greeks ---
+    {
+        "query": "Where have Greek colonies been historically established?",
+        "expected": (
+            "Greek colonies and communities have been historically established "
+            "on the shores of the Mediterranean Sea and Black Sea, centered "
+            "around the Aegean and Ionian seas."
+        ),
+    },
+    # --- Universal Studios ---
+    {
+        "query": "When did Carl Laemmle open Universal's production facility?",
+        "expected": (
+            "On March 15, 1915, Carl Laemmle opened the world's largest motion "
+            "picture production facility, Universal City Studios."
+        ),
+    },
+    # --- Unanswerable (no relevant content in the space) ---
+    {
+        "query": "What is the capital of Mongolia?",
+        "expected": (
+            "The context does not contain information about the capital of Mongolia."
+        ),
+    },
+]
+
+# ---------------------------------------------------------------------------
+# 3. Demonstrate retrieve_chunks() — structured retrieval with scores/IDs
+# ---------------------------------------------------------------------------
+print("\n=== Structured Retrieval Demo (retrieve_chunks) ===")
+demo_chunks = retriever.retrieve_chunks(test_queries[0]["query"])
+print(f"\nQuery: {test_queries[0]['query']}")
+for i, chunk in enumerate(demo_chunks):
+    print(f"  Chunk {i + 1}: score={chunk.score:.4f}  chunk_id={chunk.chunk_id[:16]}...  memory_id={chunk.memory_id[:16]}...")
+    print(f"           {chunk.content[:80]}...")
+
+# ---------------------------------------------------------------------------
+# 4. Build test cases: retrieve → generate → package
+# ---------------------------------------------------------------------------
+test_cases = []
+
+for item in test_queries:
+    query = item["query"]
+
+    # Retrieve context from GoodMem (plain text for LLMTestCase)
+    chunks = retriever.retrieve(query)
+    print(f"\n--- Query: {query} ---")
+    print(f"  Retrieved {len(chunks)} chunks")
+    for i, c in enumerate(chunks):
+        preview = c[:100].replace("\n", " ")
+        print(f"  Chunk {i + 1}: {preview}...")
+
+    # Generate answer grounded in retrieved context
+    response = client.chat.completions.create(
+        model=GENERATION_MODEL,
+        messages=[
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {
+                "role": "user",
+                "content": f"Context:\n{chr(10).join(chunks)}\n\nQuestion: {query}",
+            },
+        ],
+    )
+    answer = response.choices[0].message.content
+    print(f"  Answer: {answer[:200]}...")
+
+    test_cases.append(
+        LLMTestCase(
+            input=query,
+            actual_output=answer,
+            expected_output=item["expected"],
+            retrieval_context=chunks,
+        )
+    )
+
+# ---------------------------------------------------------------------------
+# 5. Evaluate with multiple RAG metrics
+# ---------------------------------------------------------------------------
+print("\n\n=== Running DeepEval Evaluation ===\n")
+metrics = [
+    AnswerRelevancyMetric(model="gpt-4o-mini"),
+    ContextualRelevancyMetric(model="gpt-4o-mini"),
+    # FaithfulnessMetric processes all retrieval chunks per claim and can
+    # exceed OpenAI timeouts on lower-tier keys. Uncomment with a higher-tier key:
+    # FaithfulnessMetric(model="gpt-4o-mini"),
+]
+results = evaluate(
+    test_cases,
+    metrics,
+    async_config=AsyncConfig(max_concurrent=2, throttle_value=1),
+)