lightspeed-core · syedriko · Apr 17, 2026 · Apr 17, 2026 · Apr 18, 2026 · are-ces
diff --git a/docs/byok_guide.md b/docs/byok_guide.md
@@ -79,10 +79,11 @@ Both modes rely on:
 
 Inline RAG additionally supports:
 - **Score Multiplier**: Optional weight applied per BYOK vector store when mixing multiple sources. Allows custom prioritization of content.
+- **Relevance cutoff (`relevance_cutoff_score`)**: Optional minimum **raw** similarity score from each BYOK vector store during **Inline RAG**. Chunks below the cutoff are dropped **before** `score_multiplier` is applied. It applies only to BYOK stores listed under `byok_rag`; it does not affect OKP/Solr inline RAG (which uses separate query defaults) and is not used for Tool RAG (`file_search`). The default matches `DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE` in `src/constants.py` (currently `0.3`). Set to `0.0` to disable filtering for BYOK inline retrieval.
 
 > [!NOTE]
 > OKP and BYOK scores are not directly comparable (different scoring systems), so
-> `score_multiplier` does not apply to OKP results. To control the amount of retrieved
+> `score_multiplier` and `relevance_cutoff_score` do not apply to OKP results. To control the amount of retrieved
 > context, set the `BYOK_RAG_MAX_CHUNKS` and `OKP_RAG_MAX_CHUNKS` constants in `src/constants.py`
 > (defaults: 10 and 5 respectively). For Tool RAG, use `TOOL_RAG_MAX_CHUNKS` (default: 10).
 
@@ -280,19 +281,26 @@ registered_resources:
 > section of `lightspeed-stack.yaml`. The lightspeed-stack service automatically generates the required configuration
 > at startup.
 >
+> Preferred shape: an object with `entries` (list of stores) and optional `relevance_cutoff_score`:
+>
 > ```yaml
 > byok_rag:
->   - rag_id: my-docs           # Unique identifier for this knowledge source
->     rag_type: inline::faiss
->     embedding_model: sentence-transformers/all-mpnet-base-v2
->     embedding_dimension: 768
->     vector_db_id: your-index-id  # Llama Stack vector store ID (from index generation)
->     db_path: /path/to/vector_db/faiss_store.db
->     score_multiplier: 1.0       # Optional: weight results when mixing multiple sources
+>   relevance_cutoff_score: 0.3   # Optional; min raw score per BYOK store before score_multiplier (BYOK only)
+>   entries:
+>     - rag_id: my-docs           # Unique identifier for this knowledge source
+>       rag_type: inline::faiss
+>       embedding_model: sentence-transformers/all-mpnet-base-v2
+>       embedding_dimension: 768
+>       vector_db_id: your-index-id  # Llama Stack vector store ID (from index generation)
+>       db_path: /path/to/vector_db/faiss_store.db
+>       score_multiplier: 1.0       # Optional: weight results when mixing multiple sources
 > ```
 >
+> Legacy: a bare list is still accepted and is treated as `entries` (same fields as each list item above).
+>
 > When multiple BYOK sources are configured, `score_multiplier` adjusts the relative importance of
 > each store's results during Inline RAG retrieval. Values above 1.0 boost a store; below 1.0 reduce it.
+> `relevance_cutoff_score` filters by raw retrieval score first; weighting applies only to chunks that pass the cutoff.
 
 ### Step 5: Configure RAG Strategy
 
@@ -319,10 +327,10 @@ okp:
 
 Both modes can be enabled simultaneously. Choose based on your latency and control preferences:
 
-| Mode | When context is fetched | Tool call needed | score_multiplier |
-|------|------------------------|------------------|-----------------|
-| Inline RAG | With every query | No | Yes (BYOK only) |
-| Tool RAG | On LLM demand | Yes | No |
+| Mode | When context is fetched | Tool call needed | score_multiplier | relevance_cutoff_score |
+|------|------------------------|------------------|------------------|------------------------|
+| Inline RAG | With every query | No | Yes (BYOK only) | Yes (BYOK only) |
+| Tool RAG | On LLM demand | Yes | No | No |
 
 > [!TIP]
 > A ready-to-use example combining BYOK and OKP is available at

diff --git a/src/app/endpoints/rags.py b/src/app/endpoints/rags.py
@@ -163,7 +163,7 @@ async def get_rag_endpoint_handler(
 
     # Resolve user-facing rag_id to llama-stack vector_db_id
     vector_db_id = _resolve_rag_id_to_vector_db_id(
-        rag_id, configuration.configuration.byok_rag
+        rag_id, configuration.configuration.byok_rag.entries
     )
 
     try:

diff --git a/src/client.py b/src/client.py
@@ -85,7 +85,7 @@ def _enrich_library_config(self, input_config_path: str) -> str:
         config = configuration.configuration
 
         # Enrichment: BYOK RAG
-        enrich_byok_rag(ls_config, [b.model_dump() for b in config.byok_rag])
+        enrich_byok_rag(ls_config, [b.model_dump() for b in config.byok_rag.entries])
 
         # Enrichment: Solr - enabled when "okp" appears in either inline or tool list
         enrich_solr(ls_config, config.rag.model_dump(), config.okp.model_dump())

diff --git a/src/configuration.py b/src/configuration.py
@@ -479,7 +479,8 @@ def rag_id_mapping(self) -> dict[str, str]:
         if self._configuration is None:
             raise LogicError("logic error: configuration is not loaded")
         byok_mapping = {
-            brag.vector_db_id: brag.rag_id for brag in self._configuration.byok_rag
+            brag.vector_db_id: brag.rag_id
+            for brag in self._configuration.byok_rag.entries
         }
 
         rag = self._configuration.rag
@@ -505,7 +506,7 @@ def score_multiplier_mapping(self) -> dict[str, float]:
             raise LogicError("logic error: configuration is not loaded")
         return {
             brag.vector_db_id: brag.score_multiplier
-            for brag in self._configuration.byok_rag
+            for brag in self._configuration.byok_rag.entries
         }
 
     @property

diff --git a/src/constants.py b/src/constants.py
@@ -186,10 +186,13 @@
 
 # Inline RAG constants
 BYOK_RAG_MAX_CHUNKS = 10  # retrieved from BYOK RAG
+# Default minimum raw similarity for BYOK vector stores only (``byok_rag.relevance_cutoff_score``)
+DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE = 0.3
 OKP_RAG_MAX_CHUNKS = 5  # retrieved from OKP RAG
 
 # Solr OKP constants
 SOLR_VECTOR_SEARCH_DEFAULT_K = 5
+# Default score_threshold in vector_io.query params for the OKP/Solr vector store
 SOLR_VECTOR_SEARCH_DEFAULT_SCORE_THRESHOLD = 0.3
 SOLR_VECTOR_SEARCH_DEFAULT_MODE = "hybrid"
 

diff --git a/src/llama_stack_configuration.py b/src/llama_stack_configuration.py
@@ -41,6 +41,29 @@ def increase_indent(self, flow: bool = False, indentless: bool = False) -> None:
         return super().increase_indent(flow, False)
 
 
+def _raw_byok_rag_store_list(raw_byok_rag: Any) -> list[Any]:
+    """Return BYOK store definitions from raw Lightspeed YAML (list or section dict).
+
+    Always returns a ``list`` suitable for :func:`enrich_byok_rag` (each item is
+    expected to be a mapping with ``.get``).
+
+    For the section form ``byok_rag: { entries: ..., relevance_cutoff_score: ... }``,
+    only ``entries`` is used: a ``list`` is returned as-is, a single ``dict`` is
+    wrapped as a one-element list, and ``None``, strings, or other types yield
+    ``[]``.
+    """
+    if isinstance(raw_byok_rag, list):
+        return raw_byok_rag
+    if isinstance(raw_byok_rag, dict):
+        entries = raw_byok_rag.get("entries")
+        if isinstance(entries, list):
+            return entries
+        if isinstance(entries, dict):
+            return [entries]
+        return []
+    return []
+
+
 # =============================================================================
 # Enrichment: Azure Entra ID
 # =============================================================================
@@ -619,7 +642,7 @@ def generate_configuration(
     setup_azure_entra_id_token(config.get("azure_entra_id"), env_file)
 
     # Enrichment: BYOK RAG
-    enrich_byok_rag(ls_config, config.get("byok_rag", []))
+    enrich_byok_rag(ls_config, _raw_byok_rag_store_list(config.get("byok_rag", [])))
 
     # Enrichment: Solr - enabled when "okp" appears in either inline or tool list
     enrich_solr(ls_config, config.get("rag", {}), config.get("okp", {}))

diff --git a/src/models/config.py b/src/models/config.py
@@ -7,14 +7,15 @@
 from functools import cached_property
 from pathlib import Path
 from re import Pattern
-from typing import Any, Literal, Optional, Self
+from typing import Annotated, Any, Literal, Optional, Self
 
 import jsonpath_ng
 import yaml
 from jsonpath_ng.exceptions import JSONPathError
 from pydantic import (
     AnyHttpUrl,
     BaseModel,
+    BeforeValidator,
     ConfigDict,
     Field,
     FilePath,
@@ -1622,6 +1623,55 @@ class ByokRag(ConfigurationBase):
     )
 
 
+def _normalize_byok_rag_input(value: Any) -> Any:
+    """Allow legacy ``byok_rag: [ ... ]`` YAML alongside the section object form.
+
+    Explicit YAML null (``byok_rag: null``) is normalized to an empty mapping so
+    :class:`ByokRagSection` field defaults apply instead of a type error.
+    """
+    if value is None:
+        return {}
+    if isinstance(value, list):
+        return {"entries": value}
+    return value
+
+
+class ByokRagSection(ConfigurationBase):
+    """BYOK RAG configuration: registered BYOK stores and optional raw-score cutoff.
+
+    Settings here apply only to bring-your-own-knowledge vector stores listed in
+    ``entries``. They do not affect OKP (Solr) inline RAG, which uses separate
+    query parameters and defaults.
+    """
+
+    entries: list[ByokRag] = Field(
+        default_factory=list,
+        title="BYOK RAG stores",
+        description="Registered bring-your-own-knowledge vector stores.",
+    )
+
+    relevance_cutoff_score: float = Field(
+        constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
+        ge=0,
+        title="BYOK inline RAG relevance cutoff",
+        description="Minimum raw similarity score from each **BYOK** vector store "
+        "before per-store score_multiplier weighting. Chunks below this threshold "
+        "are dropped immediately after retrieval from those stores only. Does not "
+        "apply to OKP/Solr. Set to 0.0 to disable filtering for BYOK.",
+    )
+
+
+def _default_byok_rag_section() -> ByokRagSection:
+    """Return default BYOK RAG section; delegates to :class:`ByokRagSection` field defaults."""
+    return ByokRagSection.model_validate({})
+
+
+ByokRagSectionValidated = Annotated[
+    ByokRagSection,
+    BeforeValidator(_normalize_byok_rag_input),
+]
+
+
 class QuotaLimiterConfiguration(ConfigurationBase):
     """Configuration for one quota limiter.
 
@@ -1908,11 +1958,14 @@ class Configuration(ConfigurationBase):
         description="Conversation history configuration.",
     )
 
-    byok_rag: list[ByokRag] = Field(
-        default_factory=list,
+    byok_rag: ByokRagSectionValidated = Field(
+        default_factory=_default_byok_rag_section,
         title="BYOK RAG configuration",
         description="BYOK RAG configuration. This configuration can be used to "
-        "reconfigure Llama Stack through its run.yaml configuration file",
+        "reconfigure Llama Stack through its run.yaml configuration file. "
+        "You may use the legacy form ``byok_rag: [ ... ]`` (a list of stores) or "
+        "an object with ``entries`` and optional ``relevance_cutoff_score`` "
+        "(BYOK inline RAG only; not used for OKP/Solr).",
     )
 
     a2a_state: A2AStateConfiguration = Field(

diff --git a/src/utils/responses.py b/src/utils/responses.py
@@ -236,7 +236,7 @@ async def prepare_tools(  # pylint: disable=too-many-arguments,too-many-position
     #      If rag.inline is configured, but not rag.tool, tool RAG is disabled.
     #   3. All registered vector DBs: fallback when neither rag.tool nor rag.inline are configured.
     #      IDs fetched from llama-stack are already internal and need no translation.
-    byok_rags = configuration.configuration.byok_rag
+    byok_rags = configuration.configuration.byok_rag.entries
 
     is_tool_rag_enabled = len(configuration.configuration.rag.tool) > 0
     is_inline_rag_enabled = len(configuration.configuration.rag.inline) > 0
@@ -1708,7 +1708,7 @@ async def _resolve_client_tools(
     # Per-request override of vector stores (user-facing rag_ids)
     vector_store_ids = extract_vector_store_ids_from_tools(tools) or None
     # Translate user-facing rag_ids to llama-stack vector_store_ids in each file_search tool
-    byok_rags = configuration.configuration.byok_rag
+    byok_rags = configuration.configuration.byok_rag.entries
     prepared_tools = translate_tools_vector_store_ids(tools, byok_rags)
     prepared_tools = apply_mcp_headers_to_explicit_tools(
         prepared_tools, token, mcp_headers, request_headers
@@ -1803,7 +1803,7 @@ async def resolve_tool_choice(
         )
     else:
         # Pass tools explicitly configured for this request
-        byok_rags = configuration.configuration.byok_rag
+        byok_rags = configuration.configuration.byok_rag.entries
         prepared_tools = translate_tools_vector_store_ids(tools, byok_rags)
         prepared_tools = apply_mcp_headers_to_explicit_tools(
             prepared_tools, token, mcp_headers, request_headers