diff --git a/deepeval/integrations/langchain/callback.py b/deepeval/integrations/langchain/callback.py
index 1ecd03219b..281636403d 100644
--- a/deepeval/integrations/langchain/callback.py
+++ b/deepeval/integrations/langchain/callback.py
@@ -539,4 +539,4 @@ def on_retriever_error(
         with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
             retriever_span.status = TraceSpanStatus.ERRORED
             retriever_span.error = str(error)
-            exit_current_context(uuid_str=uuid_str)
\ No newline at end of file
+            exit_current_context(uuid_str=uuid_str)
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000000..177a766138
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,72 @@
+try:
+    import sys
+    import pysqlite3 as sqlite3  # type: ignore
+
+    sys.modules["sqlite3"] = sqlite3
+    sys.modules["sqlite3.dbapi2"] = sqlite3.dbapi2
+except Exception:
+    pass
+
+import pytest
+
+from typing import TYPE_CHECKING
+from pathlib import Path
+
+from deepeval.tracing.tracing import trace_manager
+from deepeval.config.settings import get_settings, reset_settings
+
+
+if TYPE_CHECKING:
+    pass
+
+
+# Silence telemetry for all tests so we don't have to deal with the noise
+@pytest.fixture(autouse=True)
+def _telemetry_opt_out(monkeypatch):
+    monkeypatch.setenv("DEEPEVAL_TELEMETRY_OPT_OUT", "1")
+    yield
+
+
+@pytest.fixture(autouse=True)
+def _ensure_hidden_store_dir(tmp_path: Path):
+    d = tmp_path / ".deepeval"
+    d.mkdir(exist_ok=True)
+    # some code expects the file to be there after a run,
+    # but at minimum the directory must exist to avoid FileNotFoundError
+    yield
+
+
+@pytest.fixture
+def hidden_store_dir(tmp_path: Path) -> Path:
+    d = tmp_path / ".deepeval"
+    d.mkdir(parents=True, exist_ok=True)
+    return d
+
+
+@pytest.fixture()
+def settings():
+    settings = get_settings()
+    yield settings
+
+
+@pytest.fixture()
+def enable_dotenv(monkeypatch):
+    monkeypatch.setenv("DEEPEVAL_DISABLE_DOTENV", "0")
+    # rebuild Settings after changing the env
+    reset_settings(reload_dotenv=False)
+
+
+@pytest.fixture(autouse=True)
+def _reset_tracing_state():
+    trace_manager.clear_traces()
+    trace_manager.traces_to_evaluate_order.clear()
+    trace_manager.traces_to_evaluate.clear()
+    trace_manager.integration_traces_to_evaluate.clear()
+    trace_manager.trace_uuid_to_golden.clear()
+    try:
+        trace_manager.task_bindings.clear()
+    except Exception:
+        pass
+    trace_manager.evaluating = False
+    trace_manager.evaluation_loop = False
+    yield
diff --git a/tests/test_core/conftest.py b/tests/test_core/conftest.py
index f475517b8b..1744309206 100644
--- a/tests/test_core/conftest.py
+++ b/tests/test_core/conftest.py
@@ -14,7 +14,6 @@
 from typing import TYPE_CHECKING
 from pathlib import Path
 
-from deepeval.tracing.tracing import trace_manager
 from deepeval.config.settings import get_settings, reset_settings, Settings
 
 
@@ -29,36 +28,6 @@
 }
 
 
-@pytest.fixture(autouse=True)
-def _ensure_hidden_store_dir(tmp_path: Path):
-    d = tmp_path / ".deepeval"
-    d.mkdir(exist_ok=True)
-    # some code expects the file to be there after a run,
-    # but at minimum the directory must exist to avoid FileNotFoundError
-    yield
-
-
-@pytest.fixture
-def hidden_store_dir(tmp_path: Path) -> Path:
-    d = tmp_path / ".deepeval"
-    d.mkdir(parents=True, exist_ok=True)
-    return d
-
-
-# Silence telemetry for all tests so we don't have to deal with the noise
-@pytest.fixture(autouse=True)
-def _telemetry_opt_out(monkeypatch):
-    monkeypatch.setenv("DEEPEVAL_TELEMETRY_OPT_OUT", "1")
-    yield
-
-
-# Run every test in its own temp CWD so .deepeval/.deepeval is sandboxed
-@pytest.fixture(autouse=True)
-def _isolate_cwd(tmp_path: Path, monkeypatch):
-    monkeypatch.chdir(tmp_path)
-    yield
-
-
 # Default dotenv path most tests can reuse; override in tests as needed
 @pytest.fixture
 def env_path(monkeypatch, tmp_path: Path) -> Path:
@@ -72,17 +41,6 @@ def env_dir(monkeypatch, tmp_path: Path) -> Path:
     return tmp_path
 
 
-@pytest.fixture(autouse=True)
-def no_sleep(monkeypatch):
-    monkeypatch.setattr(tenacity.nap, "sleep", lambda _: None, raising=True)
-
-
-@pytest.fixture()
-def settings():
-    settings = get_settings()
-    yield settings
-
-
 @pytest.fixture(scope="session")
 def _session_env_baseline():
     # capture the environment as it existed when pytest started
@@ -120,6 +78,9 @@ def _env_sandbox(_session_env_baseline, request, monkeypatch):
     for k, v in preserved.items():
         monkeypatch.setenv(k, v)
 
+    # Always silence telemetry in tests
+    monkeypatch.setenv("DEEPEVAL_TELEMETRY_OPT_OUT", "1")
+
     # Never open the Confident AI browser UI during tests
     monkeypatch.setenv("CONFIDENT_OPEN_BROWSER", "0")
 
@@ -166,13 +127,6 @@ def _core_mode_no_confident(
     yield
 
 
-@pytest.fixture()
-def enable_dotenv(monkeypatch):
-    monkeypatch.setenv("DEEPEVAL_DISABLE_DOTENV", "0")
-    # rebuild Settings after changing the env
-    reset_settings(reload_dotenv=False)
-
-
 @pytest.fixture(autouse=False)
 def unpatch_openai_after():
     from deepeval.openai.patch import unpatch_openai_classes
@@ -181,17 +135,13 @@ def unpatch_openai_after():
     unpatch_openai_classes()
 
 
+# Run every test in its own temp CWD so .deepeval/.deepeval is sandboxed
 @pytest.fixture(autouse=True)
-def _reset_tracing_state():
-    trace_manager.clear_traces()
-    trace_manager.traces_to_evaluate_order.clear()
-    trace_manager.traces_to_evaluate.clear()
-    trace_manager.integration_traces_to_evaluate.clear()
-    trace_manager.trace_uuid_to_golden.clear()
-    try:
-        trace_manager.task_bindings.clear()
-    except Exception:
-        pass
-    trace_manager.evaluating = False
-    trace_manager.evaluation_loop = False
+def _isolate_cwd(tmp_path: Path, monkeypatch):
+    monkeypatch.chdir(tmp_path)
     yield
+
+
+@pytest.fixture(autouse=True)
+def no_sleep(monkeypatch):
+    monkeypatch.setattr(tenacity.nap, "sleep", lambda _: None, raising=True)
diff --git a/tests/test_core/test_end_to_end/__init__.py b/tests/test_core/test_end_to_end/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/test_core/test_end_to_end/conftest.py b/tests/test_core/test_end_to_end/conftest.py
new file mode 100644
index 0000000000..03b68f8334
--- /dev/null
+++ b/tests/test_core/test_end_to_end/conftest.py
@@ -0,0 +1,10 @@
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def _offline_deterministic_env(monkeypatch: pytest.MonkeyPatch):
+    # Prevent dotenv loading (could pull real API keys/configs) and avoid browser open.
+    monkeypatch.setenv("DEEPEVAL_DISABLE_DOTENV", "1")
+    monkeypatch.setenv("CONFIDENT_OPEN_BROWSER", "0")
+    # Keep stable even if unset.
+    monkeypatch.setenv("DEEPEVAL_RESULTS_FOLDER", "")
diff --git a/tests/test_core/test_end_to_end/helpers.py b/tests/test_core/test_end_to_end/helpers.py
new file mode 100644
index 0000000000..5ac2612bdd
--- /dev/null
+++ b/tests/test_core/test_end_to_end/helpers.py
@@ -0,0 +1,436 @@
+from __future__ import annotations
+
+import csv
+import json
+from pathlib import Path
+from typing import Callable, List, Tuple
+
+from deepeval.dataset import Golden, EvaluationDataset
+from deepeval.dataset.golden import ConversationalGolden
+from deepeval.metrics import BaseMetric, BaseConversationalMetric
+from deepeval.test_case import (
+    LLMTestCase,
+    LLMTestCaseParams,
+    ConversationalTestCase,
+    Turn,
+)
+
+
+def deterministic_llm_app(user_input: str) -> Tuple[str, List[str]]:
+    """
+    Deterministic stand-in for "your_llm_app" from the docs.
+
+    The docs show:
+        res, text_chunks = your_llm_app(golden.input)
+
+    We return:
+      - res: deterministic output based solely on input
+      - text_chunks: deterministic retrieval_context
+    """
+    normalized = user_input.strip().lower()
+    if "name" in normalized:
+        return "My name is DeepEval.", ["ctx: identity", "ctx: greeting"]
+    if "number" in normalized:
+        return "42", ["ctx: numbers", "ctx: preferences"]
+    return "OK", ["ctx: default"]
+
+
+def build_single_turn_dataset() -> EvaluationDataset:
+    # Mirrors the docs pattern (goldens list -> EvaluationDataset(goldens))
+    goldens = [
+        Golden(
+            input="What is your name?", expected_output="My name is DeepEval."
+        ),
+        Golden(input="Choose a number between 1 to 100", expected_output="42"),
+    ]
+    return EvaluationDataset(goldens)
+
+
+def build_llm_test_cases_from_goldens(
+    dataset: EvaluationDataset,
+    llm_app: Callable[[str], Tuple[str, List[str]]] = deterministic_llm_app,
+) -> List[LLMTestCase]:
+    test_cases: List[LLMTestCase] = []
+    for golden in dataset.goldens:
+        res, text_chunks = llm_app(golden.input)
+        test_cases.append(
+            LLMTestCase(
+                input=golden.input,
+                actual_output=res,
+                expected_output=golden.expected_output,
+                retrieval_context=text_chunks,
+            )
+        )
+    return test_cases
+
+
+class DeterministicContainsExpectedOutputMetric(BaseMetric):
+    """
+    Tiny deterministic metric for offline CI.
+    Avoid asserting exact metric scores in tests; we only need stable behavior.
+    """
+
+    _required_params = [
+        LLMTestCaseParams.ACTUAL_OUTPUT,
+        LLMTestCaseParams.EXPECTED_OUTPUT,
+    ]
+
+    def __init__(self, threshold: float = 0.5):
+        self.threshold = threshold
+        self.async_mode = False
+        self.include_reason = True
+
+    @property
+    def __name__(self) -> str:
+        return "DeterministicContainsExpectedOutputMetric"
+
+    def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
+        expected = (test_case.expected_output or "").strip()
+        actual = (test_case.actual_output or "").strip()
+        passed = (expected != "") and (expected in actual)
+        self.score = 1.0 if passed else 0.0
+        self.reason = (
+            "expected_output is contained in actual_output"
+            if passed
+            else "expected_output not found in actual_output"
+        )
+        self.success = self.is_successful()
+        return self.score
+
+    async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
+        return self.measure(test_case, *args, **kwargs)
+
+    def is_successful(self) -> bool:
+        return bool(self.score is not None and self.score >= self.threshold)
+
+
+class DeterministicFailingMetric(BaseMetric):
+    """
+    Deterministic metric that always fails.
+    Used to verify that evaluate() correctly propagates failures.
+    """
+
+    _required_params = [LLMTestCaseParams.ACTUAL_OUTPUT]
+
+    def __init__(self, threshold: float = 0.5):
+        self.threshold = threshold
+        self.async_mode = False
+        self.include_reason = True
+
+    @property
+    def __name__(self) -> str:
+        return "DeterministicFailingMetric"
+
+    def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
+        self.score = 0.0
+        self.reason = "This metric always fails for testing purposes"
+        self.success = False
+        return self.score
+
+    async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
+        return self.measure(test_case, *args, **kwargs)
+
+    def is_successful(self) -> bool:
+        return False
+
+
+class DeterministicPassingMetric(BaseMetric):
+    """
+    Deterministic metric that always passes.
+    Used to verify that evaluate() correctly propagates success.
+    """
+
+    _required_params = [LLMTestCaseParams.ACTUAL_OUTPUT]
+
+    def __init__(self, threshold: float = 0.5):
+        self.threshold = threshold
+        self.async_mode = False
+        self.include_reason = True
+
+    @property
+    def __name__(self) -> str:
+        return "DeterministicPassingMetric"
+
+    def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
+        self.score = 1.0
+        self.reason = "This metric always passes for testing purposes"
+        self.success = True
+        return self.score
+
+    async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
+        return self.measure(test_case, *args, **kwargs)
+
+    def is_successful(self) -> bool:
+        return True
+
+
+class DeterministicRequiresRetrievalContextMetric(BaseMetric):
+    """
+    Deterministic metric that requires RETRIEVAL_CONTEXT.
+    Used to test skip_on_missing_params behavior when test case lacks retrieval_context.
+    """
+
+    _required_params = [
+        LLMTestCaseParams.ACTUAL_OUTPUT,
+        LLMTestCaseParams.RETRIEVAL_CONTEXT,
+    ]
+
+    def __init__(self, threshold: float = 0.5):
+        self.threshold = threshold
+        self.async_mode = False
+        self.include_reason = True
+
+    @property
+    def __name__(self) -> str:
+        return "DeterministicRequiresRetrievalContextMetric"
+
+    def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
+        # This will only be called if retrieval_context is present
+        # The check_llm_test_case_params in BaseMetric handles the validation
+        from deepeval.metrics.utils import check_llm_test_case_params
+
+        check_llm_test_case_params(
+            test_case=test_case,
+            test_case_params=self._required_params,
+            input_image_count=None,
+            actual_output_image_count=None,
+            metric=self,
+        )
+        self.score = 1.0
+        self.reason = "Retrieval context was present"
+        self.success = True
+        return self.score
+
+    async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
+        return self.measure(test_case, *args, **kwargs)
+
+    def is_successful(self) -> bool:
+        return bool(self.score is not None and self.score >= self.threshold)
+
+
+class DeterministicRaisingMetric(BaseMetric):
+    """
+    Deterministic metric that always raises an exception.
+    Used to test ignore_errors behavior.
+    """
+
+    _required_params = [LLMTestCaseParams.ACTUAL_OUTPUT]
+
+    def __init__(self, threshold: float = 0.5):
+        self.threshold = threshold
+        self.async_mode = False
+        self.include_reason = True
+
+    @property
+    def __name__(self) -> str:
+        return "DeterministicRaisingMetric"
+
+    def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
+        raise RuntimeError("This metric always raises for testing purposes")
+
+    async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
+        raise RuntimeError("This metric always raises for testing purposes")
+
+    def is_successful(self) -> bool:
+        return False
+
+
+def save_dataset_as_json_and_load(
+    dataset: EvaluationDataset, directory: Path, file_name: str
+) -> list:
+    """
+    Option A artifact: dataset.save_as(file_type="json", directory=..., file_name=...)
+    Returns parsed JSON content (a list of records).
+    """
+    full_path = dataset.save_as(
+        file_type="json",
+        directory=str(directory),
+        file_name=file_name,
+        include_test_cases=False,
+    )
+    with open(full_path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def save_dataset_as_csv_and_load(
+    dataset: EvaluationDataset, directory: Path, file_name: str
+) -> List[dict]:
+    """
+    Option A artifact: dataset.save_as(file_type="csv", directory=..., file_name=...)
+    Returns parsed CSV content as a list of dicts.
+    """
+    full_path = dataset.save_as(
+        file_type="csv",
+        directory=str(directory),
+        file_name=file_name,
+        include_test_cases=False,
+    )
+    with open(full_path, "r", encoding="utf-8") as f:
+        return list(csv.DictReader(f))
+
+
+# ===========================================================================
+# Multi-turn / Conversational helpers
+# ===========================================================================
+
+
+def deterministic_chatbot_callback(
+    input: str,
+    turns: List[Turn] = None,
+    thread_id: str = None,
+) -> Turn:
+    """
+    Deterministic chatbot callback for offline testing.
+
+    Mirrors the doc pattern:
+        async def model_callback(input: str, turns: List[Turn], thread_id: str) -> Turn:
+            response = await your_chatbot(input, turns, thread_id)
+            return Turn(role="assistant", content=response)
+
+    This sync version returns deterministic responses based on input.
+    """
+    normalized = input.strip().lower()
+    if (
+        "ticket" in normalized
+        or "buy" in normalized
+        or "purchase" in normalized
+    ):
+        return Turn(
+            role="assistant",
+            content="I can help you purchase a ticket. What event are you interested in?",
+        )
+    if "coldplay" in normalized or "concert" in normalized:
+        return Turn(
+            role="assistant",
+            content="Great choice! We have VIP and standard tickets available for Coldplay.",
+        )
+    if "vip" in normalized:
+        return Turn(
+            role="assistant",
+            content="VIP ticket selected. That will be $250. Shall I proceed with the purchase?",
+        )
+    if (
+        "yes" in normalized
+        or "proceed" in normalized
+        or "confirm" in normalized
+    ):
+        return Turn(
+            role="assistant",
+            content="Purchase confirmed! Your VIP ticket has been booked successfully.",
+        )
+    return Turn(role="assistant", content="How can I assist you today?")
+
+
+def build_multi_turn_dataset() -> EvaluationDataset:
+    """
+    Build a multi-turn dataset using ConversationalGolden.
+
+    Mirrors the docs pattern:
+        goldens = [
+            ConversationalGolden(
+                scenario="Andy Byron wants to purchase a VIP ticket to a Coldplay concert.",
+                expected_outcome="Successful purchase of a ticket.",
+                user_description="Andy Byron is the CEO of Astronomer.",
+            ),
+            ...
+        ]
+        dataset = EvaluationDataset(goldens)
+    """
+    goldens = [
+        ConversationalGolden(
+            scenario="Andy Byron wants to purchase a VIP ticket to a Coldplay concert.",
+            expected_outcome="Successful purchase of a ticket.",
+            user_description="Andy Byron is the CEO of Astronomer.",
+        ),
+        ConversationalGolden(
+            scenario="A customer wants to ask about concert dates.",
+            expected_outcome="Customer receives concert date information.",
+            user_description="A general user looking for event info.",
+        ),
+    ]
+    return EvaluationDataset(goldens)
+
+
+def build_conversational_test_cases_manually(
+    dataset: EvaluationDataset,
+    chatbot_callback: Callable = deterministic_chatbot_callback,
+    max_turns: int = 4,
+) -> List[ConversationalTestCase]:
+    """
+    Manually build ConversationalTestCase objects without using ConversationSimulator.
+
+    ConversationSimulator requires a simulator_model which needs network access.
+    This helper creates deterministic test cases for offline testing.
+    """
+    test_cases = []
+    for golden in dataset.goldens:
+        # Simulate a basic conversation flow
+        turns = []
+        user_inputs = [
+            "Hello, I want to buy a ticket",
+            "I'm interested in Coldplay",
+            "I'd like a VIP ticket please",
+            "Yes, please proceed",
+        ]
+
+        for i, user_input in enumerate(user_inputs[:max_turns]):
+            # User turn
+            turns.append(Turn(role="user", content=user_input))
+            # Assistant response via callback
+            assistant_turn = chatbot_callback(
+                user_input, turns, f"thread-{id(golden)}"
+            )
+            turns.append(assistant_turn)
+
+        test_case = ConversationalTestCase(
+            turns=turns,
+            scenario=golden.scenario,
+            expected_outcome=golden.expected_outcome,
+            user_description=golden.user_description,
+        )
+        test_cases.append(test_case)
+
+    return test_cases
+
+
+class DeterministicConversationalMetric(BaseConversationalMetric):
+    """
+    Deterministic conversational metric for offline testing.
+    Evaluates whether the conversation reached an expected outcome.
+    """
+
+    def __init__(self, threshold: float = 0.5):
+        self.threshold = threshold
+        self.async_mode = False
+        self.include_reason = True
+
+    @property
+    def __name__(self) -> str:
+        return "DeterministicConversationalMetric"
+
+    def measure(
+        self, test_case: ConversationalTestCase, *args, **kwargs
+    ) -> float:
+        # Check if any assistant turn contains expected outcome keywords
+        outcome_keywords = ["confirmed", "booked", "success", "complete"]
+        has_positive_outcome = any(
+            any(kw in turn.content.lower() for kw in outcome_keywords)
+            for turn in test_case.turns
+            if turn.role == "assistant"
+        )
+        self.score = 1.0 if has_positive_outcome else 0.0
+        self.reason = (
+            "Conversation reached a positive outcome"
+            if has_positive_outcome
+            else "Conversation did not reach expected outcome"
+        )
+        self.success = self.is_successful()
+        return self.score
+
+    async def a_measure(
+        self, test_case: ConversationalTestCase, *args, **kwargs
+    ) -> float:
+        return self.measure(test_case, *args, **kwargs)
+
+    def is_successful(self) -> bool:
+        return bool(self.score is not None and self.score >= self.threshold)
diff --git a/tests/test_core/test_end_to_end/test_evaluate.py b/tests/test_core/test_end_to_end/test_evaluate.py
new file mode 100644
index 0000000000..3e999654d0
--- /dev/null
+++ b/tests/test_core/test_end_to_end/test_evaluate.py
@@ -0,0 +1,447 @@
+import json
+import os
+from pathlib import Path
+import subprocess
+
+import pytest
+
+from deepeval import evaluate
+from deepeval.dataset import EvaluationDataset
+from deepeval.evaluate.configs import (
+    AsyncConfig,
+    CacheConfig,
+    DisplayConfig,
+)
+from deepeval.evaluate.types import EvaluationResult, TestResult
+
+from .helpers import (
+    DeterministicContainsExpectedOutputMetric,
+    DeterministicConversationalMetric,
+    DeterministicFailingMetric,
+    DeterministicPassingMetric,
+    build_llm_test_cases_from_goldens,
+    build_single_turn_dataset,
+    build_multi_turn_dataset,
+    build_conversational_test_cases_manually,
+    save_dataset_as_csv_and_load,
+    save_dataset_as_json_and_load,
+)
+
+PROJECT_ROOT = Path(__file__).resolve().parents[3]
+
+
+def test_single_turn_evaluate_returns_result_and_dataset_json_schema(
+    tmp_path,
+):
+    """
+    End-to-end: dataset -> test cases -> evaluate() -> EvaluationResult.
+    Also validates the saved dataset JSON schema.
+
+    Deterministic/offline (no network): uses deterministic custom metrics.
+    """
+    dataset = build_single_turn_dataset()
+    test_cases = build_llm_test_cases_from_goldens(dataset)
+
+    result = evaluate(
+        test_cases=test_cases,
+        metrics=[DeterministicContainsExpectedOutputMetric()],
+        hyperparameters={"model": "offline-stub", "system_prompt": "offline"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+    )
+
+    # Verify result is the expected type with correct structure
+    assert isinstance(result, EvaluationResult)
+    assert isinstance(result.test_results, list)
+    assert len(result.test_results) == len(test_cases)
+    assert result.confident_link is None or isinstance(
+        result.confident_link, str
+    )
+    assert result.test_run_id is None or isinstance(result.test_run_id, str)
+
+    for tr in result.test_results:
+        assert isinstance(tr, TestResult)
+        assert isinstance(tr.name, str) and tr.name != ""
+        assert isinstance(tr.success, bool)
+        assert tr.conversational is False
+
+        # Single-turn results should have input/output
+        assert tr.input is not None
+        assert tr.actual_output is not None
+
+        # Metrics data should be present
+        assert isinstance(tr.metrics_data, list)
+        assert len(tr.metrics_data) >= 1
+
+    # Test JSON artifact schema
+    json_records = save_dataset_as_json_and_load(
+        dataset, directory=tmp_path, file_name="dataset"
+    )
+    assert isinstance(json_records, list)
+    assert len(json_records) >= 1
+
+    required_keys = {
+        "input",
+        "actual_output",
+        "expected_output",
+        "retrieval_context",
+        "context",
+        "name",
+        "comments",
+        "source_file",
+        "tools_called",
+        "expected_tools",
+        "additional_metadata",
+        "custom_column_key_values",
+    }
+
+    for rec in json_records:
+        assert isinstance(rec, dict)
+        assert required_keys.issubset(rec.keys())
+        assert isinstance(rec["input"], str) and rec["input"] != ""
+
+        # Optional fields can be None or their expected type
+        assert rec["expected_output"] is None or isinstance(
+            rec["expected_output"], str
+        )
+        assert rec["actual_output"] is None or isinstance(
+            rec["actual_output"], str
+        )
+        assert rec["retrieval_context"] is None or isinstance(
+            rec["retrieval_context"], list
+        )
+        assert rec["tools_called"] is None or isinstance(
+            rec["tools_called"], list
+        )
+        assert rec["expected_tools"] is None or isinstance(
+            rec["expected_tools"], list
+        )
+
+
+# ===========================================================================
+# Checklist Item 2: Multi-turn E2E evaluation (doc-driven, offline)
+# ===========================================================================
+
+
+def test_multi_turn_evaluate_returns_conversational_result_shape(tmp_path):
+    """
+    End-to-end: multi-turn/conversational evaluation returns conversational TestResult
+    objects and metrics_data.
+
+    Deterministic/offline (no network): constructs ConversationalTestCase objects
+    locally and uses deterministic conversational metrics.
+    """
+    dataset = build_multi_turn_dataset()
+    assert len(dataset.goldens) >= 1
+
+    conversational_test_cases = build_conversational_test_cases_manually(
+        dataset, max_turns=4
+    )
+    assert len(conversational_test_cases) == len(dataset.goldens)
+
+    result = evaluate(
+        test_cases=conversational_test_cases,
+        metrics=[DeterministicConversationalMetric()],
+        hyperparameters={"model": "offline-stub", "system_prompt": "offline"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+    )
+
+    # Verify result is the expected type with correct structure
+    assert isinstance(result, EvaluationResult)
+    assert isinstance(result.test_results, list)
+    assert len(result.test_results) == len(conversational_test_cases)
+    assert result.confident_link is None or isinstance(
+        result.confident_link, str
+    )
+    assert result.test_run_id is None or isinstance(result.test_run_id, str)
+
+    # Verify test result structure for conversational test cases
+    for tr in result.test_results:
+        assert isinstance(tr, TestResult)
+        assert isinstance(tr.name, str) and tr.name != ""
+        assert isinstance(tr.success, bool)
+        assert tr.conversational is True  # Multi-turn should be conversational
+
+        # Metrics data should be present
+        assert isinstance(tr.metrics_data, list)
+        assert len(tr.metrics_data) >= 1
+
+
+# ===========================================================================
+# Checklist Item 3: JSON artifact schema for multi-turn dataset
+# ===========================================================================
+
+
+def test_multi_turn_dataset_json_schema(tmp_path):
+    """Multi-turn dataset JSON export includes expected keys for conversational goldens."""
+    dataset = build_multi_turn_dataset()
+
+    json_records = save_dataset_as_json_and_load(
+        dataset, directory=tmp_path, file_name="multi_turn_dataset"
+    )
+
+    assert isinstance(json_records, list)
+    assert len(json_records) >= 1
+
+    # Multi-turn JSON schema (from dataset.py save_as implementation)
+    required_keys = {
+        "scenario",
+        "turns",
+        "expected_outcome",
+        "user_description",
+        "context",
+        "name",
+        "comments",
+        "additional_metadata",
+        "custom_column_key_values",
+    }
+
+    for rec in json_records:
+        assert isinstance(rec, dict)
+        assert required_keys.issubset(
+            rec.keys()
+        ), f"Missing keys: {required_keys - set(rec.keys())}"
+
+        # Optional fields can be None or their expected type
+        assert rec["scenario"] is None or isinstance(rec["scenario"], str)
+        assert rec["turns"] is None or isinstance(rec["turns"], list)
+        assert rec["expected_outcome"] is None or isinstance(
+            rec["expected_outcome"], str
+        )
+        assert rec["user_description"] is None or isinstance(
+            rec["user_description"], str
+        )
+
+
+def test_single_turn_dataset_csv_schema(tmp_path):
+    """Single-turn dataset CSV export includes expected columns."""
+    dataset = build_single_turn_dataset()
+
+    csv_records = save_dataset_as_csv_and_load(
+        dataset, directory=tmp_path, file_name="single_turn_dataset"
+    )
+
+    assert isinstance(csv_records, list)
+    assert len(csv_records) >= 1
+
+    # CSV should have "input" column at minimum
+    for rec in csv_records:
+        assert isinstance(rec, dict)
+        assert "input" in rec.keys(), "CSV must have 'input' column"
+        assert rec["input"] is not None and rec["input"] != ""
+
+
+def test_evaluate_propagates_metric_failure():
+    """When a metric fails, TestResult.success is False and metrics_data reflects failure."""
+    dataset = build_single_turn_dataset()
+    test_cases = build_llm_test_cases_from_goldens(dataset)
+
+    result = evaluate(
+        test_cases=test_cases,
+        metrics=[DeterministicFailingMetric()],
+        hyperparameters={"model": "offline-stub", "system_prompt": "offline"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+    )
+
+    assert isinstance(result, EvaluationResult)
+    assert len(result.test_results) == len(test_cases)
+
+    # All test results should be failures since the metric always fails
+    for tr in result.test_results:
+        assert (
+            tr.success is False
+        ), "Test result should be False when metric fails"
+        assert len(tr.metrics_data) >= 1
+        for md in tr.metrics_data:
+            assert md.success is False
+
+
+def test_evaluate_propagates_metric_success():
+    """When a metric passes, TestResult.success is True and metrics_data reflects success."""
+    dataset = build_single_turn_dataset()
+    test_cases = build_llm_test_cases_from_goldens(dataset)
+
+    result = evaluate(
+        test_cases=test_cases,
+        metrics=[DeterministicPassingMetric()],
+        hyperparameters={"model": "offline-stub", "system_prompt": "offline"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+    )
+
+    assert isinstance(result, EvaluationResult)
+    assert len(result.test_results) == len(test_cases)
+
+    # All test results should be successes since the metric always passes
+    for tr in result.test_results:
+        assert (
+            tr.success is True
+        ), "Test result should be True when metric passes"
+        assert len(tr.metrics_data) >= 1
+        for md in tr.metrics_data:
+            assert md.success is True
+
+
+def test_evaluate_with_multiple_metrics():
+    """When multiple metrics are provided, each result appears in TestResult.metrics_data."""
+    dataset = build_single_turn_dataset()
+    test_cases = build_llm_test_cases_from_goldens(dataset)
+
+    metrics = [
+        DeterministicPassingMetric(),
+        DeterministicContainsExpectedOutputMetric(),
+    ]
+
+    result = evaluate(
+        test_cases=test_cases,
+        metrics=metrics,
+        hyperparameters={"model": "offline-stub", "system_prompt": "offline"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+    )
+
+    assert isinstance(result, EvaluationResult)
+    assert len(result.test_results) == len(test_cases)
+
+    for tr in result.test_results:
+        # Should have results from all metrics
+        assert len(tr.metrics_data) == len(
+            metrics
+        ), f"Expected {len(metrics)} metric results, got {len(tr.metrics_data)}"
+
+        # Verify each metric result has a non-empty name
+        for md in tr.metrics_data:
+            assert isinstance(md.name, str) and md.name != ""
+
+
+def test_dataset_add_goldens_from_json_file_flow(tmp_path):
+    """End-to-end: write minimal JSON -> dataset.add_goldens_from_json_file -> evaluate."""
+    # Write a minimal JSON file directly (not via save_as, to avoid round-trip issues)
+    json_path = tmp_path / "test_goldens.json"
+    goldens_data = [
+        {
+            "input": "What is your name?",
+            "expected_output": "My name is DeepEval.",
+        },
+        {"input": "Choose a number between 1 to 100", "expected_output": "42"},
+    ]
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(goldens_data, f)
+
+    # Load into a new dataset (as shown in docs)
+    loaded_dataset = EvaluationDataset()
+    loaded_dataset.add_goldens_from_json_file(
+        file_path=str(json_path),
+        input_key_name="input",
+    )
+
+    # Verify goldens were loaded correctly
+    assert len(loaded_dataset.goldens) == len(goldens_data)
+
+    for orig, loaded in zip(goldens_data, loaded_dataset.goldens):
+        assert loaded.input == orig["input"]
+        assert loaded.expected_output == orig["expected_output"]
+
+    # Now evaluate using the loaded dataset (completing the documented flow)
+    test_cases = build_llm_test_cases_from_goldens(loaded_dataset)
+
+    result = evaluate(
+        test_cases=test_cases,
+        metrics=[DeterministicContainsExpectedOutputMetric()],
+        hyperparameters={"model": "offline-stub", "system_prompt": "offline"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+    )
+
+    assert isinstance(result, EvaluationResult)
+    assert len(result.test_results) == len(test_cases)
+
+    # All should pass since our deterministic LLM app returns expected outputs
+    for tr in result.test_results:
+        assert (
+            tr.success is True
+        ), f"Test case '{tr.input}' failed unexpectedly after JSON load"
+
+
+def test_evaluate_accepts_hyperparameters():
+    """evaluate() accepts hyperparameters without affecting evaluation execution."""
+    dataset = build_single_turn_dataset()
+    test_cases = build_llm_test_cases_from_goldens(dataset)
+
+    # Test with various hyperparameter types as documented
+    hyperparameters = {
+        "model": "gpt-4.1",
+        "system_prompt": "You are a helpful assistant.",
+        "temperature": 0.7,
+        "max_tokens": 1000,
+    }
+
+    result = evaluate(
+        test_cases=test_cases,
+        metrics=[DeterministicPassingMetric()],
+        hyperparameters=hyperparameters,
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+    )
+
+    # Should complete without error
+    assert isinstance(result, EvaluationResult)
+    assert len(result.test_results) == len(test_cases)
+
+
+@pytest.mark.skipif(
+    not os.getenv("OPENAI_API_KEY"),
+    reason="OPENAI_API_KEY not set; skipping networked CLI smoke test",
+)
+def test_cli_smoke_test_networked(tmp_path):
+    """
+    CLI smoke test for `deepeval test run`.
+
+    This test requires OPENAI_API_KEY to be set and will be skipped otherwise.
+    It creates a minimal test file and runs `poetry run deepeval test run` on it.
+    """
+
+    # Create a minimal test file that uses DeepEval CLI
+    # Note: file must start with "test_" prefix for deepeval CLI
+    test_file = tmp_path / "test_cli_smoke.py"
+    test_file.write_text(
+        '''
+import deepeval
+from deepeval import assert_test
+from deepeval.test_case import LLMTestCase
+from deepeval.metrics import AnswerRelevancyMetric
+
+def test_cli_smoke():
+    """Minimal test case for CLI smoke test."""
+    test_case = LLMTestCase(
+        input="What is 2+2?",
+        actual_output="4",
+    )
+    assert_test(test_case, metrics=[AnswerRelevancyMetric(threshold=0.5)])
+'''
+    )
+
+    # Run the CLI via subprocess through Poetry
+    proc = subprocess.run(
+        ["poetry", "run", "deepeval", "test", "run", str(test_file)],
+        capture_output=True,
+        cwd=PROJECT_ROOT,
+        text=True,
+        timeout=120,  # 2 minute timeout for network calls
+    )
+
+    # Assert CLI completed successfully
+    assert proc.returncode == 0, (
+        f"CLI smoke test failed with return code {proc.returncode}.\n"
+        f"STDOUT:\n{proc.stdout}\n"
+        f"STDERR:\n{proc.stderr}"
+    )
diff --git a/tests/test_core/test_end_to_end/test_evaluate_cache.py b/tests/test_core/test_end_to_end/test_evaluate_cache.py
new file mode 100644
index 0000000000..417ad4d184
--- /dev/null
+++ b/tests/test_core/test_end_to_end/test_evaluate_cache.py
@@ -0,0 +1,179 @@
+import os
+
+from deepeval import evaluate
+from deepeval.evaluate.configs import AsyncConfig, CacheConfig, DisplayConfig
+from deepeval.metrics import BaseMetric
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_run.test_run import LATEST_TEST_RUN_FILE_PATH
+from deepeval.test_run.cache import CACHE_FILE_NAME
+
+from .helpers import (
+    build_single_turn_dataset,
+    build_llm_test_cases_from_goldens,
+    DeterministicPassingMetric,
+)
+
+
+# Module-level counter for tracking metric executions
+_metric_call_counter = {"calls": 0}
+
+
+def _reset_metric_counter():
+    """Reset the metric call counter to zero."""
+    _metric_call_counter["calls"] = 0
+
+
+class CountingMetric(BaseMetric):
+    """
+    A deterministic metric that counts how many times it is executed.
+    Used to verify cache hit/miss behavior.
+    """
+
+    _required_params = [LLMTestCaseParams.ACTUAL_OUTPUT]
+
+    def __init__(self, threshold: float = 0.5):
+        self.threshold = threshold
+        self.async_mode = False
+        self.include_reason = True
+
+    @property
+    def __name__(self) -> str:
+        return "CountingMetric"
+
+    def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
+        _metric_call_counter["calls"] += 1
+        self.score = 1.0
+        self.reason = "Counting metric always passes"
+        self.success = True
+        return self.score
+
+    async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
+        return self.measure(test_case, *args, **kwargs)
+
+    def is_successful(self) -> bool:
+        return True
+
+
+def test_write_cache_creates_artifacts_on_disk():
+    """
+    Verify that write_cache=True creates cache files on disk.
+
+    When evaluate() runs with write_cache=True, it should persist:
+    - Metric cache to CACHE_FILE_NAME (.deepeval-cache.json)
+    - Latest test run data to LATEST_TEST_RUN_FILE_PATH (.latest_test_run.json)
+    """
+    dataset = build_single_turn_dataset()
+    test_cases = build_llm_test_cases_from_goldens(dataset)
+
+    evaluate(
+        test_cases=test_cases,
+        metrics=[DeterministicPassingMetric()],
+        hyperparameters={"model": "offline-stub"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=True, use_cache=False),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+    )
+
+    # Cache artifacts should exist after evaluation
+    assert os.path.isfile(CACHE_FILE_NAME), (
+        f"Expected CACHE_FILE_NAME ({CACHE_FILE_NAME}) to exist after "
+        "evaluate() with write_cache=True"
+    )
+    assert os.path.isfile(LATEST_TEST_RUN_FILE_PATH), (
+        f"Expected LATEST_TEST_RUN_FILE_PATH ({LATEST_TEST_RUN_FILE_PATH}) "
+        "to exist after evaluate() with write_cache=True"
+    )
+
+
+def test_use_cache_true_reuses_cached_results():
+    """
+    Verify that use_cache=True reuses cached metric results.
+
+    When evaluate() runs twice with identical inputs and use_cache=True,
+    the second run should use cached results and NOT re-execute the metric.
+    """
+    _reset_metric_counter()
+
+    dataset = build_single_turn_dataset()
+    test_cases = build_llm_test_cases_from_goldens(dataset)
+    num_test_cases = len(test_cases)
+
+    # First run: metrics should execute
+    evaluate(
+        test_cases=test_cases,
+        metrics=[CountingMetric()],
+        hyperparameters={"model": "offline-stub"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=True, use_cache=True),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+    )
+
+    calls_after_first_run = _metric_call_counter["calls"]
+    assert calls_after_first_run == num_test_cases, (
+        f"Expected {num_test_cases} metric calls after first run, "
+        f"got {calls_after_first_run}"
+    )
+
+    # Second run with identical inputs: should use cache, no new metric calls
+    evaluate(
+        test_cases=test_cases,
+        metrics=[CountingMetric()],
+        hyperparameters={"model": "offline-stub"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=True, use_cache=True),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+    )
+
+    calls_after_second_run = _metric_call_counter["calls"]
+    assert calls_after_second_run == calls_after_first_run, (
+        f"Expected no additional metric calls on second run with use_cache=True. "
+        f"Calls after first run: {calls_after_first_run}, "
+        f"calls after second run: {calls_after_second_run}"
+    )
+
+
+def test_use_cache_false_recomputes_metrics():
+    """
+    Verify that use_cache=False recomputes metrics even when cache exists.
+
+    When evaluate() runs twice with identical inputs but use_cache=False,
+    metrics should be re-executed on the second run.
+    """
+    _reset_metric_counter()
+
+    dataset = build_single_turn_dataset()
+    test_cases = build_llm_test_cases_from_goldens(dataset)
+    num_test_cases = len(test_cases)
+
+    # First run: metrics should execute
+    evaluate(
+        test_cases=test_cases,
+        metrics=[CountingMetric()],
+        hyperparameters={"model": "offline-stub"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=True, use_cache=False),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+    )
+
+    calls_after_first_run = _metric_call_counter["calls"]
+    assert calls_after_first_run == num_test_cases, (
+        f"Expected {num_test_cases} metric calls after first run, "
+        f"got {calls_after_first_run}"
+    )
+
+    # Second run with use_cache=False: should recompute, more metric calls
+    evaluate(
+        test_cases=test_cases,
+        metrics=[CountingMetric()],
+        hyperparameters={"model": "offline-stub"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=True, use_cache=False),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+    )
+
+    calls_after_second_run = _metric_call_counter["calls"]
+    expected_calls = num_test_cases * 2
+    assert calls_after_second_run == expected_calls, (
+        f"Expected {expected_calls} total metric calls after second run "
+        f"with use_cache=False (recomputation), got {calls_after_second_run}"
+    )
diff --git a/tests/test_core/test_end_to_end/test_evaluate_configs.py b/tests/test_core/test_end_to_end/test_evaluate_configs.py
new file mode 100644
index 0000000000..27520c27be
--- /dev/null
+++ b/tests/test_core/test_end_to_end/test_evaluate_configs.py
@@ -0,0 +1,461 @@
+import pytest
+
+from deepeval import evaluate
+from deepeval.evaluate.configs import (
+    AsyncConfig,
+    CacheConfig,
+    DisplayConfig,
+    ErrorConfig,
+)
+from deepeval.evaluate.types import EvaluationResult
+from deepeval.errors import MissingTestCaseParamsError
+from deepeval.test_case import LLMTestCase
+
+from .helpers import (
+    DeterministicFailingMetric,
+    DeterministicPassingMetric,
+    DeterministicRaisingMetric,
+    DeterministicRequiresRetrievalContextMetric,
+    build_llm_test_cases_from_goldens,
+    build_single_turn_dataset,
+)
+
+# ===========================================================================
+# ErrorConfig: missing params behavior
+# ===========================================================================
+
+
+def test_error_config_missing_params_raises_by_default():
+    """By default, missing required test case params raises MissingTestCaseParamsError."""
+    # Create a test case without retrieval_context
+    test_case = LLMTestCase(
+        input="What is your name?",
+        actual_output="My name is DeepEval.",
+        retrieval_context=None,  # Missing required param
+    )
+
+    with pytest.raises(MissingTestCaseParamsError):
+        evaluate(
+            test_cases=[test_case],
+            metrics=[DeterministicRequiresRetrievalContextMetric()],
+            hyperparameters={"model": "offline-stub"},
+            async_config=AsyncConfig(run_async=False),
+            cache_config=CacheConfig(write_cache=False, use_cache=False),
+            display_config=DisplayConfig(
+                show_indicator=False, print_results=False
+            ),
+            error_config=ErrorConfig(skip_on_missing_params=False),
+        )
+
+
+def test_error_config_skip_on_missing_params_skips_metric():
+    """When skip_on_missing_params=True, metrics with missing required params are skipped."""
+    # Create a test case without retrieval_context
+    test_case = LLMTestCase(
+        input="What is your name?",
+        actual_output="My name is DeepEval.",
+        retrieval_context=None,  # Missing required param
+    )
+
+    result = evaluate(
+        test_cases=[test_case],
+        metrics=[DeterministicRequiresRetrievalContextMetric()],
+        hyperparameters={"model": "offline-stub"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+        error_config=ErrorConfig(skip_on_missing_params=True),
+    )
+
+    assert isinstance(result, EvaluationResult)
+    assert len(result.test_results) == 1
+
+    tr = result.test_results[0]
+    # When metric is skipped, it should not appear in metrics_data
+    assert (
+        len(tr.metrics_data) == 0
+    ), "Skipped metric should not appear in metrics_data"
+
+
+def test_error_config_skip_on_missing_params_does_not_skip_when_complete():
+    """When skip_on_missing_params=True, a complete test case is still evaluated."""
+    # Create a test case with retrieval_context
+    test_case = LLMTestCase(
+        input="What is your name?",
+        actual_output="My name is DeepEval.",
+        retrieval_context=["context chunk 1", "context chunk 2"],
+    )
+
+    result = evaluate(
+        test_cases=[test_case],
+        metrics=[DeterministicRequiresRetrievalContextMetric()],
+        hyperparameters={"model": "offline-stub"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+        error_config=ErrorConfig(skip_on_missing_params=True),
+    )
+
+    assert isinstance(result, EvaluationResult)
+    assert len(result.test_results) == 1
+
+    tr = result.test_results[0]
+    # Metric should be evaluated and present
+    assert len(tr.metrics_data) == 1
+    assert tr.metrics_data[0].success is True
+
+
+def test_error_config_skip_on_missing_params_takes_precedence_over_ignore_errors():
+    """skip_on_missing_params=True takes precedence over ignore_errors=True when params are missing."""
+    test_case = LLMTestCase(
+        input="What is your name?",
+        actual_output="My name is DeepEval.",
+        retrieval_context=None,  # Missing required param
+    )
+
+    result = evaluate(
+        test_cases=[test_case],
+        metrics=[DeterministicRequiresRetrievalContextMetric()],
+        hyperparameters={"model": "offline-stub"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+        error_config=ErrorConfig(
+            skip_on_missing_params=True,
+            ignore_errors=True,
+        ),
+    )
+
+    assert isinstance(result, EvaluationResult)
+    assert len(result.test_results) == 1
+
+    tr = result.test_results[0]
+    # skip_on_missing_params takes precedence: metric should be skipped entirely
+    # (not present in metrics_data), rather than showing as ignored error
+    assert len(tr.metrics_data) == 0, (
+        "skip_on_missing_params should take precedence: metric should be "
+        "skipped (absent from metrics_data), not shown as ignored error"
+    )
+
+
+# -----------------------------------------------------------------------------
+# ErrorConfig: ignore_errors behavior
+# -----------------------------------------------------------------------------
+
+
+def test_error_config_ignore_errors_raises_by_default():
+    """By default, exceptions raised by metrics propagate (ignore_errors=False)."""
+    test_case = LLMTestCase(
+        input="What is your name?",
+        actual_output="My name is DeepEval.",
+    )
+
+    with pytest.raises(RuntimeError, match="always raises"):
+        evaluate(
+            test_cases=[test_case],
+            metrics=[DeterministicRaisingMetric()],
+            hyperparameters={"model": "offline-stub"},
+            async_config=AsyncConfig(run_async=False),
+            cache_config=CacheConfig(write_cache=False, use_cache=False),
+            display_config=DisplayConfig(
+                show_indicator=False, print_results=False
+            ),
+            error_config=ErrorConfig(ignore_errors=False),
+        )
+
+
+def test_error_config_ignore_errors_captures_metric_exception():
+    """When ignore_errors=True, metric exceptions are captured and surfaced in the result."""
+    test_case = LLMTestCase(
+        input="What is your name?",
+        actual_output="My name is DeepEval.",
+    )
+
+    result = evaluate(
+        test_cases=[test_case],
+        metrics=[DeterministicRaisingMetric()],
+        hyperparameters={"model": "offline-stub"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+        error_config=ErrorConfig(ignore_errors=True),
+    )
+
+    assert isinstance(result, EvaluationResult)
+    assert len(result.test_results) == 1
+
+    tr = result.test_results[0]
+    assert len(tr.metrics_data) == 1
+
+    md = tr.metrics_data[0]
+    # When error is ignored, metric should be marked as failed
+    assert md.success is False
+    # Error message should be captured
+    assert md.error is not None
+    assert "always raises" in md.error
+
+
+def test_error_config_ignore_errors_does_not_affect_passing_metrics():
+    """ignore_errors=True only affects metrics that raise; passing metrics remain unaffected."""
+    test_case = LLMTestCase(
+        input="What is your name?",
+        actual_output="My name is DeepEval.",
+    )
+
+    result = evaluate(
+        test_cases=[test_case],
+        metrics=[
+            DeterministicPassingMetric(),
+            DeterministicRaisingMetric(),
+        ],
+        hyperparameters={"model": "offline-stub"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+        error_config=ErrorConfig(ignore_errors=True),
+    )
+
+    assert isinstance(result, EvaluationResult)
+    assert len(result.test_results) == 1
+
+    tr = result.test_results[0]
+    assert len(tr.metrics_data) == 2
+
+    # Find each metric's data by name
+    passing_md = next(
+        md for md in tr.metrics_data if md.name == "DeterministicPassingMetric"
+    )
+    raising_md = next(
+        md for md in tr.metrics_data if md.name == "DeterministicRaisingMetric"
+    )
+
+    # Passing metric should succeed
+    assert passing_md.success is True
+    assert passing_md.error is None
+
+    # Raising metric should fail with captured error
+    assert raising_md.success is False
+    assert raising_md.error is not None
+
+
+# -----------------------------------------------------------------------------
+# AsyncConfig behavior
+# -----------------------------------------------------------------------------
+
+
+def test_async_config_sync_and_async_produce_equivalent_results():
+    """Sync and async evaluation produce equivalent results for deterministic metrics."""
+    dataset = build_single_turn_dataset()
+    test_cases = build_llm_test_cases_from_goldens(dataset)
+
+    # Run with run_async=False (sync)
+    result_sync = evaluate(
+        test_cases=test_cases,
+        metrics=[DeterministicPassingMetric()],
+        hyperparameters={"model": "offline-stub"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+    )
+
+    # Run with run_async=True (async)
+    result_async = evaluate(
+        test_cases=test_cases,
+        metrics=[DeterministicPassingMetric()],
+        hyperparameters={"model": "offline-stub"},
+        async_config=AsyncConfig(
+            run_async=True, max_concurrent=1, throttle_value=0
+        ),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+    )
+
+    # Both should return valid results
+    assert isinstance(result_sync, EvaluationResult)
+    assert isinstance(result_async, EvaluationResult)
+
+    # Same number of test results
+    assert len(result_sync.test_results) == len(result_async.test_results)
+
+    # Each test result should have same success status
+    for tr_sync, tr_async in zip(
+        result_sync.test_results, result_async.test_results
+    ):
+        assert tr_sync.success == tr_async.success
+        assert len(tr_sync.metrics_data) == len(tr_async.metrics_data)
+
+        # Metric names and success should match
+        for md_sync, md_async in zip(
+            tr_sync.metrics_data, tr_async.metrics_data
+        ):
+            assert md_sync.name == md_async.name
+            assert md_sync.success == md_async.success
+
+
+def test_async_config_max_concurrent_must_be_positive():
+    """max_concurrent must be >= 1."""
+    with pytest.raises(ValueError, match="max_concurrent"):
+        AsyncConfig(max_concurrent=0)
+
+
+def test_async_config_throttle_value_must_be_non_negative():
+    """throttle_value must be >= 0."""
+    with pytest.raises(ValueError, match="throttle_value"):
+        AsyncConfig(throttle_value=-1)
+
+
+# -----------------------------------------------------------------------------
+# CacheConfig behavior
+# -----------------------------------------------------------------------------
+
+
+def test_cache_config_disabled_does_not_break_evaluate():
+    """Evaluation succeeds when caching is fully disabled."""
+    dataset = build_single_turn_dataset()
+    test_cases = build_llm_test_cases_from_goldens(dataset)
+
+    result = evaluate(
+        test_cases=test_cases,
+        metrics=[DeterministicPassingMetric()],
+        hyperparameters={"model": "offline-stub"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(show_indicator=False, print_results=False),
+    )
+
+    assert isinstance(result, EvaluationResult)
+    assert len(result.test_results) == len(test_cases)
+
+    for tr in result.test_results:
+        assert tr.success is True
+        assert len(tr.metrics_data) >= 1
+
+
+# -----------------------------------------------------------------------------
+# DisplayConfig behavior
+# -----------------------------------------------------------------------------
+
+
+def test_display_config_all_does_not_break_evaluate():
+    """display_option=ALL does not affect evaluation execution."""
+    from deepeval.test_run.test_run import TestRunResultDisplay
+
+    dataset = build_single_turn_dataset()
+    test_cases = build_llm_test_cases_from_goldens(dataset)
+
+    result = evaluate(
+        test_cases=test_cases,
+        metrics=[DeterministicPassingMetric()],
+        hyperparameters={"model": "offline-stub"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(
+            show_indicator=False,
+            print_results=False,
+            display_option=TestRunResultDisplay.ALL,
+        ),
+    )
+
+    assert isinstance(result, EvaluationResult)
+    assert len(result.test_results) == len(test_cases)
+
+
+def test_display_config_failing_does_not_break_evaluate():
+    """display_option=FAILING does not affect evaluation execution."""
+    from deepeval.test_run.test_run import TestRunResultDisplay
+
+    dataset = build_single_turn_dataset()
+    test_cases = build_llm_test_cases_from_goldens(dataset)
+
+    result = evaluate(
+        test_cases=test_cases,
+        metrics=[DeterministicFailingMetric()],
+        hyperparameters={"model": "offline-stub"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(
+            show_indicator=False,
+            print_results=False,
+            display_option=TestRunResultDisplay.FAILING,
+        ),
+    )
+
+    assert isinstance(result, EvaluationResult)
+    assert len(result.test_results) == len(test_cases)
+
+    # All results should be failures (we used DeterministicFailingMetric)
+    for tr in result.test_results:
+        assert tr.success is False
+
+
+def test_display_config_passing_does_not_break_evaluate():
+    """display_option=PASSING does not affect evaluation execution."""
+    from deepeval.test_run.test_run import TestRunResultDisplay
+
+    dataset = build_single_turn_dataset()
+    test_cases = build_llm_test_cases_from_goldens(dataset)
+
+    result = evaluate(
+        test_cases=test_cases,
+        metrics=[DeterministicPassingMetric()],
+        hyperparameters={"model": "offline-stub"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(
+            show_indicator=False,
+            print_results=False,
+            display_option=TestRunResultDisplay.PASSING,
+        ),
+    )
+
+    assert isinstance(result, EvaluationResult)
+    assert len(result.test_results) == len(test_cases)
+
+    # All results should be successes (we used DeterministicPassingMetric)
+    for tr in result.test_results:
+        assert tr.success is True
+
+
+def test_display_config_does_not_affect_evaluation_results():
+    """DisplayConfig options should not affect evaluation outcomes, only printing."""
+    from deepeval.test_run.test_run import TestRunResultDisplay
+
+    dataset = build_single_turn_dataset()
+    test_cases = build_llm_test_cases_from_goldens(dataset)
+
+    # Run with display="all"
+    result_all = evaluate(
+        test_cases=test_cases,
+        metrics=[DeterministicPassingMetric()],
+        hyperparameters={"model": "offline-stub"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(
+            show_indicator=False,
+            print_results=False,
+            display_option=TestRunResultDisplay.ALL,
+        ),
+    )
+
+    # Run with display="passing"
+    result_passing = evaluate(
+        test_cases=test_cases,
+        metrics=[DeterministicPassingMetric()],
+        hyperparameters={"model": "offline-stub"},
+        async_config=AsyncConfig(run_async=False),
+        cache_config=CacheConfig(write_cache=False, use_cache=False),
+        display_config=DisplayConfig(
+            show_indicator=False,
+            print_results=False,
+            display_option=TestRunResultDisplay.PASSING,
+        ),
+    )
+
+    # Results should be identical regardless of display option
+    assert len(result_all.test_results) == len(result_passing.test_results)
+
+    for tr_all, tr_pass in zip(
+        result_all.test_results, result_passing.test_results
+    ):
+        assert tr_all.success == tr_pass.success
+        assert len(tr_all.metrics_data) == len(tr_pass.metrics_data)
diff --git a/tests/test_integrations/test_langgraph/apps/main.py b/tests/test_integrations/test_langgraph/apps/main.py
index dfcda0ac52..23c3df4863 100644
--- a/tests/test_integrations/test_langgraph/apps/main.py
+++ b/tests/test_integrations/test_langgraph/apps/main.py
@@ -17,38 +17,56 @@ def separator(title: str):
 def main():
     # 1. Simple App
     separator("1. SIMPLE APP - Single Tool (Weather)")
-    from tests.test_integrations.test_langgraph.apps.langgraph_simple_app import app as simple_app
-    
+    from tests.test_integrations.test_langgraph.apps.langgraph_simple_app import (
+        app as simple_app,
+    )
+
     callback = CallbackHandler(
         name="demo-simple",
         tags=["demo", "simple"],
         metadata={"app": "simple"},
     )
     result = simple_app.invoke(
-        {"messages": [HumanMessage(content="What's the weather in San Francisco?")]},
+        {
+            "messages": [
+                HumanMessage(content="What's the weather in San Francisco?")
+            ]
+        },
         config={"callbacks": [callback]},
     )
     print(f"Response: {result['messages'][-1].content}")
 
     # 2. Multiple Tools App
-    separator("2. MULTIPLE TOOLS APP - Weather, Population, Timezone, Calculator")
-    from tests.test_integrations.test_langgraph.apps.langgraph_multiple_tools_app import app as multiple_tools_app
-    
+    separator(
+        "2. MULTIPLE TOOLS APP - Weather, Population, Timezone, Calculator"
+    )
+    from tests.test_integrations.test_langgraph.apps.langgraph_multiple_tools_app import (
+        app as multiple_tools_app,
+    )
+
     callback = CallbackHandler(
         name="demo-multiple-tools",
         tags=["demo", "multiple-tools"],
         metadata={"app": "multiple_tools"},
     )
     result = multiple_tools_app.invoke(
-        {"messages": [HumanMessage(content="Tell me about Tokyo - weather, population, and timezone. Also calculate 15 * 23.")]},
+        {
+            "messages": [
+                HumanMessage(
+                    content="Tell me about Tokyo - weather, population, and timezone. Also calculate 15 * 23."
+                )
+            ]
+        },
         config={"callbacks": [callback]},
     )
     print(f"Response: {result['messages'][-1].content}")
 
     # 3. Streaming App (Sync)
     separator("3. STREAMING APP - Stock Price Tools")
-    from tests.test_integrations.test_langgraph.apps.langgraph_streaming_app import sync_app as streaming_app
-    
+    from tests.test_integrations.test_langgraph.apps.langgraph_streaming_app import (
+        sync_app as streaming_app,
+    )
+
     callback = CallbackHandler(
         name="demo-streaming",
         tags=["demo", "streaming"],
@@ -60,7 +78,7 @@ def main():
         config={"callbacks": [callback]},
     ):
         print(f"  Chunk: {list(chunk.keys())}")
-    
+
     # Also get final result
     callback = CallbackHandler(
         name="demo-streaming-invoke",
@@ -75,8 +93,10 @@ def main():
 
     # 4. Conditional App
     separator("4. CONDITIONAL APP - Intent-Based Routing")
-    from tests.test_integrations.test_langgraph.apps.langgraph_conditional_app import app as conditional_app
-    
+    from tests.test_integrations.test_langgraph.apps.langgraph_conditional_app import (
+        app as conditional_app,
+    )
+
     # Research route
     callback = CallbackHandler(
         name="demo-conditional-research",
@@ -88,7 +108,7 @@ def main():
         config={"callbacks": [callback]},
     )
     print(f"Research Response: {result['messages'][-1].content}")
-    
+
     # Fact check route
     callback = CallbackHandler(
         name="demo-conditional-factcheck",
@@ -103,23 +123,33 @@ def main():
 
     # 5. Parallel Tools App
     separator("5. PARALLEL TOOLS APP - Multiple Parallel Tool Calls")
-    from tests.test_integrations.test_langgraph.apps.langgraph_parallel_tools_app import sync_app as parallel_app
-    
+    from tests.test_integrations.test_langgraph.apps.langgraph_parallel_tools_app import (
+        sync_app as parallel_app,
+    )
+
     callback = CallbackHandler(
         name="demo-parallel",
         tags=["demo", "parallel"],
         metadata={"app": "parallel"},
     )
     result = parallel_app.invoke(
-        {"messages": [HumanMessage(content="Get weather for Tokyo, New York, and London.")]},
+        {
+            "messages": [
+                HumanMessage(
+                    content="Get weather for Tokyo, New York, and London."
+                )
+            ]
+        },
         config={"callbacks": [callback]},
     )
     print(f"Response: {result['messages'][-1].content}")
 
     # 6. Async App (run synchronously for demo)
     separator("6. ASYNC APP - Database Search & Translation")
-    from tests.test_integrations.test_langgraph.apps.langgraph_async_app import app as async_app
-    
+    from tests.test_integrations.test_langgraph.apps.langgraph_async_app import (
+        app as async_app,
+    )
+
     async def run_async():
         callback = CallbackHandler(
             name="demo-async",
@@ -127,21 +157,27 @@ async def run_async():
             metadata={"app": "async"},
         )
         result = await async_app.ainvoke(
-            {"messages": [HumanMessage(content="Search for information about Python")]},
+            {
+                "messages": [
+                    HumanMessage(content="Search for information about Python")
+                ]
+            },
             config={"callbacks": [callback]},
         )
         return result
-    
+
     result = asyncio.run(run_async())
     print(f"Response: {result['messages'][-1].content}")
 
     # 7. Multi-Turn App
     separator("7. MULTI-TURN APP - Shopping Cart with Memory")
-    from tests.test_integrations.test_langgraph.apps.langgraph_multi_turn_app import get_app_with_memory
-    
+    from tests.test_integrations.test_langgraph.apps.langgraph_multi_turn_app import (
+        get_app_with_memory,
+    )
+
     app = get_app_with_memory()
     thread_id = "demo-session-001"
-    
+
     # Turn 1
     callback = CallbackHandler(
         name="demo-multi-turn-1",
@@ -152,10 +188,13 @@ async def run_async():
     )
     result = app.invoke(
         {"messages": [HumanMessage(content="Add 2 apples to my cart")]},
-        config={"callbacks": [callback], "configurable": {"thread_id": thread_id}},
+        config={
+            "callbacks": [callback],
+            "configurable": {"thread_id": thread_id},
+        },
     )
     print(f"Turn 1: {result['messages'][-1].content}")
-    
+
     # Turn 2
     callback = CallbackHandler(
         name="demo-multi-turn-2",
@@ -166,10 +205,13 @@ async def run_async():
     )
     result = app.invoke(
         {"messages": [HumanMessage(content="Also add 3 oranges")]},
-        config={"callbacks": [callback], "configurable": {"thread_id": thread_id}},
+        config={
+            "callbacks": [callback],
+            "configurable": {"thread_id": thread_id},
+        },
     )
     print(f"Turn 2: {result['messages'][-1].content}")
-    
+
     # Turn 3
     callback = CallbackHandler(
         name="demo-multi-turn-3",
@@ -180,7 +222,10 @@ async def run_async():
     )
     result = app.invoke(
         {"messages": [HumanMessage(content="What's in my cart?")]},
-        config={"callbacks": [callback], "configurable": {"thread_id": thread_id}},
+        config={
+            "callbacks": [callback],
+            "configurable": {"thread_id": thread_id},
+        },
     )
     print(f"Turn 3: {result['messages'][-1].content}")
 
@@ -189,4 +234,3 @@ async def run_async():
 
 if __name__ == "__main__":
     main()
-
diff --git a/tests/test_integrations/test_langgraph/test_async.py b/tests/test_integrations/test_langgraph/test_async.py
index d25b350e05..1820dad24f 100644
--- a/tests/test_integrations/test_langgraph/test_async.py
+++ b/tests/test_integrations/test_langgraph/test_async.py
@@ -35,7 +35,11 @@
 
 # Set to True to generate schemas, False to assert against existing schemas
 # Can be overridden via environment variable: GENERATE_SCHEMAS=true pytest ...
-GENERATE_MODE = os.environ.get("GENERATE_SCHEMAS", "").lower() in ("true", "1", "yes")
+GENERATE_MODE = os.environ.get("GENERATE_SCHEMAS", "").lower() in (
+    "true",
+    "1",
+    "yes",
+)
 
 _current_dir = os.path.dirname(os.path.abspath(__file__))
 _schemas_dir = os.path.join(_current_dir, "schemas")
diff --git a/tests/test_integrations/test_langgraph/test_sync.py b/tests/test_integrations/test_langgraph/test_sync.py
index 5cc64debaa..94fde20aa4 100644
--- a/tests/test_integrations/test_langgraph/test_sync.py
+++ b/tests/test_integrations/test_langgraph/test_sync.py
@@ -39,7 +39,11 @@
 
 # Set to True to generate schemas, False to assert against existing schemas
 # Can be overridden via environment variable: GENERATE_SCHEMAS=true pytest ...
-GENERATE_MODE = os.environ.get("GENERATE_SCHEMAS", "").lower() in ("true", "1", "yes")
+GENERATE_MODE = os.environ.get("GENERATE_SCHEMAS", "").lower() in (
+    "true",
+    "1",
+    "yes",
+)
 
 _current_dir = os.path.dirname(os.path.abspath(__file__))
 _schemas_dir = os.path.join(_current_dir, "schemas")