diff --git a/deepeval/integrations/langchain/callback.py b/deepeval/integrations/langchain/callback.py index 1ecd03219b..281636403d 100644 --- a/deepeval/integrations/langchain/callback.py +++ b/deepeval/integrations/langchain/callback.py @@ -539,4 +539,4 @@ def on_retriever_error( with self._ctx(run_id=run_id, parent_run_id=parent_run_id): retriever_span.status = TraceSpanStatus.ERRORED retriever_span.error = str(error) - exit_current_context(uuid_str=uuid_str) \ No newline at end of file + exit_current_context(uuid_str=uuid_str) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000000..177a766138 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,72 @@ +try: + import sys + import pysqlite3 as sqlite3 # type: ignore + + sys.modules["sqlite3"] = sqlite3 + sys.modules["sqlite3.dbapi2"] = sqlite3.dbapi2 +except Exception: + pass + +import pytest + +from typing import TYPE_CHECKING +from pathlib import Path + +from deepeval.tracing.tracing import trace_manager +from deepeval.config.settings import get_settings, reset_settings + + +if TYPE_CHECKING: + pass + + +# Silence telemetry for all tests so we don't have to deal with the noise +@pytest.fixture(autouse=True) +def _telemetry_opt_out(monkeypatch): + monkeypatch.setenv("DEEPEVAL_TELEMETRY_OPT_OUT", "1") + yield + + +@pytest.fixture(autouse=True) +def _ensure_hidden_store_dir(tmp_path: Path): + d = tmp_path / ".deepeval" + d.mkdir(exist_ok=True) + # some code expects the file to be there after a run, + # but at minimum the directory must exist to avoid FileNotFoundError + yield + + +@pytest.fixture +def hidden_store_dir(tmp_path: Path) -> Path: + d = tmp_path / ".deepeval" + d.mkdir(parents=True, exist_ok=True) + return d + + +@pytest.fixture() +def settings(): + settings = get_settings() + yield settings + + +@pytest.fixture() +def enable_dotenv(monkeypatch): + monkeypatch.setenv("DEEPEVAL_DISABLE_DOTENV", "0") + # rebuild Settings after changing the env + reset_settings(reload_dotenv=False) + + +@pytest.fixture(autouse=True) +def _reset_tracing_state(): + trace_manager.clear_traces() + trace_manager.traces_to_evaluate_order.clear() + trace_manager.traces_to_evaluate.clear() + trace_manager.integration_traces_to_evaluate.clear() + trace_manager.trace_uuid_to_golden.clear() + try: + trace_manager.task_bindings.clear() + except Exception: + pass + trace_manager.evaluating = False + trace_manager.evaluation_loop = False + yield diff --git a/tests/test_core/conftest.py b/tests/test_core/conftest.py index f475517b8b..1744309206 100644 --- a/tests/test_core/conftest.py +++ b/tests/test_core/conftest.py @@ -14,7 +14,6 @@ from typing import TYPE_CHECKING from pathlib import Path -from deepeval.tracing.tracing import trace_manager from deepeval.config.settings import get_settings, reset_settings, Settings @@ -29,36 +28,6 @@ } -@pytest.fixture(autouse=True) -def _ensure_hidden_store_dir(tmp_path: Path): - d = tmp_path / ".deepeval" - d.mkdir(exist_ok=True) - # some code expects the file to be there after a run, - # but at minimum the directory must exist to avoid FileNotFoundError - yield - - -@pytest.fixture -def hidden_store_dir(tmp_path: Path) -> Path: - d = tmp_path / ".deepeval" - d.mkdir(parents=True, exist_ok=True) - return d - - -# Silence telemetry for all tests so we don't have to deal with the noise -@pytest.fixture(autouse=True) -def _telemetry_opt_out(monkeypatch): - monkeypatch.setenv("DEEPEVAL_TELEMETRY_OPT_OUT", "1") - yield - - -# Run every test in its own temp CWD so .deepeval/.deepeval is sandboxed -@pytest.fixture(autouse=True) -def _isolate_cwd(tmp_path: Path, monkeypatch): - monkeypatch.chdir(tmp_path) - yield - - # Default dotenv path most tests can reuse; override in tests as needed @pytest.fixture def env_path(monkeypatch, tmp_path: Path) -> Path: @@ -72,17 +41,6 @@ def env_dir(monkeypatch, tmp_path: Path) -> Path: return tmp_path -@pytest.fixture(autouse=True) -def no_sleep(monkeypatch): - monkeypatch.setattr(tenacity.nap, "sleep", lambda _: None, raising=True) - - -@pytest.fixture() -def settings(): - settings = get_settings() - yield settings - - @pytest.fixture(scope="session") def _session_env_baseline(): # capture the environment as it existed when pytest started @@ -120,6 +78,9 @@ def _env_sandbox(_session_env_baseline, request, monkeypatch): for k, v in preserved.items(): monkeypatch.setenv(k, v) + # Always silence telemetry in tests + monkeypatch.setenv("DEEPEVAL_TELEMETRY_OPT_OUT", "1") + # Never open the Confident AI browser UI during tests monkeypatch.setenv("CONFIDENT_OPEN_BROWSER", "0") @@ -166,13 +127,6 @@ def _core_mode_no_confident( yield -@pytest.fixture() -def enable_dotenv(monkeypatch): - monkeypatch.setenv("DEEPEVAL_DISABLE_DOTENV", "0") - # rebuild Settings after changing the env - reset_settings(reload_dotenv=False) - - @pytest.fixture(autouse=False) def unpatch_openai_after(): from deepeval.openai.patch import unpatch_openai_classes @@ -181,17 +135,13 @@ def unpatch_openai_after(): unpatch_openai_classes() +# Run every test in its own temp CWD so .deepeval/.deepeval is sandboxed @pytest.fixture(autouse=True) -def _reset_tracing_state(): - trace_manager.clear_traces() - trace_manager.traces_to_evaluate_order.clear() - trace_manager.traces_to_evaluate.clear() - trace_manager.integration_traces_to_evaluate.clear() - trace_manager.trace_uuid_to_golden.clear() - try: - trace_manager.task_bindings.clear() - except Exception: - pass - trace_manager.evaluating = False - trace_manager.evaluation_loop = False +def _isolate_cwd(tmp_path: Path, monkeypatch): + monkeypatch.chdir(tmp_path) yield + + +@pytest.fixture(autouse=True) +def no_sleep(monkeypatch): + monkeypatch.setattr(tenacity.nap, "sleep", lambda _: None, raising=True) diff --git a/tests/test_core/test_end_to_end/__init__.py b/tests/test_core/test_end_to_end/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_core/test_end_to_end/conftest.py b/tests/test_core/test_end_to_end/conftest.py new file mode 100644 index 0000000000..03b68f8334 --- /dev/null +++ b/tests/test_core/test_end_to_end/conftest.py @@ -0,0 +1,10 @@ +import pytest + + +@pytest.fixture(autouse=True) +def _offline_deterministic_env(monkeypatch: pytest.MonkeyPatch): + # Prevent dotenv loading (could pull real API keys/configs) and avoid browser open. + monkeypatch.setenv("DEEPEVAL_DISABLE_DOTENV", "1") + monkeypatch.setenv("CONFIDENT_OPEN_BROWSER", "0") + # Keep stable even if unset. + monkeypatch.setenv("DEEPEVAL_RESULTS_FOLDER", "") diff --git a/tests/test_core/test_end_to_end/helpers.py b/tests/test_core/test_end_to_end/helpers.py new file mode 100644 index 0000000000..5ac2612bdd --- /dev/null +++ b/tests/test_core/test_end_to_end/helpers.py @@ -0,0 +1,436 @@ +from __future__ import annotations + +import csv +import json +from pathlib import Path +from typing import Callable, List, Tuple + +from deepeval.dataset import Golden, EvaluationDataset +from deepeval.dataset.golden import ConversationalGolden +from deepeval.metrics import BaseMetric, BaseConversationalMetric +from deepeval.test_case import ( + LLMTestCase, + LLMTestCaseParams, + ConversationalTestCase, + Turn, +) + + +def deterministic_llm_app(user_input: str) -> Tuple[str, List[str]]: + """ + Deterministic stand-in for "your_llm_app" from the docs. + + The docs show: + res, text_chunks = your_llm_app(golden.input) + + We return: + - res: deterministic output based solely on input + - text_chunks: deterministic retrieval_context + """ + normalized = user_input.strip().lower() + if "name" in normalized: + return "My name is DeepEval.", ["ctx: identity", "ctx: greeting"] + if "number" in normalized: + return "42", ["ctx: numbers", "ctx: preferences"] + return "OK", ["ctx: default"] + + +def build_single_turn_dataset() -> EvaluationDataset: + # Mirrors the docs pattern (goldens list -> EvaluationDataset(goldens)) + goldens = [ + Golden( + input="What is your name?", expected_output="My name is DeepEval." + ), + Golden(input="Choose a number between 1 to 100", expected_output="42"), + ] + return EvaluationDataset(goldens) + + +def build_llm_test_cases_from_goldens( + dataset: EvaluationDataset, + llm_app: Callable[[str], Tuple[str, List[str]]] = deterministic_llm_app, +) -> List[LLMTestCase]: + test_cases: List[LLMTestCase] = [] + for golden in dataset.goldens: + res, text_chunks = llm_app(golden.input) + test_cases.append( + LLMTestCase( + input=golden.input, + actual_output=res, + expected_output=golden.expected_output, + retrieval_context=text_chunks, + ) + ) + return test_cases + + +class DeterministicContainsExpectedOutputMetric(BaseMetric): + """ + Tiny deterministic metric for offline CI. + Avoid asserting exact metric scores in tests; we only need stable behavior. + """ + + _required_params = [ + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.EXPECTED_OUTPUT, + ] + + def __init__(self, threshold: float = 0.5): + self.threshold = threshold + self.async_mode = False + self.include_reason = True + + @property + def __name__(self) -> str: + return "DeterministicContainsExpectedOutputMetric" + + def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float: + expected = (test_case.expected_output or "").strip() + actual = (test_case.actual_output or "").strip() + passed = (expected != "") and (expected in actual) + self.score = 1.0 if passed else 0.0 + self.reason = ( + "expected_output is contained in actual_output" + if passed + else "expected_output not found in actual_output" + ) + self.success = self.is_successful() + return self.score + + async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float: + return self.measure(test_case, *args, **kwargs) + + def is_successful(self) -> bool: + return bool(self.score is not None and self.score >= self.threshold) + + +class DeterministicFailingMetric(BaseMetric): + """ + Deterministic metric that always fails. + Used to verify that evaluate() correctly propagates failures. + """ + + _required_params = [LLMTestCaseParams.ACTUAL_OUTPUT] + + def __init__(self, threshold: float = 0.5): + self.threshold = threshold + self.async_mode = False + self.include_reason = True + + @property + def __name__(self) -> str: + return "DeterministicFailingMetric" + + def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float: + self.score = 0.0 + self.reason = "This metric always fails for testing purposes" + self.success = False + return self.score + + async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float: + return self.measure(test_case, *args, **kwargs) + + def is_successful(self) -> bool: + return False + + +class DeterministicPassingMetric(BaseMetric): + """ + Deterministic metric that always passes. + Used to verify that evaluate() correctly propagates success. + """ + + _required_params = [LLMTestCaseParams.ACTUAL_OUTPUT] + + def __init__(self, threshold: float = 0.5): + self.threshold = threshold + self.async_mode = False + self.include_reason = True + + @property + def __name__(self) -> str: + return "DeterministicPassingMetric" + + def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float: + self.score = 1.0 + self.reason = "This metric always passes for testing purposes" + self.success = True + return self.score + + async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float: + return self.measure(test_case, *args, **kwargs) + + def is_successful(self) -> bool: + return True + + +class DeterministicRequiresRetrievalContextMetric(BaseMetric): + """ + Deterministic metric that requires RETRIEVAL_CONTEXT. + Used to test skip_on_missing_params behavior when test case lacks retrieval_context. + """ + + _required_params = [ + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.RETRIEVAL_CONTEXT, + ] + + def __init__(self, threshold: float = 0.5): + self.threshold = threshold + self.async_mode = False + self.include_reason = True + + @property + def __name__(self) -> str: + return "DeterministicRequiresRetrievalContextMetric" + + def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float: + # This will only be called if retrieval_context is present + # The check_llm_test_case_params in BaseMetric handles the validation + from deepeval.metrics.utils import check_llm_test_case_params + + check_llm_test_case_params( + test_case=test_case, + test_case_params=self._required_params, + input_image_count=None, + actual_output_image_count=None, + metric=self, + ) + self.score = 1.0 + self.reason = "Retrieval context was present" + self.success = True + return self.score + + async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float: + return self.measure(test_case, *args, **kwargs) + + def is_successful(self) -> bool: + return bool(self.score is not None and self.score >= self.threshold) + + +class DeterministicRaisingMetric(BaseMetric): + """ + Deterministic metric that always raises an exception. + Used to test ignore_errors behavior. + """ + + _required_params = [LLMTestCaseParams.ACTUAL_OUTPUT] + + def __init__(self, threshold: float = 0.5): + self.threshold = threshold + self.async_mode = False + self.include_reason = True + + @property + def __name__(self) -> str: + return "DeterministicRaisingMetric" + + def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float: + raise RuntimeError("This metric always raises for testing purposes") + + async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float: + raise RuntimeError("This metric always raises for testing purposes") + + def is_successful(self) -> bool: + return False + + +def save_dataset_as_json_and_load( + dataset: EvaluationDataset, directory: Path, file_name: str +) -> list: + """ + Option A artifact: dataset.save_as(file_type="json", directory=..., file_name=...) + Returns parsed JSON content (a list of records). + """ + full_path = dataset.save_as( + file_type="json", + directory=str(directory), + file_name=file_name, + include_test_cases=False, + ) + with open(full_path, "r", encoding="utf-8") as f: + return json.load(f) + + +def save_dataset_as_csv_and_load( + dataset: EvaluationDataset, directory: Path, file_name: str +) -> List[dict]: + """ + Option A artifact: dataset.save_as(file_type="csv", directory=..., file_name=...) + Returns parsed CSV content as a list of dicts. + """ + full_path = dataset.save_as( + file_type="csv", + directory=str(directory), + file_name=file_name, + include_test_cases=False, + ) + with open(full_path, "r", encoding="utf-8") as f: + return list(csv.DictReader(f)) + + +# =========================================================================== +# Multi-turn / Conversational helpers +# =========================================================================== + + +def deterministic_chatbot_callback( + input: str, + turns: List[Turn] = None, + thread_id: str = None, +) -> Turn: + """ + Deterministic chatbot callback for offline testing. + + Mirrors the doc pattern: + async def model_callback(input: str, turns: List[Turn], thread_id: str) -> Turn: + response = await your_chatbot(input, turns, thread_id) + return Turn(role="assistant", content=response) + + This sync version returns deterministic responses based on input. + """ + normalized = input.strip().lower() + if ( + "ticket" in normalized + or "buy" in normalized + or "purchase" in normalized + ): + return Turn( + role="assistant", + content="I can help you purchase a ticket. What event are you interested in?", + ) + if "coldplay" in normalized or "concert" in normalized: + return Turn( + role="assistant", + content="Great choice! We have VIP and standard tickets available for Coldplay.", + ) + if "vip" in normalized: + return Turn( + role="assistant", + content="VIP ticket selected. That will be $250. Shall I proceed with the purchase?", + ) + if ( + "yes" in normalized + or "proceed" in normalized + or "confirm" in normalized + ): + return Turn( + role="assistant", + content="Purchase confirmed! Your VIP ticket has been booked successfully.", + ) + return Turn(role="assistant", content="How can I assist you today?") + + +def build_multi_turn_dataset() -> EvaluationDataset: + """ + Build a multi-turn dataset using ConversationalGolden. + + Mirrors the docs pattern: + goldens = [ + ConversationalGolden( + scenario="Andy Byron wants to purchase a VIP ticket to a Coldplay concert.", + expected_outcome="Successful purchase of a ticket.", + user_description="Andy Byron is the CEO of Astronomer.", + ), + ... + ] + dataset = EvaluationDataset(goldens) + """ + goldens = [ + ConversationalGolden( + scenario="Andy Byron wants to purchase a VIP ticket to a Coldplay concert.", + expected_outcome="Successful purchase of a ticket.", + user_description="Andy Byron is the CEO of Astronomer.", + ), + ConversationalGolden( + scenario="A customer wants to ask about concert dates.", + expected_outcome="Customer receives concert date information.", + user_description="A general user looking for event info.", + ), + ] + return EvaluationDataset(goldens) + + +def build_conversational_test_cases_manually( + dataset: EvaluationDataset, + chatbot_callback: Callable = deterministic_chatbot_callback, + max_turns: int = 4, +) -> List[ConversationalTestCase]: + """ + Manually build ConversationalTestCase objects without using ConversationSimulator. + + ConversationSimulator requires a simulator_model which needs network access. + This helper creates deterministic test cases for offline testing. + """ + test_cases = [] + for golden in dataset.goldens: + # Simulate a basic conversation flow + turns = [] + user_inputs = [ + "Hello, I want to buy a ticket", + "I'm interested in Coldplay", + "I'd like a VIP ticket please", + "Yes, please proceed", + ] + + for i, user_input in enumerate(user_inputs[:max_turns]): + # User turn + turns.append(Turn(role="user", content=user_input)) + # Assistant response via callback + assistant_turn = chatbot_callback( + user_input, turns, f"thread-{id(golden)}" + ) + turns.append(assistant_turn) + + test_case = ConversationalTestCase( + turns=turns, + scenario=golden.scenario, + expected_outcome=golden.expected_outcome, + user_description=golden.user_description, + ) + test_cases.append(test_case) + + return test_cases + + +class DeterministicConversationalMetric(BaseConversationalMetric): + """ + Deterministic conversational metric for offline testing. + Evaluates whether the conversation reached an expected outcome. + """ + + def __init__(self, threshold: float = 0.5): + self.threshold = threshold + self.async_mode = False + self.include_reason = True + + @property + def __name__(self) -> str: + return "DeterministicConversationalMetric" + + def measure( + self, test_case: ConversationalTestCase, *args, **kwargs + ) -> float: + # Check if any assistant turn contains expected outcome keywords + outcome_keywords = ["confirmed", "booked", "success", "complete"] + has_positive_outcome = any( + any(kw in turn.content.lower() for kw in outcome_keywords) + for turn in test_case.turns + if turn.role == "assistant" + ) + self.score = 1.0 if has_positive_outcome else 0.0 + self.reason = ( + "Conversation reached a positive outcome" + if has_positive_outcome + else "Conversation did not reach expected outcome" + ) + self.success = self.is_successful() + return self.score + + async def a_measure( + self, test_case: ConversationalTestCase, *args, **kwargs + ) -> float: + return self.measure(test_case, *args, **kwargs) + + def is_successful(self) -> bool: + return bool(self.score is not None and self.score >= self.threshold) diff --git a/tests/test_core/test_end_to_end/test_evaluate.py b/tests/test_core/test_end_to_end/test_evaluate.py new file mode 100644 index 0000000000..3e999654d0 --- /dev/null +++ b/tests/test_core/test_end_to_end/test_evaluate.py @@ -0,0 +1,447 @@ +import json +import os +from pathlib import Path +import subprocess + +import pytest + +from deepeval import evaluate +from deepeval.dataset import EvaluationDataset +from deepeval.evaluate.configs import ( + AsyncConfig, + CacheConfig, + DisplayConfig, +) +from deepeval.evaluate.types import EvaluationResult, TestResult + +from .helpers import ( + DeterministicContainsExpectedOutputMetric, + DeterministicConversationalMetric, + DeterministicFailingMetric, + DeterministicPassingMetric, + build_llm_test_cases_from_goldens, + build_single_turn_dataset, + build_multi_turn_dataset, + build_conversational_test_cases_manually, + save_dataset_as_csv_and_load, + save_dataset_as_json_and_load, +) + +PROJECT_ROOT = Path(__file__).resolve().parents[3] + + +def test_single_turn_evaluate_returns_result_and_dataset_json_schema( + tmp_path, +): + """ + End-to-end: dataset -> test cases -> evaluate() -> EvaluationResult. + Also validates the saved dataset JSON schema. + + Deterministic/offline (no network): uses deterministic custom metrics. + """ + dataset = build_single_turn_dataset() + test_cases = build_llm_test_cases_from_goldens(dataset) + + result = evaluate( + test_cases=test_cases, + metrics=[DeterministicContainsExpectedOutputMetric()], + hyperparameters={"model": "offline-stub", "system_prompt": "offline"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig(show_indicator=False, print_results=False), + ) + + # Verify result is the expected type with correct structure + assert isinstance(result, EvaluationResult) + assert isinstance(result.test_results, list) + assert len(result.test_results) == len(test_cases) + assert result.confident_link is None or isinstance( + result.confident_link, str + ) + assert result.test_run_id is None or isinstance(result.test_run_id, str) + + for tr in result.test_results: + assert isinstance(tr, TestResult) + assert isinstance(tr.name, str) and tr.name != "" + assert isinstance(tr.success, bool) + assert tr.conversational is False + + # Single-turn results should have input/output + assert tr.input is not None + assert tr.actual_output is not None + + # Metrics data should be present + assert isinstance(tr.metrics_data, list) + assert len(tr.metrics_data) >= 1 + + # Test JSON artifact schema + json_records = save_dataset_as_json_and_load( + dataset, directory=tmp_path, file_name="dataset" + ) + assert isinstance(json_records, list) + assert len(json_records) >= 1 + + required_keys = { + "input", + "actual_output", + "expected_output", + "retrieval_context", + "context", + "name", + "comments", + "source_file", + "tools_called", + "expected_tools", + "additional_metadata", + "custom_column_key_values", + } + + for rec in json_records: + assert isinstance(rec, dict) + assert required_keys.issubset(rec.keys()) + assert isinstance(rec["input"], str) and rec["input"] != "" + + # Optional fields can be None or their expected type + assert rec["expected_output"] is None or isinstance( + rec["expected_output"], str + ) + assert rec["actual_output"] is None or isinstance( + rec["actual_output"], str + ) + assert rec["retrieval_context"] is None or isinstance( + rec["retrieval_context"], list + ) + assert rec["tools_called"] is None or isinstance( + rec["tools_called"], list + ) + assert rec["expected_tools"] is None or isinstance( + rec["expected_tools"], list + ) + + +# =========================================================================== +# Checklist Item 2: Multi-turn E2E evaluation (doc-driven, offline) +# =========================================================================== + + +def test_multi_turn_evaluate_returns_conversational_result_shape(tmp_path): + """ + End-to-end: multi-turn/conversational evaluation returns conversational TestResult + objects and metrics_data. + + Deterministic/offline (no network): constructs ConversationalTestCase objects + locally and uses deterministic conversational metrics. + """ + dataset = build_multi_turn_dataset() + assert len(dataset.goldens) >= 1 + + conversational_test_cases = build_conversational_test_cases_manually( + dataset, max_turns=4 + ) + assert len(conversational_test_cases) == len(dataset.goldens) + + result = evaluate( + test_cases=conversational_test_cases, + metrics=[DeterministicConversationalMetric()], + hyperparameters={"model": "offline-stub", "system_prompt": "offline"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig(show_indicator=False, print_results=False), + ) + + # Verify result is the expected type with correct structure + assert isinstance(result, EvaluationResult) + assert isinstance(result.test_results, list) + assert len(result.test_results) == len(conversational_test_cases) + assert result.confident_link is None or isinstance( + result.confident_link, str + ) + assert result.test_run_id is None or isinstance(result.test_run_id, str) + + # Verify test result structure for conversational test cases + for tr in result.test_results: + assert isinstance(tr, TestResult) + assert isinstance(tr.name, str) and tr.name != "" + assert isinstance(tr.success, bool) + assert tr.conversational is True # Multi-turn should be conversational + + # Metrics data should be present + assert isinstance(tr.metrics_data, list) + assert len(tr.metrics_data) >= 1 + + +# =========================================================================== +# Checklist Item 3: JSON artifact schema for multi-turn dataset +# =========================================================================== + + +def test_multi_turn_dataset_json_schema(tmp_path): + """Multi-turn dataset JSON export includes expected keys for conversational goldens.""" + dataset = build_multi_turn_dataset() + + json_records = save_dataset_as_json_and_load( + dataset, directory=tmp_path, file_name="multi_turn_dataset" + ) + + assert isinstance(json_records, list) + assert len(json_records) >= 1 + + # Multi-turn JSON schema (from dataset.py save_as implementation) + required_keys = { + "scenario", + "turns", + "expected_outcome", + "user_description", + "context", + "name", + "comments", + "additional_metadata", + "custom_column_key_values", + } + + for rec in json_records: + assert isinstance(rec, dict) + assert required_keys.issubset( + rec.keys() + ), f"Missing keys: {required_keys - set(rec.keys())}" + + # Optional fields can be None or their expected type + assert rec["scenario"] is None or isinstance(rec["scenario"], str) + assert rec["turns"] is None or isinstance(rec["turns"], list) + assert rec["expected_outcome"] is None or isinstance( + rec["expected_outcome"], str + ) + assert rec["user_description"] is None or isinstance( + rec["user_description"], str + ) + + +def test_single_turn_dataset_csv_schema(tmp_path): + """Single-turn dataset CSV export includes expected columns.""" + dataset = build_single_turn_dataset() + + csv_records = save_dataset_as_csv_and_load( + dataset, directory=tmp_path, file_name="single_turn_dataset" + ) + + assert isinstance(csv_records, list) + assert len(csv_records) >= 1 + + # CSV should have "input" column at minimum + for rec in csv_records: + assert isinstance(rec, dict) + assert "input" in rec.keys(), "CSV must have 'input' column" + assert rec["input"] is not None and rec["input"] != "" + + +def test_evaluate_propagates_metric_failure(): + """When a metric fails, TestResult.success is False and metrics_data reflects failure.""" + dataset = build_single_turn_dataset() + test_cases = build_llm_test_cases_from_goldens(dataset) + + result = evaluate( + test_cases=test_cases, + metrics=[DeterministicFailingMetric()], + hyperparameters={"model": "offline-stub", "system_prompt": "offline"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig(show_indicator=False, print_results=False), + ) + + assert isinstance(result, EvaluationResult) + assert len(result.test_results) == len(test_cases) + + # All test results should be failures since the metric always fails + for tr in result.test_results: + assert ( + tr.success is False + ), "Test result should be False when metric fails" + assert len(tr.metrics_data) >= 1 + for md in tr.metrics_data: + assert md.success is False + + +def test_evaluate_propagates_metric_success(): + """When a metric passes, TestResult.success is True and metrics_data reflects success.""" + dataset = build_single_turn_dataset() + test_cases = build_llm_test_cases_from_goldens(dataset) + + result = evaluate( + test_cases=test_cases, + metrics=[DeterministicPassingMetric()], + hyperparameters={"model": "offline-stub", "system_prompt": "offline"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig(show_indicator=False, print_results=False), + ) + + assert isinstance(result, EvaluationResult) + assert len(result.test_results) == len(test_cases) + + # All test results should be successes since the metric always passes + for tr in result.test_results: + assert ( + tr.success is True + ), "Test result should be True when metric passes" + assert len(tr.metrics_data) >= 1 + for md in tr.metrics_data: + assert md.success is True + + +def test_evaluate_with_multiple_metrics(): + """When multiple metrics are provided, each result appears in TestResult.metrics_data.""" + dataset = build_single_turn_dataset() + test_cases = build_llm_test_cases_from_goldens(dataset) + + metrics = [ + DeterministicPassingMetric(), + DeterministicContainsExpectedOutputMetric(), + ] + + result = evaluate( + test_cases=test_cases, + metrics=metrics, + hyperparameters={"model": "offline-stub", "system_prompt": "offline"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig(show_indicator=False, print_results=False), + ) + + assert isinstance(result, EvaluationResult) + assert len(result.test_results) == len(test_cases) + + for tr in result.test_results: + # Should have results from all metrics + assert len(tr.metrics_data) == len( + metrics + ), f"Expected {len(metrics)} metric results, got {len(tr.metrics_data)}" + + # Verify each metric result has a non-empty name + for md in tr.metrics_data: + assert isinstance(md.name, str) and md.name != "" + + +def test_dataset_add_goldens_from_json_file_flow(tmp_path): + """End-to-end: write minimal JSON -> dataset.add_goldens_from_json_file -> evaluate.""" + # Write a minimal JSON file directly (not via save_as, to avoid round-trip issues) + json_path = tmp_path / "test_goldens.json" + goldens_data = [ + { + "input": "What is your name?", + "expected_output": "My name is DeepEval.", + }, + {"input": "Choose a number between 1 to 100", "expected_output": "42"}, + ] + with open(json_path, "w", encoding="utf-8") as f: + json.dump(goldens_data, f) + + # Load into a new dataset (as shown in docs) + loaded_dataset = EvaluationDataset() + loaded_dataset.add_goldens_from_json_file( + file_path=str(json_path), + input_key_name="input", + ) + + # Verify goldens were loaded correctly + assert len(loaded_dataset.goldens) == len(goldens_data) + + for orig, loaded in zip(goldens_data, loaded_dataset.goldens): + assert loaded.input == orig["input"] + assert loaded.expected_output == orig["expected_output"] + + # Now evaluate using the loaded dataset (completing the documented flow) + test_cases = build_llm_test_cases_from_goldens(loaded_dataset) + + result = evaluate( + test_cases=test_cases, + metrics=[DeterministicContainsExpectedOutputMetric()], + hyperparameters={"model": "offline-stub", "system_prompt": "offline"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig(show_indicator=False, print_results=False), + ) + + assert isinstance(result, EvaluationResult) + assert len(result.test_results) == len(test_cases) + + # All should pass since our deterministic LLM app returns expected outputs + for tr in result.test_results: + assert ( + tr.success is True + ), f"Test case '{tr.input}' failed unexpectedly after JSON load" + + +def test_evaluate_accepts_hyperparameters(): + """evaluate() accepts hyperparameters without affecting evaluation execution.""" + dataset = build_single_turn_dataset() + test_cases = build_llm_test_cases_from_goldens(dataset) + + # Test with various hyperparameter types as documented + hyperparameters = { + "model": "gpt-4.1", + "system_prompt": "You are a helpful assistant.", + "temperature": 0.7, + "max_tokens": 1000, + } + + result = evaluate( + test_cases=test_cases, + metrics=[DeterministicPassingMetric()], + hyperparameters=hyperparameters, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig(show_indicator=False, print_results=False), + ) + + # Should complete without error + assert isinstance(result, EvaluationResult) + assert len(result.test_results) == len(test_cases) + + +@pytest.mark.skipif( + not os.getenv("OPENAI_API_KEY"), + reason="OPENAI_API_KEY not set; skipping networked CLI smoke test", +) +def test_cli_smoke_test_networked(tmp_path): + """ + CLI smoke test for `deepeval test run`. + + This test requires OPENAI_API_KEY to be set and will be skipped otherwise. + It creates a minimal test file and runs `poetry run deepeval test run` on it. + """ + + # Create a minimal test file that uses DeepEval CLI + # Note: file must start with "test_" prefix for deepeval CLI + test_file = tmp_path / "test_cli_smoke.py" + test_file.write_text( + ''' +import deepeval +from deepeval import assert_test +from deepeval.test_case import LLMTestCase +from deepeval.metrics import AnswerRelevancyMetric + +def test_cli_smoke(): + """Minimal test case for CLI smoke test.""" + test_case = LLMTestCase( + input="What is 2+2?", + actual_output="4", + ) + assert_test(test_case, metrics=[AnswerRelevancyMetric(threshold=0.5)]) +''' + ) + + # Run the CLI via subprocess through Poetry + proc = subprocess.run( + ["poetry", "run", "deepeval", "test", "run", str(test_file)], + capture_output=True, + cwd=PROJECT_ROOT, + text=True, + timeout=120, # 2 minute timeout for network calls + ) + + # Assert CLI completed successfully + assert proc.returncode == 0, ( + f"CLI smoke test failed with return code {proc.returncode}.\n" + f"STDOUT:\n{proc.stdout}\n" + f"STDERR:\n{proc.stderr}" + ) diff --git a/tests/test_core/test_end_to_end/test_evaluate_cache.py b/tests/test_core/test_end_to_end/test_evaluate_cache.py new file mode 100644 index 0000000000..417ad4d184 --- /dev/null +++ b/tests/test_core/test_end_to_end/test_evaluate_cache.py @@ -0,0 +1,179 @@ +import os + +from deepeval import evaluate +from deepeval.evaluate.configs import AsyncConfig, CacheConfig, DisplayConfig +from deepeval.metrics import BaseMetric +from deepeval.test_case import LLMTestCase, LLMTestCaseParams +from deepeval.test_run.test_run import LATEST_TEST_RUN_FILE_PATH +from deepeval.test_run.cache import CACHE_FILE_NAME + +from .helpers import ( + build_single_turn_dataset, + build_llm_test_cases_from_goldens, + DeterministicPassingMetric, +) + + +# Module-level counter for tracking metric executions +_metric_call_counter = {"calls": 0} + + +def _reset_metric_counter(): + """Reset the metric call counter to zero.""" + _metric_call_counter["calls"] = 0 + + +class CountingMetric(BaseMetric): + """ + A deterministic metric that counts how many times it is executed. + Used to verify cache hit/miss behavior. + """ + + _required_params = [LLMTestCaseParams.ACTUAL_OUTPUT] + + def __init__(self, threshold: float = 0.5): + self.threshold = threshold + self.async_mode = False + self.include_reason = True + + @property + def __name__(self) -> str: + return "CountingMetric" + + def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float: + _metric_call_counter["calls"] += 1 + self.score = 1.0 + self.reason = "Counting metric always passes" + self.success = True + return self.score + + async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float: + return self.measure(test_case, *args, **kwargs) + + def is_successful(self) -> bool: + return True + + +def test_write_cache_creates_artifacts_on_disk(): + """ + Verify that write_cache=True creates cache files on disk. + + When evaluate() runs with write_cache=True, it should persist: + - Metric cache to CACHE_FILE_NAME (.deepeval-cache.json) + - Latest test run data to LATEST_TEST_RUN_FILE_PATH (.latest_test_run.json) + """ + dataset = build_single_turn_dataset() + test_cases = build_llm_test_cases_from_goldens(dataset) + + evaluate( + test_cases=test_cases, + metrics=[DeterministicPassingMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=True, use_cache=False), + display_config=DisplayConfig(show_indicator=False, print_results=False), + ) + + # Cache artifacts should exist after evaluation + assert os.path.isfile(CACHE_FILE_NAME), ( + f"Expected CACHE_FILE_NAME ({CACHE_FILE_NAME}) to exist after " + "evaluate() with write_cache=True" + ) + assert os.path.isfile(LATEST_TEST_RUN_FILE_PATH), ( + f"Expected LATEST_TEST_RUN_FILE_PATH ({LATEST_TEST_RUN_FILE_PATH}) " + "to exist after evaluate() with write_cache=True" + ) + + +def test_use_cache_true_reuses_cached_results(): + """ + Verify that use_cache=True reuses cached metric results. + + When evaluate() runs twice with identical inputs and use_cache=True, + the second run should use cached results and NOT re-execute the metric. + """ + _reset_metric_counter() + + dataset = build_single_turn_dataset() + test_cases = build_llm_test_cases_from_goldens(dataset) + num_test_cases = len(test_cases) + + # First run: metrics should execute + evaluate( + test_cases=test_cases, + metrics=[CountingMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=True, use_cache=True), + display_config=DisplayConfig(show_indicator=False, print_results=False), + ) + + calls_after_first_run = _metric_call_counter["calls"] + assert calls_after_first_run == num_test_cases, ( + f"Expected {num_test_cases} metric calls after first run, " + f"got {calls_after_first_run}" + ) + + # Second run with identical inputs: should use cache, no new metric calls + evaluate( + test_cases=test_cases, + metrics=[CountingMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=True, use_cache=True), + display_config=DisplayConfig(show_indicator=False, print_results=False), + ) + + calls_after_second_run = _metric_call_counter["calls"] + assert calls_after_second_run == calls_after_first_run, ( + f"Expected no additional metric calls on second run with use_cache=True. " + f"Calls after first run: {calls_after_first_run}, " + f"calls after second run: {calls_after_second_run}" + ) + + +def test_use_cache_false_recomputes_metrics(): + """ + Verify that use_cache=False recomputes metrics even when cache exists. + + When evaluate() runs twice with identical inputs but use_cache=False, + metrics should be re-executed on the second run. + """ + _reset_metric_counter() + + dataset = build_single_turn_dataset() + test_cases = build_llm_test_cases_from_goldens(dataset) + num_test_cases = len(test_cases) + + # First run: metrics should execute + evaluate( + test_cases=test_cases, + metrics=[CountingMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=True, use_cache=False), + display_config=DisplayConfig(show_indicator=False, print_results=False), + ) + + calls_after_first_run = _metric_call_counter["calls"] + assert calls_after_first_run == num_test_cases, ( + f"Expected {num_test_cases} metric calls after first run, " + f"got {calls_after_first_run}" + ) + + # Second run with use_cache=False: should recompute, more metric calls + evaluate( + test_cases=test_cases, + metrics=[CountingMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=True, use_cache=False), + display_config=DisplayConfig(show_indicator=False, print_results=False), + ) + + calls_after_second_run = _metric_call_counter["calls"] + expected_calls = num_test_cases * 2 + assert calls_after_second_run == expected_calls, ( + f"Expected {expected_calls} total metric calls after second run " + f"with use_cache=False (recomputation), got {calls_after_second_run}" + ) diff --git a/tests/test_core/test_end_to_end/test_evaluate_configs.py b/tests/test_core/test_end_to_end/test_evaluate_configs.py new file mode 100644 index 0000000000..27520c27be --- /dev/null +++ b/tests/test_core/test_end_to_end/test_evaluate_configs.py @@ -0,0 +1,461 @@ +import pytest + +from deepeval import evaluate +from deepeval.evaluate.configs import ( + AsyncConfig, + CacheConfig, + DisplayConfig, + ErrorConfig, +) +from deepeval.evaluate.types import EvaluationResult +from deepeval.errors import MissingTestCaseParamsError +from deepeval.test_case import LLMTestCase + +from .helpers import ( + DeterministicFailingMetric, + DeterministicPassingMetric, + DeterministicRaisingMetric, + DeterministicRequiresRetrievalContextMetric, + build_llm_test_cases_from_goldens, + build_single_turn_dataset, +) + +# =========================================================================== +# ErrorConfig: missing params behavior +# =========================================================================== + + +def test_error_config_missing_params_raises_by_default(): + """By default, missing required test case params raises MissingTestCaseParamsError.""" + # Create a test case without retrieval_context + test_case = LLMTestCase( + input="What is your name?", + actual_output="My name is DeepEval.", + retrieval_context=None, # Missing required param + ) + + with pytest.raises(MissingTestCaseParamsError): + evaluate( + test_cases=[test_case], + metrics=[DeterministicRequiresRetrievalContextMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig( + show_indicator=False, print_results=False + ), + error_config=ErrorConfig(skip_on_missing_params=False), + ) + + +def test_error_config_skip_on_missing_params_skips_metric(): + """When skip_on_missing_params=True, metrics with missing required params are skipped.""" + # Create a test case without retrieval_context + test_case = LLMTestCase( + input="What is your name?", + actual_output="My name is DeepEval.", + retrieval_context=None, # Missing required param + ) + + result = evaluate( + test_cases=[test_case], + metrics=[DeterministicRequiresRetrievalContextMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig(show_indicator=False, print_results=False), + error_config=ErrorConfig(skip_on_missing_params=True), + ) + + assert isinstance(result, EvaluationResult) + assert len(result.test_results) == 1 + + tr = result.test_results[0] + # When metric is skipped, it should not appear in metrics_data + assert ( + len(tr.metrics_data) == 0 + ), "Skipped metric should not appear in metrics_data" + + +def test_error_config_skip_on_missing_params_does_not_skip_when_complete(): + """When skip_on_missing_params=True, a complete test case is still evaluated.""" + # Create a test case with retrieval_context + test_case = LLMTestCase( + input="What is your name?", + actual_output="My name is DeepEval.", + retrieval_context=["context chunk 1", "context chunk 2"], + ) + + result = evaluate( + test_cases=[test_case], + metrics=[DeterministicRequiresRetrievalContextMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig(show_indicator=False, print_results=False), + error_config=ErrorConfig(skip_on_missing_params=True), + ) + + assert isinstance(result, EvaluationResult) + assert len(result.test_results) == 1 + + tr = result.test_results[0] + # Metric should be evaluated and present + assert len(tr.metrics_data) == 1 + assert tr.metrics_data[0].success is True + + +def test_error_config_skip_on_missing_params_takes_precedence_over_ignore_errors(): + """skip_on_missing_params=True takes precedence over ignore_errors=True when params are missing.""" + test_case = LLMTestCase( + input="What is your name?", + actual_output="My name is DeepEval.", + retrieval_context=None, # Missing required param + ) + + result = evaluate( + test_cases=[test_case], + metrics=[DeterministicRequiresRetrievalContextMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig(show_indicator=False, print_results=False), + error_config=ErrorConfig( + skip_on_missing_params=True, + ignore_errors=True, + ), + ) + + assert isinstance(result, EvaluationResult) + assert len(result.test_results) == 1 + + tr = result.test_results[0] + # skip_on_missing_params takes precedence: metric should be skipped entirely + # (not present in metrics_data), rather than showing as ignored error + assert len(tr.metrics_data) == 0, ( + "skip_on_missing_params should take precedence: metric should be " + "skipped (absent from metrics_data), not shown as ignored error" + ) + + +# ----------------------------------------------------------------------------- +# ErrorConfig: ignore_errors behavior +# ----------------------------------------------------------------------------- + + +def test_error_config_ignore_errors_raises_by_default(): + """By default, exceptions raised by metrics propagate (ignore_errors=False).""" + test_case = LLMTestCase( + input="What is your name?", + actual_output="My name is DeepEval.", + ) + + with pytest.raises(RuntimeError, match="always raises"): + evaluate( + test_cases=[test_case], + metrics=[DeterministicRaisingMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig( + show_indicator=False, print_results=False + ), + error_config=ErrorConfig(ignore_errors=False), + ) + + +def test_error_config_ignore_errors_captures_metric_exception(): + """When ignore_errors=True, metric exceptions are captured and surfaced in the result.""" + test_case = LLMTestCase( + input="What is your name?", + actual_output="My name is DeepEval.", + ) + + result = evaluate( + test_cases=[test_case], + metrics=[DeterministicRaisingMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig(show_indicator=False, print_results=False), + error_config=ErrorConfig(ignore_errors=True), + ) + + assert isinstance(result, EvaluationResult) + assert len(result.test_results) == 1 + + tr = result.test_results[0] + assert len(tr.metrics_data) == 1 + + md = tr.metrics_data[0] + # When error is ignored, metric should be marked as failed + assert md.success is False + # Error message should be captured + assert md.error is not None + assert "always raises" in md.error + + +def test_error_config_ignore_errors_does_not_affect_passing_metrics(): + """ignore_errors=True only affects metrics that raise; passing metrics remain unaffected.""" + test_case = LLMTestCase( + input="What is your name?", + actual_output="My name is DeepEval.", + ) + + result = evaluate( + test_cases=[test_case], + metrics=[ + DeterministicPassingMetric(), + DeterministicRaisingMetric(), + ], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig(show_indicator=False, print_results=False), + error_config=ErrorConfig(ignore_errors=True), + ) + + assert isinstance(result, EvaluationResult) + assert len(result.test_results) == 1 + + tr = result.test_results[0] + assert len(tr.metrics_data) == 2 + + # Find each metric's data by name + passing_md = next( + md for md in tr.metrics_data if md.name == "DeterministicPassingMetric" + ) + raising_md = next( + md for md in tr.metrics_data if md.name == "DeterministicRaisingMetric" + ) + + # Passing metric should succeed + assert passing_md.success is True + assert passing_md.error is None + + # Raising metric should fail with captured error + assert raising_md.success is False + assert raising_md.error is not None + + +# ----------------------------------------------------------------------------- +# AsyncConfig behavior +# ----------------------------------------------------------------------------- + + +def test_async_config_sync_and_async_produce_equivalent_results(): + """Sync and async evaluation produce equivalent results for deterministic metrics.""" + dataset = build_single_turn_dataset() + test_cases = build_llm_test_cases_from_goldens(dataset) + + # Run with run_async=False (sync) + result_sync = evaluate( + test_cases=test_cases, + metrics=[DeterministicPassingMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig(show_indicator=False, print_results=False), + ) + + # Run with run_async=True (async) + result_async = evaluate( + test_cases=test_cases, + metrics=[DeterministicPassingMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig( + run_async=True, max_concurrent=1, throttle_value=0 + ), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig(show_indicator=False, print_results=False), + ) + + # Both should return valid results + assert isinstance(result_sync, EvaluationResult) + assert isinstance(result_async, EvaluationResult) + + # Same number of test results + assert len(result_sync.test_results) == len(result_async.test_results) + + # Each test result should have same success status + for tr_sync, tr_async in zip( + result_sync.test_results, result_async.test_results + ): + assert tr_sync.success == tr_async.success + assert len(tr_sync.metrics_data) == len(tr_async.metrics_data) + + # Metric names and success should match + for md_sync, md_async in zip( + tr_sync.metrics_data, tr_async.metrics_data + ): + assert md_sync.name == md_async.name + assert md_sync.success == md_async.success + + +def test_async_config_max_concurrent_must_be_positive(): + """max_concurrent must be >= 1.""" + with pytest.raises(ValueError, match="max_concurrent"): + AsyncConfig(max_concurrent=0) + + +def test_async_config_throttle_value_must_be_non_negative(): + """throttle_value must be >= 0.""" + with pytest.raises(ValueError, match="throttle_value"): + AsyncConfig(throttle_value=-1) + + +# ----------------------------------------------------------------------------- +# CacheConfig behavior +# ----------------------------------------------------------------------------- + + +def test_cache_config_disabled_does_not_break_evaluate(): + """Evaluation succeeds when caching is fully disabled.""" + dataset = build_single_turn_dataset() + test_cases = build_llm_test_cases_from_goldens(dataset) + + result = evaluate( + test_cases=test_cases, + metrics=[DeterministicPassingMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig(show_indicator=False, print_results=False), + ) + + assert isinstance(result, EvaluationResult) + assert len(result.test_results) == len(test_cases) + + for tr in result.test_results: + assert tr.success is True + assert len(tr.metrics_data) >= 1 + + +# ----------------------------------------------------------------------------- +# DisplayConfig behavior +# ----------------------------------------------------------------------------- + + +def test_display_config_all_does_not_break_evaluate(): + """display_option=ALL does not affect evaluation execution.""" + from deepeval.test_run.test_run import TestRunResultDisplay + + dataset = build_single_turn_dataset() + test_cases = build_llm_test_cases_from_goldens(dataset) + + result = evaluate( + test_cases=test_cases, + metrics=[DeterministicPassingMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig( + show_indicator=False, + print_results=False, + display_option=TestRunResultDisplay.ALL, + ), + ) + + assert isinstance(result, EvaluationResult) + assert len(result.test_results) == len(test_cases) + + +def test_display_config_failing_does_not_break_evaluate(): + """display_option=FAILING does not affect evaluation execution.""" + from deepeval.test_run.test_run import TestRunResultDisplay + + dataset = build_single_turn_dataset() + test_cases = build_llm_test_cases_from_goldens(dataset) + + result = evaluate( + test_cases=test_cases, + metrics=[DeterministicFailingMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig( + show_indicator=False, + print_results=False, + display_option=TestRunResultDisplay.FAILING, + ), + ) + + assert isinstance(result, EvaluationResult) + assert len(result.test_results) == len(test_cases) + + # All results should be failures (we used DeterministicFailingMetric) + for tr in result.test_results: + assert tr.success is False + + +def test_display_config_passing_does_not_break_evaluate(): + """display_option=PASSING does not affect evaluation execution.""" + from deepeval.test_run.test_run import TestRunResultDisplay + + dataset = build_single_turn_dataset() + test_cases = build_llm_test_cases_from_goldens(dataset) + + result = evaluate( + test_cases=test_cases, + metrics=[DeterministicPassingMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig( + show_indicator=False, + print_results=False, + display_option=TestRunResultDisplay.PASSING, + ), + ) + + assert isinstance(result, EvaluationResult) + assert len(result.test_results) == len(test_cases) + + # All results should be successes (we used DeterministicPassingMetric) + for tr in result.test_results: + assert tr.success is True + + +def test_display_config_does_not_affect_evaluation_results(): + """DisplayConfig options should not affect evaluation outcomes, only printing.""" + from deepeval.test_run.test_run import TestRunResultDisplay + + dataset = build_single_turn_dataset() + test_cases = build_llm_test_cases_from_goldens(dataset) + + # Run with display="all" + result_all = evaluate( + test_cases=test_cases, + metrics=[DeterministicPassingMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig( + show_indicator=False, + print_results=False, + display_option=TestRunResultDisplay.ALL, + ), + ) + + # Run with display="passing" + result_passing = evaluate( + test_cases=test_cases, + metrics=[DeterministicPassingMetric()], + hyperparameters={"model": "offline-stub"}, + async_config=AsyncConfig(run_async=False), + cache_config=CacheConfig(write_cache=False, use_cache=False), + display_config=DisplayConfig( + show_indicator=False, + print_results=False, + display_option=TestRunResultDisplay.PASSING, + ), + ) + + # Results should be identical regardless of display option + assert len(result_all.test_results) == len(result_passing.test_results) + + for tr_all, tr_pass in zip( + result_all.test_results, result_passing.test_results + ): + assert tr_all.success == tr_pass.success + assert len(tr_all.metrics_data) == len(tr_pass.metrics_data) diff --git a/tests/test_integrations/test_langgraph/apps/main.py b/tests/test_integrations/test_langgraph/apps/main.py index dfcda0ac52..23c3df4863 100644 --- a/tests/test_integrations/test_langgraph/apps/main.py +++ b/tests/test_integrations/test_langgraph/apps/main.py @@ -17,38 +17,56 @@ def separator(title: str): def main(): # 1. Simple App separator("1. SIMPLE APP - Single Tool (Weather)") - from tests.test_integrations.test_langgraph.apps.langgraph_simple_app import app as simple_app - + from tests.test_integrations.test_langgraph.apps.langgraph_simple_app import ( + app as simple_app, + ) + callback = CallbackHandler( name="demo-simple", tags=["demo", "simple"], metadata={"app": "simple"}, ) result = simple_app.invoke( - {"messages": [HumanMessage(content="What's the weather in San Francisco?")]}, + { + "messages": [ + HumanMessage(content="What's the weather in San Francisco?") + ] + }, config={"callbacks": [callback]}, ) print(f"Response: {result['messages'][-1].content}") # 2. Multiple Tools App - separator("2. MULTIPLE TOOLS APP - Weather, Population, Timezone, Calculator") - from tests.test_integrations.test_langgraph.apps.langgraph_multiple_tools_app import app as multiple_tools_app - + separator( + "2. MULTIPLE TOOLS APP - Weather, Population, Timezone, Calculator" + ) + from tests.test_integrations.test_langgraph.apps.langgraph_multiple_tools_app import ( + app as multiple_tools_app, + ) + callback = CallbackHandler( name="demo-multiple-tools", tags=["demo", "multiple-tools"], metadata={"app": "multiple_tools"}, ) result = multiple_tools_app.invoke( - {"messages": [HumanMessage(content="Tell me about Tokyo - weather, population, and timezone. Also calculate 15 * 23.")]}, + { + "messages": [ + HumanMessage( + content="Tell me about Tokyo - weather, population, and timezone. Also calculate 15 * 23." + ) + ] + }, config={"callbacks": [callback]}, ) print(f"Response: {result['messages'][-1].content}") # 3. Streaming App (Sync) separator("3. STREAMING APP - Stock Price Tools") - from tests.test_integrations.test_langgraph.apps.langgraph_streaming_app import sync_app as streaming_app - + from tests.test_integrations.test_langgraph.apps.langgraph_streaming_app import ( + sync_app as streaming_app, + ) + callback = CallbackHandler( name="demo-streaming", tags=["demo", "streaming"], @@ -60,7 +78,7 @@ def main(): config={"callbacks": [callback]}, ): print(f" Chunk: {list(chunk.keys())}") - + # Also get final result callback = CallbackHandler( name="demo-streaming-invoke", @@ -75,8 +93,10 @@ def main(): # 4. Conditional App separator("4. CONDITIONAL APP - Intent-Based Routing") - from tests.test_integrations.test_langgraph.apps.langgraph_conditional_app import app as conditional_app - + from tests.test_integrations.test_langgraph.apps.langgraph_conditional_app import ( + app as conditional_app, + ) + # Research route callback = CallbackHandler( name="demo-conditional-research", @@ -88,7 +108,7 @@ def main(): config={"callbacks": [callback]}, ) print(f"Research Response: {result['messages'][-1].content}") - + # Fact check route callback = CallbackHandler( name="demo-conditional-factcheck", @@ -103,23 +123,33 @@ def main(): # 5. Parallel Tools App separator("5. PARALLEL TOOLS APP - Multiple Parallel Tool Calls") - from tests.test_integrations.test_langgraph.apps.langgraph_parallel_tools_app import sync_app as parallel_app - + from tests.test_integrations.test_langgraph.apps.langgraph_parallel_tools_app import ( + sync_app as parallel_app, + ) + callback = CallbackHandler( name="demo-parallel", tags=["demo", "parallel"], metadata={"app": "parallel"}, ) result = parallel_app.invoke( - {"messages": [HumanMessage(content="Get weather for Tokyo, New York, and London.")]}, + { + "messages": [ + HumanMessage( + content="Get weather for Tokyo, New York, and London." + ) + ] + }, config={"callbacks": [callback]}, ) print(f"Response: {result['messages'][-1].content}") # 6. Async App (run synchronously for demo) separator("6. ASYNC APP - Database Search & Translation") - from tests.test_integrations.test_langgraph.apps.langgraph_async_app import app as async_app - + from tests.test_integrations.test_langgraph.apps.langgraph_async_app import ( + app as async_app, + ) + async def run_async(): callback = CallbackHandler( name="demo-async", @@ -127,21 +157,27 @@ async def run_async(): metadata={"app": "async"}, ) result = await async_app.ainvoke( - {"messages": [HumanMessage(content="Search for information about Python")]}, + { + "messages": [ + HumanMessage(content="Search for information about Python") + ] + }, config={"callbacks": [callback]}, ) return result - + result = asyncio.run(run_async()) print(f"Response: {result['messages'][-1].content}") # 7. Multi-Turn App separator("7. MULTI-TURN APP - Shopping Cart with Memory") - from tests.test_integrations.test_langgraph.apps.langgraph_multi_turn_app import get_app_with_memory - + from tests.test_integrations.test_langgraph.apps.langgraph_multi_turn_app import ( + get_app_with_memory, + ) + app = get_app_with_memory() thread_id = "demo-session-001" - + # Turn 1 callback = CallbackHandler( name="demo-multi-turn-1", @@ -152,10 +188,13 @@ async def run_async(): ) result = app.invoke( {"messages": [HumanMessage(content="Add 2 apples to my cart")]}, - config={"callbacks": [callback], "configurable": {"thread_id": thread_id}}, + config={ + "callbacks": [callback], + "configurable": {"thread_id": thread_id}, + }, ) print(f"Turn 1: {result['messages'][-1].content}") - + # Turn 2 callback = CallbackHandler( name="demo-multi-turn-2", @@ -166,10 +205,13 @@ async def run_async(): ) result = app.invoke( {"messages": [HumanMessage(content="Also add 3 oranges")]}, - config={"callbacks": [callback], "configurable": {"thread_id": thread_id}}, + config={ + "callbacks": [callback], + "configurable": {"thread_id": thread_id}, + }, ) print(f"Turn 2: {result['messages'][-1].content}") - + # Turn 3 callback = CallbackHandler( name="demo-multi-turn-3", @@ -180,7 +222,10 @@ async def run_async(): ) result = app.invoke( {"messages": [HumanMessage(content="What's in my cart?")]}, - config={"callbacks": [callback], "configurable": {"thread_id": thread_id}}, + config={ + "callbacks": [callback], + "configurable": {"thread_id": thread_id}, + }, ) print(f"Turn 3: {result['messages'][-1].content}") @@ -189,4 +234,3 @@ async def run_async(): if __name__ == "__main__": main() - diff --git a/tests/test_integrations/test_langgraph/test_async.py b/tests/test_integrations/test_langgraph/test_async.py index d25b350e05..1820dad24f 100644 --- a/tests/test_integrations/test_langgraph/test_async.py +++ b/tests/test_integrations/test_langgraph/test_async.py @@ -35,7 +35,11 @@ # Set to True to generate schemas, False to assert against existing schemas # Can be overridden via environment variable: GENERATE_SCHEMAS=true pytest ... -GENERATE_MODE = os.environ.get("GENERATE_SCHEMAS", "").lower() in ("true", "1", "yes") +GENERATE_MODE = os.environ.get("GENERATE_SCHEMAS", "").lower() in ( + "true", + "1", + "yes", +) _current_dir = os.path.dirname(os.path.abspath(__file__)) _schemas_dir = os.path.join(_current_dir, "schemas") diff --git a/tests/test_integrations/test_langgraph/test_sync.py b/tests/test_integrations/test_langgraph/test_sync.py index 5cc64debaa..94fde20aa4 100644 --- a/tests/test_integrations/test_langgraph/test_sync.py +++ b/tests/test_integrations/test_langgraph/test_sync.py @@ -39,7 +39,11 @@ # Set to True to generate schemas, False to assert against existing schemas # Can be overridden via environment variable: GENERATE_SCHEMAS=true pytest ... -GENERATE_MODE = os.environ.get("GENERATE_SCHEMAS", "").lower() in ("true", "1", "yes") +GENERATE_MODE = os.environ.get("GENERATE_SCHEMAS", "").lower() in ( + "true", + "1", + "yes", +) _current_dir = os.path.dirname(os.path.abspath(__file__)) _schemas_dir = os.path.join(_current_dir, "schemas")