style: fix pre-existing black formatting violations

faridun-ag2 · faridun-ag2 · commit bd521a97b836 · 2026-04-03T10:49:20.000-07:00
These 20 files were already failing black --check on main branch
before this PR. Reformatting them here to pass CI.
diff --git a/deepeval/metrics/arena_g_eval/template.py b/deepeval/metrics/arena_g_eval/template.py
@@ -46,8 +46,7 @@ def generate_arena_winner(
             "Be specific and grounded in the evaluation steps."
         )
 
-        return textwrap.dedent(
-            f"""
+        return textwrap.dedent(f"""
             You are a judge. Given the following evaluation steps, select the single contestant that best aligns with the evaluation steps.
 
             {ArenaGEvalTemplate.multimodal_rules if multimodal else ""}
@@ -88,16 +87,14 @@ def generate_arena_winner(
             }}
 
             JSON:
-        """
-        )
+        """)
 
     @staticmethod
     def rewrite_reason(
         reason: str,
         dummy_to_real_names: Dict[str, str],
     ):
-        return textwrap.dedent(
-            f"""
+        return textwrap.dedent(f"""
             Given the following reason that explains which contestant is the winner, rewrite the reason to REPLACE all contestant names with their real names.
 
             The contestant names are wrapped in $name$ format (e.g., $Alice$, $Bob$, $Charlie$).
@@ -129,5 +126,4 @@ def rewrite_reason(
             }}
 
             JSON:
-            """
-        )
+            """)
diff --git a/deepeval/metrics/argument_correctness/template.py b/deepeval/metrics/argument_correctness/template.py
@@ -19,8 +19,7 @@ def generate_verdicts(
 
         stringified_tools_called = repr(tools_called)
 
-        return textwrap.dedent(
-            f"""
+        return textwrap.dedent(f"""
             For the provided list of tool calls, determine whether each tool call input parameter is relevantly and correctly addresses the input.
 
             Please generate a list of JSON with two keys: `verdict` and `reason`.
@@ -99,8 +98,7 @@ def generate_verdicts(
             {stringified_tools_called}
 
             JSON:
-            """
-        )
+            """)
 
     @staticmethod
     def generate_reason(
diff --git a/deepeval/metrics/base_metric.py b/deepeval/metrics/base_metric.py
@@ -37,6 +37,7 @@ class BaseMetric:
     def __init_subclass__(cls, **kwargs):
         super().__init_subclass__(**kwargs)
         from deepeval.tracing.internal import observe_methods
+
         observe_methods(cls)
 
     @abstractmethod
@@ -85,6 +86,7 @@ class BaseConversationalMetric:
     def __init_subclass__(cls, **kwargs):
         super().__init_subclass__(**kwargs)
         from deepeval.tracing.internal import observe_methods
+
         observe_methods(cls)
 
     @abstractmethod
@@ -131,6 +133,7 @@ class BaseArenaMetric:
     def __init_subclass__(cls, **kwargs):
         super().__init_subclass__(**kwargs)
         from deepeval.tracing.internal import observe_methods
+
         observe_methods(cls)
 
     @abstractmethod
diff --git a/deepeval/metrics/contextual_relevancy/template.py b/deepeval/metrics/contextual_relevancy/template.py
@@ -55,12 +55,10 @@ def generate_verdicts(
         # Conditional instructions based on mode
         extraction_instructions = ""
         if multimodal:
-            extraction_instructions = textwrap.dedent(
-                """
+            extraction_instructions = textwrap.dedent("""
                 If the context is textual, you should first extract the statements found in the context if the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement.
                 If the context is an image, `statement` should be a description of the image. Do not assume any information not visibly available.
-                """
-            ).strip()
+                """).strip()
         else:
             extraction_instructions = "You should first extract statements found in the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement."
 
diff --git a/deepeval/metrics/conversational_dag/templates.py b/deepeval/metrics/conversational_dag/templates.py
@@ -73,8 +73,7 @@ def generate_task_output(instructions: str, text: str):
 class ConversationalBinaryJudgementTemplate:
     @staticmethod
     def generate_binary_verdict(criteria: str, text: str):
-        return dedent(
-            f"""{criteria}
+        return dedent(f"""{criteria}
 
                 Below is the full conversation you should evaluate. Consider dialogue context, speaker roles, and how responses were handled.
 
@@ -95,17 +94,15 @@ def generate_binary_verdict(criteria: str, text: str):
                 }}
                 **
                 JSON:
-            """
-        )
+            """)
 
 
 class ConversationalNonBinaryJudgementTemplate:
     @staticmethod
     def generate_non_binary_verdict(
         criteria: str, text: str, options: List[str]
     ):
-        return dedent(
-            f"""{criteria}
+        return dedent(f"""{criteria}
 
                 You are evaluating the following conversation. Choose one of the options that best reflects the assistant's behavior.
 
@@ -128,5 +125,4 @@ def generate_non_binary_verdict(
                 }}
                 **
                 JSON:
-            """
-        )
+            """)
diff --git a/deepeval/metrics/faithfulness/template.py b/deepeval/metrics/faithfulness/template.py
@@ -93,8 +93,7 @@ def generate_verdicts(
     ):
         example_section = ""
         if multimodal:
-            example_section = textwrap.dedent(
-                """
+            example_section = textwrap.dedent("""
                 Example retrieval contexts: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. Einstein won the Nobel Prize in 1968. Einstein is a German Scientist."
                 Example claims: ["Barack Obama is a caucasian male.", "Zurich is a city in London", "Einstein won the Nobel Prize for the discovery of the photoelectric effect which may have contributed to his fame.", "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect.", "Einstein was a German chef."]
 
@@ -123,11 +122,9 @@ def generate_verdicts(
                     ]  
                 }}
                 ===== END OF EXAMPLE ======
-                """
-            )
+                """)
 
-        format_instruction = textwrap.dedent(
-            """
+        format_instruction = textwrap.dedent("""
             Expected JSON format:
             {{
                 "verdicts": [
@@ -144,31 +141,26 @@ def generate_verdicts(
                     }}
                 ]  
             }}
-            """
-        )
+            """)
 
         guidelines = ""
         if multimodal:
-            guidelines = textwrap.dedent(
-                """
+            guidelines = textwrap.dedent("""
                 The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
                 You DON'T have to provide a reason if the answer is 'yes'.
                 ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
                 Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
                 Claims that is not backed up due to a lack of information/is not mentioned in the retrieval contexts MUST be answered 'idk', otherwise I WILL DIE.
                 If there are clear contradictions or any data or images that's not mentioned in the retrieval context, just provide 'no'.
-                """
-            )
+                """)
         else:
-            guidelines = textwrap.dedent(
-                """
+            guidelines = textwrap.dedent("""
                 Generate ONE verdict per claim - length of 'verdicts' MUST equal number of claims.
                 No 'reason' needed for 'yes' verdicts.
                 Only use 'no' if retrieval context DIRECTLY CONTRADICTS the claim - never use prior knowledge.
                 Use 'idk' for claims not backed up by context OR factually incorrect but non-contradictory - do not assume your knowledge.
                 Vague/speculative language in claims (e.g. 'may have', 'possibility') does NOT count as contradiction.
-                """
-            )
+                """)
 
         return textwrap.dedent(
             f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
diff --git a/deepeval/metrics/multimodal_metrics/image_coherence/template.py b/deepeval/metrics/multimodal_metrics/image_coherence/template.py
@@ -5,8 +5,7 @@ class ImageCoherenceTemplate:
 
     @staticmethod
     def evaluate_image_coherence(context_above, context_below):
-        return textwrap.dedent(
-            f"""
+        return textwrap.dedent(f"""
             # Task Description
             You are a multi-modal document evaluation assistant. You will receive an image and its textual context. 
             Your task is to evaluate the coherence between the image and the text (context above and below) it accompanies.
@@ -40,5 +39,4 @@ def evaluate_image_coherence(context_above, context_below):
             
             # Image
             [Insert Image Here]
-            """
-        )
+            """)
diff --git a/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py b/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py
@@ -280,15 +280,13 @@ def is_successful(self) -> bool:
     def _generate_reason(
         self,
     ) -> str:
-        return textwrap.dedent(
-            f"""
+        return textwrap.dedent(f"""
             The overall score is {self.score:.2f} because the lowest score from semantic consistency was {min(self.SC_scores)} 
             and the lowest score from perceptual quality was {min(self.PQ_scores)}. These scores were combined to reflect the 
             overall effectiveness and quality of the AI-generated image(s).
             Reason for Semantic Consistency score: {self.SC_reasoning}
             Reason for Perceptual Quality score: {self.PQ_reasoning}
-        """
-        )
+        """)
 
     @property
     def __name__(self):
diff --git a/deepeval/metrics/multimodal_metrics/image_editing/template.py b/deepeval/metrics/multimodal_metrics/image_editing/template.py
@@ -3,8 +3,7 @@
 
 class ImageEditingTemplate:
 
-    context = textwrap.dedent(
-        """
+    context = textwrap.dedent("""
         You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
         All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
                               
@@ -13,13 +12,11 @@ class ImageEditingTemplate:
             "score" : [...],
             "reasoning" : "..."
         }
-    """
-    )
+    """)
 
     @staticmethod
     def generate_semantic_consistency_evaluation_results(text_prompt: str):
-        return textwrap.dedent(
-            f"""
+        return textwrap.dedent(f"""
             {ImageEditingTemplate.context}
 
             RULES:
@@ -33,13 +30,11 @@ def generate_semantic_consistency_evaluation_results(text_prompt: str):
             Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.
 
             Editing instruction: {text_prompt}
-        """
-        )
+        """)
 
     @staticmethod
     def generate_perceptual_quality_evaluation_results():
-        return textwrap.dedent(
-            f"""
+        return textwrap.dedent(f"""
             {ImageEditingTemplate.context}
 
             RULES:
@@ -59,5 +54,4 @@ def generate_perceptual_quality_evaluation_results():
                 10 indicates the image has no artifacts.
             )
             Put the score in a list such that output score = [naturalness, artifacts]
-        """
-        )
+        """)
diff --git a/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py b/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py
@@ -5,8 +5,7 @@ class ImageHelpfulnessTemplate:
 
     @staticmethod
     def evaluate_image_helpfulness(context_above, context_below):
-        return textwrap.dedent(
-            f"""
+        return textwrap.dedent(f"""
             # Task Description
             You are a multi-modal document evaluation assistant. You will receive an image and its textual context.
             Your task is to evaluate the helpfulness of the image in enabling human readers to comprehend the text (context above and below) it accompanies.
@@ -40,5 +39,4 @@ def evaluate_image_helpfulness(context_above, context_below):
 
             # Image
             [Insert Image Here]
-            """
-        )
+            """)
diff --git a/deepeval/metrics/multimodal_metrics/image_reference/template.py b/deepeval/metrics/multimodal_metrics/image_reference/template.py
@@ -5,8 +5,7 @@ class ImageReferenceTemplate:
 
     @staticmethod
     def evaluate_image_reference(context_above, context_below):
-        return textwrap.dedent(
-            f"""
+        return textwrap.dedent(f"""
             # Task Description
             You are a multi-modal document quality assessment assistant. You will receive an image and its accompanying textual context.
             Your task is to determine whether the image is explicitly referenced or explained within the surrounding text (both above and below the image).
@@ -39,5 +38,4 @@ def evaluate_image_reference(context_above, context_below):
 
             # Image
             [Insert Image Here]
-            """
-        )
+            """)
diff --git a/deepeval/metrics/multimodal_metrics/text_to_image/template.py b/deepeval/metrics/multimodal_metrics/text_to_image/template.py
@@ -3,8 +3,7 @@
 
 class TextToImageTemplate:
 
-    context = textwrap.dedent(
-        """
+    context = textwrap.dedent("""
         You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
         All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
                               
@@ -13,13 +12,11 @@ class TextToImageTemplate:
             "score" : [...],
             "reasoning" : "..."
         }
-    """
-    )
+    """)
 
     @staticmethod
     def generate_semantic_consistency_evaluation_results(text_prompt: str):
-        return textwrap.dedent(
-            f"""
+        return textwrap.dedent(f"""
             {TextToImageTemplate.context}
 
             RULES:
@@ -34,13 +31,11 @@ def generate_semantic_consistency_evaluation_results(text_prompt: str):
             Put the score in a list such that output score = [score].
 
             Text Prompt: {text_prompt}
-        """
-        )
+        """)
 
     @staticmethod
     def generate_perceptual_quality_evaluation_results():
-        return textwrap.dedent(
-            f"""
+        return textwrap.dedent(f"""
             {TextToImageTemplate.context}
 
             RULES:
@@ -60,5 +55,4 @@ def generate_perceptual_quality_evaluation_results():
                 10 indicates the image has no artifacts.
             )
             Put the score in a list such that output score = [naturalness, artifacts]
-        """
-        )
+        """)
diff --git a/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py b/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py
@@ -282,15 +282,13 @@ def is_successful(self) -> bool:
         return self.success
 
     def _generate_reason(self) -> str:
-        return textwrap.dedent(
-            f"""
+        return textwrap.dedent(f"""
             The overall score is {self.score:.2f} because the lowest score from semantic consistency was {min(self.SC_scores)} 
             and the lowest score from perceptual quality was {min(self.PQ_scores)}. These scores were combined to reflect the 
             overall effectiveness and quality of the AI-generated image(s).
             Reason for Semantic Consistency score: {self.SC_reasoning}
             Reason for Perceptual Quality score: {self.PQ_reasoning}
-        """
-        )
+        """)
 
     @property
     def __name__(self):
diff --git a/deepeval/metrics/topic_adherence/template.py b/deepeval/metrics/topic_adherence/template.py
@@ -79,8 +79,7 @@ def get_qa_pair_verdict(
         question: str,
         response: str,
     ) -> str:
-        return textwrap.dedent(
-            f"""You are given:
+        return textwrap.dedent(f"""You are given:
                 - A list of **relevant topics**
                 - A **user question**
                 - An **assistant response**
@@ -138,8 +137,7 @@ def get_qa_pair_verdict(
                 Response: {response}
 
                 JSON:
-            """
-        )
+            """)
 
     @staticmethod
     def generate_reason(success, score, threshold, TP, TN, FP, FN) -> str:
diff --git a/deepeval/metrics/turn_contextual_relevancy/template.py b/deepeval/metrics/turn_contextual_relevancy/template.py
diff --git a/deepeval/metrics/turn_faithfulness/template.py b/deepeval/metrics/turn_faithfulness/template.py
diff --git a/deepeval/scorer/scorer.py b/deepeval/scorer/scorer.py
diff --git a/deepeval/simulator/template.py b/deepeval/simulator/template.py
diff --git a/deepeval/synthesizer/templates/template.py b/deepeval/synthesizer/templates/template.py
diff --git a/deepeval/synthesizer/templates/template_prompt.py b/deepeval/synthesizer/templates/template_prompt.py