openai · 14H034160212 · Feb 16, 2026 · Feb 17, 2026
@@ -13,6 +13,7 @@
 from evals.elsuite.make_me_say.defaults import DEFAULT_JUDGE_PREAMBLE_FN
 
 nltk.download("punkt")
+nltk.download("punkt_tab")
 nltk.download("wordnet")
 wn.ensure_loaded()  # needed to support threading
 

@@ -0,0 +1,9 @@
+logic_stress_v2:
+  id: logic_stress_v2.v0
+  description: "Logical reasoning stress test: Variant 2 (Essential rule removal). Evaluates if models incorrectly conclude 'True' when a necessary inferential link is missing."
+  metrics: [accuracy]
+
+logic_stress_v2.v0:
+  class: evals.elsuite.basic.match:Match
+  args:
+    samples_jsonl: logic_stress/logic_stress_v2.jsonl
@@ -0,0 +1,9 @@
+logic_stress_v3:
+  id: logic_stress_v3.v0
+  description: "Logical reasoning stress test: Variant 3 (Logic Inertia / Contradiction). Evaluates if models can detect explicit contradictions and halt deduction, or if they blindly follow rules (Inertia)."
+  metrics: [accuracy]
+
+logic_stress_v3.v0:
+  class: evals.elsuite.basic.match:Match
+  args:
+    samples_jsonl: logic_stress/logic_stress_v3.jsonl
@@ -97,10 +97,10 @@ def _convert_msgs_to_anthropic_format(msgs: list[Message]) -> list[MessageParam]
         """
         # enforce valid roles; convert to Anthropic message type
         anth_msgs = [
-            MessageParam(
-                role=oai_to_anthropic_role[msg.role],
-                content=[ContentBlock(text=msg.content, type="text")],
-            )
+            {
+                "role": oai_to_anthropic_role[msg.role],
+                "content": [{"text": msg.content, "type": "text"}],
+            }
             for msg in msgs
         ]
         # enforce alternating roles by merging consecutive messages with the same role

@@ -79,29 +79,21 @@ def test_message_format():
     anth_msgs = AnthropicSolver._convert_msgs_to_anthropic_format(msgs)
 
     expected = [
-        MessageParam(
-            role="user",
-            content=[
-                ContentBlock(text="What is 2 + 2?", type="text"),
-                ContentBlock(text="reason step by step", type="text"),
+        {
+            "role": "user",
+            "content": [
+                {"text": "What is 2 + 2?", "type": "text"},
+                {"text": "reason step by step", "type": "text"},
             ],
-        ),
-        MessageParam(
-            role="assistant",
-            content=[
-                ContentBlock(
-                    text="I don't need to reason for this, 2+2 is just 4", type="text"
-                ),
-            ],
-        ),
-        MessageParam(
-            role="user",
-            content=[
-                ContentBlock(
-                    text="now, given your reasoning, provide the answer", type="text"
-                ),
-            ],
-        ),
+        },
+        {
+            "role": "assistant",
+            "content": [{"text": "I don't need to reason for this, 2+2 is just 4", "type": "text"}],
+        },
+        {
+            "role": "user",
+            "content": [{"text": "now, given your reasoning, provide the answer", "type": "text"}],
+        },
     ]
 
     assert anth_msgs == expected, f"Expected {expected}, but got {anth_msgs}"