Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions evals/elsuite/make_me_say/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from evals.elsuite.make_me_say.defaults import DEFAULT_JUDGE_PREAMBLE_FN

nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")
wn.ensure_loaded() # needed to support threading

Expand Down
400 changes: 400 additions & 0 deletions evals/registry/data/logic_stress/logic_stress_v2.jsonl

Large diffs are not rendered by default.

400 changes: 400 additions & 0 deletions evals/registry/data/logic_stress/logic_stress_v3.jsonl

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions evals/registry/evals/logic_stress_v2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
logic_stress_v2:
id: logic_stress_v2.v0
description: "Logical reasoning stress test: Variant 2 (Essential rule removal). Evaluates if models incorrectly conclude 'True' when a necessary inferential link is missing."
metrics: [accuracy]

logic_stress_v2.v0:
class: evals.elsuite.basic.match:Match
args:
samples_jsonl: logic_stress/logic_stress_v2.jsonl
9 changes: 9 additions & 0 deletions evals/registry/evals/logic_stress_v3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
logic_stress_v3:
id: logic_stress_v3.v0
description: "Logical reasoning stress test: Variant 3 (Logic Inertia / Contradiction). Evaluates if models can detect explicit contradictions and halt deduction, or if they blindly follow rules (Inertia)."
metrics: [accuracy]

logic_stress_v3.v0:
class: evals.elsuite.basic.match:Match
args:
samples_jsonl: logic_stress/logic_stress_v3.jsonl
8 changes: 4 additions & 4 deletions evals/solvers/providers/anthropic/anthropic_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,10 @@ def _convert_msgs_to_anthropic_format(msgs: list[Message]) -> list[MessageParam]
"""
# enforce valid roles; convert to Anthropic message type
anth_msgs = [
MessageParam(
role=oai_to_anthropic_role[msg.role],
content=[ContentBlock(text=msg.content, type="text")],
)
{
"role": oai_to_anthropic_role[msg.role],
"content": [{"text": msg.content, "type": "text"}],
}
for msg in msgs
]
# enforce alternating roles by merging consecutive messages with the same role
Expand Down
36 changes: 14 additions & 22 deletions evals/solvers/providers/anthropic/anthropic_solver_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,29 +79,21 @@ def test_message_format():
anth_msgs = AnthropicSolver._convert_msgs_to_anthropic_format(msgs)

expected = [
MessageParam(
role="user",
content=[
ContentBlock(text="What is 2 + 2?", type="text"),
ContentBlock(text="reason step by step", type="text"),
{
"role": "user",
"content": [
{"text": "What is 2 + 2?", "type": "text"},
{"text": "reason step by step", "type": "text"},
],
),
MessageParam(
role="assistant",
content=[
ContentBlock(
text="I don't need to reason for this, 2+2 is just 4", type="text"
),
],
),
MessageParam(
role="user",
content=[
ContentBlock(
text="now, given your reasoning, provide the answer", type="text"
),
],
),
},
{
"role": "assistant",
"content": [{"text": "I don't need to reason for this, 2+2 is just 4", "type": "text"}],
},
{
"role": "user",
"content": [{"text": "now, given your reasoning, provide the answer", "type": "text"}],
},
]

assert anth_msgs == expected, f"Expected {expected}, but got {anth_msgs}"
Expand Down