diff --git a/slime/rollout/rm_hub/deepscaler.py b/slime/rollout/rm_hub/deepscaler.py index 39d4de383a..ebe73e8220 100644 --- a/slime/rollout/rm_hub/deepscaler.py +++ b/slime/rollout/rm_hub/deepscaler.py @@ -5,7 +5,7 @@ def get_deepscaler_rule_based_reward(response, label): if "" in response: model_solution = response.split("")[-1] elif "###Response" in response: - model_solution = response.split("###Response")[1] + model_solution = response.split("###Response")[-1] else: return 0 diff --git a/tests/test_rm_deepscaler.py b/tests/test_rm_deepscaler.py index 0883a60622..a925ff105f 100644 --- a/tests/test_rm_deepscaler.py +++ b/tests/test_rm_deepscaler.py @@ -35,6 +35,30 @@ def test_response_split_on_response_marker_grades_tail(): assert get_deepscaler_rule_based_reward(response, "42") == 1 +@pytest.mark.unit +def test_response_split_on_response_marker_grades_last_segment(): + """When ``###Response`` appears more than once, the *final* segment is the + answer — same rule the ```` branch already follows with ``[-1]``. + A model that echoes the ``###Response`` marker inside its scratch work + (then emits the real answer after the last one) must still be graded on + that last segment. The earlier ``[1]`` indexing graded the *second* + segment instead, silently scoring a correct answer as 0.""" + response = r"reasoning \boxed{1}###Response\boxed{2}###Response\boxed{42}" + assert get_deepscaler_rule_based_reward(response, "42") == 1 + # The wrong intermediate answer in the second segment must not be graded. + assert get_deepscaler_rule_based_reward(response, "2") == 0 + + +@pytest.mark.unit +def test_think_and_response_markers_agree_on_last_segment(): + """The two marker branches must pick the same (final) segment so the + reward does not depend on which separator the chat template emits.""" + think = r"junk \boxed{0}mid \boxed{1}\boxed{42}" + resp = r"junk \boxed{0}###Response mid \boxed{1}###Response\boxed{42}" + assert get_deepscaler_rule_based_reward(think, "42") == 1 + assert get_deepscaler_rule_based_reward(resp, "42") == 1 + + @pytest.mark.unit def test_response_without_any_marker_returns_zero(): """No ```` AND no ``###Response`` → fall through to 0