verl-project · aoshen02 · Jun 15, 2026 · Jun 15, 2026 · gemini-code-assist · Jun 15, 2026
diff --git a/uni_agent/reward/swe_bench.py b/uni_agent/reward/swe_bench.py
@@ -16,13 +16,63 @@
 from swebench.harness.log_parsers import MAP_REPO_TO_PARSER
 from swebench.harness.test_spec.python import get_test_directives
 
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
 from uni_agent.async_logging import get_logger
-from uni_agent.interaction import AgentEnv
 from uni_agent.reward.base import AbstractRewardSpec
+
+if TYPE_CHECKING:
+    from uni_agent.interaction import AgentEnv
 from uni_agent.reward.registry import register_reward_spec
 from uni_agent.utils import auto_await
 
 
+def make_eval_script(metadata: dict, workdir: str, env_name: str = "testbed") -> str:
+    """Build a self-contained bash eval script for a SWE-bench instance.
+
+    Returns the full script string (with shebang). Caller writes it to a file
+    and executes it; output is parsed by :func:`parse_eval_output`.
+    """
+    repo, version = metadata["repo"], metadata["version"]
+    specs = MAP_REPO_VERSION_TO_SPECS[repo][version]
+    cmds = _make_eval_script_list(
+        instance=metadata, specs=specs, env_name=env_name,
+        repo_directory=workdir, base_commit=metadata.get("base_commit", ""),
+        test_patch=metadata["test_patch"],
+    )
+    return "\n".join(["#!/bin/bash", "set -uxo pipefail"] + cmds) + "\n"
+
+
+def parse_eval_output(metadata: dict, eval_output: str) -> tuple[bool, dict]:
+    """Parse raw eval output and return (solved, report).
+
+    Pure function — no env/sandbox dependency. Reusable by any caller that
+    ran the script from :func:`make_eval_script`.
+    """
+    repo = metadata["repo"]
+    report = {"resolved": False, "found_eval_status": False, "test_status": None}
+
+    if START_TEST_OUTPUT not in eval_output or END_TEST_OUTPUT not in eval_output:
+        return False, report
+
+    test_content = eval_output.split(START_TEST_OUTPUT)[1].split(END_TEST_OUTPUT)[0]
+    status_map = MAP_REPO_TO_PARSER[repo](test_content, None)
+    report["found_eval_status"] = True
+
+    eval_ref = {
+        "instance_id": metadata["instance_id"],
+        "FAIL_TO_PASS": json.loads(metadata.get("FAIL_TO_PASS", "[]")),
+        "PASS_TO_PASS": json.loads(metadata.get("PASS_TO_PASS", "[]")),
+    }
-    if START_TEST_OUTPUT not in eval_output or END_TEST_OUTPUT not in eval_output:
-        return False, report
-
-    test_content = eval_output.split(START_TEST_OUTPUT)[1].split(END_TEST_OUTPUT)[0]
-    status_map = MAP_REPO_TO_PARSER[repo](test_content, None)
-    report["found_eval_status"] = True
-
-    eval_ref = {
-        "instance_id": metadata["instance_id"],
-        "FAIL_TO_PASS": json.loads(metadata.get("FAIL_TO_PASS", "[]")),
-        "PASS_TO_PASS": json.loads(metadata.get("PASS_TO_PASS", "[]")),
-    }
+    if not eval_output or START_TEST_OUTPUT not in eval_output or END_TEST_OUTPUT not in eval_output:
+        return False, report
+
+    test_content = eval_output.split(START_TEST_OUTPUT)[1].split(END_TEST_OUTPUT)[0]
+    status_map = MAP_REPO_TO_PARSER[repo](test_content, None)
+    report["found_eval_status"] = True
+
+    fail_to_pass = metadata.get("FAIL_TO_PASS", "[]")
+    pass_to_pass = metadata.get("PASS_TO_PASS", "[]")
+    eval_ref = {
+        "instance_id": metadata["instance_id"],
+        "FAIL_TO_PASS": json.loads(fail_to_pass) if isinstance(fail_to_pass, str) else fail_to_pass,
+        "PASS_TO_PASS": json.loads(pass_to_pass) if isinstance(pass_to_pass, str) else pass_to_pass,
+    }
-    if START_TEST_OUTPUT not in eval_output or END_TEST_OUTPUT not in eval_output:
-        return False, report
-
-    test_content = eval_output.split(START_TEST_OUTPUT)[1].split(END_TEST_OUTPUT)[0]
-    status_map = MAP_REPO_TO_PARSER[repo](test_content, None)
-    report["found_eval_status"] = True
-
-    eval_ref = {
-        "instance_id": metadata["instance_id"],
-        "FAIL_TO_PASS": json.loads(metadata.get("FAIL_TO_PASS", "[]")),
-        "PASS_TO_PASS": json.loads(metadata.get("PASS_TO_PASS", "[]")),
-    }
+    if not eval_output or START_TEST_OUTPUT not in eval_output or END_TEST_OUTPUT not in eval_output:
+        return False, report
+
+    test_content = eval_output.split(START_TEST_OUTPUT)[1].split(END_TEST_OUTPUT)[0]
+    status_map = MAP_REPO_TO_PARSER[repo](test_content, None)
+    report["found_eval_status"] = True
+
+    fail_to_pass = metadata.get("FAIL_TO_PASS", "[]")
+    pass_to_pass = metadata.get("PASS_TO_PASS", "[]")
+    eval_ref = {
+        "instance_id": metadata["instance_id"],
+        "FAIL_TO_PASS": json.loads(fail_to_pass) if isinstance(fail_to_pass, str) else fail_to_pass,
+        "PASS_TO_PASS": json.loads(pass_to_pass) if isinstance(pass_to_pass, str) else pass_to_pass,
+    }
+    eval_type = EvalType.FAIL_ONLY if repo in FAIL_ONLY_REPOS else EvalType.PASS_AND_FAIL
+    eval_tests_report = get_eval_tests_report(status_map, eval_ref, eval_type=eval_type)
+    report["test_status"] = eval_tests_report
+    report["resolved"] = get_resolution_status(eval_tests_report) == ResolvedStatus.FULL.value
+    return report["resolved"], report
+
+
 # fix: https://github.com/SWE-bench/SWE-bench/issues/518
 def _make_eval_script_list(instance, specs, env_name, repo_directory, base_commit, test_patch):
     """
@@ -91,52 +141,23 @@ async def compute_reward(self, **kwargs) -> tuple[dict | None, bool]:
             "resolved": False,
         }
 
-        # 1. eval script
-        instance = self.metadata
-        repo = instance["repo"]
-        version = instance.get("version")
-        specs = MAP_REPO_VERSION_TO_SPECS[repo][version]
-        env_name = "testbed"
-        repo_directory = f"/{env_name}"
-        base_commit = instance["base_commit"]
-        test_patch = instance["test_patch"]
-        eval_script_list = _make_eval_script_list(
-            instance=instance,
-            specs=specs,
-            env_name=env_name,
-            repo_directory=repo_directory,
-            base_commit=base_commit,
-            test_patch=test_patch,
-        )
-        eval_script = "\n".join(["#!/bin/bash", "set -uxo pipefail"] + eval_script_list) + "\n"
-
         try:
-            # write eval script to container
+            eval_script = make_eval_script(self.metadata, workdir="/testbed")
             eval_script_container = Path(f"/tmp/eval_script_{uuid.uuid4()}.sh")
             await self.env.write_file(eval_script_container, eval_script)
 
             execution_t0 = time.perf_counter()
-
-            cmd_str = f"bash {eval_script_container}"
-            output = await self.env.communicate(cmd_str, timeout=self.eval_timeout, check="ignore")
-
-            # cmd_str = f"bash {eval_script_container} 2>&1"
-            # from swerex.runtime.abstract import Command
-            # r = await self.env.deployment.runtime.execute(
-            #     Command(command=["bash", "-c", cmd_str], timeout=self.eval_timeout)
-            # )
-            # output = r.stdout
+            output = await self.env.communicate(
+                f"bash {eval_script_container}", timeout=self.eval_timeout, check="ignore")
             execution_time = time.perf_counter() - execution_t0
             result["eval_completed"] = True
             result["eval_execution_time"] = execution_time
 
-            # Remove ANSI escape codes and \r
             output = re.sub(r"\x1b\[[0-9;]*m|\r", "", output)
-
-            eval_report = self._get_eval_report(output)
+            solved, eval_report = parse_eval_output(self.metadata, output)
             result["eval_report"] = eval_report
             self.logger.info(f"Eval report: {eval_report}")
-            result["resolved"] = eval_report["resolved"]
+            result["resolved"] = solved
         except Exception as e:
             self.logger.error(f"Failed to evaluate: {e}")
         return result["resolved"], result
@@ -180,41 +201,7 @@ async def _apply_patch(self, patch: str) -> None:
                 continue
         raise RuntimeError("Failed to apply patch with any command") from last_error
 
-    def _get_logs_eval(self, eval_output: str):
-        instance = self.metadata
-        repo = instance["repo"]
-        log_parser = MAP_REPO_TO_PARSER[repo]
-        if START_TEST_OUTPUT in eval_output and END_TEST_OUTPUT in eval_output:
-            test_content = eval_output.split(START_TEST_OUTPUT)[1].split(END_TEST_OUTPUT)[0]
-            status_map = log_parser(test_content, None)
-            return status_map, True
-        else:
-            status_map = {}
-            return status_map, False
-
     def _get_eval_report(self, eval_output: str):
-        eval_report = {
-            "resolved": False,
-            "found_eval_status": False,
-            "test_status": None,
-        }
-
-        # step 1: get logs eval
-        status_map, found = self._get_logs_eval(eval_output)
-        eval_report["found_eval_status"] = found
-        if not found:
-            return eval_report
-
-        # step 2: get eval tests report
-        eval_ref = {
-            "instance_id": self.metadata["instance_id"],
-            "FAIL_TO_PASS": json.loads(self.metadata.get("FAIL_TO_PASS", "[]")),
-            "PASS_TO_PASS": json.loads(self.metadata.get("PASS_TO_PASS", "[]")),
-        }
-        repo = self.metadata["repo"]
-        eval_type = EvalType.FAIL_ONLY if repo in FAIL_ONLY_REPOS else EvalType.PASS_AND_FAIL
-        report = get_eval_tests_report(status_map, eval_ref, eval_type=eval_type)
-        eval_report["test_status"] = report
-        if get_resolution_status(report) == ResolvedStatus.FULL.value:
-            eval_report["resolved"] = True
-        return eval_report
+        """Delegates to module-level :func:`parse_eval_output`."""
+        _, report = parse_eval_output(self.metadata, eval_output)
+        return report