Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 59 additions & 72 deletions uni_agent/reward/swe_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,63 @@
from swebench.harness.log_parsers import MAP_REPO_TO_PARSER
from swebench.harness.test_spec.python import get_test_directives

from __future__ import annotations

from typing import TYPE_CHECKING

from uni_agent.async_logging import get_logger
from uni_agent.interaction import AgentEnv
from uni_agent.reward.base import AbstractRewardSpec

if TYPE_CHECKING:
from uni_agent.interaction import AgentEnv
from uni_agent.reward.registry import register_reward_spec
from uni_agent.utils import auto_await


def make_eval_script(metadata: dict, workdir: str, env_name: str = "testbed") -> str:
"""Build a self-contained bash eval script for a SWE-bench instance.

Returns the full script string (with shebang). Caller writes it to a file
and executes it; output is parsed by :func:`parse_eval_output`.
"""
repo, version = metadata["repo"], metadata["version"]
specs = MAP_REPO_VERSION_TO_SPECS[repo][version]
cmds = _make_eval_script_list(
instance=metadata, specs=specs, env_name=env_name,
repo_directory=workdir, base_commit=metadata.get("base_commit", ""),
test_patch=metadata["test_patch"],
)
return "\n".join(["#!/bin/bash", "set -uxo pipefail"] + cmds) + "\n"


def parse_eval_output(metadata: dict, eval_output: str) -> tuple[bool, dict]:
"""Parse raw eval output and return (solved, report).

Pure function — no env/sandbox dependency. Reusable by any caller that
ran the script from :func:`make_eval_script`.
"""
repo = metadata["repo"]
report = {"resolved": False, "found_eval_status": False, "test_status": None}

if START_TEST_OUTPUT not in eval_output or END_TEST_OUTPUT not in eval_output:
return False, report

test_content = eval_output.split(START_TEST_OUTPUT)[1].split(END_TEST_OUTPUT)[0]
status_map = MAP_REPO_TO_PARSER[repo](test_content, None)
report["found_eval_status"] = True

eval_ref = {
"instance_id": metadata["instance_id"],
"FAIL_TO_PASS": json.loads(metadata.get("FAIL_TO_PASS", "[]")),
"PASS_TO_PASS": json.loads(metadata.get("PASS_TO_PASS", "[]")),
}
Comment on lines +57 to +68

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

To make this public helper more robust for external callers (e.g., when integrated with Hugging Face datasets or when the evaluation fails/times out), we should:

  1. Defensively handle cases where eval_output is None or empty to avoid TypeError during substring checks.
  2. Handle cases where FAIL_TO_PASS and PASS_TO_PASS are already parsed as lists/dicts rather than JSON-serialized strings.
Suggested change
if START_TEST_OUTPUT not in eval_output or END_TEST_OUTPUT not in eval_output:
return False, report
test_content = eval_output.split(START_TEST_OUTPUT)[1].split(END_TEST_OUTPUT)[0]
status_map = MAP_REPO_TO_PARSER[repo](test_content, None)
report["found_eval_status"] = True
eval_ref = {
"instance_id": metadata["instance_id"],
"FAIL_TO_PASS": json.loads(metadata.get("FAIL_TO_PASS", "[]")),
"PASS_TO_PASS": json.loads(metadata.get("PASS_TO_PASS", "[]")),
}
if not eval_output or START_TEST_OUTPUT not in eval_output or END_TEST_OUTPUT not in eval_output:
return False, report
test_content = eval_output.split(START_TEST_OUTPUT)[1].split(END_TEST_OUTPUT)[0]
status_map = MAP_REPO_TO_PARSER[repo](test_content, None)
report["found_eval_status"] = True
fail_to_pass = metadata.get("FAIL_TO_PASS", "[]")
pass_to_pass = metadata.get("PASS_TO_PASS", "[]")
eval_ref = {
"instance_id": metadata["instance_id"],
"FAIL_TO_PASS": json.loads(fail_to_pass) if isinstance(fail_to_pass, str) else fail_to_pass,
"PASS_TO_PASS": json.loads(pass_to_pass) if isinstance(pass_to_pass, str) else pass_to_pass,
}

eval_type = EvalType.FAIL_ONLY if repo in FAIL_ONLY_REPOS else EvalType.PASS_AND_FAIL
eval_tests_report = get_eval_tests_report(status_map, eval_ref, eval_type=eval_type)
report["test_status"] = eval_tests_report
report["resolved"] = get_resolution_status(eval_tests_report) == ResolvedStatus.FULL.value
return report["resolved"], report


# fix: https://github.com/SWE-bench/SWE-bench/issues/518
def _make_eval_script_list(instance, specs, env_name, repo_directory, base_commit, test_patch):
"""
Expand Down Expand Up @@ -91,52 +141,23 @@ async def compute_reward(self, **kwargs) -> tuple[dict | None, bool]:
"resolved": False,
}

# 1. eval script
instance = self.metadata
repo = instance["repo"]
version = instance.get("version")
specs = MAP_REPO_VERSION_TO_SPECS[repo][version]
env_name = "testbed"
repo_directory = f"/{env_name}"
base_commit = instance["base_commit"]
test_patch = instance["test_patch"]
eval_script_list = _make_eval_script_list(
instance=instance,
specs=specs,
env_name=env_name,
repo_directory=repo_directory,
base_commit=base_commit,
test_patch=test_patch,
)
eval_script = "\n".join(["#!/bin/bash", "set -uxo pipefail"] + eval_script_list) + "\n"

try:
# write eval script to container
eval_script = make_eval_script(self.metadata, workdir="/testbed")
eval_script_container = Path(f"/tmp/eval_script_{uuid.uuid4()}.sh")
await self.env.write_file(eval_script_container, eval_script)

execution_t0 = time.perf_counter()

cmd_str = f"bash {eval_script_container}"
output = await self.env.communicate(cmd_str, timeout=self.eval_timeout, check="ignore")

# cmd_str = f"bash {eval_script_container} 2>&1"
# from swerex.runtime.abstract import Command
# r = await self.env.deployment.runtime.execute(
# Command(command=["bash", "-c", cmd_str], timeout=self.eval_timeout)
# )
# output = r.stdout
output = await self.env.communicate(
f"bash {eval_script_container}", timeout=self.eval_timeout, check="ignore")
execution_time = time.perf_counter() - execution_t0
result["eval_completed"] = True
result["eval_execution_time"] = execution_time

# Remove ANSI escape codes and \r
output = re.sub(r"\x1b\[[0-9;]*m|\r", "", output)

eval_report = self._get_eval_report(output)
solved, eval_report = parse_eval_output(self.metadata, output)
result["eval_report"] = eval_report
self.logger.info(f"Eval report: {eval_report}")
result["resolved"] = eval_report["resolved"]
result["resolved"] = solved
except Exception as e:
self.logger.error(f"Failed to evaluate: {e}")
return result["resolved"], result
Expand Down Expand Up @@ -180,41 +201,7 @@ async def _apply_patch(self, patch: str) -> None:
continue
raise RuntimeError("Failed to apply patch with any command") from last_error

def _get_logs_eval(self, eval_output: str):
instance = self.metadata
repo = instance["repo"]
log_parser = MAP_REPO_TO_PARSER[repo]
if START_TEST_OUTPUT in eval_output and END_TEST_OUTPUT in eval_output:
test_content = eval_output.split(START_TEST_OUTPUT)[1].split(END_TEST_OUTPUT)[0]
status_map = log_parser(test_content, None)
return status_map, True
else:
status_map = {}
return status_map, False

def _get_eval_report(self, eval_output: str):
eval_report = {
"resolved": False,
"found_eval_status": False,
"test_status": None,
}

# step 1: get logs eval
status_map, found = self._get_logs_eval(eval_output)
eval_report["found_eval_status"] = found
if not found:
return eval_report

# step 2: get eval tests report
eval_ref = {
"instance_id": self.metadata["instance_id"],
"FAIL_TO_PASS": json.loads(self.metadata.get("FAIL_TO_PASS", "[]")),
"PASS_TO_PASS": json.loads(self.metadata.get("PASS_TO_PASS", "[]")),
}
repo = self.metadata["repo"]
eval_type = EvalType.FAIL_ONLY if repo in FAIL_ONLY_REPOS else EvalType.PASS_AND_FAIL
report = get_eval_tests_report(status_map, eval_ref, eval_type=eval_type)
eval_report["test_status"] = report
if get_resolution_status(report) == ResolvedStatus.FULL.value:
eval_report["resolved"] = True
return eval_report
"""Delegates to module-level :func:`parse_eval_output`."""
_, report = parse_eval_output(self.metadata, eval_output)
return report
Loading