diff --git a/docker/Dockerfile b/docker/Dockerfile index af1f1a3d94..bb4c168bd6 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -125,3 +125,9 @@ RUN git clone https://github.com/THUDM/slime.git /root/slime && \ RUN cd /root/slime/slime/backends/megatron_utils/kernels/int4_qat && \ pip install . --no-build-isolation + +# ====================================== Agentic rollout dependencies ================================ + +# uni-agent: reward specs (swe_bench.py) used by coding_agent_rl +ARG UNIAGENT_COMMIT=main +RUN pip install "git+https://github.com/verl-project/uni-agent.git@${UNIAGENT_COMMIT}" diff --git a/examples/coding_agent_rl/generate.py b/examples/coding_agent_rl/generate.py index 64fb0d6660..9c052b44da 100644 --- a/examples/coding_agent_rl/generate.py +++ b/examples/coding_agent_rl/generate.py @@ -246,6 +246,7 @@ async def generate(args, sample: Sample, sampling_params: dict[str, Any]): diff_text=diff_text, swepro=md["swepro"], eval_cmd=md["eval_cmd"], + swebench_metadata=md.get("swebench_metadata"), pre_commands=md["pre_commands"], timeout_sec=SWE_EVAL_TIMEOUT_SEC, ) @@ -329,6 +330,7 @@ def _metadata(sample: Sample) -> dict[str, Any]: "problem_statement": m.get("problem_statement") or _coerce_prompt(sample.prompt), "swepro": m.get("swepro"), "eval_cmd": m.get("eval_cmd") or _wrap_f2p_script(rem.get("f2p_script")), + "swebench_metadata": m.get("swebench_metadata"), "pre_commands": m.get("pre_commands") or rem.get("pre_commands"), } diff --git a/examples/coding_agent_rl/sandbox.py b/examples/coding_agent_rl/sandbox.py index 4cff30f260..ee95f64efe 100644 --- a/examples/coding_agent_rl/sandbox.py +++ b/examples/coding_agent_rl/sandbox.py @@ -301,6 +301,7 @@ async def evaluate( diff_text: str, swepro: dict | None = None, eval_cmd: str | None = None, + swebench_metadata: dict | None = None, pre_commands: list[str] | str | None = None, timeout_sec: int = 600, ) -> tuple[float, bool, bool]: @@ -308,8 +309,8 @@ async def evaluate( No-test-cheating guarantee: the eval sandbox is built from the same image but starts CLEAN, so only the model-produced diff affects reward.""" - if not (swepro or eval_cmd): - logger.warning("[e2b.evaluate] no swepro/eval_cmd; reward=0") + if not (swepro or eval_cmd or swebench_metadata): + logger.warning("[e2b.evaluate] no swepro/eval_cmd/swebench_metadata; reward=0") return 0.0, False, True async with E2BSandbox(image) as ev: @@ -327,6 +328,9 @@ async def evaluate( if swepro: r, s = await _run_swepro(ev, workdir, swepro, timeout_sec) return r, s, True + if swebench_metadata: + r, s = await _run_swebench_eval(ev, workdir, swebench_metadata, timeout_sec) + return r, s, True r, s = await _run_eval_cmd(ev, workdir, eval_cmd, timeout_sec) return r, s, True @@ -395,6 +399,19 @@ async def _run_swepro(ev: Sandbox, workdir: str, swepro: dict, timeout: int) -> return (1.0 if solved else 0.0), solved +async def _run_swebench_eval(ev: Sandbox, workdir: str, metadata: dict, timeout: int) -> tuple[float, bool]: + """SWE-bench harness grading — delegates to uni_agent.reward.swe_bench.""" + import re as _re + from uni_agent.reward.swe_bench import make_eval_script, parse_eval_output + + eval_script = make_eval_script(metadata, workdir) + await ev.write_file("/tmp/_swebench_eval.sh", eval_script, user="root") + _, stdout, _ = await ev.exec("bash /tmp/_swebench_eval.sh 2>&1", user="root", check=False, timeout=timeout) + output = _re.sub(r"\x1b\[[0-9;]*m|\r", "", stdout or "") + solved, _ = parse_eval_output(metadata, output) + return (1.0 if solved else 0.0), solved + + async def _run_eval_cmd(ev: Sandbox, workdir: str, cmd: str, timeout: int) -> tuple[float, bool]: ec, _, _ = await ev.exec(f"cd {workdir} && {cmd}", user="agent", check=False, timeout=timeout) return (1.0 if ec == 0 else 0.0), ec == 0