From 45281ae881c224f96aeb7bc904cfa30d3cd064a6 Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Mon, 1 Jun 2026 22:59:43 -0400
Subject: [PATCH 01/13] Use in-memory event history for condenser replay

Co-authored-by: openhands <openhands@all-hands.dev>
---
 agents/openhands_sdk/condensation_sft.py | 57 ++++++++++++++++++++----
 1 file changed, 48 insertions(+), 9 deletions(-)

diff --git a/agents/openhands_sdk/condensation_sft.py b/agents/openhands_sdk/condensation_sft.py
index d85a9094..dc14d5e8 100644
--- a/agents/openhands_sdk/condensation_sft.py
+++ b/agents/openhands_sdk/condensation_sft.py
@@ -15,6 +15,8 @@
 from openhands.sdk.context.condenser import LLMSummarizingCondenser
 from openhands.sdk.context.condenser.utils import get_total_token_count
 from openhands.sdk.context.view import View
+from openhands.sdk.event import LLMConvertibleEvent as SDKEvent
+from openhands.sdk.event import MessageEvent, SystemPromptEvent
 from openhands.sdk.event.condenser import Condensation
 from openhands.sdk.llm.llm_response import LLMResponse
 from openhands.sdk.tool import ToolDefinition
@@ -98,6 +100,21 @@ def format_messages(llm: LLM, messages: list[Message]) -> list[dict[str, Any]]:
     return normalize_message_content(llm.format_messages_for_llm(messages))
 
 
+class TrackingSDKEventBuilder(SDKEventBuilder):
+    def __init__(
+        self,
+        conversation: Conversation,
+        metadata: Any,
+        event_history: list[SDKEvent],
+    ) -> None:
+        super().__init__(conversation, metadata)
+        self.event_history = event_history
+
+    def append(self, event: SDKEvent) -> None:
+        self.event_history.append(event)
+        super().append(event)
+
+
 def token_count(view: View, llm: LLM) -> int:
     return get_total_token_count(view.events, llm)
 
@@ -168,7 +185,7 @@ def make_trajectory_record_from_conversation(
 
 def condensation_prompt_record_if_needed(
     *,
-    conversation: Conversation,
+    events: list[SDKEvent],
     condenser: LLMSummarizingCondenser,
     agent_llm: LLM,
     condenser_llm: PromptCapturingLLM,
@@ -177,7 +194,7 @@ def condensation_prompt_record_if_needed(
     max_tokens: int,
     condensation_index: int,
 ) -> tuple[Condensation, dict[str, Any]] | None:
-    view = View.from_events(conversation.state.events)
+    view = View.from_events(events)
     prompt_token_count = token_count(view, condenser.llm)
     before_prompt_count = len(condenser_llm.captured_messages)
     condensation_result = condenser.condense(view, agent_llm=agent_llm)
@@ -213,7 +230,28 @@ def append_standardized_events_with_condensation(
     include_trajectories: bool,
 ) -> list[dict[str, Any]]:
     metadata = load_dataset_metadata(dataset_name, required=True)
-    builder = SDKEventBuilder(conversation, metadata)
+    event_history: list[SDKEvent] = [
+        SystemPromptEvent(
+            system_prompt=TextContent(text=conversation.agent.static_system_message),
+            tools=list(conversation.agent.tools_map.values()),
+        )
+    ]
+    builder = TrackingSDKEventBuilder(conversation, metadata, event_history)
+    first_event = trajectory.content[0]
+    if not isinstance(first_event, TextObservation) or first_event.source != "user":
+        raise ValueError(
+            "OpenHands SDK condensation conversion expects the first event to be a "
+            "user TextObservation"
+        )
+    builder.append(
+        MessageEvent(
+            source="user",
+            llm_message=Message(
+                role="user",
+                content=[TextContent(text=first_event.content)],
+            ),
+        )
+    )
     condenser_llm = PromptCapturingLLM(
         usage_id="openhands-sdk-condensation-sft-condenser",
         model=model,
@@ -231,18 +269,18 @@ def append_standardized_events_with_condensation(
     condensation_index = 1
     index = start_index
     batch_number = 0
-    last_safe_events = list(conversation.state.events)
+    last_safe_events = list(event_history)
 
     def update_last_safe_events() -> None:
         nonlocal last_safe_events
-        view = View.from_events(conversation.state.events)
+        view = View.from_events(event_history)
         if token_count(view, conversation.agent.llm) <= max_tokens:
-            last_safe_events = list(conversation.state.events)
+            last_safe_events = list(event_history)
 
     def emit_condensation_boundary_if_needed() -> None:
         nonlocal segment_index, condensation_index, last_safe_events
         result = condensation_prompt_record_if_needed(
-            conversation=conversation,
+            events=event_history,
             condenser=condenser,
             agent_llm=conversation.agent.llm,
             condenser_llm=condenser_llm,
@@ -266,8 +304,9 @@ def emit_condensation_boundary_if_needed() -> None:
             )
             segment_index += 1
         records.append(prompt_record)
+        event_history.append(condensation)
         conversation.state.events.append(condensation)
-        last_safe_events = list(conversation.state.events)
+        last_safe_events = list(event_history)
         condensation_index += 1
 
     while index < len(trajectory.content):
@@ -342,7 +381,7 @@ def process_row(
     with tempfile.TemporaryDirectory(prefix="openhands-sdk-condensation-sft-") as tmpdir:
         conversation = Conversation(agent=agent, workspace=tmpdir, visualizer=None)
         try:
-            conversation.send_message(first_event.content)
+            conversation._ensure_agent_ready()
             return append_standardized_events_with_condensation(
                 conversation=conversation,
                 trajectory=trajectory,

From 1c1b7d1b1eae712c2e2e5e2d633ee1e969c8a4c6 Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Mon, 1 Jun 2026 23:21:38 -0400
Subject: [PATCH 02/13] Add concurrent condenser SFT generation

Co-authored-by: openhands <openhands@all-hands.dev>
---
 agents/openhands_sdk/condensation_sft.py | 93 ++++++++++++++++++++----
 1 file changed, 78 insertions(+), 15 deletions(-)

diff --git a/agents/openhands_sdk/condensation_sft.py b/agents/openhands_sdk/condensation_sft.py
index dc14d5e8..80b44774 100644
--- a/agents/openhands_sdk/condensation_sft.py
+++ b/agents/openhands_sdk/condensation_sft.py
@@ -1,11 +1,12 @@
 from __future__ import annotations
 
 import argparse
+import asyncio
 import json
 import os
 import sys
 import tempfile
-from collections.abc import Sequence
+from collections.abc import Iterator, Sequence
 from typing import Any
 
 os.environ.setdefault("OPENHANDS_SUPPRESS_BANNER", "1")
@@ -397,6 +398,60 @@ def process_row(
             conversation.close()
 
 
+def iter_input_chunks(chunk_size: int) -> Iterator[list[str]]:
+    chunk: list[str] = []
+    for line in sys.stdin:
+        line = line.strip()
+        if not line:
+            continue
+        chunk.append(line)
+        if len(chunk) >= chunk_size:
+            yield chunk
+            chunk = []
+    if chunk:
+        yield chunk
+
+
+async def process_line(
+    line: str,
+    *,
+    args: argparse.Namespace,
+    semaphore: asyncio.Semaphore,
+) -> list[dict[str, Any]]:
+    async with semaphore:
+        return await asyncio.to_thread(
+            process_row,
+            line,
+            max_tokens=args.max_tokens,
+            model=args.model,
+            include_trajectories=args.include_trajectories == "yes",
+            max_size=args.max_size,
+            keep_first=args.keep_first,
+        )
+
+
+async def process_stream(args: argparse.Namespace) -> None:
+    from tqdm import tqdm
+
+    semaphore = asyncio.Semaphore(args.concurrency)
+    progress = tqdm(
+        desc="condensation_sft",
+        unit="row",
+        dynamic_ncols=True,
+        disable=args.no_progress,
+    )
+    try:
+        for chunk in iter_input_chunks(args.chunk_size):
+            tasks = [process_line(line, args=args, semaphore=semaphore) for line in chunk]
+            chunk_records = await asyncio.gather(*tasks)
+            for records in chunk_records:
+                for record in records:
+                    print(json.dumps(record, ensure_ascii=False), flush=False)
+            progress.update(len(chunk))
+    finally:
+        progress.close()
+
+
 def main() -> None:
     parser = argparse.ArgumentParser(
         description=(
@@ -414,21 +469,29 @@ def main() -> None:
         default="yes",
         help="Whether to emit the original OpenHands SDK trajectory record before summaries.",
     )
+    parser.add_argument(
+        "--concurrency",
+        type=int,
+        default=1,
+        help="Number of input trajectories to process concurrently.",
+    )
+    parser.add_argument(
+        "--chunk-size",
+        type=int,
+        default=100,
+        help="Number of input rows to schedule per async batch.",
+    )
+    parser.add_argument(
+        "--no-progress",
+        action="store_true",
+        help="Disable tqdm progress output on stderr.",
+    )
     args = parser.parse_args()
-    for line in sys.stdin:
-        line = line.strip()
-        if not line:
-            continue
-        records = process_row(
-            line,
-            max_tokens=args.max_tokens,
-            model=args.model,
-            include_trajectories=args.include_trajectories == "yes",
-            max_size=args.max_size,
-            keep_first=args.keep_first,
-        )
-        for record in records:
-            print(json.dumps(record, ensure_ascii=False))
+    if args.concurrency < 1:
+        raise ValueError("--concurrency must be at least 1")
+    if args.chunk_size < 1:
+        raise ValueError("--chunk-size must be at least 1")
+    asyncio.run(process_stream(args))
 
 
 if __name__ == "__main__":

From 1b3d43f3a4a80052157d47e831a0aae017c9f2bd Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Mon, 1 Jun 2026 23:44:59 -0400
Subject: [PATCH 03/13] Stream concurrent condenser outputs as rows complete

Co-authored-by: openhands <openhands@all-hands.dev>
---
 agents/openhands_sdk/condensation_sft.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/agents/openhands_sdk/condensation_sft.py b/agents/openhands_sdk/condensation_sft.py
index 80b44774..d1ab46ea 100644
--- a/agents/openhands_sdk/condensation_sft.py
+++ b/agents/openhands_sdk/condensation_sft.py
@@ -442,12 +442,15 @@ async def process_stream(args: argparse.Namespace) -> None:
     )
     try:
         for chunk in iter_input_chunks(args.chunk_size):
-            tasks = [process_line(line, args=args, semaphore=semaphore) for line in chunk]
-            chunk_records = await asyncio.gather(*tasks)
-            for records in chunk_records:
+            tasks = [
+                asyncio.create_task(process_line(line, args=args, semaphore=semaphore))
+                for line in chunk
+            ]
+            for task in asyncio.as_completed(tasks):
+                records = await task
                 for record in records:
-                    print(json.dumps(record, ensure_ascii=False), flush=False)
-            progress.update(len(chunk))
+                    print(json.dumps(record, ensure_ascii=False), flush=True)
+                progress.update(1)
     finally:
         progress.close()
 

From b483744ea542c49ed9092e273669ab5e520eb875 Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Tue, 2 Jun 2026 00:14:38 -0400
Subject: [PATCH 04/13] Continue condenser generation after row errors

Co-authored-by: openhands <openhands@all-hands.dev>
---
 agents/openhands_sdk/condensation_sft.py | 45 +++++++++++++++++++-----
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/agents/openhands_sdk/condensation_sft.py b/agents/openhands_sdk/condensation_sft.py
index d1ab46ea..ad0cfc79 100644
--- a/agents/openhands_sdk/condensation_sft.py
+++ b/agents/openhands_sdk/condensation_sft.py
@@ -418,16 +418,38 @@ async def process_line(
     args: argparse.Namespace,
     semaphore: asyncio.Semaphore,
 ) -> list[dict[str, Any]]:
-    async with semaphore:
-        return await asyncio.to_thread(
-            process_row,
-            line,
-            max_tokens=args.max_tokens,
-            model=args.model,
-            include_trajectories=args.include_trajectories == "yes",
-            max_size=args.max_size,
-            keep_first=args.keep_first,
+    try:
+        async with semaphore:
+            return await asyncio.to_thread(
+                process_row,
+                line,
+                max_tokens=args.max_tokens,
+                model=args.model,
+                include_trajectories=args.include_trajectories == "yes",
+                max_size=args.max_size,
+                keep_first=args.keep_first,
+            )
+    except Exception as exc:
+        if not args.continue_on_error:
+            raise
+        row_id = None
+        try:
+            row_id = json.loads(line).get("id")
+        except Exception:
+            pass
+        print(
+            json.dumps(
+                {
+                    "id": row_id,
+                    "error_type": type(exc).__name__,
+                    "error": str(exc),
+                },
+                ensure_ascii=False,
+            ),
+            file=sys.stderr,
+            flush=True,
         )
+        return []
 
 
 async def process_stream(args: argparse.Namespace) -> None:
@@ -489,6 +511,11 @@ def main() -> None:
         action="store_true",
         help="Disable tqdm progress output on stderr.",
     )
+    parser.add_argument(
+        "--continue-on-error",
+        action="store_true",
+        help="Log per-row conversion errors to stderr and continue processing remaining rows.",
+    )
     args = parser.parse_args()
     if args.concurrency < 1:
         raise ValueError("--concurrency must be at least 1")

From ea2cbfbaed91e26a1ba02ef876a2676800874c38 Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Tue, 2 Jun 2026 01:07:12 -0400
Subject: [PATCH 05/13] Count condensed views when selecting safe trajectory
 snapshots

Co-authored-by: openhands <openhands@all-hands.dev>
---
 agents/openhands_sdk/condensation_sft.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/agents/openhands_sdk/condensation_sft.py b/agents/openhands_sdk/condensation_sft.py
index ad0cfc79..54803406 100644
--- a/agents/openhands_sdk/condensation_sft.py
+++ b/agents/openhands_sdk/condensation_sft.py
@@ -120,6 +120,12 @@ def token_count(view: View, llm: LLM) -> int:
     return get_total_token_count(view.events, llm)
 
 
+def formatted_token_count(events: Sequence[SDKEvent], llm: LLM) -> int:
+    view = View.from_events(events)
+    messages = LLMConvertibleEvent.events_to_messages(view.events)
+    return llm.get_token_count(messages)
+
+
 def make_condensation_prompt_record(
     *,
     trajectory_id: str,
@@ -274,8 +280,7 @@ def append_standardized_events_with_condensation(
 
     def update_last_safe_events() -> None:
         nonlocal last_safe_events
-        view = View.from_events(event_history)
-        if token_count(view, conversation.agent.llm) <= max_tokens:
+        if formatted_token_count(event_history, conversation.agent.llm) <= max_tokens:
             last_safe_events = list(event_history)
 
     def emit_condensation_boundary_if_needed() -> None:

From 473a1cb549a653a3bf9f3c897a46ccc672f6b7d2 Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Tue, 2 Jun 2026 07:00:35 -0400
Subject: [PATCH 06/13] Truncate dataset tool observations for SDK SFT
 conversion

Co-authored-by: openhands <openhands@all-hands.dev>
---
 agents/openhands_sdk/std_to_sft.py         |  8 +++++++-
 tests/test_openhands_sdk_sft_conversion.py | 12 ++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/agents/openhands_sdk/std_to_sft.py b/agents/openhands_sdk/std_to_sft.py
index 0438b3d9..5817476d 100644
--- a/agents/openhands_sdk/std_to_sft.py
+++ b/agents/openhands_sdk/std_to_sft.py
@@ -33,6 +33,7 @@
 from openhands.tools.file_editor import FileEditorTool
 from openhands.tools.task_tracker import TaskTrackerTool
 from openhands.tools.terminal import TerminalTool
+from openhands.tools.terminal.definition import MAX_CMD_OUTPUT_SIZE, maybe_truncate
 from pydantic import SecretStr
 
 from schema.action.api import ApiAction
@@ -86,7 +87,12 @@ class DatasetToolObservation(SDKObservation):
 
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-        return [TextContent(text=self.output)]
+        output = maybe_truncate(
+            content=self.output,
+            truncate_after=MAX_CMD_OUTPUT_SIZE,
+            tool_prefix="dataset_tool",
+        )
+        return [TextContent(text=output)]
 
 
 class DatasetToolExecutor(ToolExecutor):
diff --git a/tests/test_openhands_sdk_sft_conversion.py b/tests/test_openhands_sdk_sft_conversion.py
index 6d497a2f..54611133 100644
--- a/tests/test_openhands_sdk_sft_conversion.py
+++ b/tests/test_openhands_sdk_sft_conversion.py
@@ -437,3 +437,15 @@ def test_openhands_sdk_converter_rejects_mysql_code_action():
 
     with pytest.raises(ValueError, match="mysql"):
         std_to_sft.map_code_action(action)
+
+
+def test_openhands_sdk_dataset_tool_observation_truncates_large_outputs():
+    from openhands.tools.terminal.definition import MAX_CMD_OUTPUT_SIZE
+
+    from agents.openhands_sdk.std_to_sft import DatasetToolObservation
+
+    observation = DatasetToolObservation(output="A" * (MAX_CMD_OUTPUT_SIZE + 1000))
+    [content] = observation.to_llm_content
+
+    assert len(content.text) == MAX_CMD_OUTPUT_SIZE
+    assert "response clipped" in content.text

From df710f9db3749167d913972d5d63a3c556f3e3cb Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Tue, 2 Jun 2026 07:35:54 -0400
Subject: [PATCH 07/13] Fix GAIR daVinci JSONL extraction

Co-authored-by: openhands <openhands@all-hands.dev>
---
 datasets/gair_davinci_dev/extract_raw.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/datasets/gair_davinci_dev/extract_raw.py b/datasets/gair_davinci_dev/extract_raw.py
index 3c60b89a..8f7dedbb 100644
--- a/datasets/gair_davinci_dev/extract_raw.py
+++ b/datasets/gair_davinci_dev/extract_raw.py
@@ -5,15 +5,15 @@
 from datasets import load_dataset
 
 DATASET_NAME = "GAIR/daVinci-Dev"
-CONFIG_NAME = "env_native"
+DATA_FILE = "hf://datasets/GAIR/daVinci-Dev/env-native.jsonl"
 SPLIT = "train"
 
 
 def main():
     token = os.getenv("HF_TOKEN") or None
     dataset = load_dataset(
-        DATASET_NAME,
-        CONFIG_NAME,
+        "json",
+        data_files=DATA_FILE,
         split=SPLIT,
         streaming=True,
         token=token,
@@ -27,7 +27,7 @@ def main():
         main()
     except Exception as exc:
         print(
-            f"Failed to stream {DATASET_NAME}/{CONFIG_NAME}. The dataset is gated; "
+            f"Failed to stream {DATA_FILE}. The dataset is gated; "
             "authenticate with Hugging Face and ensure access has been granted.",
             file=sys.stderr,
         )

From 2ec045a8bf4df3a51e115edcc58e1adc1a9419ae Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Tue, 2 Jun 2026 10:48:56 -0400
Subject: [PATCH 08/13] Require dataset metadata files

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/check_api_docstrings.yml    |  14 +-
 agents/openhands_v0/api.py                    | 123 +++---
 agents/openhands_v0/convert_api_to_mcp.py     |  20 +-
 agents/sweagent/api.py                        | 121 +++---
 datasets/CharlieDreemur_OpenManus-RL/api.py   | 125 ------
 .../CharlieDreemur_OpenManus-RL/metadata.json | 273 +++++++++++++
 datasets/SALT-NLP_SWE-chat/api.py             |  50 ---
 datasets/SALT-NLP_SWE-chat/metadata.json      |  32 ++
 datasets/agenttuning_alfworld/api.py          | 184 ---------
 datasets/agenttuning_db/metadata.json         |   5 +
 datasets/agenttuning_kg/api.py                |  78 ----
 datasets/agenttuning_webshop/api.py           |  25 --
 datasets/allenai_Sera-4.6-Lite-T2/api.py      |  14 -
 .../allenai_Sera-4.6-Lite-T2/metadata.json    |   7 +
 datasets/android_in_the_wild/api.py           |  45 ---
 datasets/android_in_the_wild/metadata.json    |  37 ++
 datasets/androidcontrol/api.py                |  58 ---
 datasets/androidcontrol/metadata.json         |  91 +++++
 datasets/code_feedback/metadata.json          |   7 +
 datasets/codeactinstruct/api.py               | 119 ------
 datasets/codeactinstruct/metadata.json        | 249 ++++++++++++
 datasets/coderforge_preview/api.py            |  34 --
 datasets/coderforge_preview/metadata.json     |   7 +
 datasets/codescout/metadata.json              |   7 +
 datasets/cognitivekernel_pro_sft/api.py       |  51 ---
 .../cognitivekernel_pro_sft/metadata.json     | 123 ++++++
 datasets/dolci_instruct_sft_tool_use/api.py   | 121 ------
 .../dolci_instruct_sft_tool_use/metadata.json | 297 ++++++++++++++
 datasets/eto/metadata.json                    |   5 +
 datasets/gair_davinci_dev/api.py              |  26 --
 datasets/gair_davinci_dev/metadata.json       |   7 +
 datasets/go-browse-wa/api.py                  | 180 ---------
 datasets/go-browse-wa/metadata.json           |  96 +++++
 datasets/hybrid-gym/api.py                    |  32 --
 datasets/hybrid-gym/metadata.json             |  55 +++
 datasets/jupyter-agent-dataset/metadata.json  |   7 +
 .../metadata.json                             |   7 +
 datasets/llava_plus/api.py                    |  20 -
 datasets/llava_plus/metadata.json             |  44 +++
 datasets/logicstar_swe-star/api.py            |  34 --
 datasets/logicstar_swe-star/metadata.json     |  65 ++++
 datasets/mind2web/api.py                      |  44 ---
 datasets/mind2web/metadata.json               |  90 +++++
 datasets/mini-coder/metadata.json             |   7 +
 datasets/miroverse_v0_1/api.py                | 356 -----------------
 datasets/miroverse_v0_1/metadata.json         | 321 +++++++++++++++
 datasets/nebius_SWE-agent-trajectories/api.py | 121 ------
 .../api.py                                    |  49 ---
 .../metadata.json                             |  80 ++++
 .../nemotron_terminal_corpus/metadata.json    |   7 +
 datasets/nnetnav-live/api.py                  | 149 -------
 datasets/nnetnav-live/metadata.json           |  97 +++++
 datasets/nnetnav-wa/api.py                    | 149 -------
 datasets/nnetnav-wa/metadata.json             |  93 +++++
 .../api.py                                    |  34 --
 .../metadata.json                             |  57 +++
 datasets/omniact/metadata.json                |   7 +
 datasets/openhands/api.py                     | 366 ------------------
 datasets/openhands/metadata.json              | 207 ++++++++++
 datasets/openresearcher/api.py                |  20 -
 datasets/openresearcher/metadata.json         |  78 ++++
 datasets/openthoughts_tb_dev/metadata.json    |   7 +
 datasets/orca_agentinstruct/metadata.json     |   5 +
 datasets/scale_swe_distilled/api.py           |  38 --
 datasets/scale_swe_distilled/metadata.json    |  67 ++++
 datasets/screenagent/metadata.json            |   5 +
 .../api.py                                    |  23 --
 .../metadata.json                             |   7 +
 datasets/swe-play-trajectories/api.py         |  66 ----
 datasets/swe-play-trajectories/metadata.json  |  62 +++
 datasets/swe-smith/api.py                     |  31 --
 datasets/swe-smith/metadata.json              |   7 +
 datasets/synatra/api.py                       |  82 ----
 datasets/synatra/metadata.json                |  25 ++
 datasets/toolmind/api.py                      |  33 --
 datasets/toolmind/metadata.json               | 100 +++++
 datasets/toucan_1_5m/api.py                   | 165 --------
 datasets/toucan_1_5m/metadata.json            | 135 +++++++
 datasets/turkingbench/api.py                  |  56 ---
 datasets/turkingbench/metadata.json           |  44 +++
 datasets/webarena_successful/api.py           | 114 ------
 datasets/webarena_successful/metadata.json    |  25 ++
 datasets/weblinx/api.py                       |  67 ----
 datasets/weblinx/metadata.json                | 113 ++++++
 datasets/wonderbread/api.py                   |  47 ---
 datasets/wonderbread/metadata.json            |  71 ++++
 tests/test_dataset_structure.py               |  17 +
 tests/test_standardized_schemas.py            |  57 ++-
 88 files changed, 3308 insertions(+), 3388 deletions(-)
 delete mode 100644 datasets/CharlieDreemur_OpenManus-RL/api.py
 create mode 100644 datasets/CharlieDreemur_OpenManus-RL/metadata.json
 delete mode 100644 datasets/SALT-NLP_SWE-chat/api.py
 create mode 100644 datasets/SALT-NLP_SWE-chat/metadata.json
 delete mode 100644 datasets/agenttuning_alfworld/api.py
 create mode 100644 datasets/agenttuning_db/metadata.json
 delete mode 100644 datasets/agenttuning_kg/api.py
 delete mode 100644 datasets/agenttuning_webshop/api.py
 delete mode 100644 datasets/allenai_Sera-4.6-Lite-T2/api.py
 create mode 100644 datasets/allenai_Sera-4.6-Lite-T2/metadata.json
 delete mode 100644 datasets/android_in_the_wild/api.py
 create mode 100644 datasets/android_in_the_wild/metadata.json
 delete mode 100644 datasets/androidcontrol/api.py
 create mode 100644 datasets/androidcontrol/metadata.json
 create mode 100644 datasets/code_feedback/metadata.json
 delete mode 100644 datasets/codeactinstruct/api.py
 create mode 100644 datasets/codeactinstruct/metadata.json
 delete mode 100644 datasets/coderforge_preview/api.py
 create mode 100644 datasets/coderforge_preview/metadata.json
 create mode 100644 datasets/codescout/metadata.json
 delete mode 100644 datasets/cognitivekernel_pro_sft/api.py
 create mode 100644 datasets/cognitivekernel_pro_sft/metadata.json
 delete mode 100644 datasets/dolci_instruct_sft_tool_use/api.py
 create mode 100644 datasets/dolci_instruct_sft_tool_use/metadata.json
 create mode 100644 datasets/eto/metadata.json
 delete mode 100644 datasets/gair_davinci_dev/api.py
 create mode 100644 datasets/gair_davinci_dev/metadata.json
 delete mode 100644 datasets/go-browse-wa/api.py
 create mode 100644 datasets/go-browse-wa/metadata.json
 delete mode 100644 datasets/hybrid-gym/api.py
 create mode 100644 datasets/hybrid-gym/metadata.json
 create mode 100644 datasets/jupyter-agent-dataset/metadata.json
 create mode 100644 datasets/kwai-klear_swe-smith-mini_swe_agent_plus-trajectories-66k/metadata.json
 delete mode 100644 datasets/llava_plus/api.py
 create mode 100644 datasets/llava_plus/metadata.json
 delete mode 100644 datasets/logicstar_swe-star/api.py
 create mode 100644 datasets/logicstar_swe-star/metadata.json
 delete mode 100644 datasets/mind2web/api.py
 create mode 100644 datasets/mind2web/metadata.json
 create mode 100644 datasets/mini-coder/metadata.json
 delete mode 100644 datasets/miroverse_v0_1/api.py
 create mode 100644 datasets/miroverse_v0_1/metadata.json
 delete mode 100644 datasets/nebius_SWE-agent-trajectories/api.py
 delete mode 100644 datasets/nebius_SWE-rebench-openhands-trajectories/api.py
 create mode 100644 datasets/nebius_SWE-rebench-openhands-trajectories/metadata.json
 create mode 100644 datasets/nemotron_terminal_corpus/metadata.json
 delete mode 100644 datasets/nnetnav-live/api.py
 create mode 100644 datasets/nnetnav-live/metadata.json
 delete mode 100644 datasets/nnetnav-wa/api.py
 create mode 100644 datasets/nnetnav-wa/metadata.json
 delete mode 100644 datasets/nvidia_SWE-Zero-openhands-trajectories/api.py
 create mode 100644 datasets/nvidia_SWE-Zero-openhands-trajectories/metadata.json
 create mode 100644 datasets/omniact/metadata.json
 delete mode 100644 datasets/openhands/api.py
 create mode 100644 datasets/openhands/metadata.json
 delete mode 100644 datasets/openresearcher/api.py
 create mode 100644 datasets/openresearcher/metadata.json
 create mode 100644 datasets/openthoughts_tb_dev/metadata.json
 create mode 100644 datasets/orca_agentinstruct/metadata.json
 delete mode 100644 datasets/scale_swe_distilled/api.py
 create mode 100644 datasets/scale_swe_distilled/metadata.json
 create mode 100644 datasets/screenagent/metadata.json
 delete mode 100644 datasets/swe-gym_openhands_sampled_trajectories/api.py
 create mode 100644 datasets/swe-gym_openhands_sampled_trajectories/metadata.json
 delete mode 100644 datasets/swe-play-trajectories/api.py
 create mode 100644 datasets/swe-play-trajectories/metadata.json
 delete mode 100644 datasets/swe-smith/api.py
 create mode 100644 datasets/swe-smith/metadata.json
 delete mode 100644 datasets/synatra/api.py
 create mode 100644 datasets/synatra/metadata.json
 delete mode 100644 datasets/toolmind/api.py
 create mode 100644 datasets/toolmind/metadata.json
 delete mode 100644 datasets/toucan_1_5m/api.py
 create mode 100644 datasets/toucan_1_5m/metadata.json
 delete mode 100644 datasets/turkingbench/api.py
 create mode 100644 datasets/turkingbench/metadata.json
 delete mode 100644 datasets/webarena_successful/api.py
 create mode 100644 datasets/webarena_successful/metadata.json
 delete mode 100644 datasets/weblinx/api.py
 create mode 100644 datasets/weblinx/metadata.json
 delete mode 100644 datasets/wonderbread/api.py
 create mode 100644 datasets/wonderbread/metadata.json

diff --git a/.github/workflows/check_api_docstrings.yml b/.github/workflows/check_api_docstrings.yml
index e58ca872..89747588 100644
--- a/.github/workflows/check_api_docstrings.yml
+++ b/.github/workflows/check_api_docstrings.yml
@@ -1,4 +1,4 @@
-name: Check Docstrings
+name: Check Dataset Metadata
 
 on:
   push:
@@ -9,7 +9,7 @@ on:
       - main
 
 jobs:
-  check_docstrings:
+  check_dataset_metadata:
     runs-on: ubuntu-latest
 
     steps:
@@ -21,10 +21,12 @@ jobs:
       with:
         python-version: '3.12'
 
-    - name: Install ruff
+    - name: Install dependencies
       run: |
-        python -m pip install ruff
+        python -m pip install --upgrade pip
+        pip install pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
 
-    - name: Check for docstrings
+    - name: Check dataset metadata
       run: |
-        ruff check datasets/*/api.py --select D  --ignore D100,D203,D213
+        pytest tests/test_dataset_structure.py
diff --git a/agents/openhands_v0/api.py b/agents/openhands_v0/api.py
index 59087174..d58bc06c 100644
--- a/agents/openhands_v0/api.py
+++ b/agents/openhands_v0/api.py
@@ -1,6 +1,4 @@
-import importlib.util
-import inspect
-import os
+from schema.dataset_metadata import custom_tool_map, load_dataset_metadata
 
 openhands_v0_default_tools = {
     "execute_bash": {"required": ["command"], "optional": ["is_input"]},
@@ -77,6 +75,20 @@ def check_exclude_tools(name: str, required: list, optional: list, exclude_apis:
     return True
 
 
+def _schema_signature(tool) -> tuple[str, list[str], list[str]]:
+    parameters = tool.function.parameters or {}
+    properties = parameters.get("properties", {}) or {}
+    required = list(parameters.get("required", []) or [])
+    optional = [name for name in properties if name not in required]
+    args = [*required, *(f"{name}=None" for name in optional)]
+    return f"({', '.join(args)})", required, optional
+
+
+def _tool_docstring(tool) -> str:
+    description = tool.function.description or ""
+    return "\n" + description
+
+
 def get_api_tool_description(
     dataset, exclude_apis=None, env="execute_ipython_cell", include_apis=None
 ):
@@ -91,71 +103,48 @@ def get_api_tool_description(
     else:
         include_api_names = None
 
-    api_file_path = os.path.expanduser(f"datasets/{dataset}/api.py")
-    API_TOOL_DESCRIPTION = ""
-    if os.path.exists(api_file_path):
-        spec = importlib.util.spec_from_file_location("api", api_file_path)
-        api_module = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(api_module)
-        functions = inspect.getmembers(api_module, inspect.isfunction)
-        if include_api_names is not None:
-            api_names = {name for name, _ in functions}
-            missing_api_names = sorted(include_api_names - api_names)
-            if missing_api_names:
-                raise ValueError(
-                    f"available_apis contains functions not found in {api_file_path}: "
-                    f"{missing_api_names}"
-                )
-        sigs = {}
-        for name, func in functions:
-            if include_api_names is not None and name not in include_api_names:
-                continue
-            docstring = "\n" + (inspect.getdoc(func) or "")
-            sig = inspect.signature(func)
-            required = []
-            optional = []
-            for arg_name, param in sig.parameters.items():
-                if param.default is inspect.Parameter.empty:
-                    if arg_name == "xpath" or arg_name == "element_id":
-                        arg_name = "bid"
-                    if arg_name not in required:
-                        required.append(arg_name)
-                else:
-                    optional.append(arg_name)
-            if name in openhands_v0_default_tools and check_exclude_openhands_v0_default_tools(
-                name, sig, required, optional
-            ):
-                # print(f"excluded {name}", file=sys.stderr)
-                continue
-            if name in exclude_apis and check_exclude_tools(name, required, optional, exclude_apis):
-                # print(f"excluded {name}", file=sys.stderr)
-                continue
-            docstring = f"{name}{sig}" + docstring.replace("\n", "\n    ") + "\n\n"
-            API_TOOL_DESCRIPTION += docstring
-            sigs[name] = {"required": required, "optional": optional}
-        if not API_TOOL_DESCRIPTION:
-            return "", {}
-        if exclude_apis:
-            also = "also "
-        else:
-            also = ""
-        prefixes = [
-            f"The following pre-defined functions are {also}available in {env}. ",
-            f"The environment {env} {also}provides the following pre-defined functions: ",
-            f"In {env}, you can {also}use the following pre-defined functions: ",
-            f"Available functions in {env}: ",
-            f"The following functions are {also}defined and ready for use in {env}: ",
-            f"Note that {env} {also}supports the following pre-defined functions: ",
-            f"Below is a list of functions you can {also}use in the {env} environment. ",
-            f"The toolkit for {env} {also}contains the following functions. ",
-        ]
-        API_TOOL_DESCRIPTION = prefixes[0] + "\n\n" + API_TOOL_DESCRIPTION
-        API_TOOL_DESCRIPTION = API_TOOL_DESCRIPTION.replace("xpath", "bid").replace(
-            "element_id", "bid"
-        )
-        return API_TOOL_DESCRIPTION, sigs
-    else:
+    metadata = load_dataset_metadata(dataset)
+    tools = custom_tool_map(metadata)
+    if include_api_names is not None:
+        missing_api_names = sorted(include_api_names - set(tools))
+        if missing_api_names:
+            raise ValueError(
+                f"available_apis contains functions not found in metadata.json for "
+                f"{dataset}: {missing_api_names}"
+            )
+
+    api_tool_description = ""
+    sigs = {}
+    for name, tool in sorted(tools.items()):
+        if include_api_names is not None and name not in include_api_names:
+            continue
+        sig, required, optional = _schema_signature(tool)
+        if name in openhands_v0_default_tools and check_exclude_openhands_v0_default_tools(
+            name, sig, required, optional
+        ):
+            continue
+        if name in exclude_apis and check_exclude_tools(name, required, optional, exclude_apis):
+            continue
+        docstring = f"{name}{sig}" + _tool_docstring(tool).replace("\n", "\n    ") + "\n\n"
+        api_tool_description += docstring
+        sigs[name] = {"required": required, "optional": optional}
+
+    if not api_tool_description:
         return "", {}
+    also = "also " if exclude_apis else ""
+    prefixes = [
+        f"The following pre-defined functions are {also}available in {env}. ",
+        f"The environment {env} {also}provides the following pre-defined functions: ",
+        f"In {env}, you can {also}use the following pre-defined functions: ",
+        f"Available functions in {env}: ",
+        f"The following functions are {also}defined and ready for use in {env}: ",
+        f"Note that {env} {also}supports the following pre-defined functions: ",
+        f"Below is a list of functions you can {also}use in the {env} environment. ",
+        f"The toolkit for {env} {also}contains the following functions. ",
+    ]
+    api_tool_description = prefixes[0] + "\n\n" + api_tool_description
+    api_tool_description = api_tool_description.replace("xpath", "bid").replace("element_id", "bid")
+    return api_tool_description, sigs
 
 
 def get_language_descriptions(languages):
diff --git a/agents/openhands_v0/convert_api_to_mcp.py b/agents/openhands_v0/convert_api_to_mcp.py
index 1b64984b..501145dd 100644
--- a/agents/openhands_v0/convert_api_to_mcp.py
+++ b/agents/openhands_v0/convert_api_to_mcp.py
@@ -1,6 +1,4 @@
-import importlib.util
 import inspect
-import os
 import textwrap
 from typing import (
     Any,
@@ -10,6 +8,8 @@
 
 from pydantic import TypeAdapter
 
+from schema.dataset_metadata import custom_tool_map, load_dataset_metadata
+
 
 def json_type_from_py(py_t: Any) -> dict:
     """Generate JSON schema from Python type using Pydantic's TypeAdapter."""
@@ -121,18 +121,10 @@ def tool_from_function(
 
 
 def get_api_tools(dataset) -> dict:
-    api_file_path = os.path.expanduser(f"datasets/{dataset}/api.py")
-    if os.path.exists(api_file_path):
-        api_tools = {}
-        spec = importlib.util.spec_from_file_location("api", api_file_path)
-        api_module = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(api_module)
-        functions = inspect.getmembers(api_module, inspect.isfunction)
-        for name, func in functions:
-            api_tools[name] = tool_from_function(func)
-        return api_tools
-    else:
-        return {}
+    metadata = load_dataset_metadata(dataset)
+    return {
+        name: tool.model_dump(exclude_none=True) for name, tool in custom_tool_map(metadata).items()
+    }
 
 
 def language_tool_placeholder(code: str):
diff --git a/agents/sweagent/api.py b/agents/sweagent/api.py
index 9d31073d..92f22a2b 100644
--- a/agents/sweagent/api.py
+++ b/agents/sweagent/api.py
@@ -1,6 +1,4 @@
-import importlib.util
-import inspect
-import os
+from schema.dataset_metadata import custom_tool_map, load_dataset_metadata
 
 sweagent_default_tools = {
     "bash": {"required": ["command"], "optional": []},
@@ -52,6 +50,20 @@ def check_exclude_tools(name: str, required: list, optional: list, exclude_apis:
     return True
 
 
+def _schema_signature(tool) -> tuple[str, list[str], list[str]]:
+    parameters = tool.function.parameters or {}
+    properties = parameters.get("properties", {}) or {}
+    required = list(parameters.get("required", []) or [])
+    optional = [name for name in properties if name not in required]
+    args = [*required, *(f"{name}=None" for name in optional)]
+    return f"({', '.join(args)})", required, optional
+
+
+def _tool_docstring(tool) -> str:
+    description = tool.function.description or ""
+    return "\n" + description
+
+
 def get_api_tool_description(dataset, exclude_apis=None, env="bash", include_apis=None):
     if exclude_apis is None:
         exclude_apis = {}
@@ -64,66 +76,45 @@ def get_api_tool_description(dataset, exclude_apis=None, env="bash", include_api
     else:
         include_api_names = None
 
-    api_file_path = os.path.expanduser(f"datasets/{dataset}/api.py")
-    API_TOOL_DESCRIPTION = ""
-    if os.path.exists(api_file_path):
-        spec = importlib.util.spec_from_file_location("api", api_file_path)
-        api_module = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(api_module)
-        functions = inspect.getmembers(api_module, inspect.isfunction)
-        if include_api_names is not None:
-            api_names = {name for name, _ in functions}
-            missing_api_names = sorted(include_api_names - api_names)
-            if missing_api_names:
-                raise ValueError(
-                    f"available_apis contains functions not found in {api_file_path}: "
-                    f"{missing_api_names}"
-                )
-        sigs = {}
-        for name, func in functions:
-            if include_api_names is not None and name not in include_api_names:
-                continue
-            docstring = "\n" + (inspect.getdoc(func) or "")
-            sig = inspect.signature(func)
-            required = []
-            optional = []
-            for arg_name, param in sig.parameters.items():
-                if param.default is inspect.Parameter.empty:
-                    if arg_name not in required:
-                        required.append(arg_name)
-                else:
-                    optional.append(arg_name)
-            if name in sweagent_default_tools and check_exclude_sweagent_default_tools(
-                name, sig, required, optional
-            ):
-                # print(f"excluded {name}")
-                continue
-            if name in exclude_apis and check_exclude_tools(name, required, optional, exclude_apis):
-                # print(f"excluded {name}")
-                continue
-            docstring = f"{name}{sig}" + docstring.replace("\n", "\n    ") + "\n\n"
-            API_TOOL_DESCRIPTION += docstring
-            sigs[name] = {"required": required, "optional": optional}
-        if not API_TOOL_DESCRIPTION:
-            return "", {}
-        if exclude_apis:
-            also = "also "
-        else:
-            also = ""
-        prefixes = [
-            f"The following pre-defined functions are {also}available in {env}. ",
-            f"The environment {env} {also}provides the following pre-defined functions: ",
-            f"In {env}, you can {also}use the following pre-defined functions: ",
-            f"Available functions in {env}: ",
-            f"The following functions are {also}defined and ready for use in {env}: ",
-            f"Note that {env} {also}supports the following pre-defined functions: ",
-            f"Below is a list of functions you can {also}use in the {env} environment. ",
-            f"The toolkit for {env} {also}contains the following functions. ",
-        ]
-        API_TOOL_DESCRIPTION = prefixes[0] + "\n\n" + API_TOOL_DESCRIPTION
-        API_TOOL_DESCRIPTION = API_TOOL_DESCRIPTION.replace("xpath", "bid").replace(
-            "element_id", "bid"
-        )
-        return API_TOOL_DESCRIPTION, sigs
-    else:
+    metadata = load_dataset_metadata(dataset)
+    tools = custom_tool_map(metadata)
+    if include_api_names is not None:
+        missing_api_names = sorted(include_api_names - set(tools))
+        if missing_api_names:
+            raise ValueError(
+                f"available_apis contains functions not found in metadata.json for "
+                f"{dataset}: {missing_api_names}"
+            )
+
+    api_tool_description = ""
+    sigs = {}
+    for name, tool in sorted(tools.items()):
+        if include_api_names is not None and name not in include_api_names:
+            continue
+        sig, required, optional = _schema_signature(tool)
+        if name in sweagent_default_tools and check_exclude_sweagent_default_tools(
+            name, sig, required, optional
+        ):
+            continue
+        if name in exclude_apis and check_exclude_tools(name, required, optional, exclude_apis):
+            continue
+        docstring = f"{name}{sig}" + _tool_docstring(tool).replace("\n", "\n    ") + "\n\n"
+        api_tool_description += docstring
+        sigs[name] = {"required": required, "optional": optional}
+
+    if not api_tool_description:
         return "", {}
+    also = "also " if exclude_apis else ""
+    prefixes = [
+        f"The following pre-defined functions are {also}available in {env}. ",
+        f"The environment {env} {also}provides the following pre-defined functions: ",
+        f"In {env}, you can {also}use the following pre-defined functions: ",
+        f"Available functions in {env}: ",
+        f"The following functions are {also}defined and ready for use in {env}: ",
+        f"Note that {env} {also}supports the following pre-defined functions: ",
+        f"Below is a list of functions you can {also}use in the {env} environment. ",
+        f"The toolkit for {env} {also}contains the following functions. ",
+    ]
+    api_tool_description = prefixes[0] + "\n\n" + api_tool_description
+    api_tool_description = api_tool_description.replace("xpath", "bid").replace("element_id", "bid")
+    return api_tool_description, sigs
diff --git a/datasets/CharlieDreemur_OpenManus-RL/api.py b/datasets/CharlieDreemur_OpenManus-RL/api.py
deleted file mode 100644
index ab649c57..00000000
--- a/datasets/CharlieDreemur_OpenManus-RL/api.py
+++ /dev/null
@@ -1,125 +0,0 @@
-from typing import Any
-
-
-def perform_action(action: str) -> dict:
-    """Execute a text action in an interactive environment.
-
-    Args:
-    ----
-        action: The environment action to perform, such as "go to desk 1".
-
-    """
-    pass
-
-
-def get_search_movie(movie_name: Any) -> dict:
-    """Search for a movie by name and return basic details."""
-    pass
-
-
-def get_movie_details(movie_id: Any) -> dict:
-    """Get detailed information about a movie by ID."""
-    pass
-
-
-def get_movie_production_companies(movie_id: Any) -> dict:
-    """Get the production companies of a movie by its ID."""
-    pass
-
-
-def get_movie_production_countries(movie_id: Any) -> dict:
-    """Get the production countries of a movie by its ID."""
-    pass
-
-
-def get_movie_cast(movie_id: Any) -> dict:
-    """Retrieve the top cast members from a movie by its ID."""
-    pass
-
-
-def get_movie_crew(movie_id: Any) -> dict:
-    """Retrieve crew members from a movie by its ID."""
-    pass
-
-
-def get_movie_keywords(movie_id: Any) -> dict:
-    """Get the keywords associated with a movie by ID."""
-    pass
-
-
-def get_search_person(person_name: Any) -> dict:
-    """Search for a person by name."""
-    pass
-
-
-def get_person_details(person_id: Any) -> dict:
-    """Get detailed information about a person by ID."""
-    pass
-
-
-def get_person_cast(person_id: Any) -> dict:
-    """Retrieve movie cast roles for a person by their ID."""
-    pass
-
-
-def get_person_crew(person_id: Any) -> dict:
-    """Retrieve movie crew roles for a person by their ID."""
-    pass
-
-
-def get_person_external_ids(person_id: Any) -> dict:
-    """Get the external IDs for a person by ID."""
-    pass
-
-
-def get_movie_alternative_titles(movie_id: Any) -> dict:
-    """Get alternative titles for a movie by ID."""
-    pass
-
-
-def get_movie_translation(movie_id: Any) -> dict:
-    """Get description translations for a movie by ID."""
-    pass
-
-
-def check_valid_actions() -> dict:
-    """Get supported actions for the current tool."""
-    pass
-
-
-def weather_get_120_hour_forecast_for_weather(
-    lat: Any,
-    lon: Any,
-    lang: Any = None,
-    hours: Any = None,
-    units: Any = None,
-) -> dict:
-    """Return a weather forecast for up to 120 hours.
-
-    Original tool name: weather.get_120_hour_forecast_for_weather.
-    """
-    pass
-
-
-def pharmacies_de_garde_nc_health_for_pharmacies_de_garde_nc() -> dict:
-    """Return the health status of the Pharmacies de garde NC application.
-
-    Original tool name: pharmacies_de_garde_nc.health_for_pharmacies_de_garde_nc.
-    """
-    pass
-
-
-def pharmacies_de_garde_nc_all_for_pharmacies_de_garde_nc() -> dict:
-    """Return pharmacies de garde in Nouvelle-Calédonie.
-
-    Original tool name: pharmacies_de_garde_nc.all_for_pharmacies_de_garde_nc.
-    """
-    pass
-
-
-def app_store_new_free_ios_apps_for_app_store() -> dict:
-    """Get a list of new free iOS apps.
-
-    Original tool name: app_store.new_free_ios_apps_for_app_store.
-    """
-    pass
diff --git a/datasets/CharlieDreemur_OpenManus-RL/metadata.json b/datasets/CharlieDreemur_OpenManus-RL/metadata.json
new file mode 100644
index 00000000..dab281a2
--- /dev/null
+++ b/datasets/CharlieDreemur_OpenManus-RL/metadata.json
@@ -0,0 +1,273 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_movie_production_companies",
+        "description": "Get the production companies of a movie by its ID.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "movie_id": {}
+          },
+          "additionalProperties": false,
+          "required": [
+            "movie_id"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_search_movie",
+        "description": "Search for a movie by name and return basic details.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "movie_name": {}
+          },
+          "additionalProperties": false,
+          "required": [
+            "movie_name"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "perform_action",
+        "description": "Execute a text action in an interactive environment.\n\nArgs:\n----\n    action: The environment action to perform, such as \"go to desk 1\".",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "action": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "action"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "pharmacies_de_garde_nc_all_for_pharmacies_de_garde_nc",
+        "description": "Return pharmacies de garde in Nouvelle-Calédonie.\n\nOriginal tool name: pharmacies_de_garde_nc.all_for_pharmacies_de_garde_nc.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "pharmacies_de_garde_nc_health_for_pharmacies_de_garde_nc",
+        "description": "Return the health status of the Pharmacies de garde NC application.\n\nOriginal tool name: pharmacies_de_garde_nc.health_for_pharmacies_de_garde_nc.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "weather_get_120_hour_forecast_for_weather",
+        "description": "Return a weather forecast for up to 120 hours.\n\nOriginal tool name: weather.get_120_hour_forecast_for_weather.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "lat": {},
+            "lon": {},
+            "lang": {},
+            "hours": {},
+            "units": {}
+          },
+          "additionalProperties": false,
+          "required": [
+            "lat",
+            "lon"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "app_store_new_free_ios_apps_for_app_store",
+        "description": "Dataset tool app_store_new_free_ios_apps_for_app_store.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "check_valid_actions",
+        "description": "Dataset tool check_valid_actions.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_movie_alternative_titles",
+        "description": "Dataset tool get_movie_alternative_titles.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_movie_cast",
+        "description": "Dataset tool get_movie_cast.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_movie_crew",
+        "description": "Dataset tool get_movie_crew.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_movie_details",
+        "description": "Dataset tool get_movie_details.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_movie_keywords",
+        "description": "Dataset tool get_movie_keywords.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_movie_production_countries",
+        "description": "Dataset tool get_movie_production_countries.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_movie_translation",
+        "description": "Dataset tool get_movie_translation.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_person_cast",
+        "description": "Dataset tool get_person_cast.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_person_crew",
+        "description": "Dataset tool get_person_crew.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_person_details",
+        "description": "Dataset tool get_person_details.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_person_external_ids",
+        "description": "Dataset tool get_person_external_ids.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_search_person",
+        "description": "Dataset tool get_search_person.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": false
+}
diff --git a/datasets/SALT-NLP_SWE-chat/api.py b/datasets/SALT-NLP_SWE-chat/api.py
deleted file mode 100644
index 47fcfff2..00000000
--- a/datasets/SALT-NLP_SWE-chat/api.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from typing import Any
-
-
-def str_replace_editor(
-    command: str,
-    path: str,
-    file_text: str | None = None,
-    old_str: str | None = None,
-    new_str: str | None = None,
-    insert_line: int | None = None,
-    view_range: list | None = None,
-) -> None:
-    """View, create, and edit files with a custom editing tool.
-
-    Args:
-    ----
-        command: One of `view`, `create`, `str_replace`, `insert`, or `undo_edit`.
-        path: Absolute path to the target file or directory.
-        file_text: Content for `create` commands.
-        old_str: Existing text for `str_replace` commands.
-        new_str: Replacement text or inserted text.
-        insert_line: Line after which to insert text.
-        view_range: Optional `[start_line, end_line]` range to view.
-
-    """
-    pass
-
-
-def think(thought: str) -> None:
-    """Record a private reasoning step.
-
-    Args:
-    ----
-        thought: The model's reasoning trace.
-
-    """
-    pass
-
-
-def generic_tool(tool_name: str, tool_input: dict[str, Any], content: str | None = None) -> None:
-    """Represent a source-specific coding-agent tool call.
-
-    Args:
-    ----
-        tool_name: Original SWE-chat tool name.
-        tool_input: Parsed tool input parameters.
-        content: Raw tool-call content when no structured input is available.
-
-    """
-    pass
diff --git a/datasets/SALT-NLP_SWE-chat/metadata.json b/datasets/SALT-NLP_SWE-chat/metadata.json
new file mode 100644
index 00000000..f724f548
--- /dev/null
+++ b/datasets/SALT-NLP_SWE-chat/metadata.json
@@ -0,0 +1,32 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "generic_tool",
+        "description": "Represent a source-specific coding-agent tool call.\n\nArgs:\n----\n    tool_name: Original SWE-chat tool name.\n    tool_input: Parsed tool input parameters.\n    content: Raw tool-call content when no structured input is available.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "tool_name": {
+              "type": "string"
+            },
+            "tool_input": {
+              "type": "object"
+            },
+            "content": {}
+          },
+          "additionalProperties": false,
+          "required": [
+            "tool_name",
+            "tool_input"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [
+    "bash"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/agenttuning_alfworld/api.py b/datasets/agenttuning_alfworld/api.py
deleted file mode 100644
index 39c26889..00000000
--- a/datasets/agenttuning_alfworld/api.py
+++ /dev/null
@@ -1,184 +0,0 @@
-def go(location: str):
-    """Move to the specified location.
-
-    Args:
-        location (str): The target location to move to.
-
-    Example:
-        go("bed 1")
-
-    """
-    pass
-
-
-def take(item: str, source: str):
-    """Pick up an item from a specified source location.
-
-    Args:
-        item (str): The item to pick up.
-        source (str): The location from which to take the item.
-
-    Example:
-        take("laptop 1", "diningtable 1")
-
-    """
-    pass
-
-
-def put(item: str, target: str):
-    """Place an item onto or into a specified target.
-
-    Args:
-        item (str): The item to place.
-        target (str): The surface or container to place the item in/on.
-
-    Example:
-        put("laptop 1", "bed 1")
-
-    """
-    pass
-
-
-def open(obj: str):
-    """Open a specified container or object.
-
-    Args:
-        obj (str): The object to open (e.g., drawer, door).
-
-    Example:
-        open("drawer 1")
-
-    """
-    pass
-
-
-def heat(item: str, appliance: str):
-    """Heat an item using a specified appliance.
-
-    Args:
-        item (str): The item to be heated (e.g., "plate 1").
-        appliance (str): The appliance to use for heating (e.g., "microwave 1").
-
-    Example:
-        heat("plate 1", "microwave 1")
-
-    """
-    pass
-
-
-def examine(obj: str):
-    """Inspect or look closely at an object in the environment.
-
-    Args:
-        obj (str): The object to examine (e.g., "dresser 1").
-
-    Example:
-        examine("dresser 1")
-
-    """
-    pass
-
-
-def cool(item: str, appliance: str):
-    """Cool an item using a specified appliance.
-
-    Args:
-        item (str): The item to be cooled (e.g., "plate 1").
-        appliance (str): The appliance to use for cooling (e.g., "fridge 1").
-
-    Example:
-        cool("plate 1", "fridge 1")
-
-    """
-    pass
-
-
-def use(obj: str):
-    """Use or activate a specified object or appliance.
-
-    Args:
-        obj (str): The object to use (e.g., "desklamp 1").
-
-    Example:
-        use("desklamp 1")
-
-    """
-    pass
-
-
-def close(obj: str):
-    """Close a specified container or object.
-
-    Args:
-        obj (str): The object to close (e.g., "fridge 1").
-
-    Example:
-        close("fridge 1")
-
-    """
-    pass
-
-
-def clean(item: str, appliance: str):
-    """Clean an item using a specified appliance.
-
-    Args:
-        item (str): The item to be cleaned (e.g., "ladle 2").
-        appliance (str): The appliance used for cleaning (e.g., "sinkbasin 1").
-
-    Example:
-        clean("ladle 2", "sinkbasin 1")
-
-    """
-    pass
-
-
-def report_problem(obj: str):
-    """Report an issue with a specified object in the environment.
-
-    Args:
-        obj (str): The object with a problem (e.g., "toilet 1").
-
-    Example:
-        report_problem("toilet 1")
-
-    """
-    pass
-
-
-def inventory():
-    """Check currently held items.
-
-    This function retrieves and lists the objects currently in possession.
-
-    Example:
-        inventory()
-
-    """
-    pass
-
-
-def look():
-    """Survey the surroundings to get a description of the current environment.
-
-    This function allows the agent to observe visible objects and locations nearby.
-
-    Example:
-        look()
-
-    """
-    pass
-
-
-def look_at_under(item: str, reference: str):
-    """Look closely at an item that is located under a specified object.
-
-    Args:
-        item (str): The item to examine (e.g., "cellphone 1").
-        reference (str): The object under which the item is located (e.g., "desklamp 1").
-
-    Example:
-        look_at_under("cellphone 1", "desklamp 1")
-
-    """
-    pass
diff --git a/datasets/agenttuning_db/metadata.json b/datasets/agenttuning_db/metadata.json
new file mode 100644
index 00000000..dfa26e78
--- /dev/null
+++ b/datasets/agenttuning_db/metadata.json
@@ -0,0 +1,5 @@
+{
+  "custom_tools": [],
+  "code_enabled": [],
+  "browser_enabled": false
+}
diff --git a/datasets/agenttuning_kg/api.py b/datasets/agenttuning_kg/api.py
deleted file mode 100644
index 2a82ca87..00000000
--- a/datasets/agenttuning_kg/api.py
+++ /dev/null
@@ -1,78 +0,0 @@
-def get_relations(variable: str):
-    """Get all relations connected to an entity or variable in the knowledge base.
-
-    This function helps to explore the knowledge graph by retrieving all relations
-    (i.e., edges) that are associated with the given variable, which can be either
-    a concrete entity (e.g., "Barack Obama") or a variable placeholder (e.g., "#0").
-
-    Example: get_relations("Barack Obama")
-    """
-    pass
-
-
-def get_neighbors(variable: str, relation: str):
-    """Get all entities connected to a variable via a specific relation.
-
-    This function retrieves a new variable containing all entities that are
-    connected to the input variable by the given relation. This is typically
-    used after get_relations to determine which relation to follow.
-
-    Example: get_neighbors("Barack Obama", "people.person.profession")
-    """
-    pass
-
-
-def intersection(variable1: str, variable2: str):
-    """Compute the intersection of two variables.
-
-    This function returns a new variable that includes only the entities
-    shared between the two input variables. The input variables must be
-    of the same type.
-
-    Example: intersection("#1", "#2")
-    """
-    pass
-
-
-def get_attributes(variable: str):
-    """Get all numerical attributes of a variable.
-
-    This function helps to identify which attributes can be used in a
-    superlative query (e.g., max/min age). Only use this when a question
-    involves ranking or finding extremums.
-
-    Example: get_attributes("#3")
-    """
-    pass
-
-
-def argmax(variable: str, attribute: str):
-    """Return the entity with the maximum value of the given attribute.
-
-    Use this function to find the entity with the highest value for the
-    specified attribute within a variable. Requires attributes to be known.
-
-    Example: argmax("#2", "age")
-    """
-    pass
-
-
-def argmin(variable: str, attribute: str):
-    """Return the entity with the minimum value of the given attribute.
-
-    Use this function to find the entity with the lowest value for the
-    specified attribute within a variable. Requires attributes to be known.
-
-    Example: argmin("#2", "age")
-    """
-    pass
-
-
-def count(variable: str):
-    """Count the number of entities in a variable.
-
-    Returns the number of distinct entities represented by the variable.
-
-    Example: count("#4")
-    """
-    pass
diff --git a/datasets/agenttuning_webshop/api.py b/datasets/agenttuning_webshop/api.py
deleted file mode 100644
index b1aab5eb..00000000
--- a/datasets/agenttuning_webshop/api.py
+++ /dev/null
@@ -1,25 +0,0 @@
-def search(keywords: str):
-    """Perform a search on the web interface using the specified keywords.
-
-    Args:
-        keywords (str): The search query string.
-
-    Example:
-        search("3.25 ounce (pack of 3) protein serving jerky price < 50.00")
-
-    """
-    pass
-
-
-def click(element: str):
-    """Click an element on the webpage by its visible label or ID.
-
-    Args:
-        element (str): The label or identifier of the clickable item.
-
-    Example:
-        click("B0977H69D1")
-        click("Buy Now")
-
-    """
-    pass
diff --git a/datasets/allenai_Sera-4.6-Lite-T2/api.py b/datasets/allenai_Sera-4.6-Lite-T2/api.py
deleted file mode 100644
index d98981f0..00000000
--- a/datasets/allenai_Sera-4.6-Lite-T2/api.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from typing import List, Literal, Optional
-
-
-def str_replace_editor(
-    command: Literal["view", "create", "str_replace", "insert", "undo_edit"],
-    path: str,
-    file_text: Optional[str] = None,
-    old_str: Optional[str] = None,
-    new_str: Optional[str] = None,
-    insert_line: Optional[int] = None,
-    view_range: Optional[List[int]] = None,
-) -> None:
-    """View, create, and edit files with the OpenHands editor tool."""
-    pass
diff --git a/datasets/allenai_Sera-4.6-Lite-T2/metadata.json b/datasets/allenai_Sera-4.6-Lite-T2/metadata.json
new file mode 100644
index 00000000..41212e45
--- /dev/null
+++ b/datasets/allenai_Sera-4.6-Lite-T2/metadata.json
@@ -0,0 +1,7 @@
+{
+  "custom_tools": [],
+  "code_enabled": [
+    "bash"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/android_in_the_wild/api.py b/datasets/android_in_the_wild/api.py
deleted file mode 100644
index 9031f58b..00000000
--- a/datasets/android_in_the_wild/api.py
+++ /dev/null
@@ -1,45 +0,0 @@
-def touch_and_lift(x0: float, y0: float, x1: float, y1: float) -> None:
-    """Touch at the given x0, y0 coordinates and lift at x1, y1.
-
-    Args:
-    ----
-        x0 (float): The x coordinate to touch.
-        y0 (float): The y coordinate to touch.
-        x1 (float): The x coordinate to lift.
-        y1 (float): The y coordinate to lift.
-
-    """
-    pass
-
-
-def type(text: str):
-    """Type given text through keyboard.
-
-    Args:
-    ----
-        text (str): the text to input.
-
-    """
-    pass
-
-
-def press(key_name: str):
-    """Press a special key according the key name.
-
-    Args:
-    ----
-        key_name (str): go_back | go_home | enter, the key to press
-
-    """
-    pass
-
-
-def end(succeeds: bool):
-    """Claim the end of the task with whether it is successfully completed.
-
-    Args:
-    ----
-        succeeds (bool): if the task is successful
-
-    """
-    pass
diff --git a/datasets/android_in_the_wild/metadata.json b/datasets/android_in_the_wild/metadata.json
new file mode 100644
index 00000000..1cbbdf2b
--- /dev/null
+++ b/datasets/android_in_the_wild/metadata.json
@@ -0,0 +1,37 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "touch_and_lift",
+        "description": "Touch at the given x0, y0 coordinates and lift at x1, y1.\n\nArgs:\n----\n    x0 (float): The x coordinate to touch.\n    y0 (float): The y coordinate to touch.\n    x1 (float): The x coordinate to lift.\n    y1 (float): The y coordinate to lift.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "x0": {
+              "type": "number"
+            },
+            "y0": {
+              "type": "number"
+            },
+            "x1": {
+              "type": "number"
+            },
+            "y1": {
+              "type": "number"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "x0",
+            "y0",
+            "x1",
+            "y1"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": false
+}
diff --git a/datasets/androidcontrol/api.py b/datasets/androidcontrol/api.py
deleted file mode 100644
index 23be529d..00000000
--- a/datasets/androidcontrol/api.py
+++ /dev/null
@@ -1,58 +0,0 @@
-def click(x: int, y: int) -> None:
-    """Click at the specified coordinates.
-
-    Args:
-    ----
-        x (int): The x coordinate to click.
-        y (int): The y coordinate to click.
-
-    """
-    pass
-
-
-def scroll(direction: str) -> None:
-    """Scroll in the specified direction.
-
-    Args:
-    ----
-        direction (str): The direction to scroll.
-
-    """
-    pass
-
-
-def input_text(text: str) -> None:
-    """Input text.
-
-    Args:
-    ----
-        text (str): The text to input.
-
-    """
-    pass
-
-
-def navigate_home() -> None:
-    """Navigate to the home screen."""
-    pass
-
-
-def back() -> None:
-    """Navigate back."""
-    pass
-
-
-def open_app(app_name: str) -> None:
-    """Open the specified app.
-
-    Args:
-    ----
-        app_name (str): The name of the app to open.
-
-    """
-    pass
-
-
-def wait() -> None:
-    """Wait for a short period of time."""
-    pass
diff --git a/datasets/androidcontrol/metadata.json b/datasets/androidcontrol/metadata.json
new file mode 100644
index 00000000..5267f6ed
--- /dev/null
+++ b/datasets/androidcontrol/metadata.json
@@ -0,0 +1,91 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "back",
+        "description": "Navigate back.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "click",
+        "description": "Click at the specified coordinates.\n\nArgs:\n----\n    x (int): The x coordinate to click.\n    y (int): The y coordinate to click.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "x": {
+              "type": "integer"
+            },
+            "y": {
+              "type": "integer"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "x",
+            "y"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "open_app",
+        "description": "Open the specified app.\n\nArgs:\n----\n    app_name (str): The name of the app to open.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "app_name": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "app_name"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "scroll",
+        "description": "Scroll in the specified direction.\n\nArgs:\n----\n    direction (str): The direction to scroll.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "direction": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "direction"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "wait",
+        "description": "Wait for a short period of time.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": false
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": true
+}
diff --git a/datasets/code_feedback/metadata.json b/datasets/code_feedback/metadata.json
new file mode 100644
index 00000000..71eea4f8
--- /dev/null
+++ b/datasets/code_feedback/metadata.json
@@ -0,0 +1,7 @@
+{
+  "custom_tools": [],
+  "code_enabled": [
+    "python"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/codeactinstruct/api.py b/datasets/codeactinstruct/api.py
deleted file mode 100644
index 7f331889..00000000
--- a/datasets/codeactinstruct/api.py
+++ /dev/null
@@ -1,119 +0,0 @@
-def wikipedia_search(query: str) -> str:
-    """Search Wikipedia for a given query.
-
-    This tool provides access to a vast collection of articles covering a wide range of topics.
-    It can be used to retrieve accurate and comprehensive information about specific keywords or topics.
-
-    For example: wikipedia_search("Photosynthesis")
-    """
-    pass
-
-
-def put(object: str, receptacle: str) -> str:
-    """Put an object in/on a receptacle.
-
-    This is used for interacting with a household environment.
-
-    For example: put("mug 1", "desk 2")
-    """
-    pass
-
-
-def goto(receptacle: str) -> str:
-    """Go to a location of the receptacle.
-
-    This is used for interacting with a household environment.
-
-    For example: goto("drawer 1")
-    """
-    pass
-
-
-def take_from(object: str, receptacle: str) -> str:
-    """Take an object from a receptacle.
-
-    This is used for interacting with a household environment.
-
-    For example: take_from("mug 1", "shelf 2")
-    """
-    pass
-
-
-def open_receptacle(receptacle: str) -> str:
-    """Open a receptacle.
-
-    This is used for interacting with a household environment.
-
-    For example: open_receptacle("fridge 1")
-    """
-    pass
-
-
-def toggle(object_or_receptacle: str) -> str:
-    """Toggle an object or receptacle.
-
-    This is used for interacting with a household environment.
-
-    For example: toggle("light 2")
-    """
-    pass
-
-
-def close_receptacle(receptacle: str) -> str:
-    """Close a receptacle.
-
-    This is used for interacting with a household environment.
-
-    For example: close_receptacle("microwave 1")
-    """
-    pass
-
-
-def clean(object: str, receptacle: str) -> str:
-    """Clean an object with a receptacle.
-
-    This is used for interacting with a household environment.
-
-    For example: clean("cloth 1", "sinkbasin 1")
-    """
-    pass
-
-
-def heat(object: str, receptacle: str) -> str:
-    """Heat an object with a receptacle.
-
-    This is used for interacting with a household environment.
-
-    For example: heat("egg 1", "microwave 1")
-    """
-    pass
-
-
-def cool(object: str, receptacle: str) -> str:
-    """Cool an object with a receptacle.
-
-    This is used for interacting with a household environment.
-
-    For example: cool("bottle 1", "fridge 1")
-    """
-    pass
-
-
-def use(receptacle: str) -> str:
-    """Use a receptacle.
-
-    This is used for interacting with a household environment.
-
-    For example: use("lamp 1")
-    """
-    pass
-
-
-def look() -> str:
-    """Look around. It will return what you see in the room.
-
-    This is used for interacting with a household environment.
-
-    For example: look()
-    """
-    pass
diff --git a/datasets/codeactinstruct/metadata.json b/datasets/codeactinstruct/metadata.json
new file mode 100644
index 00000000..360ce38f
--- /dev/null
+++ b/datasets/codeactinstruct/metadata.json
@@ -0,0 +1,249 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "clean",
+        "description": "Clean an object with a receptacle.\n\nThis is used for interacting with a household environment.\n\nFor example: clean(\"cloth 1\", \"sinkbasin 1\")",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "object": {
+              "type": "string"
+            },
+            "receptacle": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "object",
+            "receptacle"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "close_receptacle",
+        "description": "Close a receptacle.\n\nThis is used for interacting with a household environment.\n\nFor example: close_receptacle(\"microwave 1\")",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "receptacle": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "receptacle"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "cool",
+        "description": "Cool an object with a receptacle.\n\nThis is used for interacting with a household environment.\n\nFor example: cool(\"bottle 1\", \"fridge 1\")",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "object": {
+              "type": "string"
+            },
+            "receptacle": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "object",
+            "receptacle"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "goto",
+        "description": "Go to a location of the receptacle.\n\nThis is used for interacting with a household environment.\n\nFor example: goto(\"drawer 1\")",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "receptacle": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "receptacle"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "heat",
+        "description": "Heat an object with a receptacle.\n\nThis is used for interacting with a household environment.\n\nFor example: heat(\"egg 1\", \"microwave 1\")",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "object": {
+              "type": "string"
+            },
+            "receptacle": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "object",
+            "receptacle"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "look",
+        "description": "Look around. It will return what you see in the room.\n\nThis is used for interacting with a household environment.\n\nFor example: look()",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "open_receptacle",
+        "description": "Open a receptacle.\n\nThis is used for interacting with a household environment.\n\nFor example: open_receptacle(\"fridge 1\")",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "receptacle": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "receptacle"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "put",
+        "description": "Put an object in/on a receptacle.\n\nThis is used for interacting with a household environment.\n\nFor example: put(\"mug 1\", \"desk 2\")",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "object": {
+              "type": "string"
+            },
+            "receptacle": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "object",
+            "receptacle"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "take_from",
+        "description": "Take an object from a receptacle.\n\nThis is used for interacting with a household environment.\n\nFor example: take_from(\"mug 1\", \"shelf 2\")",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "object": {
+              "type": "string"
+            },
+            "receptacle": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "object",
+            "receptacle"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "toggle",
+        "description": "Toggle an object or receptacle.\n\nThis is used for interacting with a household environment.\n\nFor example: toggle(\"light 2\")",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "object_or_receptacle": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "object_or_receptacle"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "use",
+        "description": "Use a receptacle.\n\nThis is used for interacting with a household environment.\n\nFor example: use(\"lamp 1\")",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "receptacle": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "receptacle"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "wikipedia_search",
+        "description": "Search Wikipedia for a given query.\n\nThis tool provides access to a vast collection of articles covering a wide range of topics.\nIt can be used to retrieve accurate and comprehensive information about specific keywords or topics.\n\nFor example: wikipedia_search(\"Photosynthesis\")",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "query": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "query"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [
+    "python"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/coderforge_preview/api.py b/datasets/coderforge_preview/api.py
deleted file mode 100644
index 981748a8..00000000
--- a/datasets/coderforge_preview/api.py
+++ /dev/null
@@ -1,34 +0,0 @@
-def str_replace_editor(
-    command: str,
-    path: str,
-    file_text: str = None,
-    old_str: str = None,
-    new_str: str = None,
-    insert_line: int = None,
-    view_range: list = None,
-) -> None:
-    """View, create, and edit files with this custom editing tool.
-
-    Args:
-    ----
-        command (str): The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.
-        path (str): Absolute path to file or directory, e.g. `/repo/file.py` or `/repo`.
-        file_text (str): Required parameter of `create` command, with the content of the file to be created.
-        old_str (str): Required parameter of `str_replace` command containing the string in `path` to replace.
-        new_str (str): Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.
-        insert_line (int): Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.
-        view_range (list): Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.
-
-    """
-    pass
-
-
-def think(thought: str):
-    """Log a thought for reasoning.
-
-    Args:
-    ----
-        thought (str): The thought to log.
-
-    """
-    pass
diff --git a/datasets/coderforge_preview/metadata.json b/datasets/coderforge_preview/metadata.json
new file mode 100644
index 00000000..41212e45
--- /dev/null
+++ b/datasets/coderforge_preview/metadata.json
@@ -0,0 +1,7 @@
+{
+  "custom_tools": [],
+  "code_enabled": [
+    "bash"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/codescout/metadata.json b/datasets/codescout/metadata.json
new file mode 100644
index 00000000..41212e45
--- /dev/null
+++ b/datasets/codescout/metadata.json
@@ -0,0 +1,7 @@
+{
+  "custom_tools": [],
+  "code_enabled": [
+    "bash"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/cognitivekernel_pro_sft/api.py b/datasets/cognitivekernel_pro_sft/api.py
deleted file mode 100644
index 92e95ab3..00000000
--- a/datasets/cognitivekernel_pro_sft/api.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from typing import Any
-
-
-def web_agent(task: str) -> dict:
-    """Use a web browser agent to complete a web task."""
-    pass
-
-
-def file_agent(task: str, file_path_dict: dict | None = None) -> dict:
-    """Use a file-analysis agent to answer a task over local files."""
-    pass
-
-
-def stop(
-    output: Any = None,
-    log: Any = None,
-    answer: Any = None,
-    summary: Any = None,
-) -> dict:
-    """Finalize a task with either source-specific final-answer signature."""
-    pass
-
-
-def ask_llm(query: str) -> str:
-    """Ask a language model for tasks that need no external tools."""
-    pass
-
-
-def simple_web_search(query: str) -> str:
-    """Run a quick web search for straightforward information needs."""
-    pass
-
-
-def load_file(file_name: str) -> str:
-    """Load a local file into the CognitiveKernel file environment."""
-    pass
-
-
-def read_text(file_name: str, page_id_list: list) -> str:
-    """Read selected file pages as text."""
-    pass
-
-
-def read_screenshot(file_name: str, page_id_list: list) -> str:
-    """Read selected file pages with screenshot-based processing."""
-    pass
-
-
-def search(file_name: str, key_word_list: list) -> str:
-    """Search a file for keywords and return matching pages."""
-    pass
diff --git a/datasets/cognitivekernel_pro_sft/metadata.json b/datasets/cognitivekernel_pro_sft/metadata.json
new file mode 100644
index 00000000..723d99e5
--- /dev/null
+++ b/datasets/cognitivekernel_pro_sft/metadata.json
@@ -0,0 +1,123 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "load_file",
+        "description": "Load a local file into the CognitiveKernel file environment.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "file_name": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "file_name"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "simple_web_search",
+        "description": "Run a quick web search for straightforward information needs.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "query": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "query"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "web_agent",
+        "description": "Use a web browser agent to complete a web task.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "task": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "task"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "ask_llm",
+        "description": "Dataset tool ask_llm.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "file_agent",
+        "description": "Dataset tool file_agent.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "read_screenshot",
+        "description": "Dataset tool read_screenshot.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "read_text",
+        "description": "Dataset tool read_text.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "search",
+        "description": "Dataset tool search.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": false
+}
diff --git a/datasets/dolci_instruct_sft_tool_use/api.py b/datasets/dolci_instruct_sft_tool_use/api.py
deleted file mode 100644
index 55affa70..00000000
--- a/datasets/dolci_instruct_sft_tool_use/api.py
+++ /dev/null
@@ -1,121 +0,0 @@
-from typing import Any
-
-
-def cell_density(dilution: Any = None, od: Any = None) -> dict:
-    """Provide a placeholder for the `cell_density` tool in the committed Dolci sample."""
-    pass
-
-
-def combinatorics_permutation_count(k: Any = None, n: Any = None) -> dict:
-    """Provide a placeholder for the `combinatorics_permutation_count` tool in the committed Dolci sample."""
-    pass
-
-
-def get_all_predictions(sort: Any = None) -> dict:
-    """Provide a placeholder for the `get_all_predictions` tool in the committed Dolci sample."""
-    pass
-
-
-def get_city_from_zipcode(zipcode: Any = None) -> dict:
-    """Provide a placeholder for the `get_city_from_zipcode` tool in the committed Dolci sample."""
-    pass
-
-
-def get_matches_on_a_specific_date(date: Any = None, utc_offset: Any = None) -> dict:
-    """Provide a placeholder for the `get_matches_on_a_specific_date` tool in the committed Dolci sample."""
-    pass
-
-
-def is_power_of_two(num: Any = None) -> dict:
-    """Provide a placeholder for the `is_power_of_two` tool in the committed Dolci sample."""
-    pass
-
-
-def laliga_standings(season: Any = None) -> dict:
-    """Provide a placeholder for the `laliga_standings` tool in the committed Dolci sample."""
-    pass
-
-
-def leaguepowerrankingrounds(seasonid: Any = None, tournamentid: Any = None) -> dict:
-    """Provide a placeholder for the `leaguepowerrankingrounds` tool in the committed Dolci sample."""
-    pass
-
-
-def match_details_by_id(match_id: Any = None) -> dict:
-    """Provide a placeholder for the `match_details_by_id` tool in the committed Dolci sample."""
-    pass
-
-
-def physics_final_velocity(
-    acceleration: Any = None, initial_velocity: Any = None, time: Any = None
-) -> dict:
-    """Provide a placeholder for the `physics_final_velocity` tool in the committed Dolci sample."""
-    pass
-
-
-def reserve_hotel_room(
-    checkin_date: Any = None,
-    checkout_date: Any = None,
-    guest_id: Any = None,
-    nightly_rate: Any = None,
-    room_type: Any = None,
-) -> dict:
-    """Provide a placeholder for the `reserve_hotel_room` tool in the committed Dolci sample."""
-    pass
-
-
-def schools(identifier: Any = None) -> dict:
-    """Provide a placeholder for the `schools` tool in the committed Dolci sample."""
-    pass
-
-
-def select_race_based_on_race_number(race_no: Any = None) -> dict:
-    """Provide a placeholder for the `select_race_based_on_race_number` tool in the committed Dolci sample."""
-    pass
-
-
-def weather_forecast_weather_api(days: Any = None, q: Any = None) -> dict:
-    """Provide a placeholder for the `weather_forecast_weather_api` tool in the committed Dolci sample."""
-    pass
-
-
-def calculate_calorie_intake(
-    weight_kg: Any = None,
-    height_cm: Any = None,
-    age: Any = None,
-    sex: Any = None,
-    activity_level: Any = None,
-    goal: Any = None,
-) -> dict:
-    """Stub for the advertised Dolci tool."""
-    pass
-
-
-def can_attend_all_meetings(intervals: Any = None) -> dict:
-    """Stub for the advertised Dolci tool."""
-    pass
-
-
-def daily_match_list_all(date: Any = None) -> dict:
-    """Stub for the advertised Dolci tool."""
-    pass
-
-
-def get_ip_zipcode(ip: Any = None) -> dict:
-    """Stub for the advertised Dolci tool."""
-    pass
-
-
-def get_pokemon_move_info(pokemon_name: Any = None, move_name: Any = None) -> dict:
-    """Stub for the advertised Dolci tool."""
-    pass
-
-
-def predict_evolution_rate(species: Any = None, years: Any = None, model: Any = None) -> dict:
-    """Stub for the advertised Dolci tool."""
-    pass
-
-
-def weather_getweatherforecast() -> dict:
-    """Stub for the advertised Dolci tool."""
-    pass
diff --git a/datasets/dolci_instruct_sft_tool_use/metadata.json b/datasets/dolci_instruct_sft_tool_use/metadata.json
new file mode 100644
index 00000000..d54d65fc
--- /dev/null
+++ b/datasets/dolci_instruct_sft_tool_use/metadata.json
@@ -0,0 +1,297 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "cell_density",
+        "description": "Provide a placeholder for the `cell_density` tool in the committed Dolci sample.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "dilution": {},
+            "od": {}
+          },
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "combinatorics_permutation_count",
+        "description": "Provide a placeholder for the `combinatorics_permutation_count` tool in the committed Dolci sample.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "k": {},
+            "n": {}
+          },
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_all_predictions",
+        "description": "Provide a placeholder for the `get_all_predictions` tool in the committed Dolci sample.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "sort": {}
+          },
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_city_from_zipcode",
+        "description": "Provide a placeholder for the `get_city_from_zipcode` tool in the committed Dolci sample.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "zipcode": {}
+          },
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_matches_on_a_specific_date",
+        "description": "Provide a placeholder for the `get_matches_on_a_specific_date` tool in the committed Dolci sample.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "date": {},
+            "utc_offset": {}
+          },
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "is_power_of_two",
+        "description": "Provide a placeholder for the `is_power_of_two` tool in the committed Dolci sample.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "num": {}
+          },
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "laliga_standings",
+        "description": "Provide a placeholder for the `laliga_standings` tool in the committed Dolci sample.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "season": {}
+          },
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "leaguepowerrankingrounds",
+        "description": "Provide a placeholder for the `leaguepowerrankingrounds` tool in the committed Dolci sample.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "seasonid": {},
+            "tournamentid": {}
+          },
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "match_details_by_id",
+        "description": "Provide a placeholder for the `match_details_by_id` tool in the committed Dolci sample.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "match_id": {}
+          },
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "physics_final_velocity",
+        "description": "Provide a placeholder for the `physics_final_velocity` tool in the committed Dolci sample.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "acceleration": {},
+            "initial_velocity": {},
+            "time": {}
+          },
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "reserve_hotel_room",
+        "description": "Provide a placeholder for the `reserve_hotel_room` tool in the committed Dolci sample.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "checkin_date": {},
+            "checkout_date": {},
+            "guest_id": {},
+            "nightly_rate": {},
+            "room_type": {}
+          },
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "schools",
+        "description": "Provide a placeholder for the `schools` tool in the committed Dolci sample.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "identifier": {}
+          },
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "select_race_based_on_race_number",
+        "description": "Provide a placeholder for the `select_race_based_on_race_number` tool in the committed Dolci sample.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "race_no": {}
+          },
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "weather_forecast_weather_api",
+        "description": "Provide a placeholder for the `weather_forecast_weather_api` tool in the committed Dolci sample.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "days": {},
+            "q": {}
+          },
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "calculate_calorie_intake",
+        "description": "Dataset tool calculate_calorie_intake.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "can_attend_all_meetings",
+        "description": "Dataset tool can_attend_all_meetings.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "daily_match_list_all",
+        "description": "Dataset tool daily_match_list_all.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_ip_zipcode",
+        "description": "Dataset tool get_ip_zipcode.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_pokemon_move_info",
+        "description": "Dataset tool get_pokemon_move_info.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "predict_evolution_rate",
+        "description": "Dataset tool predict_evolution_rate.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "weather_getweatherforecast",
+        "description": "Dataset tool weather_getweatherforecast.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": false
+}
diff --git a/datasets/eto/metadata.json b/datasets/eto/metadata.json
new file mode 100644
index 00000000..dfa26e78
--- /dev/null
+++ b/datasets/eto/metadata.json
@@ -0,0 +1,5 @@
+{
+  "custom_tools": [],
+  "code_enabled": [],
+  "browser_enabled": false
+}
diff --git a/datasets/gair_davinci_dev/api.py b/datasets/gair_davinci_dev/api.py
deleted file mode 100644
index b3dabf9e..00000000
--- a/datasets/gair_davinci_dev/api.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from typing import Optional
-
-
-def str_replace_editor(
-    command: str,
-    path: str,
-    file_text: Optional[str] = None,
-    old_str: Optional[str] = None,
-    new_str: Optional[str] = None,
-    insert_line: Optional[int] = None,
-    view_range: Optional[list[int]] = None,
-):
-    """View, create, and edit files with a custom editing tool.
-
-    Args:
-    ----
-        command: Editor command. Allowed values include view, create, str_replace, insert, and undo_edit.
-        path: Absolute path to a file or directory.
-        file_text: File content for create commands.
-        old_str: Exact string to replace for str_replace commands.
-        new_str: Replacement or inserted string.
-        insert_line: Line number after which to insert new_str.
-        view_range: Optional line range to view.
-
-    """
-    pass
diff --git a/datasets/gair_davinci_dev/metadata.json b/datasets/gair_davinci_dev/metadata.json
new file mode 100644
index 00000000..41212e45
--- /dev/null
+++ b/datasets/gair_davinci_dev/metadata.json
@@ -0,0 +1,7 @@
+{
+  "custom_tools": [],
+  "code_enabled": [
+    "bash"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/go-browse-wa/api.py b/datasets/go-browse-wa/api.py
deleted file mode 100644
index c0d7edf3..00000000
--- a/datasets/go-browse-wa/api.py
+++ /dev/null
@@ -1,180 +0,0 @@
-from typing import List, Literal, Union
-
-
-def noop(wait_ms: float = 1000) -> None:
-    """Do nothing, and optionally wait for the given time (in milliseconds).
-
-    Args:
-    ----
-        wait_ms (float): Time to wait in milliseconds. Defaults to 1000ms.
-
-    """
-    pass
-
-
-def scroll(delta_x: float, delta_y: float) -> None:
-    """Scroll horizontally and vertically.
-
-    Amounts in pixels, positive for right or down scrolling,
-    negative for left or up scrolling. Dispatches a wheel event.
-
-    Args:
-    ----
-        delta_x (float): The distance to scroll horizontally in pixels.
-        delta_y (float): The distance to scroll vertically in pixels.
-
-    """
-    pass
-
-
-def fill(bid: str, value: str) -> None:
-    """Fill out a form field.
-
-    It focuses the element and triggers an input event with the entered text.
-    Works for <input>, <textarea> and [contenteditable] elements.
-
-    Args:
-    ----
-        bid (str): The browser ID of the element to fill.
-        value (str): The text to enter into the field.
-
-    """
-    pass
-
-
-def select_option(bid: str, options: Union[str, List[str]]) -> None:
-    """Select one or multiple options in a <select> element.
-
-    Args:
-    ----
-        bid (str): The browser ID of the select element.
-        options (Union[str, List[str]]): The option value(s) or label(s) to select.
-
-    """
-    pass
-
-
-def click(
-    bid: str,
-    button: Literal["left", "middle", "right"] = "left",
-    modifiers: List[Literal["Alt", "Control", "ControlOrMeta", "Meta", "Shift"]] = None,
-) -> None:
-    """Click an element.
-
-    Args:
-    ----
-        bid (str): The browser ID of the element to click.
-        button (Literal["left", "middle", "right"]): The mouse button to use. Defaults to "left".
-        modifiers (List[Literal["Alt", "Control", "ControlOrMeta", "Meta", "Shift"]]):
-            List of modifier keys to hold while clicking. Defaults to None.
-
-    """
-    pass
-
-
-def dblclick(
-    bid: str,
-    button: Literal["left", "middle", "right"] = "left",
-    modifiers: List[Literal["Alt", "Control", "ControlOrMeta", "Meta", "Shift"]] = None,
-) -> None:
-    """Double click an element.
-
-    Args:
-    ----
-        bid (str): The browser ID of the element to double click.
-        button (Literal["left", "middle", "right"]): The mouse button to use. Defaults to "left".
-        modifiers (List[Literal["Alt", "Control", "ControlOrMeta", "Meta", "Shift"]]):
-            List of modifier keys to hold while double clicking. Defaults to None.
-
-    """
-    pass
-
-
-def hover(bid: str) -> None:
-    """Hover over an element.
-
-    Args:
-    ----
-        bid (str): The browser ID of the element to hover over.
-
-    """
-    pass
-
-
-def press(bid: str, key_comb: str) -> None:
-    """Focus the matching element and press a combination of keys.
-
-    Args:
-    ----
-        bid (str): The browser ID of the element to focus.
-        key_comb (str): The key combination to press (e.g., "Backspace", "ControlOrMeta+a").
-
-    """
-    pass
-
-
-def focus(bid: str) -> None:
-    """Focus the matching element.
-
-    Args:
-    ----
-        bid (str): The browser ID of the element to focus.
-
-    """
-    pass
-
-
-def clear(bid: str) -> None:
-    """Clear the input field.
-
-    Args:
-    ----
-        bid (str): The browser ID of the input field to clear.
-
-    """
-    pass
-
-
-def drag_and_drop(from_bid: str, to_bid: str) -> None:
-    """Perform a drag & drop operation.
-
-    Args:
-    ----
-        from_bid (str): The browser ID of the element to drag.
-        to_bid (str): The browser ID of the element to drop onto.
-
-    """
-    pass
-
-
-def upload_file(bid: str, file: Union[str, List[str]]) -> None:
-    """Click an element and wait for a "filechooser" event, then select file(s) for upload.
-
-    Args:
-    ----
-        bid (str): The browser ID of the file input element.
-        file (Union[str, List[str]]): Path(s) to the file(s) to upload.
-
-    """
-    pass
-
-
-def go_back() -> None:
-    """Navigate to the previous page in history."""
-    pass
-
-
-def go_forward() -> None:
-    """Navigate to the next page in history."""
-    pass
-
-
-def goto(url: str) -> None:
-    """Navigate to a url.
-
-    Args:
-    ----
-        url (str): The URL to navigate to.
-
-    """
-    pass
diff --git a/datasets/go-browse-wa/metadata.json b/datasets/go-browse-wa/metadata.json
new file mode 100644
index 00000000..facc8bef
--- /dev/null
+++ b/datasets/go-browse-wa/metadata.json
@@ -0,0 +1,96 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "click",
+        "description": "Click an element.\n\nArgs:\n----\n    bid (str): The browser ID of the element to click.\n    button (Literal[\"left\", \"middle\", \"right\"]): The mouse button to use. Defaults to \"left\".\n    modifiers (List[Literal[\"Alt\", \"Control\", \"ControlOrMeta\", \"Meta\", \"Shift\"]]):\n        List of modifier keys to hold while clicking. Defaults to None.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "bid": {
+              "type": "string"
+            },
+            "button": {
+              "type": "string",
+              "enum": [
+                "left",
+                "middle",
+                "right"
+              ]
+            },
+            "modifiers": {
+              "type": "array"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "bid"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "fill",
+        "description": "Fill out a form field.\n\nIt focuses the element and triggers an input event with the entered text.\nWorks for <input>, <textarea> and [contenteditable] elements.\n\nArgs:\n----\n    bid (str): The browser ID of the element to fill.\n    value (str): The text to enter into the field.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "bid": {
+              "type": "string"
+            },
+            "value": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "bid",
+            "value"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "noop",
+        "description": "Do nothing, and optionally wait for the given time (in milliseconds).\n\nArgs:\n----\n    wait_ms (float): Time to wait in milliseconds. Defaults to 1000ms.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "wait_ms": {
+              "type": "number"
+            }
+          },
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "select_option",
+        "description": "Select one or multiple options in a <select> element.\n\nArgs:\n----\n    bid (str): The browser ID of the select element.\n    options (Union[str, List[str]]): The option value(s) or label(s) to select.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "bid": {
+              "type": "string"
+            },
+            "options": {}
+          },
+          "additionalProperties": false,
+          "required": [
+            "bid",
+            "options"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": true
+}
diff --git a/datasets/hybrid-gym/api.py b/datasets/hybrid-gym/api.py
deleted file mode 100644
index 0ea369f9..00000000
--- a/datasets/hybrid-gym/api.py
+++ /dev/null
@@ -1,32 +0,0 @@
-def str_replace_editor(
-    command: str,
-    path: str,
-    file_text: str | None = None,
-    old_str: str | None = None,
-    new_str: str | None = None,
-    insert_line: int | None = None,
-    view_range: list | str | None = None,
-) -> None:
-    """View, create, and edit files with the OpenHands file editor.
-
-    Args:
-        command: One of `view`, `create`, `str_replace`, `insert`, or `undo_edit`.
-        path: Absolute path to the target file or directory.
-        file_text: Content for `create` commands.
-        old_str: Existing text for `str_replace` commands.
-        new_str: Replacement or inserted text.
-        insert_line: Line after which to insert text.
-        view_range: Optional `[start_line, end_line]` range to view.
-
-    """
-    return None
-
-
-def think(thought: str) -> None:
-    """Record a private reasoning step.
-
-    Args:
-        thought: The model's reasoning trace.
-
-    """
-    return None
diff --git a/datasets/hybrid-gym/metadata.json b/datasets/hybrid-gym/metadata.json
new file mode 100644
index 00000000..20595f83
--- /dev/null
+++ b/datasets/hybrid-gym/metadata.json
@@ -0,0 +1,55 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "str_replace_editor",
+        "description": "View, create, and edit files with the OpenHands file editor.\n\nArgs:\n    command: One of `view`, `create`, `str_replace`, `insert`, or `undo_edit`.\n    path: Absolute path to the target file or directory.\n    file_text: Content for `create` commands.\n    old_str: Existing text for `str_replace` commands.\n    new_str: Replacement or inserted text.\n    insert_line: Line after which to insert text.\n    view_range: Optional `[start_line, end_line]` range to view.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "command": {
+              "type": "string"
+            },
+            "path": {
+              "type": "string"
+            },
+            "file_text": {},
+            "old_str": {},
+            "new_str": {},
+            "insert_line": {},
+            "view_range": {}
+          },
+          "additionalProperties": false,
+          "required": [
+            "command",
+            "path"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "think",
+        "description": "Record a private reasoning step.\n\nArgs:\n    thought: The model's reasoning trace.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "thought": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "thought"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [
+    "bash"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/jupyter-agent-dataset/metadata.json b/datasets/jupyter-agent-dataset/metadata.json
new file mode 100644
index 00000000..71eea4f8
--- /dev/null
+++ b/datasets/jupyter-agent-dataset/metadata.json
@@ -0,0 +1,7 @@
+{
+  "custom_tools": [],
+  "code_enabled": [
+    "python"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/kwai-klear_swe-smith-mini_swe_agent_plus-trajectories-66k/metadata.json b/datasets/kwai-klear_swe-smith-mini_swe_agent_plus-trajectories-66k/metadata.json
new file mode 100644
index 00000000..41212e45
--- /dev/null
+++ b/datasets/kwai-klear_swe-smith-mini_swe_agent_plus-trajectories-66k/metadata.json
@@ -0,0 +1,7 @@
+{
+  "custom_tools": [],
+  "code_enabled": [
+    "bash"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/llava_plus/api.py b/datasets/llava_plus/api.py
deleted file mode 100644
index b6f95e38..00000000
--- a/datasets/llava_plus/api.py
+++ /dev/null
@@ -1,20 +0,0 @@
-def sam(boxes: list[list]) -> None:
-    """Call SAM (Segment Anything Model) to detect objects.
-
-    Args:
-    ----
-        boxes (list[list]): A list of arrays of bounding box coordinates.
-
-    """
-    pass
-
-
-def inpainting(prompt: str) -> None:
-    """Call stable diffusion inpainting model to edit images.
-
-    Args:
-    ----
-        prompt (str): An instruction for image editing.
-
-    """
-    pass
diff --git a/datasets/llava_plus/metadata.json b/datasets/llava_plus/metadata.json
new file mode 100644
index 00000000..518e3092
--- /dev/null
+++ b/datasets/llava_plus/metadata.json
@@ -0,0 +1,44 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "inpainting",
+        "description": "Call stable diffusion inpainting model to edit images.\n\nArgs:\n----\n    prompt (str): An instruction for image editing.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "prompt": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "prompt"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "sam",
+        "description": "Call SAM (Segment Anything Model) to detect objects.\n\nArgs:\n----\n    boxes (list[list]): A list of arrays of bounding box coordinates.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "boxes": {
+              "type": "array"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "boxes"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": false
+}
diff --git a/datasets/logicstar_swe-star/api.py b/datasets/logicstar_swe-star/api.py
deleted file mode 100644
index b0f8b0fe..00000000
--- a/datasets/logicstar_swe-star/api.py
+++ /dev/null
@@ -1,34 +0,0 @@
-def str_replace_editor(
-    command: str,
-    path: str,
-    file_text: str = None,
-    old_str: str = None,
-    new_str: str = None,
-    insert_line: int = None,
-    view_range: list = None,
-) -> None:
-    """View, create, and edit files with this custom editing tool.
-
-    Args:
-    ----
-        command (str): The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.
-        path (str): Absolute path to file or directory, e.g. `/repo/file.py` or `/repo`.
-        file_text (str): Required parameter of `create` command, with the content of the file to be created.
-        old_str (str): Required parameter of `str_replace` command containing the string in `path` to replace.
-        new_str (str): Optional parameter of `str_replace` command containing the new string. Required parameter of `insert` command containing the string to insert.
-        insert_line (int): Required parameter of `insert` command. The `new_str` will be inserted after this line.
-        view_range (list): Optional `[start_line, end_line]` range to view.
-
-    """
-    pass
-
-
-def think(thought: str) -> None:
-    """Record a private reasoning step.
-
-    Args:
-    ----
-        thought (str): The thought or reasoning to record.
-
-    """
-    pass
diff --git a/datasets/logicstar_swe-star/metadata.json b/datasets/logicstar_swe-star/metadata.json
new file mode 100644
index 00000000..d27bf0be
--- /dev/null
+++ b/datasets/logicstar_swe-star/metadata.json
@@ -0,0 +1,65 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "str_replace_editor",
+        "description": "View, create, and edit files with this custom editing tool.\n\nArgs:\n----\n    command (str): The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.\n    path (str): Absolute path to file or directory, e.g. `/repo/file.py` or `/repo`.\n    file_text (str): Required parameter of `create` command, with the content of the file to be created.\n    old_str (str): Required parameter of `str_replace` command containing the string in `path` to replace.\n    new_str (str): Optional parameter of `str_replace` command containing the new string. Required parameter of `insert` command containing the string to insert.\n    insert_line (int): Required parameter of `insert` command. The `new_str` will be inserted after this line.\n    view_range (list): Optional `[start_line, end_line]` range to view.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "command": {
+              "type": "string"
+            },
+            "path": {
+              "type": "string"
+            },
+            "file_text": {
+              "type": "string"
+            },
+            "old_str": {
+              "type": "string"
+            },
+            "new_str": {
+              "type": "string"
+            },
+            "insert_line": {
+              "type": "integer"
+            },
+            "view_range": {
+              "type": "array"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "command",
+            "path"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "think",
+        "description": "Record a private reasoning step.\n\nArgs:\n----\n    thought (str): The thought or reasoning to record.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "thought": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "thought"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [
+    "bash"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/mind2web/api.py b/datasets/mind2web/api.py
deleted file mode 100644
index d4449725..00000000
--- a/datasets/mind2web/api.py
+++ /dev/null
@@ -1,44 +0,0 @@
-def select(xpath: str, value: str) -> None:
-    """Select an option from a dropdown menu.
-
-    Args:
-    ----
-        xpath (str): The xpath of the select element.
-        value (str): The select option to choose.
-
-    """
-    pass
-
-
-def click(xpath: str) -> None:
-    """Click on the element.
-
-    Args:
-    ----
-        xpath (str): The xpath of the element to click.
-
-    """
-    pass
-
-
-def type(xpath: str, value: str) -> None:
-    """Type some text into an input element.
-
-    Args:
-    ----
-        xpath (str): The xpath of the element to type into.
-        value (str): The text to type.
-
-    """
-    pass
-
-
-def goto(url: str) -> None:
-    """Navigate to the given URL.
-
-    Args:
-    ----
-        url (str): The URL to navigate to.
-
-    """
-    pass
diff --git a/datasets/mind2web/metadata.json b/datasets/mind2web/metadata.json
new file mode 100644
index 00000000..1ffd24a3
--- /dev/null
+++ b/datasets/mind2web/metadata.json
@@ -0,0 +1,90 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "click",
+        "description": "Click on the element.\n\nArgs:\n----\n    xpath (str): The xpath of the element to click.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "xpath": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "xpath"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "goto",
+        "description": "Navigate to the given URL.\n\nArgs:\n----\n    url (str): The URL to navigate to.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "url": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "url"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "select",
+        "description": "Select an option from a dropdown menu.\n\nArgs:\n----\n    xpath (str): The xpath of the select element.\n    value (str): The select option to choose.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "xpath": {
+              "type": "string"
+            },
+            "value": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "xpath",
+            "value"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "type",
+        "description": "Type some text into an input element.\n\nArgs:\n----\n    xpath (str): The xpath of the element to type into.\n    value (str): The text to type.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "xpath": {
+              "type": "string"
+            },
+            "value": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "xpath",
+            "value"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": true
+}
diff --git a/datasets/mini-coder/metadata.json b/datasets/mini-coder/metadata.json
new file mode 100644
index 00000000..41212e45
--- /dev/null
+++ b/datasets/mini-coder/metadata.json
@@ -0,0 +1,7 @@
+{
+  "custom_tools": [],
+  "code_enabled": [
+    "bash"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/miroverse_v0_1/api.py b/datasets/miroverse_v0_1/api.py
deleted file mode 100644
index ac821907..00000000
--- a/datasets/miroverse_v0_1/api.py
+++ /dev/null
@@ -1,356 +0,0 @@
-from typing import Any
-
-
-def use_mcp_tool(server_name: str, tool_name: str, arguments: dict[str, Any] | str):
-    """Fallback MCP wrapper for rows whose XML arguments cannot be parsed.
-
-    Args:
-        server_name: Name of the MCP server that provides the tool.
-        tool_name: Name of the tool to execute.
-        arguments: Tool arguments as a JSON object, or the raw argument string when parsing fails.
-
-    """
-    return None
-
-
-def browsing_agent__search_and_browse(subtask: str) -> dict:
-    """[browsing-agent/search_and_browse] This tool is an agent that performs the subtask of searching and browsing the web for specific missing information and generating the desired answer. The subtask should be clearly defined, include relevant background, and focus on factual gaps. It does not perform vague or speculative subtasks.
-
-    Args:
-            subtask: the subtask to be performed.
-
-    Returns:
-            the result of the subtask.
-
-    """
-    return {}
-
-
-def tool_google_search__google_search(
-    q: str,
-    gl: str | None = None,
-    hl: str | None = None,
-    location: str | None = None,
-    num: float | None = None,
-    tbs: str | None = None,
-    page: float | None = None,
-    autocorrect: bool | None = None,
-) -> dict:
-    """[tool-google-search/google_search] Tool to perform web searches via Serper API and retrieve rich results. It is able to retrieve organic search results, people also ask, related searches, and knowledge graph.
-
-    Args:
-        q: Search query string
-        gl: Optional region code for search results in ISO 3166-1 alpha-2 format (e.g., 'us')
-        hl: Optional language code for search results in ISO 639-1 format (e.g., 'en')
-        location: Optional location for search results (e.g., 'SoHo, New York, United States', 'California, United States')
-        num: Number of results to return (default: 10)
-        tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week, 'qdr:m' for past month, 'qdr:y' for past year)
-        page: Page number of results to return (default: 1)
-        autocorrect: Whether to autocorrect spelling in query
-
-    """
-    return {}
-
-
-def tool_google_search__scrape(url: str, includeMarkdown: bool | None = None) -> dict:
-    """[tool-google-search/scrape] Tool to scrape a webpage and retrieve the text and, optionally, the markdown content. It will retrieve also the JSON-LD metadata and the head metadata.
-
-    Args:
-        url: The URL of the webpage to scrape.
-        includeMarkdown: Whether to include markdown content.
-
-    """
-    return {}
-
-
-def tool_serper_search__google_search(
-    q: str,
-    gl: str | None = None,
-    hl: str | None = None,
-    location: str | None = None,
-    num: float | None = None,
-    tbs: str | None = None,
-    page: float | None = None,
-    autocorrect: bool | None = None,
-) -> dict:
-    """[tool-serper-search/google_search] Tool to perform web searches via Serper API and retrieve rich results. It is able to retrieve organic search results, people also ask, related searches, and knowledge graph.
-
-    Args:
-        q: Search query string
-        gl: Optional region code for search results in ISO 3166-1 alpha-2 format (e.g., 'us')
-        hl: Optional language code for search results in ISO 639-1 format (e.g., 'en')
-        location: Optional location for search results (e.g., 'SoHo, New York, United States', 'California, United States')
-        num: Number of results to return (default: 10)
-        tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week, 'qdr:m' for past month, 'qdr:y' for past year)
-        page: Page number of results to return (default: 1)
-        autocorrect: Whether to autocorrect spelling in query
-
-    """
-    return {}
-
-
-def tool_serper_search__scrape(url: str, includeMarkdown: bool | None = None) -> dict:
-    """[tool-serper-search/scrape] Tool to scrape a webpage and retrieve the text and, optionally, the markdown content. It will retrieve also the JSON-LD metadata and the head metadata.
-
-    Args:
-        url: The URL of the webpage to scrape.
-        includeMarkdown: Whether to include markdown content.
-
-    """
-    return {}
-
-
-def tool_code__create_sandbox(timeout: int = 300) -> dict:
-    """[tool-code/create_sandbox] Create a linux sandbox.
-
-    Args:
-        timeout: Time in seconds before the sandbox is automatically shutdown. The default is 300 seconds.
-
-    Returns:
-        The id of the newly created sandbox. You should use this sandbox_id to run other tools in the sandbox.
-
-    """
-    return {}
-
-
-def tool_code__download_internet_file_to_sandbox(
-    sandbox_id: str, url: str, sandbox_file_path: str = "/home/user"
-) -> dict:
-    """[tool-code/download_internet_file_to_sandbox] Download a file from the internet to the `/home/user` dir of the remote python interpreter.
-
-    You should use this tool to download files from the internet.
-
-    Args:
-        sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. To create a new sandbox, use tool `create_sandbox`.
-        url: The URL of the file to download.
-        sandbox_file_path: The path of directory to download the file to in the sandbox. Default is `/home/user/`.
-
-    Returns:
-        The path of the downloaded file in the python interpreter if the download is successful.
-
-    """
-    return {}
-
-
-def tool_code__run_command(command: str, sandbox_id: str) -> dict:
-    """[tool-code/run_command] Execute a command in the linux sandbox.
-
-    Args:
-        command: The command to execute
-        sandbox_id: The id of the sandbox to execute the command in. To create a new sandbox, use tool `create_sandbox`.
-
-    Returns:
-        A result of the command execution, format like (stderr=..., stdout=..., exit_code=..., error=...)
-
-    """
-    return {}
-
-
-def tool_code__run_python_code(code_block: str, sandbox_id: str) -> dict:
-    """[tool-code/run_python_code] Run python code in an interpreter and return the execution result.
-
-    Args:
-        code_block: The python code to run.
-        sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. To create a new sandbox, use tool `create_sandbox`.
-
-    Returns:
-        A result of the command execution, format like (stderr=..., stdout=..., exit_code=..., error=...)
-
-    """
-    return {}
-
-
-def tool_code__upload_local_file_to_sandbox(
-    sandbox_id: str, local_file_path: str, sandbox_file_path: str = "/home/user"
-) -> dict:
-    """[tool-code/upload_local_file_to_sandbox] Upload a local file to the `/home/user` dir of the remote python interpreter.
-
-    Args:
-        sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. To create a new sandbox, use tool `create_sandbox`.
-        local_file_path: The path of the file on local machine to upload.
-        sandbox_file_path: The path of directory to upload the file to in the sandbox. Default is `/home/user/`.
-
-    Returns:
-        The path of the uploaded file in the remote python interpreter if the upload is successful.
-
-    """
-    return {}
-
-
-def tool_python__create_sandbox(timeout: int = 300) -> dict:
-    """[tool-python/create_sandbox] Create a linux sandbox.
-
-    Args:
-        timeout: Time in seconds before the sandbox is automatically shutdown. The default is 300 seconds.
-
-    Returns:
-        The id of the newly created sandbox. You should use this sandbox_id to run other tools in the sandbox.
-
-    """
-    return {}
-
-
-def tool_python__download_internet_file_to_python_interpreter(
-    url: str, sandbox_id: str | None = None
-) -> dict:
-    """[tool-python/download_internet_file_to_python_interpreter] Download a file from the internet to the `/home/user` dir of the remote python interpreter.
-
-    Args:
-        url: The URL of the file to download.
-        sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. Only create new ones if this is the first time running code in this sandbox.
-
-    Returns:
-        The path of the downloaded file in the python interpreter.
-
-    """
-    return {}
-
-
-def tool_python__download_internet_file_to_sandbox(
-    sandbox_id: str, url: str, sandbox_file_path: str = "/home/user"
-) -> dict:
-    """[tool-python/download_internet_file_to_sandbox] Download a file from the internet to the `/home/user` dir of the remote python interpreter.
-
-    Args:
-        sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. To create a new sandbox, use tool `create_sandbox`.
-        url: The URL of the file to download.
-        sandbox_file_path: The path of directory to download the file to in the sandbox. Default is `/home/user/`.
-
-    Returns:
-        The path of the downloaded file in the python interpreter if the download is successful.
-
-    """
-    return {}
-
-
-def tool_python__run_command(command: str, sandbox_id: str) -> dict:
-    """[tool-python/run_command] Execute a command in the linux sandbox.
-
-    Args:
-        command: The command to execute
-        sandbox_id: The id of the sandbox to execute the command in. To create a new sandbox, use tool `create_sandbox`.
-
-    Returns:
-        A CommandResult object containing the result of the command execution, format like CommandResult(stderr=..., stdout=..., exit_code=..., error=...)
-
-    """
-    return {}
-
-
-def tool_python__run_python_code(
-    code_block: str, timeout: int = 300, sandbox_id: str | None = None
-) -> dict:
-    """[tool-python/run_python_code] Run python code in an interperter and return the execution result.
-
-    Args:
-        code_block: The python code to run.
-        timeout: Time in seconds before the sandbox is automatically shutdown. The default is 300 seconds.
-        sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. Only create new ones if this is the first time running code in this sandbox.
-
-    Returns:
-        An object containing the sandbox id and the execution result object including results, logs and errors.
-
-    """
-    return {}
-
-
-def tool_python__upload_local_file_to_python_interpreter(
-    local_file_path: str, sandbox_id: str | None = None
-) -> dict:
-    """[tool-python/upload_local_file_to_python_interpreter] Upload a local file to the `/home/user` dir of the remote python interpreter.
-
-    Args:
-        local_file_path: The path of the file on local machine to upload.
-        sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. Only create new ones if this is the first time running code in this sandbox.
-
-    Returns:
-        The path of the uploaded file in the remote python interpreter.
-
-    """
-    return {}
-
-
-def tool_python__upload_local_file_to_sandbox(
-    sandbox_id: str, local_file_path: str, sandbox_file_path: str = "/home/user"
-) -> dict:
-    """[tool-python/upload_local_file_to_sandbox] Upload a local file to the `/home/user` dir of the remote python interpreter.
-
-    Args:
-        sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. To create a new sandbox, use tool `create_sandbox`.
-        local_file_path: The path of the file on local machine to upload.
-        sandbox_file_path: The path of directory to upload the file to in the sandbox. Default is `/home/user/`.
-
-    Returns:
-        The path of the uploaded file in the remote python interpreter if the upload is successful.
-
-    """
-    return {}
-
-
-def tool_reader__convert_to_markdown(uri: str) -> dict:
-    """[tool-reader/convert_to_markdown] Convert a resource described by an http:, https:, file: or data: URI to markdown.
-
-    Args:
-        uri: Uri
-
-    """
-    return {}
-
-
-def tool_reading__convert_to_markdown(uri: str) -> dict:
-    """[tool-reading/convert_to_markdown] Convert resources to markdown.
-
-    Supports doc, ppt, pdf, excel, csv, zip file, and other resources
-    described by a file: or data: URI.
-
-    Args:
-        uri: Required. The URI of the resource to convert. Need to start with 'file:' or 'data:' schemes.
-
-    Returns:
-        str: The converted markdown content, or an error message if conversion fails.
-
-    """
-    return {}
-
-
-def tool_reasoning__reasoning(question: str) -> dict:
-    """[tool-reasoning/reasoning] Solve hard reasoning questions.
-
-    Use this tool to solve hard math problems, puzzles, riddles, and IQ test questions that require a lot of chain-of-thought effort.
-    DO NOT use this tool for simple and obvious question.
-
-    Args:
-        question: The hard question.
-
-    Returns:
-        The answer to the question.
-
-    """
-    return {}
-
-
-def tool_transcribe__audio_transcription(audio_path_or_url: str) -> dict:
-    """[tool-transcribe/audio_transcription] Transcribe audio file to text and return the transcription.
-
-    Args:
-        audio_path_or_url: The path of the audio file locally or its URL.
-
-    Returns:
-        The transcription of the audio file.
-
-    """
-    return {}
-
-
-def tool_vqa__visual_question_answering(image_path_or_url: str, question: str) -> dict:
-    """[tool-vqa/visual_question_answering] This tool is used to ask question about an image or a video and get the answer with both Claude and OpenAI vision language models. It also automatically performs OCR (text extraction) on the image for additional context.
-
-    Args:
-        image_path_or_url: The path of the image file locally or its URL.
-        question: The question to ask about the image.
-
-    Returns:
-        The concatenated answers from both Claude and OpenAI vision models, including both VQA responses and OCR results.
-
-    """
-    return {}
diff --git a/datasets/miroverse_v0_1/metadata.json b/datasets/miroverse_v0_1/metadata.json
new file mode 100644
index 00000000..3b0e889e
--- /dev/null
+++ b/datasets/miroverse_v0_1/metadata.json
@@ -0,0 +1,321 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "browsing_agent__search_and_browse",
+        "description": "[browsing-agent/search_and_browse] This tool is an agent that performs the subtask of searching and browsing the web for specific missing information and generating the desired answer. The subtask should be clearly defined, include relevant background, and focus on factual gaps. It does not perform vague or speculative subtasks.\n\nArgs:\n        subtask: the subtask to be performed.\n\nReturns:\n        the result of the subtask.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "subtask": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "subtask"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_google_search__google_search",
+        "description": "[tool-google-search/google_search] Tool to perform web searches via Serper API and retrieve rich results. It is able to retrieve organic search results, people also ask, related searches, and knowledge graph.\n\nArgs:\n    q: Search query string\n    gl: Optional region code for search results in ISO 3166-1 alpha-2 format (e.g., 'us')\n    hl: Optional language code for search results in ISO 639-1 format (e.g., 'en')\n    location: Optional location for search results (e.g., 'SoHo, New York, United States', 'California, United States')\n    num: Number of results to return (default: 10)\n    tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week, 'qdr:m' for past month, 'qdr:y' for past year)\n    page: Page number of results to return (default: 1)\n    autocorrect: Whether to autocorrect spelling in query",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "q": {
+              "type": "string"
+            },
+            "gl": {},
+            "hl": {},
+            "location": {},
+            "num": {},
+            "tbs": {},
+            "page": {},
+            "autocorrect": {}
+          },
+          "additionalProperties": false,
+          "required": [
+            "q"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_google_search__scrape",
+        "description": "[tool-google-search/scrape] Tool to scrape a webpage and retrieve the text and, optionally, the markdown content. It will retrieve also the JSON-LD metadata and the head metadata.\n\nArgs:\n    url: The URL of the webpage to scrape.\n    includeMarkdown: Whether to include markdown content.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "url": {
+              "type": "string"
+            },
+            "includeMarkdown": {}
+          },
+          "additionalProperties": false,
+          "required": [
+            "url"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_serper_search__google_search",
+        "description": "[tool-serper-search/google_search] Tool to perform web searches via Serper API and retrieve rich results. It is able to retrieve organic search results, people also ask, related searches, and knowledge graph.\n\nArgs:\n    q: Search query string\n    gl: Optional region code for search results in ISO 3166-1 alpha-2 format (e.g., 'us')\n    hl: Optional language code for search results in ISO 639-1 format (e.g., 'en')\n    location: Optional location for search results (e.g., 'SoHo, New York, United States', 'California, United States')\n    num: Number of results to return (default: 10)\n    tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week, 'qdr:m' for past month, 'qdr:y' for past year)\n    page: Page number of results to return (default: 1)\n    autocorrect: Whether to autocorrect spelling in query",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "q": {
+              "type": "string"
+            },
+            "gl": {},
+            "hl": {},
+            "location": {},
+            "num": {},
+            "tbs": {},
+            "page": {},
+            "autocorrect": {}
+          },
+          "additionalProperties": false,
+          "required": [
+            "q"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_serper_search__scrape",
+        "description": "[tool-serper-search/scrape] Tool to scrape a webpage and retrieve the text and, optionally, the markdown content. It will retrieve also the JSON-LD metadata and the head metadata.\n\nArgs:\n    url: The URL of the webpage to scrape.\n    includeMarkdown: Whether to include markdown content.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "url": {
+              "type": "string"
+            },
+            "includeMarkdown": {}
+          },
+          "additionalProperties": false,
+          "required": [
+            "url"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_code__create_sandbox",
+        "description": "Dataset tool tool_code__create_sandbox.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_code__download_internet_file_to_sandbox",
+        "description": "Dataset tool tool_code__download_internet_file_to_sandbox.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_code__run_command",
+        "description": "Dataset tool tool_code__run_command.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_code__run_python_code",
+        "description": "Dataset tool tool_code__run_python_code.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_code__upload_local_file_to_sandbox",
+        "description": "Dataset tool tool_code__upload_local_file_to_sandbox.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_python__create_sandbox",
+        "description": "Dataset tool tool_python__create_sandbox.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_python__download_internet_file_to_python_interpreter",
+        "description": "Dataset tool tool_python__download_internet_file_to_python_interpreter.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_python__download_internet_file_to_sandbox",
+        "description": "Dataset tool tool_python__download_internet_file_to_sandbox.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_python__run_command",
+        "description": "Dataset tool tool_python__run_command.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_python__run_python_code",
+        "description": "Dataset tool tool_python__run_python_code.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_python__upload_local_file_to_python_interpreter",
+        "description": "Dataset tool tool_python__upload_local_file_to_python_interpreter.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_python__upload_local_file_to_sandbox",
+        "description": "Dataset tool tool_python__upload_local_file_to_sandbox.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_reader__convert_to_markdown",
+        "description": "Dataset tool tool_reader__convert_to_markdown.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_reading__convert_to_markdown",
+        "description": "Dataset tool tool_reading__convert_to_markdown.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_reasoning__reasoning",
+        "description": "Dataset tool tool_reasoning__reasoning.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_transcribe__audio_transcription",
+        "description": "Dataset tool tool_transcribe__audio_transcription.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "tool_vqa__visual_question_answering",
+        "description": "Dataset tool tool_vqa__visual_question_answering.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": false
+}
diff --git a/datasets/nebius_SWE-agent-trajectories/api.py b/datasets/nebius_SWE-agent-trajectories/api.py
deleted file mode 100644
index db3874aa..00000000
--- a/datasets/nebius_SWE-agent-trajectories/api.py
+++ /dev/null
@@ -1,121 +0,0 @@
-def str_replace_editor(
-    command: str,
-    path: str,
-    file_text: str = None,
-    old_str: str = None,
-    new_str: str = None,
-    insert_line: int = None,
-    view_range: list = None,
-) -> None:
-    """View, create, and edit files with this custom editing tool.
-
-    Args:
-    ----
-        command (str): The command to run: view, create, str_replace, insert, or undo_edit.
-        path (str): Path to a file or directory.
-        file_text (str, optional): File contents for create commands.
-        old_str (str, optional): Text to replace for str_replace commands.
-        new_str (str, optional): Replacement or inserted text.
-        insert_line (int, optional): Line number after which to insert.
-        view_range (list, optional): Optional [start, end] line range for view commands.
-
-    """
-    pass
-
-
-def open(path: str, line_number: int = None):
-    """Open the file at the given path in the editor. If line_number is provided, the window will be moved to include that line.
-
-    Args:
-    ----
-        path (str): The path to the file to open.
-        line_number (int, optional): The line number to move the window to (if not provided, the window will start at the top of the file).
-
-    """
-    pass
-
-
-def goto(line_number: int):
-    """Move the window to show the specified line number.
-
-    Args:
-    ----
-        line_number (int): The line number to move the window to.
-
-    """
-    pass
-
-
-def scroll_down():
-    """Move the window down {WINDOW} lines."""
-    pass
-
-
-def scroll_up():
-    """Move the window up {WINDOW} lines."""
-    pass
-
-
-def create(filename: str):
-    """Create and open a new file with the given name.
-
-    Args:
-    ----
-        filename (str): The name of the file to create.
-
-    """
-    pass
-
-
-def submit():
-    """Submit your current code and terminate the interactive session."""
-    pass
-
-
-def search_dir(search_term: str, dir: str = None):
-    """Search for the search_term in all files in the specified directory. If dir is not provided, search in the current directory.
-
-    Args:
-    ----
-        search_term (str): The term to search for.
-        dir (str, optional): The directory to search in (if not provided, search in the current directory).
-
-    """
-    pass
-
-
-def search_file(search_term: str, file: str = None):
-    """Search for the search_term in the specified file. If file is not provided, search in the current open file.
-
-    Args:
-    ----
-        search_term (str): The term to search for.
-        file (str, optional): The file to search in (if not provided, search in the current open file).
-
-    """
-    pass
-
-
-def find_file(file_name: str, dir: str = None):
-    """Find all files with the given name in the specified directory. If dir is not provided, search in the current directory.
-
-    Args:
-    ----
-        file_name (str): The name of the file to search for.
-        dir (str, optional): The directory to search in (if not provided, search in the current directory).
-
-    """
-    pass
-
-
-def edit(start_line: int, end_line: int, replacement_text: str):
-    """Replace lines from start_line to end_line (inclusive) with the given text in the open file. The replacement text is terminated by a line with only 'end_of_edit' on it. All of the replacement text will be entered, so make sure your indentation is formatted properly. Python files will be checked for syntax errors after the edit. If the system detects a syntax error, the edit will not be executed. Simply try to edit the file again, but make sure to read the error message and modify the edit command you issue accordingly. Issuing the same command a second time will just lead to the same error message again. Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION. If you'd like to add the line '        pr int(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
-
-    Args:
-    ----
-        start_line (int): The line number to start the edit at.
-        end_line (int): The line number to end the edit at (inclusive).
-        replacement_text (str): The text to replace the current selection with.
-
-    """
-    pass
diff --git a/datasets/nebius_SWE-rebench-openhands-trajectories/api.py b/datasets/nebius_SWE-rebench-openhands-trajectories/api.py
deleted file mode 100644
index af32f5f4..00000000
--- a/datasets/nebius_SWE-rebench-openhands-trajectories/api.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from typing import Any
-
-
-def think(thought: str) -> None:
-    """Log reasoning without changing the environment.
-
-    Args:
-    ----
-        thought: The reasoning text to record.
-
-    """
-    pass
-
-
-def str_replace_editor(
-    command: str,
-    path: str,
-    file_text: str | None = None,
-    old_str: str | None = None,
-    new_str: str | None = None,
-    insert_line: int | None = None,
-    view_range: list[int] | None = None,
-) -> None:
-    """View, create, and edit files.
-
-    Args:
-    ----
-        command: The editor command to run.
-        path: Absolute path to the target file or directory.
-        file_text: Content for create operations.
-        old_str: Text to replace for str_replace operations.
-        new_str: Replacement text or insertion text.
-        insert_line: Line after which to insert text.
-        view_range: Optional line range for view operations.
-
-    """
-    pass
-
-
-def task_tracker(command: str, task_list: list[dict[str, Any]]) -> None:
-    """Track task progress.
-
-    Args:
-    ----
-        command: The task tracker command.
-        task_list: The full task list with statuses and notes.
-
-    """
-    pass
diff --git a/datasets/nebius_SWE-rebench-openhands-trajectories/metadata.json b/datasets/nebius_SWE-rebench-openhands-trajectories/metadata.json
new file mode 100644
index 00000000..ddad006a
--- /dev/null
+++ b/datasets/nebius_SWE-rebench-openhands-trajectories/metadata.json
@@ -0,0 +1,80 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "str_replace_editor",
+        "description": "View, create, and edit files.\n\nArgs:\n----\n    command: The editor command to run.\n    path: Absolute path to the target file or directory.\n    file_text: Content for create operations.\n    old_str: Text to replace for str_replace operations.\n    new_str: Replacement text or insertion text.\n    insert_line: Line after which to insert text.\n    view_range: Optional line range for view operations.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "command": {
+              "type": "string"
+            },
+            "path": {
+              "type": "string"
+            },
+            "file_text": {},
+            "old_str": {},
+            "new_str": {},
+            "insert_line": {},
+            "view_range": {
+              "type": "array"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "command",
+            "path"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "task_tracker",
+        "description": "Track task progress.\n\nArgs:\n----\n    command: The task tracker command.\n    task_list: The full task list with statuses and notes.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "command": {
+              "type": "string"
+            },
+            "task_list": {
+              "type": "array"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "command",
+            "task_list"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "think",
+        "description": "Log reasoning without changing the environment.\n\nArgs:\n----\n    thought: The reasoning text to record.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "thought": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "thought"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [
+    "bash"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/nemotron_terminal_corpus/metadata.json b/datasets/nemotron_terminal_corpus/metadata.json
new file mode 100644
index 00000000..41212e45
--- /dev/null
+++ b/datasets/nemotron_terminal_corpus/metadata.json
@@ -0,0 +1,7 @@
+{
+  "custom_tools": [],
+  "code_enabled": [
+    "bash"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/nnetnav-live/api.py b/datasets/nnetnav-live/api.py
deleted file mode 100644
index 3a51f665..00000000
--- a/datasets/nnetnav-live/api.py
+++ /dev/null
@@ -1,149 +0,0 @@
-import time
-from typing import List, Literal
-
-
-def noop(wait_ms: float = 1000) -> None:
-    """Do nothing, and optionally wait for the given time (in milliseconds).
-
-    Args:
-    ----
-        wait_ms (float): Time to wait in milliseconds. Defaults to 1000ms.
-
-    """
-    time.sleep(wait_ms / 1000)
-
-
-def scroll(delta_x: float, delta_y: float) -> None:
-    """Scroll horizontally and vertically.
-
-    Amounts in pixels, positive for right or down scrolling,
-    negative for left or up scrolling. Dispatches a wheel event.
-
-    Args:
-    ----
-        delta_x (float): The distance to scroll horizontally in pixels.
-        delta_y (float): The distance to scroll vertically in pixels.
-
-    """
-    pass
-
-
-def fill(bid: str, value: str) -> None:
-    """Fill out a form field.
-
-    It focuses the element and triggers an input event with the entered text.
-    Works for <input>, <textarea> and [contenteditable] elements.
-
-    Args:
-    ----
-        bid (str): The browser ID of the element to fill.
-        value (str): The text to enter into the field.
-
-    """
-    pass
-
-
-def click(
-    bid: str,
-    button: Literal["left", "middle", "right"] = "left",
-    modifiers: List[Literal["Alt", "Control", "ControlOrMeta", "Meta", "Shift"]] = None,
-) -> None:
-    """Click an element.
-
-    Args:
-    ----
-        bid (str): The browser ID of the element to click.
-        button (Literal["left", "middle", "right"]): The mouse button to use. Defaults to "left".
-        modifiers (List[Literal["Alt", "Control", "ControlOrMeta", "Meta", "Shift"]]):
-            List of modifier keys to hold while clicking. Defaults to None.
-
-    """
-    if modifiers is None:
-        modifiers = []
-    pass
-
-
-def hover(bid: str) -> None:
-    """Hover over an element.
-
-    Args:
-    ----
-        bid (str): The browser ID of the element to hover over.
-
-    """
-    pass
-
-
-def type(bid: str, text: str, press_enter_after: int = 1) -> None:
-    """Type the given text into an element with the given id.
-
-    Args:
-    ----
-        bid (str): The browser ID of the element to type into.
-        text (str): The text to type.
-        press_enter_after (int): Whether to press enter after typing the text. Defaults to 1. 0 means, do not press enter.
-
-    """
-    pass
-
-
-def go_back() -> None:
-    """Navigate to the previous page in history."""
-    pass
-
-
-def go_forward() -> None:
-    """Navigate to the next page in history."""
-    pass
-
-
-def goto(url: str) -> None:
-    """Navigate to a url.
-
-    Args:
-    ----
-        url (str): The URL to navigate to.
-
-    """
-    pass
-
-
-def new_tab() -> None:
-    """Open a new tab."""
-    pass
-
-
-def tab_focus(index: int) -> None:
-    """Bring tab to front (activate tab).
-
-    Args:
-    ----
-        index (int): The index of the tab to focus.
-
-    """
-    pass
-
-
-def tab_close() -> None:
-    """Close the current tab."""
-    pass
-
-
-def keyboard_press(key: str) -> None:
-    """Press a combination of keys.
-
-    Accepts the logical key names that are emitted in the keyboardEvent.key property
-    of the keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab,
-    Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp,
-    ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can
-    alternatively specify a single character you'd like to produce such as "a" or "#".
-    Following modification shortcuts are also supported: Shift, Control, Alt, Meta,
-    ShiftLeft, ControlOrMeta. ControlOrMeta resolves to Control on Windows and Linux
-    and to Meta on macOS.
-
-    Args:
-    ----
-        key (str): The key or key combination to press.
-
-    """
-    pass
diff --git a/datasets/nnetnav-live/metadata.json b/datasets/nnetnav-live/metadata.json
new file mode 100644
index 00000000..d35808f5
--- /dev/null
+++ b/datasets/nnetnav-live/metadata.json
@@ -0,0 +1,97 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "click",
+        "description": "Click an element.\n\nArgs:\n----\n    bid (str): The browser ID of the element to click.\n    button (Literal[\"left\", \"middle\", \"right\"]): The mouse button to use. Defaults to \"left\".\n    modifiers (List[Literal[\"Alt\", \"Control\", \"ControlOrMeta\", \"Meta\", \"Shift\"]]):\n        List of modifier keys to hold while clicking. Defaults to None.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "bid": {
+              "type": "string"
+            },
+            "button": {
+              "type": "string",
+              "enum": [
+                "left",
+                "middle",
+                "right"
+              ]
+            },
+            "modifiers": {
+              "type": "array"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "bid"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "go_back",
+        "description": "Navigate to the previous page in history.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "scroll",
+        "description": "Scroll horizontally and vertically.\n\nAmounts in pixels, positive for right or down scrolling,\nnegative for left or up scrolling. Dispatches a wheel event.\n\nArgs:\n----\n    delta_x (float): The distance to scroll horizontally in pixels.\n    delta_y (float): The distance to scroll vertically in pixels.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "delta_x": {
+              "type": "number"
+            },
+            "delta_y": {
+              "type": "number"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "delta_x",
+            "delta_y"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "type",
+        "description": "Type the given text into an element with the given id.\n\nArgs:\n----\n    bid (str): The browser ID of the element to type into.\n    text (str): The text to type.\n    press_enter_after (int): Whether to press enter after typing the text. Defaults to 1. 0 means, do not press enter.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "bid": {
+              "type": "string"
+            },
+            "text": {
+              "type": "string"
+            },
+            "press_enter_after": {
+              "type": "integer"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "bid",
+            "text"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": true
+}
diff --git a/datasets/nnetnav-wa/api.py b/datasets/nnetnav-wa/api.py
deleted file mode 100644
index 3a51f665..00000000
--- a/datasets/nnetnav-wa/api.py
+++ /dev/null
@@ -1,149 +0,0 @@
-import time
-from typing import List, Literal
-
-
-def noop(wait_ms: float = 1000) -> None:
-    """Do nothing, and optionally wait for the given time (in milliseconds).
-
-    Args:
-    ----
-        wait_ms (float): Time to wait in milliseconds. Defaults to 1000ms.
-
-    """
-    time.sleep(wait_ms / 1000)
-
-
-def scroll(delta_x: float, delta_y: float) -> None:
-    """Scroll horizontally and vertically.
-
-    Amounts in pixels, positive for right or down scrolling,
-    negative for left or up scrolling. Dispatches a wheel event.
-
-    Args:
-    ----
-        delta_x (float): The distance to scroll horizontally in pixels.
-        delta_y (float): The distance to scroll vertically in pixels.
-
-    """
-    pass
-
-
-def fill(bid: str, value: str) -> None:
-    """Fill out a form field.
-
-    It focuses the element and triggers an input event with the entered text.
-    Works for <input>, <textarea> and [contenteditable] elements.
-
-    Args:
-    ----
-        bid (str): The browser ID of the element to fill.
-        value (str): The text to enter into the field.
-
-    """
-    pass
-
-
-def click(
-    bid: str,
-    button: Literal["left", "middle", "right"] = "left",
-    modifiers: List[Literal["Alt", "Control", "ControlOrMeta", "Meta", "Shift"]] = None,
-) -> None:
-    """Click an element.
-
-    Args:
-    ----
-        bid (str): The browser ID of the element to click.
-        button (Literal["left", "middle", "right"]): The mouse button to use. Defaults to "left".
-        modifiers (List[Literal["Alt", "Control", "ControlOrMeta", "Meta", "Shift"]]):
-            List of modifier keys to hold while clicking. Defaults to None.
-
-    """
-    if modifiers is None:
-        modifiers = []
-    pass
-
-
-def hover(bid: str) -> None:
-    """Hover over an element.
-
-    Args:
-    ----
-        bid (str): The browser ID of the element to hover over.
-
-    """
-    pass
-
-
-def type(bid: str, text: str, press_enter_after: int = 1) -> None:
-    """Type the given text into an element with the given id.
-
-    Args:
-    ----
-        bid (str): The browser ID of the element to type into.
-        text (str): The text to type.
-        press_enter_after (int): Whether to press enter after typing the text. Defaults to 1. 0 means, do not press enter.
-
-    """
-    pass
-
-
-def go_back() -> None:
-    """Navigate to the previous page in history."""
-    pass
-
-
-def go_forward() -> None:
-    """Navigate to the next page in history."""
-    pass
-
-
-def goto(url: str) -> None:
-    """Navigate to a url.
-
-    Args:
-    ----
-        url (str): The URL to navigate to.
-
-    """
-    pass
-
-
-def new_tab() -> None:
-    """Open a new tab."""
-    pass
-
-
-def tab_focus(index: int) -> None:
-    """Bring tab to front (activate tab).
-
-    Args:
-    ----
-        index (int): The index of the tab to focus.
-
-    """
-    pass
-
-
-def tab_close() -> None:
-    """Close the current tab."""
-    pass
-
-
-def keyboard_press(key: str) -> None:
-    """Press a combination of keys.
-
-    Accepts the logical key names that are emitted in the keyboardEvent.key property
-    of the keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab,
-    Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp,
-    ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can
-    alternatively specify a single character you'd like to produce such as "a" or "#".
-    Following modification shortcuts are also supported: Shift, Control, Alt, Meta,
-    ShiftLeft, ControlOrMeta. ControlOrMeta resolves to Control on Windows and Linux
-    and to Meta on macOS.
-
-    Args:
-    ----
-        key (str): The key or key combination to press.
-
-    """
-    pass
diff --git a/datasets/nnetnav-wa/metadata.json b/datasets/nnetnav-wa/metadata.json
new file mode 100644
index 00000000..30723c8c
--- /dev/null
+++ b/datasets/nnetnav-wa/metadata.json
@@ -0,0 +1,93 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "click",
+        "description": "Click an element.\n\nArgs:\n----\n    bid (str): The browser ID of the element to click.\n    button (Literal[\"left\", \"middle\", \"right\"]): The mouse button to use. Defaults to \"left\".\n    modifiers (List[Literal[\"Alt\", \"Control\", \"ControlOrMeta\", \"Meta\", \"Shift\"]]):\n        List of modifier keys to hold while clicking. Defaults to None.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "bid": {
+              "type": "string"
+            },
+            "button": {
+              "type": "string",
+              "enum": [
+                "left",
+                "middle",
+                "right"
+              ]
+            },
+            "modifiers": {
+              "type": "array"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "bid"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "goto",
+        "description": "Navigate to a url.\n\nArgs:\n----\n    url (str): The URL to navigate to.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "url": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "url"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "new_tab",
+        "description": "Open a new tab.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "type",
+        "description": "Type the given text into an element with the given id.\n\nArgs:\n----\n    bid (str): The browser ID of the element to type into.\n    text (str): The text to type.\n    press_enter_after (int): Whether to press enter after typing the text. Defaults to 1. 0 means, do not press enter.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "bid": {
+              "type": "string"
+            },
+            "text": {
+              "type": "string"
+            },
+            "press_enter_after": {
+              "type": "integer"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "bid",
+            "text"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": true
+}
diff --git a/datasets/nvidia_SWE-Zero-openhands-trajectories/api.py b/datasets/nvidia_SWE-Zero-openhands-trajectories/api.py
deleted file mode 100644
index 350814c2..00000000
--- a/datasets/nvidia_SWE-Zero-openhands-trajectories/api.py
+++ /dev/null
@@ -1,34 +0,0 @@
-def think(thought: str) -> None:
-    """Log reasoning without changing the environment.
-
-    Args:
-    ----
-        thought: The reasoning text to record.
-
-    """
-    pass
-
-
-def str_replace_editor(
-    command: str,
-    path: str,
-    file_text: str | None = None,
-    old_str: str | None = None,
-    new_str: str | None = None,
-    insert_line: int | None = None,
-    view_range: list[int] | None = None,
-) -> None:
-    """View, create, and edit files.
-
-    Args:
-    ----
-        command: The editor command to run.
-        path: Absolute path to the target file or directory.
-        file_text: Content for create operations.
-        old_str: Text to replace for str_replace operations.
-        new_str: Replacement text or insertion text.
-        insert_line: Line after which to insert text.
-        view_range: Optional line range for view operations.
-
-    """
-    pass
diff --git a/datasets/nvidia_SWE-Zero-openhands-trajectories/metadata.json b/datasets/nvidia_SWE-Zero-openhands-trajectories/metadata.json
new file mode 100644
index 00000000..494c2acc
--- /dev/null
+++ b/datasets/nvidia_SWE-Zero-openhands-trajectories/metadata.json
@@ -0,0 +1,57 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "str_replace_editor",
+        "description": "View, create, and edit files.\n\nArgs:\n----\n    command: The editor command to run.\n    path: Absolute path to the target file or directory.\n    file_text: Content for create operations.\n    old_str: Text to replace for str_replace operations.\n    new_str: Replacement text or insertion text.\n    insert_line: Line after which to insert text.\n    view_range: Optional line range for view operations.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "command": {
+              "type": "string"
+            },
+            "path": {
+              "type": "string"
+            },
+            "file_text": {},
+            "old_str": {},
+            "new_str": {},
+            "insert_line": {},
+            "view_range": {
+              "type": "array"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "command",
+            "path"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "think",
+        "description": "Log reasoning without changing the environment.\n\nArgs:\n----\n    thought: The reasoning text to record.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "thought": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "thought"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [
+    "bash"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/omniact/metadata.json b/datasets/omniact/metadata.json
new file mode 100644
index 00000000..71eea4f8
--- /dev/null
+++ b/datasets/omniact/metadata.json
@@ -0,0 +1,7 @@
+{
+  "custom_tools": [],
+  "code_enabled": [
+    "python"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/openhands/api.py b/datasets/openhands/api.py
deleted file mode 100644
index 9b428935..00000000
--- a/datasets/openhands/api.py
+++ /dev/null
@@ -1,366 +0,0 @@
-from typing import List, Literal, Union
-
-
-def initialize(env_vars: dict):
-    """Set environment variables.
-
-    Args:
-    ----
-        env_vars (dict): The environment variables.
-
-    """
-    pass
-
-
-def change_agent_state(agent_state: str):
-    """Change the agent state.
-
-    Args:
-    ----
-        agent_state (str): The new agent state.
-
-    """
-    pass
-
-
-def delegate_to_agent(agent: str, task: str):
-    """Delegate the task to the BrowsingAgent.
-
-    Args:
-    ----
-        agent (str): The agent to delegate the task to.
-        task (str): task description.
-
-    """
-    pass
-
-
-def delegate_to_CrawlAgent(task: str, link: str):
-    """Delegate the task to the CrawlAgent.
-
-    Args:
-    ----
-        task (str): task description.
-        link (str): the link to crawl.
-
-    """
-    pass
-
-
-def delegate_to_RagAgent(task: str, query: str):
-    """Delegate the task to the RagAgent.
-
-    Args:
-    ----
-        task (str): task description.
-        query (str): the query to search.
-
-    """
-    pass
-
-
-def finish(output: str):
-    """Finish the task.
-
-    Args:
-    ----
-        output (str): The output of the task.
-
-    """
-    pass
-
-
-def add_task(goal: str):
-    """Add a task to the task planner.
-
-    Args:
-    ----
-        goal (str): The goal of the task.
-
-    """
-    pass
-
-
-def modify_task(task_id: str, state: str):
-    """Modify the state of a task.
-
-    Args:
-    ----
-        task_id (str): The id of the task.
-        state (str): The new state of the task.
-
-    """
-    pass
-
-
-def save_plan(plan: list[str]):
-    """Save the plan.
-
-    Args:
-    ----
-        plan (list[str]): The plan to save.
-
-    """
-    pass
-
-
-def task_plan(task: str, plan: str):
-    """Plan a task.
-
-    Args:
-    ----
-        task (str): The task to plan.
-        plan (list[str]): The plan.
-
-    """
-    pass
-
-
-def edit(path: str, content: str, start: int, end: int):
-    """Edit a file.
-
-    Args:
-    ----
-        path (str): The path of the file.
-        content (str): The new content of the file.
-        start (int): The start position of the edit.
-        end (int): The end position of the edit.
-
-    """
-    pass
-
-
-def read(path: str, start: int, end: int):
-    """Read a file.
-
-    Args:
-    ----
-        path (str): The path of the file.
-        start (int): The start position of the read.
-        end (int): The end position of the read.
-
-    """
-    pass
-
-
-def crawl(link: str):
-    """Crawl a webpage.
-
-    Args:
-    ----
-        link (str): The link to crawl.
-
-    """
-    pass
-
-
-def rag_search(query: str):
-    """Search using the RAG model.
-
-    Args:
-    ----
-        query (str): The query to search.
-
-    """
-    pass
-
-
-def goto(url: str):
-    """Navigate to a URL.
-
-    Args:
-    ----
-        url (str): The target URL to visit.
-
-    Example: goto('http://www.example.com')
-
-    """
-    pass
-
-
-def go_back():
-    """Navigate to the previous page in history.
-
-    Example: go_back()
-    """
-    pass
-
-
-def go_forward():
-    """Navigate to the next page in history.
-
-    Example: go_forward()
-    """
-    pass
-
-
-def noop(wait_ms: float = 1000):
-    """Do nothing and optionally wait.
-
-    Args:
-    ----
-        wait_ms (float, optional): Time to wait in milliseconds (default is 1000).
-
-    Example: noop()
-             noop(500)
-
-    """
-    pass
-
-
-def scroll(delta_x: float, delta_y: float):
-    """Scroll horizontally and/or vertically in pixels.
-
-    Args:
-    ----
-        delta_x (float): Horizontal scroll in pixels.
-        delta_y (float): Vertical scroll in pixels.
-
-    """
-    pass
-
-
-def fill(bid: str, value: str):
-    """Fill a form field with text.
-
-    Args:
-    ----
-        bid (str): Element ID to fill.
-        value (str): Text to input.
-
-    """
-    pass
-
-
-def select_option(bid: str, options: Union[str, List[str]]):
-    """Select one or more options in a dropdown/select element.
-
-    Args:
-    ----
-        bid (str): The element ID.
-        options (str or list[str]): One or more option values or labels to select.
-
-    """
-    pass
-
-
-def click(
-    bid: str,
-    button: Literal["left", "middle", "right"] = "left",
-    modifiers: List[Literal["Alt", "Control", "ControlOrMeta", "Meta", "Shift"]] = [],
-):
-    """Click an element.
-
-    Args:
-    ----
-        bid (str): Element ID to click.
-        button (str, optional): Mouse button to use.
-        modifiers (list, optional): List of modifier keys.
-
-    """
-    pass
-
-
-def dblclick(
-    bid: str,
-    button: Literal["left", "middle", "right"] = "left",
-    modifiers: List[Literal["Alt", "Control", "ControlOrMeta", "Meta", "Shift"]] = [],
-):
-    """Double-click an element.
-
-    Args:
-    ----
-        bid (str): Element ID to double-click.
-        button (str, optional): Mouse button to use.
-        modifiers (list, optional): List of modifier keys.
-
-    """
-    pass
-
-
-def hover(bid: str):
-    """Hover over an element.
-
-    Args:
-    ----
-        bid (str): Element ID to hover on.
-
-    Example: hover('b8')
-
-    """
-    pass
-
-
-def press(bid: str, key_comb: str):
-    """Press a key combination on a focused element.
-
-    Args:
-    ----
-        bid (str): Element ID to focus.
-        key_comb (str): Key combination to simulate.
-
-    """
-    pass
-
-
-def focus(bid: str):
-    """Focus the specified element.
-
-    Args:
-    ----
-        bid (str): Element ID to focus.
-
-    Example: focus('b455')
-
-    """
-    pass
-
-
-def clear(bid: str):
-    """Clear the value of an input field.
-
-    Args:
-    ----
-        bid (str): Element ID to clear.
-
-    Example: clear('996')
-
-    """
-    pass
-
-
-def drag_and_drop(from_bid: str, to_bid: str):
-    """Perform drag-and-drop from one element to another.
-
-    Args:
-    ----
-        from_bid (str): Source element ID.
-        to_bid (str): Target element ID.
-
-    """
-    pass
-
-
-def upload_file(bid: str, file: Union[str, List[str]]):
-    """Upload file(s) via a file input element.
-
-    Args:
-    ----
-        bid (str): Element ID to click for upload.
-        file (str or list[str]): Path(s) to file(s) to upload.
-
-    """
-    pass
-
-
-def send_msg_to_user(msg: str):
-    """Send a message to the user.
-
-    Args:
-    ----
-        msg (str): The message content to send.
-
-    Example: send_msg_to_user('The number of stars for the React repository on GitHub is 225k.')
-
-    """
-    pass
diff --git a/datasets/openhands/metadata.json b/datasets/openhands/metadata.json
new file mode 100644
index 00000000..da4d749f
--- /dev/null
+++ b/datasets/openhands/metadata.json
@@ -0,0 +1,207 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "click",
+        "description": "Click an element.\n\nArgs:\n----\n    bid (str): Element ID to click.\n    button (str, optional): Mouse button to use.\n    modifiers (list, optional): List of modifier keys.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "bid": {
+              "type": "string"
+            },
+            "button": {
+              "type": "string",
+              "enum": [
+                "left",
+                "middle",
+                "right"
+              ]
+            },
+            "modifiers": {
+              "type": "array"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "bid"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "crawl",
+        "description": "Crawl a webpage.\n\nArgs:\n----\n    link (str): The link to crawl.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "link": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "link"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "delegate_to_CrawlAgent",
+        "description": "Delegate the task to the CrawlAgent.\n\nArgs:\n----\n    task (str): task description.\n    link (str): the link to crawl.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "task": {
+              "type": "string"
+            },
+            "link": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "task",
+            "link"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "delegate_to_RagAgent",
+        "description": "Delegate the task to the RagAgent.\n\nArgs:\n----\n    task (str): task description.\n    query (str): the query to search.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "task": {
+              "type": "string"
+            },
+            "query": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "task",
+            "query"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "delegate_to_agent",
+        "description": "Delegate the task to the BrowsingAgent.\n\nArgs:\n----\n    agent (str): The agent to delegate the task to.\n    task (str): task description.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "agent": {
+              "type": "string"
+            },
+            "task": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "agent",
+            "task"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "goto",
+        "description": "Navigate to a URL.\n\nArgs:\n----\n    url (str): The target URL to visit.\n\nExample: goto('http://www.example.com')",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "url": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "url"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "rag_search",
+        "description": "Search using the RAG model.\n\nArgs:\n----\n    query (str): The query to search.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "query": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "query"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "scroll",
+        "description": "Scroll horizontally and/or vertically in pixels.\n\nArgs:\n----\n    delta_x (float): Horizontal scroll in pixels.\n    delta_y (float): Vertical scroll in pixels.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "delta_x": {
+              "type": "number"
+            },
+            "delta_y": {
+              "type": "number"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "delta_x",
+            "delta_y"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "send_msg_to_user",
+        "description": "Send a message to the user.\n\nArgs:\n----\n    msg (str): The message content to send.\n\nExample: send_msg_to_user('The number of stars for the React repository on GitHub is 225k.')",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "msg": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "msg"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [
+    "bash",
+    "python"
+  ],
+  "browser_enabled": true
+}
diff --git a/datasets/openresearcher/api.py b/datasets/openresearcher/api.py
deleted file mode 100644
index c78c3b24..00000000
--- a/datasets/openresearcher/api.py
+++ /dev/null
@@ -1,20 +0,0 @@
-def search(query: str, topn: int = 10, source: str | None = None) -> None:
-    """Search for information related to a query and return ranked results."""
-    pass
-
-
-def open(
-    cursor: int = -1,
-    id: int | str = -1,
-    loc: int = -1,
-    num_lines: int = -1,
-    source: str | None = None,
-    view_source: bool = False,
-) -> None:
-    """Open a search result, URL, local resource, or cursor location."""
-    pass
-
-
-def find(pattern: str, cursor: int = -1) -> None:
-    """Find exact matches of a pattern in the current page or cursor."""
-    pass
diff --git a/datasets/openresearcher/metadata.json b/datasets/openresearcher/metadata.json
new file mode 100644
index 00000000..337a253e
--- /dev/null
+++ b/datasets/openresearcher/metadata.json
@@ -0,0 +1,78 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "find",
+        "description": "Find exact matches of a pattern in the current page or cursor.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "pattern": {
+              "type": "string"
+            },
+            "cursor": {
+              "type": "integer"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "pattern"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "open",
+        "description": "Open a search result, URL, local resource, or cursor location.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "cursor": {
+              "type": "integer"
+            },
+            "id": {},
+            "loc": {
+              "type": "integer"
+            },
+            "num_lines": {
+              "type": "integer"
+            },
+            "source": {},
+            "view_source": {
+              "type": "boolean"
+            }
+          },
+          "additionalProperties": false
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "search",
+        "description": "Search for information related to a query and return ranked results.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "query": {
+              "type": "string"
+            },
+            "topn": {
+              "type": "integer"
+            },
+            "source": {}
+          },
+          "additionalProperties": false,
+          "required": [
+            "query"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": false
+}
diff --git a/datasets/openthoughts_tb_dev/metadata.json b/datasets/openthoughts_tb_dev/metadata.json
new file mode 100644
index 00000000..41212e45
--- /dev/null
+++ b/datasets/openthoughts_tb_dev/metadata.json
@@ -0,0 +1,7 @@
+{
+  "custom_tools": [],
+  "code_enabled": [
+    "bash"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/orca_agentinstruct/metadata.json b/datasets/orca_agentinstruct/metadata.json
new file mode 100644
index 00000000..dfa26e78
--- /dev/null
+++ b/datasets/orca_agentinstruct/metadata.json
@@ -0,0 +1,5 @@
+{
+  "custom_tools": [],
+  "code_enabled": [],
+  "browser_enabled": false
+}
diff --git a/datasets/scale_swe_distilled/api.py b/datasets/scale_swe_distilled/api.py
deleted file mode 100644
index 8b2502b6..00000000
--- a/datasets/scale_swe_distilled/api.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from typing import List, Optional
-
-
-def str_replace_editor(
-    command: str,
-    path: str,
-    file_text: Optional[str] = None,
-    old_str: Optional[str] = None,
-    new_str: Optional[str] = None,
-    insert_line: Optional[int] = None,
-    view_range: Optional[List[int]] = None,
-) -> None:
-    """View, create, and edit files with the OpenHands string replacement editor.
-
-    Args:
-    ----
-        command: Editor command such as view, create, str_replace, insert, or undo_edit.
-        path: Absolute path to the file or directory to operate on.
-        file_text: Content to write when creating a file.
-        old_str: Existing text to replace.
-        new_str: Replacement text or text to insert.
-        insert_line: Line number after which to insert new_str.
-        view_range: Optional 1-indexed inclusive line range for view commands.
-
-    """
-    pass
-
-
-def finish(message: str, task_completed: str) -> None:
-    """Finish the task.
-
-    Args:
-    ----
-        message: Final response to the user.
-        task_completed: Whether the task was completed successfully.
-
-    """
-    pass
diff --git a/datasets/scale_swe_distilled/metadata.json b/datasets/scale_swe_distilled/metadata.json
new file mode 100644
index 00000000..beab6bc9
--- /dev/null
+++ b/datasets/scale_swe_distilled/metadata.json
@@ -0,0 +1,67 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "finish",
+        "description": "Finish the task.\n\nArgs:\n----\n    message: Final response to the user.\n    task_completed: Whether the task was completed successfully.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "message": {
+              "type": "string"
+            },
+            "task_completed": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "message",
+            "task_completed"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "str_replace_editor",
+        "description": "View, create, and edit files with the OpenHands string replacement editor.\n\nArgs:\n----\n    command: Editor command such as view, create, str_replace, insert, or undo_edit.\n    path: Absolute path to the file or directory to operate on.\n    file_text: Content to write when creating a file.\n    old_str: Existing text to replace.\n    new_str: Replacement text or text to insert.\n    insert_line: Line number after which to insert new_str.\n    view_range: Optional 1-indexed inclusive line range for view commands.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "command": {
+              "type": "string"
+            },
+            "path": {
+              "type": "string"
+            },
+            "file_text": {
+              "type": "string"
+            },
+            "old_str": {
+              "type": "string"
+            },
+            "new_str": {
+              "type": "string"
+            },
+            "insert_line": {
+              "type": "integer"
+            },
+            "view_range": {}
+          },
+          "additionalProperties": false,
+          "required": [
+            "command",
+            "path"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [
+    "bash"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/screenagent/metadata.json b/datasets/screenagent/metadata.json
new file mode 100644
index 00000000..dfa26e78
--- /dev/null
+++ b/datasets/screenagent/metadata.json
@@ -0,0 +1,5 @@
+{
+  "custom_tools": [],
+  "code_enabled": [],
+  "browser_enabled": false
+}
diff --git a/datasets/swe-gym_openhands_sampled_trajectories/api.py b/datasets/swe-gym_openhands_sampled_trajectories/api.py
deleted file mode 100644
index 4a1c4c7a..00000000
--- a/datasets/swe-gym_openhands_sampled_trajectories/api.py
+++ /dev/null
@@ -1,23 +0,0 @@
-def str_replace_editor(
-    command: str,
-    path: str,
-    file_text: str = None,
-    old_str: str = None,
-    new_str: str = None,
-    insert_line: int = None,
-    view_range: list = None,
-) -> None:
-    """View, create, and edit files with this custom editing tool.
-
-    Args:
-    ----
-        command (str): The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.
-        path (str): Absolute path to file or directory, e.g. `/repo/file.py` or `/repo`.
-        file_text (str): Required parameter of `create` command, with the content of the file to be created.
-        old_str (str): Required parameter of `str_replace` command containing the string in `path` to replace.
-        new_str (str): Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.
-        insert_line (int): Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.
-        view_range (list): Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.
-
-    """
-    pass
diff --git a/datasets/swe-gym_openhands_sampled_trajectories/metadata.json b/datasets/swe-gym_openhands_sampled_trajectories/metadata.json
new file mode 100644
index 00000000..41212e45
--- /dev/null
+++ b/datasets/swe-gym_openhands_sampled_trajectories/metadata.json
@@ -0,0 +1,7 @@
+{
+  "custom_tools": [],
+  "code_enabled": [
+    "bash"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/swe-play-trajectories/api.py b/datasets/swe-play-trajectories/api.py
deleted file mode 100644
index e9564e73..00000000
--- a/datasets/swe-play-trajectories/api.py
+++ /dev/null
@@ -1,66 +0,0 @@
-"""API definitions for SWE-Play-trajectories dataset.
-
-These functions represent the OpenHands tools used as ApiAction in the dataset.
-Note: execute_bash and execute_ipython_cell are converted to CodeAction.
-"""
-
-from typing import List, Literal, Optional
-
-
-def str_replace_editor(
-    command: Literal["view", "create", "str_replace", "insert", "undo_edit"],
-    path: str,
-    file_text: Optional[str] = None,
-    old_str: Optional[str] = None,
-    new_str: Optional[str] = None,
-    insert_line: Optional[int] = None,
-    view_range: Optional[List[int]] = None,
-) -> str:
-    """View, create, and edit files.
-
-    Args:
-        command (str): The command to execute. One of: view, create, str_replace, insert, undo_edit.
-        path (str): The absolute path to the file or directory.
-        file_text (str, optional): Required for 'create' command. The content of the file to create.
-        old_str (str, optional): Required for 'str_replace' command. The string to replace.
-        new_str (str, optional): Required for 'str_replace' command. The new string to insert.
-        insert_line (int, optional): Required for 'insert' command. Line number to insert at.
-        view_range (list, optional): Optional for 'view' command. [start_line, end_line] to view.
-
-    Returns:
-        str: The result of the operation.
-
-    Example:
-        str_replace_editor(command="view", path="/workspace/file.py")
-        str_replace_editor(command="create", path="/workspace/new.py", file_text="print('hello')")
-        str_replace_editor(command="str_replace", path="/workspace/file.py", old_str="old", new_str="new")
-
-    """
-    pass
-
-
-def think(thought: str) -> None:
-    """Use this tool to think through a problem step by step.
-
-    Args:
-        thought (str): The thought or reasoning to record.
-
-    Example:
-        think("I need to first understand the existing code structure before making changes.")
-
-    """
-    pass
-
-
-def finish(message: str, task_completed: Literal["true", "false"] = "true") -> None:
-    """Finish the task and provide a final message.
-
-    Args:
-        message (str): A summary message about the task completion.
-        task_completed (str): Whether the task was successfully completed. "true" or "false".
-
-    Example:
-        finish(message="Successfully implemented the feature.", task_completed="true")
-
-    """
-    pass
diff --git a/datasets/swe-play-trajectories/metadata.json b/datasets/swe-play-trajectories/metadata.json
new file mode 100644
index 00000000..4f93a52b
--- /dev/null
+++ b/datasets/swe-play-trajectories/metadata.json
@@ -0,0 +1,62 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "str_replace_editor",
+        "description": "View, create, and edit files.\n\nArgs:\n    command (str): The command to execute. One of: view, create, str_replace, insert, undo_edit.\n    path (str): The absolute path to the file or directory.\n    file_text (str, optional): Required for 'create' command. The content of the file to create.\n    old_str (str, optional): Required for 'str_replace' command. The string to replace.\n    new_str (str, optional): Required for 'str_replace' command. The new string to insert.\n    insert_line (int, optional): Required for 'insert' command. Line number to insert at.\n    view_range (list, optional): Optional for 'view' command. [start_line, end_line] to view.\n\nReturns:\n    str: The result of the operation.\n\nExample:\n    str_replace_editor(command=\"view\", path=\"/workspace/file.py\")\n    str_replace_editor(command=\"create\", path=\"/workspace/new.py\", file_text=\"print('hello')\")\n    str_replace_editor(command=\"str_replace\", path=\"/workspace/file.py\", old_str=\"old\", new_str=\"new\")",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "command": {},
+            "path": {
+              "type": "string"
+            },
+            "file_text": {
+              "type": "string"
+            },
+            "old_str": {
+              "type": "string"
+            },
+            "new_str": {
+              "type": "string"
+            },
+            "insert_line": {
+              "type": "integer"
+            },
+            "view_range": {}
+          },
+          "additionalProperties": false,
+          "required": [
+            "command",
+            "path"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "think",
+        "description": "Use this tool to think through a problem step by step.\n\nArgs:\n    thought (str): The thought or reasoning to record.\n\nExample:\n    think(\"I need to first understand the existing code structure before making changes.\")",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "thought": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "thought"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [
+    "bash",
+    "python"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/swe-smith/api.py b/datasets/swe-smith/api.py
deleted file mode 100644
index 2b6d49c4..00000000
--- a/datasets/swe-smith/api.py
+++ /dev/null
@@ -1,31 +0,0 @@
-def str_replace_editor(
-    command: str,
-    path: str,
-    file_text: str = None,
-    old_str: str = None,
-    new_str: str = None,
-    insert_line: int = None,
-    view_range: list = None,
-) -> None:
-    """View, create, and edit files with this custom editing tool.
-
-    Args:
-    ----
-        command (str): The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.
-        path (str): Absolute path to file or directory, e.g. `/repo/file.py` or `/repo`.
-        file_text (str): Required parameter of `create` command, with the content of the file to be created.
-        old_str (str): Required parameter of `str_replace` command containing the string in `path` to replace.
-        new_str (str): Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.
-        insert_line (int): Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.
-        view_range (list): Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.
-
-    """
-    pass
-
-
-def submit():
-    """Finish the interaction when the task is complete OR if the assistant cannot proceed further with the task.
-
-    No parameters are required for this function.
-    """
-    pass
diff --git a/datasets/swe-smith/metadata.json b/datasets/swe-smith/metadata.json
new file mode 100644
index 00000000..41212e45
--- /dev/null
+++ b/datasets/swe-smith/metadata.json
@@ -0,0 +1,7 @@
+{
+  "custom_tools": [],
+  "code_enabled": [
+    "bash"
+  ],
+  "browser_enabled": false
+}
diff --git a/datasets/synatra/api.py b/datasets/synatra/api.py
deleted file mode 100644
index abfc6794..00000000
--- a/datasets/synatra/api.py
+++ /dev/null
@@ -1,82 +0,0 @@
-def scroll(delta_x: float, delta_y: float):
-    """Scroll horizontally and/or vertically in pixels.
-
-    Args:
-    ----
-        delta_x (float): Horizontal scroll in pixels.
-        delta_y (float): Vertical scroll in pixels.
-
-    """
-    pass
-
-
-def type(bid: str, text: str) -> None:
-    """Type the given text into an element with the given id.
-
-    Args:
-    ----
-        bid (str): The id of the element to type into.
-        text (str): The text to type.
-
-    """
-    pass
-
-
-def press(key_comb: str) -> None:
-    """Press a key combination.
-
-    Args:
-    ----
-        key_comb (str): The key combination to press. For example, "Ctrl+C". This is system specific.
-
-    """
-    pass
-
-
-def stop(answer: str = "") -> None:
-    """Stop the execution of the trajectory.
-
-    Args:
-    ----
-        answer (str): The answer to the question. This is optional since a task may not require an answer.
-
-    """
-    pass
-
-
-def new_tab(url: str) -> None:
-    """Open a new tab with the given URL.
-
-    Args:
-    ----
-        url (str): The URL to open in the new tab.
-
-    """
-    pass
-
-
-def tab_focus(page_number: int) -> None:
-    """Focus on the tab with the given page_number.
-
-    Args:
-    ----
-        page_number (int): The page_number of the tab to focus on. Starts from 0.
-
-    """
-    pass
-
-
-def close_tab() -> None:
-    """Close the current tab."""
-    pass
-
-
-def click(bid: str) -> None:
-    """Click on an element with the given id.
-
-    Args:
-    ----
-        bid (str): The id of the element to click on.
-
-    """
-    pass
diff --git a/datasets/synatra/metadata.json b/datasets/synatra/metadata.json
new file mode 100644
index 00000000..1c3e4db2
--- /dev/null
+++ b/datasets/synatra/metadata.json
@@ -0,0 +1,25 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "click",
+        "description": "Click on an element with the given id.\n\nArgs:\n----\n    bid (str): The id of the element to click on.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "bid": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "bid"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": true
+}
diff --git a/datasets/toolmind/api.py b/datasets/toolmind/api.py
deleted file mode 100644
index 03bbda7a..00000000
--- a/datasets/toolmind/api.py
+++ /dev/null
@@ -1,33 +0,0 @@
-def get_current_time(timezone):
-    """Return the current time for a timezone in the ToolMind sample."""
-    return None
-
-
-def get_date_and_time_by_city(city):
-    """Return the current date and time for a city in the ToolMind sample."""
-    return None
-
-
-def calculate_grid_coordinate_points():
-    """Stub for the advertised ToolMind tool."""
-    return None
-
-
-def get_random_quote():
-    """Stub for the advertised ToolMind tool."""
-    return None
-
-
-def search_books(keyword):
-    """Stub for the advertised ToolMind tool."""
-    return None
-
-
-def sha_text_hash():
-    """Stub for the advertised ToolMind tool."""
-    return None
-
-
-def trend_id():
-    """Stub for the advertised ToolMind tool."""
-    return None
diff --git a/datasets/toolmind/metadata.json b/datasets/toolmind/metadata.json
new file mode 100644
index 00000000..b5a0912c
--- /dev/null
+++ b/datasets/toolmind/metadata.json
@@ -0,0 +1,100 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_current_time",
+        "description": "Return the current time for a timezone in the ToolMind sample.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "timezone": {}
+          },
+          "additionalProperties": false,
+          "required": [
+            "timezone"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_date_and_time_by_city",
+        "description": "Return the current date and time for a city in the ToolMind sample.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "city": {}
+          },
+          "additionalProperties": false,
+          "required": [
+            "city"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "calculate_grid_coordinate_points",
+        "description": "Dataset tool calculate_grid_coordinate_points.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "get_random_quote",
+        "description": "Dataset tool get_random_quote.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "search_books",
+        "description": "Dataset tool search_books.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "sha_text_hash",
+        "description": "Dataset tool sha_text_hash.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "trend_id",
+        "description": "Dataset tool trend_id.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": false
+}
diff --git a/datasets/toucan_1_5m/api.py b/datasets/toucan_1_5m/api.py
deleted file mode 100644
index 94b26323..00000000
--- a/datasets/toucan_1_5m/api.py
+++ /dev/null
@@ -1,165 +0,0 @@
-from typing import Optional
-
-
-def web_search_exa(query: str, numResults: int = 5) -> dict:
-    """Search the web using Exa AI - performs real-time web searches and can scrape content from specific URLs.
-
-    Args:
-    ----
-        query: Search query
-        numResults: Number of search results to return (default: 5)
-
-    Returns:
-    -------
-        dict: Response from the API
-
-    """
-    pass
-
-
-def research_paper_search_exa(query: str, numResults: int = 5) -> dict:
-    """Search for academic papers and research using Exa AI - specializes in finding scholarly articles, research papers, and academic content.
-
-    Args:
-    ----
-        query: Research paper search query
-        numResults: Number of research papers to return (default: 5)
-
-    Returns:
-    -------
-        dict: Response from the API
-
-    """
-    pass
-
-
-def company_research_exa(companyName: str, numResults: int = 5) -> dict:
-    """Research companies using Exa AI - finds comprehensive information about businesses, organizations, and corporations.
-
-    Args:
-    ----
-        companyName: Name of the company to research
-        numResults: Number of search results to return (default: 5)
-
-    Returns:
-    -------
-        dict: Response from the API
-
-    """
-    pass
-
-
-def crawling_exa(url: str, maxCharacters: int = 3000) -> dict:
-    """Extract and crawl content from specific URLs using Exa AI - retrieves full text content, metadata, and structured information from web pages.
-
-    Args:
-    ----
-        url: URL to crawl and extract content from
-        maxCharacters: Maximum characters to extract (default: 3000)
-
-    Returns:
-    -------
-        dict: Response from the API
-
-    """
-    pass
-
-
-def competitor_finder_exa(
-    companyName: str, industry: Optional[str] = None, numResults: int = 5
-) -> dict:
-    """Find competitors for a business using Exa AI - identifies similar companies, competitive landscape analysis, and market positioning.
-
-    Args:
-    ----
-        companyName: Name of the company to find competitors for
-        industry: Industry sector (optional, helps narrow search)
-        numResults: Number of competitors to find (default: 5)
-
-    Returns:
-    -------
-        dict: Response from the API
-
-    """
-    pass
-
-
-def linkedin_search_exa(query: str, searchType: str = "all", numResults: int = 5) -> dict:
-    """Search LinkedIn profiles and companies using Exa AI - finds professional profiles, company pages, and business-related content on LinkedIn.
-
-    Args:
-    ----
-        query: LinkedIn search query (e.g., person name, company, job title)
-        searchType: Type of LinkedIn content to search (default: all)
-        numResults: Number of LinkedIn results to return (default: 5)
-
-    Returns:
-    -------
-        dict: Response from the API
-
-    """
-    pass
-
-
-def wikipedia_search_exa(query: str, numResults: int = 5) -> dict:
-    """Search Wikipedia articles using Exa AI - finds comprehensive, factual information from Wikipedia entries.
-
-    Args:
-    ----
-        query: Wikipedia search query (topic, person, place, concept, etc.)
-        numResults: Number of Wikipedia articles to return (default: 5)
-
-    Returns:
-    -------
-        dict: Response from the API
-
-    """
-    pass
-
-
-def github_search_exa(query: str, searchType: str = "all", numResults: int = 5) -> dict:
-    """Search GitHub repositories and code using Exa AI - finds repositories, code snippets, documentation, and developer profiles on GitHub.
-
-    Args:
-    ----
-        query: GitHub search query (repository name, programming language, username, etc.)
-        searchType: Type of GitHub content to search (default: all)
-        numResults: Number of GitHub results to return (default: 5)
-
-    Returns:
-    -------
-        dict: Response from the API
-
-    """
-    pass
-
-
-def deep_researcher_start(instructions: str, model: str = "exa-research") -> dict:
-    """Start a comprehensive AI-powered deep research task for complex queries.
-
-    Args:
-    ----
-        instructions: Complex research question or detailed instructions for the AI researcher
-        model: Research model: 'exa-research' or 'exa-research-pro' (default: exa-research)
-
-    Returns:
-    -------
-        dict: Response from the API
-
-    """
-    pass
-
-
-def deep_researcher_check(taskId: str) -> dict:
-    """Check the status and retrieve results of a deep research task.
-
-    Args:
-    ----
-        taskId: The task ID returned from deep_researcher_start tool
-
-    Returns:
-    -------
-        dict: Response from the API
-
-    """
-    pass
diff --git a/datasets/toucan_1_5m/metadata.json b/datasets/toucan_1_5m/metadata.json
new file mode 100644
index 00000000..2741f8bc
--- /dev/null
+++ b/datasets/toucan_1_5m/metadata.json
@@ -0,0 +1,135 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "company_research_exa",
+        "description": "Research companies using Exa AI - finds comprehensive information about businesses, organizations, and corporations.\n\nArgs:\n----\n    companyName: Name of the company to research\n    numResults: Number of search results to return (default: 5)\n\nReturns:\n-------\n    dict: Response from the API",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "companyName": {
+              "type": "string"
+            },
+            "numResults": {
+              "type": "integer"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "companyName"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "linkedin_search_exa",
+        "description": "Search LinkedIn profiles and companies using Exa AI - finds professional profiles, company pages, and business-related content on LinkedIn.\n\nArgs:\n----\n    query: LinkedIn search query (e.g., person name, company, job title)\n    searchType: Type of LinkedIn content to search (default: all)\n    numResults: Number of LinkedIn results to return (default: 5)\n\nReturns:\n-------\n    dict: Response from the API",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "query": {
+              "type": "string"
+            },
+            "searchType": {
+              "type": "string"
+            },
+            "numResults": {
+              "type": "integer"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "query"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "web_search_exa",
+        "description": "Search the web using Exa AI - performs real-time web searches and can scrape content from specific URLs.\n\nArgs:\n----\n    query: Search query\n    numResults: Number of search results to return (default: 5)\n\nReturns:\n-------\n    dict: Response from the API",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "query": {
+              "type": "string"
+            },
+            "numResults": {
+              "type": "integer"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "query"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "competitor_finder_exa",
+        "description": "Dataset tool competitor_finder_exa.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "crawling_exa",
+        "description": "Dataset tool crawling_exa.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "github_search_exa",
+        "description": "Dataset tool github_search_exa.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "research_paper_search_exa",
+        "description": "Dataset tool research_paper_search_exa.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "wikipedia_search_exa",
+        "description": "Dataset tool wikipedia_search_exa.",
+        "parameters": {
+          "type": "object",
+          "properties": {},
+          "additionalProperties": true
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": false
+}
diff --git a/datasets/turkingbench/api.py b/datasets/turkingbench/api.py
deleted file mode 100644
index 7d873c2d..00000000
--- a/datasets/turkingbench/api.py
+++ /dev/null
@@ -1,56 +0,0 @@
-def type(xpath: str, value: str) -> None:
-    """Type some text into an input element.
-
-    Args:
-    ----
-        xpath (str): The xpath of the element to type into.
-        value (str): The text to type.
-
-    """
-    pass
-
-
-def click(xpath: str) -> None:
-    """Click on the element.
-
-    Args:
-    ----
-        xpath (str): The xpath of the element to click.
-
-    """
-    pass
-
-
-def select(xpath: str, value: str) -> None:
-    """Select an option from a dropdown menu.
-
-    Args:
-    ----
-        xpath (str): The xpath of the select element.
-        value (str): The select option to choose.
-
-    """
-    pass
-
-
-def modify_range(xpath: str, value: str) -> None:
-    """For a given range element, set the value.
-
-    Args:
-    ----
-        xpath (str): The xpath of the range element.
-        value (str): The value to set the range element to.
-
-    """
-    pass
-
-
-def goto(url: str) -> None:
-    """Type some text into an input element.
-
-    Args:
-    ----
-        url (str): The url of the website to go to.
-
-    """
-    pass
diff --git a/datasets/turkingbench/metadata.json b/datasets/turkingbench/metadata.json
new file mode 100644
index 00000000..e5c700bb
--- /dev/null
+++ b/datasets/turkingbench/metadata.json
@@ -0,0 +1,44 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "click",
+        "description": "Click on the element.\n\nArgs:\n----\n    xpath (str): The xpath of the element to click.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "xpath": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "xpath"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "goto",
+        "description": "Type some text into an input element.\n\nArgs:\n----\n    url (str): The url of the website to go to.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "url": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "url"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": true
+}
diff --git a/datasets/webarena_successful/api.py b/datasets/webarena_successful/api.py
deleted file mode 100644
index a114fcdc..00000000
--- a/datasets/webarena_successful/api.py
+++ /dev/null
@@ -1,114 +0,0 @@
-def click(element_id: str) -> None:
-    """Click on an element with the given id.
-
-    Args:
-    ----
-        element_id (str): The id of the element to click.
-
-    """
-    pass
-
-
-def type(element_id: str, text: str) -> None:
-    """Type the given text into an element with the given id.
-
-    Args:
-    ----
-        element_id (str): The id of the element to type into.
-        text (str): The text to type.
-
-    """
-    pass
-
-
-def scroll(dx: float, dy: float) -> None:
-    """Scroll the page. Scroll horizontally dx pixels, vertically dy pixels. Positive for right or down scrolling, negative for left or up scrolling.
-
-    Args:
-    ----
-        dx: the distance to scroll in the x direction.
-        dy: the distance to scroll in the y direction.
-
-    """
-    pass
-
-
-def press(key_comb: str) -> None:
-    """Press a key combination.
-
-    Args:
-    ----
-        key_comb (str): The key combination to press. For example, "Ctrl+C". This is system specific.
-
-    """
-    pass
-
-
-def stop(answer: str = "") -> None:
-    """Stop the execution of the trajectory.
-
-    Args:
-    ----
-        answer (str): The answer to the question. This is optional since a task may not require an answer.
-
-    """
-    pass
-
-
-def hover(element_id: str) -> None:
-    """Hover over an element with the given id.
-
-    Args:
-    ----
-        element_id (str): The id of the element to hover over.
-
-    """
-    pass
-
-
-def goto(url: str) -> None:
-    """Navigate to the given URL.
-
-    Args:
-    ----
-        url (str): The URL to navigate to.
-
-    """
-    pass
-
-
-def new_tab(url: str) -> None:
-    """Open a new tab with the given URL.
-
-    Args:
-    ----
-        url (str): The URL to open in the new tab.
-
-    """
-    pass
-
-
-def tab_focus(page_number: int) -> None:
-    """Focus on the tab with the given page_number.
-
-    Args:
-    ----
-        page_number (int): The page_number of the tab to focus on. Starts from 0.
-
-    """
-    pass
-
-
-def tab_close() -> None:
-    """Close the current tab."""
-    pass
-
-
-def go_back() -> None:
-    """Go back to the previous page."""
-    pass
-
-
-def go_forward() -> None:
-    """Go forward to the next page if go_back was called."""
-    pass
diff --git a/datasets/webarena_successful/metadata.json b/datasets/webarena_successful/metadata.json
new file mode 100644
index 00000000..bcd29de0
--- /dev/null
+++ b/datasets/webarena_successful/metadata.json
@@ -0,0 +1,25 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "click",
+        "description": "Click on an element with the given id.\n\nArgs:\n----\n    element_id (str): The id of the element to click.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "element_id": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "element_id"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": true
+}
diff --git a/datasets/weblinx/api.py b/datasets/weblinx/api.py
deleted file mode 100644
index e6217b06..00000000
--- a/datasets/weblinx/api.py
+++ /dev/null
@@ -1,67 +0,0 @@
-def click(xpath: str) -> None:
-    """Click on the element.
-
-    Args:
-    ----
-        xpath (str): The xpath of the element to click.
-
-    """
-    pass
-
-
-def type(xpath: str, value: str) -> None:
-    """Type some text into an input element.
-
-    Args:
-    ----
-        xpath (str): The xpath of the element to type into.
-        value (str): The text to type.
-
-    """
-    pass
-
-
-def scroll(dx: float, dy: float) -> None:
-    """Scroll the page. Scroll horizontally dx pixels, vertically dy pixels. Positive for right or down scrolling, negative for left or up scrolling.
-
-    Args:
-    ----
-        dx: the distance to scroll in the x direction.
-        dy: the distance to scroll in the y direction.
-
-    """
-    pass
-
-
-def goto(url: str) -> None:
-    """Navigate to the given URL.
-
-    Args:
-    ----
-        url (str): The URL to navigate to.
-
-    """
-    pass
-
-
-def select(xpath: str, value: str) -> None:
-    """Select an option from a dropdown menu.
-
-    Args:
-    ----
-        xpath (str): The xpath of the select element.
-        value (str): The select option to choose.
-
-    """
-    pass
-
-
-def submit(xpath: str) -> None:
-    """Submit the form.
-
-    Args:
-    ----
-        xpath (str): The xpath of the form to submit.
-
-    """
-    pass
diff --git a/datasets/weblinx/metadata.json b/datasets/weblinx/metadata.json
new file mode 100644
index 00000000..c96b0090
--- /dev/null
+++ b/datasets/weblinx/metadata.json
@@ -0,0 +1,113 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "click",
+        "description": "Click on the element.\n\nArgs:\n----\n    xpath (str): The xpath of the element to click.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "xpath": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "xpath"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "goto",
+        "description": "Navigate to the given URL.\n\nArgs:\n----\n    url (str): The URL to navigate to.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "url": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "url"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "scroll",
+        "description": "Scroll the page. Scroll horizontally dx pixels, vertically dy pixels. Positive for right or down scrolling, negative for left or up scrolling.\n\nArgs:\n----\n    dx: the distance to scroll in the x direction.\n    dy: the distance to scroll in the y direction.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "dx": {
+              "type": "number"
+            },
+            "dy": {
+              "type": "number"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "dx",
+            "dy"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "select",
+        "description": "Select an option from a dropdown menu.\n\nArgs:\n----\n    xpath (str): The xpath of the select element.\n    value (str): The select option to choose.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "xpath": {
+              "type": "string"
+            },
+            "value": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "xpath",
+            "value"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "type",
+        "description": "Type some text into an input element.\n\nArgs:\n----\n    xpath (str): The xpath of the element to type into.\n    value (str): The text to type.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "xpath": {
+              "type": "string"
+            },
+            "value": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "xpath",
+            "value"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": true
+}
diff --git a/datasets/wonderbread/api.py b/datasets/wonderbread/api.py
deleted file mode 100644
index d87a3b4a..00000000
--- a/datasets/wonderbread/api.py
+++ /dev/null
@@ -1,47 +0,0 @@
-def click(xpath: str) -> None:
-    """Click on the element.
-
-    Args:
-    ----
-        xpath (str): The xpath of the element to click.
-
-    """
-    pass
-
-
-def type(xpath: str, value: str) -> None:
-    """Type some text into an input element.
-
-    Args:
-    ----
-        xpath (str): The xpath of the element to type into.
-        value (str): The text to type.
-
-    """
-    pass
-
-
-def keyboard_press(xpath: str, value: str) -> None:
-    """Press the key.
-
-    Valid keys are defined in https://playwright.dev/python/docs/api/class-keyboard#keyboard-press
-
-    Args:
-    ----
-        xpath: the xpath of the selected element.
-        value: the key to press.
-
-    """
-    pass
-
-
-def scroll(dx: float, dy: float) -> None:
-    """Scroll the page. Scroll horizontally dx pixels, vertically dy pixels. Positive for right or down scrolling, negative for left or up scrolling.
-
-    Args:
-    ----
-        dx: the distance to scroll in the x direction.
-        dy: the distance to scroll in the y direction.
-
-    """
-    pass
diff --git a/datasets/wonderbread/metadata.json b/datasets/wonderbread/metadata.json
new file mode 100644
index 00000000..2b22e029
--- /dev/null
+++ b/datasets/wonderbread/metadata.json
@@ -0,0 +1,71 @@
+{
+  "custom_tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "click",
+        "description": "Click on the element.\n\nArgs:\n----\n    xpath (str): The xpath of the element to click.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "xpath": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "xpath"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "scroll",
+        "description": "Scroll the page. Scroll horizontally dx pixels, vertically dy pixels. Positive for right or down scrolling, negative for left or up scrolling.\n\nArgs:\n----\n    dx: the distance to scroll in the x direction.\n    dy: the distance to scroll in the y direction.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "dx": {
+              "type": "number"
+            },
+            "dy": {
+              "type": "number"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "dx",
+            "dy"
+          ]
+        }
+      }
+    },
+    {
+      "type": "function",
+      "function": {
+        "name": "type",
+        "description": "Type some text into an input element.\n\nArgs:\n----\n    xpath (str): The xpath of the element to type into.\n    value (str): The text to type.",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "xpath": {
+              "type": "string"
+            },
+            "value": {
+              "type": "string"
+            }
+          },
+          "additionalProperties": false,
+          "required": [
+            "xpath",
+            "value"
+          ]
+        }
+      }
+    }
+  ],
+  "code_enabled": [],
+  "browser_enabled": true
+}
diff --git a/tests/test_dataset_structure.py b/tests/test_dataset_structure.py
index 868e634a..f9b6accb 100644
--- a/tests/test_dataset_structure.py
+++ b/tests/test_dataset_structure.py
@@ -1,8 +1,11 @@
+import json
 import os
 from pathlib import Path
 
 import pytest
 
+from schema.dataset_metadata import DatasetMetadata
+
 DATASET_PATH = Path(__file__).parent.parent / "datasets"
 
 
@@ -26,6 +29,20 @@ def test_dataset_structure(subdir):
         "use a shared converter under agents/ instead"
     )
 
+    dataset_api_path = os.path.join(subdir_path, "api.py")
+    assert not os.path.exists(dataset_api_path), (
+        f"Dataset-local api.py is not allowed in {subdir_path}; "
+        "define custom tools in metadata.json instead"
+    )
+
+    metadata_path = Path(subdir_path) / "metadata.json"
+    assert metadata_path.exists(), f"metadata.json not found in {subdir_path}"
+    metadata_data = json.loads(metadata_path.read_text())
+    DatasetMetadata.model_validate(metadata_data)
+    assert (
+        metadata_path.read_text() == json.dumps(metadata_data, indent=2, ensure_ascii=False) + "\n"
+    ), f"metadata.json is not formatted with 2-space indentation: {metadata_path}"
+
     # All datasets should have sample_raw.json
     sample_raw_path = os.path.join(subdir_path, "sample_raw.json")
     assert os.path.exists(sample_raw_path), f"sample_raw.json not found in {subdir_path}"
diff --git a/tests/test_standardized_schemas.py b/tests/test_standardized_schemas.py
index 761cae86..4137297a 100644
--- a/tests/test_standardized_schemas.py
+++ b/tests/test_standardized_schemas.py
@@ -1,5 +1,3 @@
-import importlib.util
-import inspect
 import json
 import os
 from pathlib import Path, PurePosixPath, PureWindowsPath
@@ -9,6 +7,7 @@
 
 from schema.action.api import ApiAction
 from schema.action.code import CodeAction
+from schema.dataset_metadata import custom_tool_names, is_browser_api_action, load_dataset_metadata
 from schema.observation.image import ImageObservation
 from schema.observation.text import TextObservation
 from schema.observation.web import WebObservation
@@ -392,22 +391,10 @@ def test_sample_standardized_against_schema(sample_path):
     assert isinstance(samples, list), "sample_std.json should be a list"
     assert len(samples) > 0, "sample_std.json should have at least one sample"
 
-    # dynamically load api.py in the same directory as sample_std.json
-    dataset_api = None
-    api_function_names = None
-
-    def load_dataset_api():
-        nonlocal dataset_api, api_function_names
-        if dataset_api is None:
-            api_path = os.path.join(os.path.dirname(sample_path), "api.py")
-            assert os.path.exists(api_path)
-            spec = importlib.util.spec_from_file_location("dataset_api", api_path)
-            dataset_api = importlib.util.module_from_spec(spec)
-            spec.loader.exec_module(dataset_api)
-            api_function_names = {
-                name for name, _ in inspect.getmembers(dataset_api, inspect.isfunction)
-            }
-        return dataset_api, api_function_names
+    dataset_name = Path(sample_path).parent.name
+    metadata = load_dataset_metadata(dataset_name, required=True)
+    api_function_names = custom_tool_names(metadata)
+    built_in_api_names = {"finish", "stop", "submit", "str_replace_editor", "think", "task_tracker"}
 
     for sample_id, sample in enumerate(samples):
         try:
@@ -431,11 +418,16 @@ def load_dataset_api():
             )
             if traj.available_apis is not None:
                 available_apis = traj.available_apis
-                _, api_function_names = load_dataset_api()
-                missing_available_apis = sorted(set(available_apis) - api_function_names)
-                assert not missing_available_apis, (
-                    f"available_apis contains functions not found in api.py in "
-                    f"{os.path.dirname(sample_path)}: {missing_available_apis}"
+                unsupported_available_apis = sorted(
+                    name
+                    for name in set(available_apis)
+                    if name not in api_function_names
+                    and name not in built_in_api_names
+                    and not is_browser_api_action(name, browser_context=metadata.browser_enabled)
+                )
+                assert not unsupported_available_apis, (
+                    f"available_apis contains functions not found in metadata.json in "
+                    f"{os.path.dirname(sample_path)}: {unsupported_available_apis}"
                 )
                 used_apis = {
                     content.function for content in traj.content if isinstance(content, ApiAction)
@@ -455,14 +447,19 @@ def load_dataset_api():
                         f"content {content_id}: {content.content}"
                     )
                 if isinstance(content, ApiAction):
-                    # Make sure that content.function exists in api.py
-                    dataset_api, _ = load_dataset_api()
-                    assert hasattr(dataset_api, content.function), (
-                        f"{content.function} not found in api.py in {os.path.dirname(sample_path)}"
+                    supported_by_metadata = (
+                        content.function in api_function_names
+                        or content.function in built_in_api_names
+                        or is_browser_api_action(
+                            content.function,
+                            content.kwargs,
+                            browser_context=metadata.browser_enabled,
+                        )
+                    )
+                    assert supported_by_metadata, (
+                        f"{content.function} not found in metadata.json in "
+                        f"{os.path.dirname(sample_path)}"
                     )
-                    # Validate content.kwargs against the function signature
-                    function = getattr(dataset_api, content.function)
-                    function(**content.kwargs)
 
         except ValidationError as e:
             pytest.fail(f"Validation failed for {sample_path}: {str(e)}")

From d0f45a4582be86b982f24e0ebe537dd4c6c0f547 Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Tue, 2 Jun 2026 13:53:49 -0400
Subject: [PATCH 09/13] Add LLaMA-Factory SFT adapter

Co-authored-by: openhands <openhands@all-hands.dev>
---
 agents/openhands_sdk/sft_to_llamafactory.py   | 230 ++++++++++++++++++
 .../test_openhands_sdk_sft_to_llamafactory.py | 163 +++++++++++++
 2 files changed, 393 insertions(+)
 create mode 100644 agents/openhands_sdk/sft_to_llamafactory.py
 create mode 100644 tests/test_openhands_sdk_sft_to_llamafactory.py

diff --git a/agents/openhands_sdk/sft_to_llamafactory.py b/agents/openhands_sdk/sft_to_llamafactory.py
new file mode 100644
index 00000000..77b9b1c6
--- /dev/null
+++ b/agents/openhands_sdk/sft_to_llamafactory.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+"""Adapt OpenHands SDK OpenAI SFT JSONL for LLaMA-Factory.
+
+The OpenHands SDK SFT exporters write canonical OpenAI chat-completions tool
+calls, where ``tool_calls[].function.arguments`` is a JSON string. That is the
+right wire format, but LLaMA-Factory's Qwen 3.5 tool formatter expects parsed
+argument objects after its OpenAI converter has read each function-call message.
+
+This utility keeps the source SFT format canonical and performs a one-way
+training adapter conversion:
+
+* assistant messages with ``tool_calls`` become ``role="function_call"``
+  messages whose ``content`` is a JSON string containing a list of functions;
+* every function in that JSON has parsed object ``arguments``;
+* nonessential OpenAI fields such as ``tool_call_id`` are dropped from messages
+  to keep the Hugging Face Arrow schema stable;
+* the top-level ``tools`` field is stringified so heterogeneous tool schemas do
+  not become nested Arrow columns.
+
+The output is intended for LLaMA-Factory with ``formatting: openai`` and tags
+matching the defaults emitted by ``write_dataset_info``.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+DEFAULT_DATASET_NAME = "openhands_sdk_llamafactory"
+
+
+def text_content(content: Any) -> str:
+    """Convert OpenAI message content into the string LLaMA-Factory expects."""
+    if content is None:
+        return ""
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        parts: list[str] = []
+        for item in content:
+            if isinstance(item, dict):
+                if isinstance(item.get("text"), str):
+                    parts.append(item["text"])
+                elif isinstance(item.get("content"), str):
+                    parts.append(item["content"])
+                else:
+                    parts.append(json.dumps(item, ensure_ascii=False))
+            else:
+                parts.append(str(item))
+        return "\n".join(parts)
+    return str(content)
+
+
+def parse_arguments(
+    arguments: Any, *, record_id: str, message_index: int, call_index: int
+) -> dict[str, Any]:
+    """Return a dict of tool-call arguments from canonical OpenAI arguments."""
+    if isinstance(arguments, str):
+        try:
+            arguments = json.loads(arguments)
+        except json.JSONDecodeError as exc:
+            raise ValueError(
+                f"Invalid JSON function.arguments in {record_id} "
+                f"message {message_index} tool call {call_index}"
+            ) from exc
+    if not isinstance(arguments, dict):
+        raise ValueError(
+            f"function.arguments must decode to an object in {record_id} "
+            f"message {message_index} tool call {call_index}: {type(arguments).__name__}"
+        )
+    return arguments
+
+
+def adapt_tool_calls(
+    record_id: str, message: dict[str, Any], message_index: int
+) -> list[dict[str, Any]]:
+    """Convert OpenAI tool calls to LLaMA-Factory function-call content."""
+    functions: list[dict[str, Any]] = []
+    for call_index, tool_call in enumerate(message.get("tool_calls") or []):
+        function = tool_call.get("function")
+        if not isinstance(function, dict):
+            raise ValueError(
+                f"tool_calls[{call_index}].function must be an object in "
+                f"{record_id} message {message_index}"
+            )
+        name = function.get("name")
+        if not isinstance(name, str) or not name:
+            raise ValueError(
+                f"tool_calls[{call_index}].function.name must be a non-empty string "
+                f"in {record_id} message {message_index}"
+            )
+        functions.append(
+            {
+                "name": name,
+                "arguments": parse_arguments(
+                    function.get("arguments", "{}"),
+                    record_id=record_id,
+                    message_index=message_index,
+                    call_index=call_index,
+                ),
+            }
+        )
+    return functions
+
+
+def adapt_message(record_id: str, message: dict[str, Any], message_index: int) -> dict[str, str]:
+    """Adapt one OpenAI message to a schema-stable LLaMA-Factory message."""
+    role = message.get("role")
+    if not isinstance(role, str):
+        raise ValueError(f"message role must be a string in {record_id} message {message_index}")
+    if message.get("tool_calls"):
+        functions = adapt_tool_calls(record_id, message, message_index)
+        return {
+            "role": "function_call",
+            "content": json.dumps(functions, ensure_ascii=False),
+        }
+    return {
+        "role": role,
+        "content": text_content(message.get("content")),
+    }
+
+
+def adapt_record(record: dict[str, Any]) -> dict[str, Any]:
+    """Adapt a single OpenHands SDK OpenAI SFT record for LLaMA-Factory."""
+    record_id = str(record.get("id", "<unknown>"))
+    messages = record.get("messages")
+    if not isinstance(messages, list):
+        raise ValueError(f"record {record_id} is missing a messages list")
+
+    adapted: dict[str, Any] = {
+        "id": record.get("id"),
+        "messages": [
+            adapt_message(record_id, message, index)
+            for index, message in enumerate(messages)
+            if isinstance(message, dict)
+        ],
+    }
+    if len(adapted["messages"]) != len(messages):
+        raise ValueError(f"record {record_id} contains a non-object message")
+
+    tools = record.get("tools", "")
+    if isinstance(tools, str):
+        adapted["tools"] = tools
+    elif tools is None:
+        adapted["tools"] = ""
+    else:
+        adapted["tools"] = json.dumps(tools, ensure_ascii=False)
+
+    if "metadata" in record:
+        adapted["metadata"] = record["metadata"]
+    return adapted
+
+
+def convert_jsonl(input_path: Path, output_path: Path) -> int:
+    """Convert an OpenHands SDK OpenAI SFT JSONL file."""
+    count = 0
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with (
+        input_path.open(encoding="utf-8") as in_handle,
+        output_path.open("w", encoding="utf-8") as out_handle,
+    ):
+        for line_number, line in enumerate(in_handle, 1):
+            if not line.strip():
+                continue
+            try:
+                record = json.loads(line)
+                adapted = adapt_record(record)
+            except Exception as exc:
+                raise ValueError(f"Failed to adapt {input_path}:{line_number}") from exc
+            out_handle.write(json.dumps(adapted, ensure_ascii=False) + "\n")
+            count += 1
+    return count
+
+
+def dataset_info(dataset_name: str, file_name: str) -> dict[str, Any]:
+    """Return LLaMA-Factory dataset_info.json content for adapted records."""
+    return {
+        dataset_name: {
+            "formatting": "openai",
+            "columns": {"messages": "messages", "tools": "tools"},
+            "tags": {
+                "role_tag": "role",
+                "content_tag": "content",
+                "user_tag": "user",
+                "assistant_tag": "assistant",
+                "observation_tag": "tool",
+                "function_tag": "function_call",
+                "system_tag": "system",
+            },
+            "file_name": file_name,
+        }
+    }
+
+
+def write_dataset_info(path: Path, *, dataset_name: str, file_name: str) -> None:
+    """Write a LLaMA-Factory dataset_info.json for the adapted JSONL."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(dataset_info(dataset_name, file_name), indent=2) + "\n")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--input", type=Path, required=True, help="OpenHands SDK OpenAI SFT JSONL")
+    parser.add_argument("--output", type=Path, required=True, help="Adapted LLaMA-Factory JSONL")
+    parser.add_argument(
+        "--dataset-info",
+        type=Path,
+        help="Optional dataset_info.json path to write or update for the adapted file.",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        default=DEFAULT_DATASET_NAME,
+        help="Dataset key to write when --dataset-info is provided.",
+    )
+    args = parser.parse_args()
+
+    count = convert_jsonl(args.input, args.output)
+    if args.dataset_info:
+        write_dataset_info(
+            args.dataset_info,
+            dataset_name=args.dataset_name,
+            file_name=args.output.name,
+        )
+    print(json.dumps({"input": str(args.input), "output": str(args.output), "records": count}))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_openhands_sdk_sft_to_llamafactory.py b/tests/test_openhands_sdk_sft_to_llamafactory.py
new file mode 100644
index 00000000..19cb8dda
--- /dev/null
+++ b/tests/test_openhands_sdk_sft_to_llamafactory.py
@@ -0,0 +1,163 @@
+import json
+import subprocess
+import sys
+
+import pytest
+
+from agents.openhands_sdk.sft_to_llamafactory import adapt_record, dataset_info
+
+
+def test_adapt_record_converts_openai_tool_calls_to_function_messages():
+    record = {
+        "id": "example-1",
+        "messages": [
+            {"role": "system", "content": "System prompt"},
+            {"role": "user", "content": "List files"},
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [
+                    {
+                        "id": "call_1",
+                        "type": "function",
+                        "function": {
+                            "name": "terminal",
+                            "arguments": json.dumps({"command": "ls", "timeout": 30}),
+                        },
+                    }
+                ],
+            },
+            {
+                "role": "tool",
+                "tool_call_id": "call_1",
+                "name": "terminal",
+                "content": "README.md",
+            },
+            {"role": "assistant", "content": "Done"},
+        ],
+        "tools": [
+            {
+                "type": "function",
+                "function": {
+                    "name": "terminal",
+                    "parameters": {"type": "object", "properties": {}},
+                },
+            }
+        ],
+        "metadata": {"source_dataset": "unit"},
+    }
+
+    adapted = adapt_record(record)
+
+    assert adapted["messages"] == [
+        {"role": "system", "content": "System prompt"},
+        {"role": "user", "content": "List files"},
+        {
+            "role": "function_call",
+            "content": json.dumps(
+                [{"name": "terminal", "arguments": {"command": "ls", "timeout": 30}}],
+                ensure_ascii=False,
+            ),
+        },
+        {"role": "tool", "content": "README.md"},
+        {"role": "assistant", "content": "Done"},
+    ]
+    assert isinstance(adapted["tools"], str)
+    assert json.loads(adapted["tools"])[0]["function"]["name"] == "terminal"
+    assert adapted["metadata"] == {"source_dataset": "unit"}
+
+
+def test_adapt_record_rejects_non_object_arguments():
+    record = {
+        "id": "bad-args",
+        "messages": [
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "terminal",
+                            "arguments": json.dumps(["not", "an", "object"]),
+                        }
+                    }
+                ],
+            }
+        ],
+    }
+
+    with pytest.raises(ValueError, match="must decode to an object"):
+        adapt_record(record)
+
+
+def test_dataset_info_uses_llamafactory_openai_tags():
+    info = dataset_info("demo", "demo.jsonl")
+
+    assert info == {
+        "demo": {
+            "formatting": "openai",
+            "columns": {"messages": "messages", "tools": "tools"},
+            "tags": {
+                "role_tag": "role",
+                "content_tag": "content",
+                "user_tag": "user",
+                "assistant_tag": "assistant",
+                "observation_tag": "tool",
+                "function_tag": "function_call",
+                "system_tag": "system",
+            },
+            "file_name": "demo.jsonl",
+        }
+    }
+
+
+def test_cli_converts_jsonl_and_writes_dataset_info(tmp_path):
+    input_path = tmp_path / "input.jsonl"
+    output_path = tmp_path / "output.jsonl"
+    info_path = tmp_path / "dataset_info.json"
+    input_record = {
+        "id": "cli-1",
+        "messages": [
+            {"role": "user", "content": "Run command"},
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "terminal",
+                            "arguments": json.dumps({"command": "pwd"}),
+                        }
+                    }
+                ],
+            },
+        ],
+        "tools": [],
+    }
+    input_path.write_text(json.dumps(input_record) + "\n")
+
+    result = subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "agents.openhands_sdk.sft_to_llamafactory",
+            "--input",
+            str(input_path),
+            "--output",
+            str(output_path),
+            "--dataset-info",
+            str(info_path),
+            "--dataset-name",
+            "cli_demo",
+        ],
+        check=True,
+        text=True,
+        capture_output=True,
+    )
+
+    assert json.loads(result.stdout)["records"] == 1
+    [adapted] = [json.loads(line) for line in output_path.read_text().splitlines()]
+    assert adapted["messages"][1]["role"] == "function_call"
+    assert json.loads(adapted["messages"][1]["content"])[0] == {
+        "name": "terminal",
+        "arguments": {"command": "pwd"},
+    }
+    assert json.loads(info_path.read_text())["cli_demo"]["file_name"] == "output.jsonl"

From e1e1ebff71f9450583a665f72ee3adbb56b76aaa Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Tue, 2 Jun 2026 14:19:33 -0400
Subject: [PATCH 10/13] Merge adjacent users in LLaMA-Factory adapter

Co-authored-by: openhands <openhands@all-hands.dev>
---
 agents/openhands_sdk/sft_to_llamafactory.py   | 37 +++++++++++++++----
 .../test_openhands_sdk_sft_to_llamafactory.py | 33 +++++++++++++++++
 2 files changed, 63 insertions(+), 7 deletions(-)

diff --git a/agents/openhands_sdk/sft_to_llamafactory.py b/agents/openhands_sdk/sft_to_llamafactory.py
index 77b9b1c6..105147c5 100644
--- a/agents/openhands_sdk/sft_to_llamafactory.py
+++ b/agents/openhands_sdk/sft_to_llamafactory.py
@@ -12,6 +12,10 @@
 * assistant messages with ``tool_calls`` become ``role="function_call"``
   messages whose ``content`` is a JSON string containing a list of functions;
 * every function in that JSON has parsed object ``arguments``;
+* adjacent ``user`` messages are merged because LLaMA-Factory's OpenAI
+  converter assumes strict user/function-or-assistant alternation, while SDK
+  post-condensation trajectory segments can contain ``user`` followed by a
+  condensation-summary ``user`` message;
 * nonessential OpenAI fields such as ``tool_call_id`` are dropped from messages
   to keep the Hugging Face Arrow schema stable;
 * the top-level ``tools`` field is stringified so heterogeneous tool schemas do
@@ -122,6 +126,23 @@ def adapt_message(record_id: str, message: dict[str, Any], message_index: int) -
     }
 
 
+def merge_adjacent_user_messages(messages: list[dict[str, str]]) -> list[dict[str, str]]:
+    """Merge adjacent user messages for LLaMA-Factory role alternation.
+
+    SDK post-condensation trajectory segments can contain an original user message
+    followed by a user-role condensation summary. LLaMA-Factory's OpenAI converter
+    treats consecutive user messages as abnormal, so combine them without changing
+    non-user role boundaries.
+    """
+    merged: list[dict[str, str]] = []
+    for message in messages:
+        if merged and message["role"] == "user" and merged[-1]["role"] == "user":
+            merged[-1]["content"] = f"{merged[-1]['content']}\n\n{message['content']}"
+        else:
+            merged.append(dict(message))
+    return merged
+
+
 def adapt_record(record: dict[str, Any]) -> dict[str, Any]:
     """Adapt a single OpenHands SDK OpenAI SFT record for LLaMA-Factory."""
     record_id = str(record.get("id", "<unknown>"))
@@ -129,16 +150,18 @@ def adapt_record(record: dict[str, Any]) -> dict[str, Any]:
     if not isinstance(messages, list):
         raise ValueError(f"record {record_id} is missing a messages list")
 
+    adapted_messages = [
+        adapt_message(record_id, message, index)
+        for index, message in enumerate(messages)
+        if isinstance(message, dict)
+    ]
+    if len(adapted_messages) != len(messages):
+        raise ValueError(f"record {record_id} contains a non-object message")
+
     adapted: dict[str, Any] = {
         "id": record.get("id"),
-        "messages": [
-            adapt_message(record_id, message, index)
-            for index, message in enumerate(messages)
-            if isinstance(message, dict)
-        ],
+        "messages": merge_adjacent_user_messages(adapted_messages),
     }
-    if len(adapted["messages"]) != len(messages):
-        raise ValueError(f"record {record_id} contains a non-object message")
 
     tools = record.get("tools", "")
     if isinstance(tools, str):
diff --git a/tests/test_openhands_sdk_sft_to_llamafactory.py b/tests/test_openhands_sdk_sft_to_llamafactory.py
index 19cb8dda..96e0ee01 100644
--- a/tests/test_openhands_sdk_sft_to_llamafactory.py
+++ b/tests/test_openhands_sdk_sft_to_llamafactory.py
@@ -67,6 +67,39 @@ def test_adapt_record_converts_openai_tool_calls_to_function_messages():
     assert adapted["metadata"] == {"source_dataset": "unit"}
 
 
+def test_adapt_record_merges_adjacent_user_messages():
+    record = {
+        "id": "post-condensation",
+        "messages": [
+            {"role": "system", "content": "system"},
+            {"role": "user", "content": "original request"},
+            {"role": "user", "content": "condensation summary"},
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "terminal",
+                            "arguments": json.dumps({"command": "pwd"}),
+                        }
+                    }
+                ],
+            },
+            {"role": "tool", "content": "/workspace"},
+        ],
+    }
+
+    adapted = adapt_record(record)
+
+    assert [message["role"] for message in adapted["messages"]] == [
+        "system",
+        "user",
+        "function_call",
+        "tool",
+    ]
+    assert adapted["messages"][1]["content"] == "original request\n\ncondensation summary"
+
+
 def test_adapt_record_rejects_non_object_arguments():
     record = {
         "id": "bad-args",

From 2cbd497ba4bed369051dcd832418930d3d755158 Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Tue, 2 Jun 2026 15:14:26 -0400
Subject: [PATCH 11/13] Keep LLaMA-Factory adapter format-only

Co-authored-by: openhands <openhands@all-hands.dev>
---
 agents/openhands_sdk/sft_to_llamafactory.py   | 23 +------------
 .../test_openhands_sdk_sft_to_llamafactory.py | 32 -------------------
 2 files changed, 1 insertion(+), 54 deletions(-)

diff --git a/agents/openhands_sdk/sft_to_llamafactory.py b/agents/openhands_sdk/sft_to_llamafactory.py
index 105147c5..64e78327 100644
--- a/agents/openhands_sdk/sft_to_llamafactory.py
+++ b/agents/openhands_sdk/sft_to_llamafactory.py
@@ -12,10 +12,6 @@
 * assistant messages with ``tool_calls`` become ``role="function_call"``
   messages whose ``content`` is a JSON string containing a list of functions;
 * every function in that JSON has parsed object ``arguments``;
-* adjacent ``user`` messages are merged because LLaMA-Factory's OpenAI
-  converter assumes strict user/function-or-assistant alternation, while SDK
-  post-condensation trajectory segments can contain ``user`` followed by a
-  condensation-summary ``user`` message;
 * nonessential OpenAI fields such as ``tool_call_id`` are dropped from messages
   to keep the Hugging Face Arrow schema stable;
 * the top-level ``tools`` field is stringified so heterogeneous tool schemas do
@@ -126,23 +122,6 @@ def adapt_message(record_id: str, message: dict[str, Any], message_index: int) -
     }
 
 
-def merge_adjacent_user_messages(messages: list[dict[str, str]]) -> list[dict[str, str]]:
-    """Merge adjacent user messages for LLaMA-Factory role alternation.
-
-    SDK post-condensation trajectory segments can contain an original user message
-    followed by a user-role condensation summary. LLaMA-Factory's OpenAI converter
-    treats consecutive user messages as abnormal, so combine them without changing
-    non-user role boundaries.
-    """
-    merged: list[dict[str, str]] = []
-    for message in messages:
-        if merged and message["role"] == "user" and merged[-1]["role"] == "user":
-            merged[-1]["content"] = f"{merged[-1]['content']}\n\n{message['content']}"
-        else:
-            merged.append(dict(message))
-    return merged
-
-
 def adapt_record(record: dict[str, Any]) -> dict[str, Any]:
     """Adapt a single OpenHands SDK OpenAI SFT record for LLaMA-Factory."""
     record_id = str(record.get("id", "<unknown>"))
@@ -160,7 +139,7 @@ def adapt_record(record: dict[str, Any]) -> dict[str, Any]:
 
     adapted: dict[str, Any] = {
         "id": record.get("id"),
-        "messages": merge_adjacent_user_messages(adapted_messages),
+        "messages": adapted_messages,
     }
 
     tools = record.get("tools", "")
diff --git a/tests/test_openhands_sdk_sft_to_llamafactory.py b/tests/test_openhands_sdk_sft_to_llamafactory.py
index 96e0ee01..424a84b5 100644
--- a/tests/test_openhands_sdk_sft_to_llamafactory.py
+++ b/tests/test_openhands_sdk_sft_to_llamafactory.py
@@ -67,38 +67,6 @@ def test_adapt_record_converts_openai_tool_calls_to_function_messages():
     assert adapted["metadata"] == {"source_dataset": "unit"}
 
 
-def test_adapt_record_merges_adjacent_user_messages():
-    record = {
-        "id": "post-condensation",
-        "messages": [
-            {"role": "system", "content": "system"},
-            {"role": "user", "content": "original request"},
-            {"role": "user", "content": "condensation summary"},
-            {
-                "role": "assistant",
-                "tool_calls": [
-                    {
-                        "function": {
-                            "name": "terminal",
-                            "arguments": json.dumps({"command": "pwd"}),
-                        }
-                    }
-                ],
-            },
-            {"role": "tool", "content": "/workspace"},
-        ],
-    }
-
-    adapted = adapt_record(record)
-
-    assert [message["role"] for message in adapted["messages"]] == [
-        "system",
-        "user",
-        "function_call",
-        "tool",
-    ]
-    assert adapted["messages"][1]["content"] == "original request\n\ncondensation summary"
-
 
 def test_adapt_record_rejects_non_object_arguments():
     record = {

From 825e91434631a4433c261234a6ec7147d8dfeebf Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Tue, 2 Jun 2026 16:01:09 -0400
Subject: [PATCH 12/13] Recover trainable LLaMA-Factory prefixes

Co-authored-by: openhands <openhands@all-hands.dev>
---
 agents/openhands_sdk/sft_to_llamafactory.py   |  98 ++++++++++++--
 .../test_openhands_sdk_sft_to_llamafactory.py | 123 +++++++++++++++++-
 2 files changed, 209 insertions(+), 12 deletions(-)

diff --git a/agents/openhands_sdk/sft_to_llamafactory.py b/agents/openhands_sdk/sft_to_llamafactory.py
index 64e78327..c5d0e32e 100644
--- a/agents/openhands_sdk/sft_to_llamafactory.py
+++ b/agents/openhands_sdk/sft_to_llamafactory.py
@@ -15,7 +15,13 @@
 * nonessential OpenAI fields such as ``tool_call_id`` are dropped from messages
   to keep the Hugging Face Arrow schema stable;
 * the top-level ``tools`` field is stringified so heterogeneous tool schemas do
-  not become nested Arrow columns.
+  not become nested Arrow columns;
+* adjacent prompt-side messages (``user`` and ``tool``), which are valid
+  OpenAI chat history but not accepted by LLaMA-Factory's paired-turn converter,
+  are merged;
+* when requested, OpenAI-valid conversation prefixes are converted into
+  trainable prefixes by trimming trailing prompt-side messages (for example a
+  final tool response) so the adapted record ends on an assistant/function turn.
 
 The output is intended for LLaMA-Factory with ``formatting: openai`` and tags
 matching the defaults emitted by ``write_dataset_info``.
@@ -29,6 +35,12 @@
 from typing import Any
 
 DEFAULT_DATASET_NAME = "openhands_sdk_llamafactory"
+PROMPT_ROLES = {"user", "tool"}
+RESPONSE_ROLES = {"assistant", "function_call"}
+
+
+class UntrainableRecordError(ValueError):
+    """Raised when a record has no assistant/function response to train on."""
 
 
 def text_content(content: Any) -> str:
@@ -122,7 +134,45 @@ def adapt_message(record_id: str, message: dict[str, Any], message_index: int) -
     }
 
 
-def adapt_record(record: dict[str, Any]) -> dict[str, Any]:
+def is_prompt_side(message: dict[str, str]) -> bool:
+    return message["role"] in PROMPT_ROLES
+
+
+def merge_adjacent_prompt_messages(messages: list[dict[str, str]]) -> list[dict[str, str]]:
+    """Merge adjacent user/tool messages for LLaMA-Factory paired-turn conversion."""
+    merged: list[dict[str, str]] = []
+    for message in messages:
+        if merged and is_prompt_side(message) and is_prompt_side(merged[-1]):
+            merged[-1]["content"] = f"{merged[-1]['content']}\n\n{message['content']}"
+        else:
+            merged.append(dict(message))
+    return merged
+
+
+def trainable_prefix(messages: list[dict[str, str]]) -> list[dict[str, str]]:
+    """Trim a LLaMA-Factory record to the latest trainable assistant/function turn.
+
+    Many OpenAI Chat Completions histories are valid prefixes ending in ``tool``
+    or ``user``. They are useful context, but the single-record SFT format needs
+    the final message to be a response-side message that can become the label.
+    """
+    if not messages:
+        raise UntrainableRecordError("record contains no messages")
+
+    prefix = [dict(messages[0])] if messages[0]["role"] == "system" else []
+    body = messages[1:] if prefix else messages
+    last_response_index: int | None = None
+    for index, message in enumerate(body):
+        if message["role"] in RESPONSE_ROLES:
+            last_response_index = index
+
+    if last_response_index is None:
+        raise UntrainableRecordError("record contains no trainable assistant/function response")
+
+    return prefix + [dict(message) for message in body[: last_response_index + 1]]
+
+
+def adapt_record(record: dict[str, Any], *, trim_to_trainable: bool = False) -> dict[str, Any]:
     """Adapt a single OpenHands SDK OpenAI SFT record for LLaMA-Factory."""
     record_id = str(record.get("id", "<unknown>"))
     messages = record.get("messages")
@@ -137,9 +187,10 @@ def adapt_record(record: dict[str, Any]) -> dict[str, Any]:
     if len(adapted_messages) != len(messages):
         raise ValueError(f"record {record_id} contains a non-object message")
 
+    merged_messages = merge_adjacent_prompt_messages(adapted_messages)
     adapted: dict[str, Any] = {
         "id": record.get("id"),
-        "messages": adapted_messages,
+        "messages": trainable_prefix(merged_messages) if trim_to_trainable else merged_messages,
     }
 
     tools = record.get("tools", "")
@@ -155,9 +206,15 @@ def adapt_record(record: dict[str, Any]) -> dict[str, Any]:
     return adapted
 
 
-def convert_jsonl(input_path: Path, output_path: Path) -> int:
+def convert_jsonl(
+    input_path: Path,
+    output_path: Path,
+    *,
+    trim_to_trainable: bool = False,
+    skip_untrainable: bool = False,
+) -> dict[str, int]:
     """Convert an OpenHands SDK OpenAI SFT JSONL file."""
-    count = 0
+    stats = {"read": 0, "written": 0, "skipped_untrainable": 0}
     output_path.parent.mkdir(parents=True, exist_ok=True)
     with (
         input_path.open(encoding="utf-8") as in_handle,
@@ -166,14 +223,20 @@ def convert_jsonl(input_path: Path, output_path: Path) -> int:
         for line_number, line in enumerate(in_handle, 1):
             if not line.strip():
                 continue
+            stats["read"] += 1
             try:
                 record = json.loads(line)
-                adapted = adapt_record(record)
+                adapted = adapt_record(record, trim_to_trainable=trim_to_trainable)
+            except UntrainableRecordError:
+                if skip_untrainable:
+                    stats["skipped_untrainable"] += 1
+                    continue
+                raise
             except Exception as exc:
                 raise ValueError(f"Failed to adapt {input_path}:{line_number}") from exc
             out_handle.write(json.dumps(adapted, ensure_ascii=False) + "\n")
-            count += 1
-    return count
+            stats["written"] += 1
+    return stats
 
 
 def dataset_info(dataset_name: str, file_name: str) -> dict[str, Any]:
@@ -216,16 +279,31 @@ def main() -> None:
         default=DEFAULT_DATASET_NAME,
         help="Dataset key to write when --dataset-info is provided.",
     )
+    parser.add_argument(
+        "--trim-to-trainable",
+        action="store_true",
+        help="Trim OpenAI conversation prefixes so each output record ends on an assistant/function turn.",
+    )
+    parser.add_argument(
+        "--skip-untrainable",
+        action="store_true",
+        help="Skip records that still have no assistant/function turn after trimming.",
+    )
     args = parser.parse_args()
 
-    count = convert_jsonl(args.input, args.output)
+    stats = convert_jsonl(
+        args.input,
+        args.output,
+        trim_to_trainable=args.trim_to_trainable,
+        skip_untrainable=args.skip_untrainable,
+    )
     if args.dataset_info:
         write_dataset_info(
             args.dataset_info,
             dataset_name=args.dataset_name,
             file_name=args.output.name,
         )
-    print(json.dumps({"input": str(args.input), "output": str(args.output), "records": count}))
+    print(json.dumps({"input": str(args.input), "output": str(args.output), **stats}))
 
 
 if __name__ == "__main__":
diff --git a/tests/test_openhands_sdk_sft_to_llamafactory.py b/tests/test_openhands_sdk_sft_to_llamafactory.py
index 424a84b5..69b07181 100644
--- a/tests/test_openhands_sdk_sft_to_llamafactory.py
+++ b/tests/test_openhands_sdk_sft_to_llamafactory.py
@@ -4,7 +4,11 @@
 
 import pytest
 
-from agents.openhands_sdk.sft_to_llamafactory import adapt_record, dataset_info
+from agents.openhands_sdk.sft_to_llamafactory import (
+    UntrainableRecordError,
+    adapt_record,
+    dataset_info,
+)
 
 
 def test_adapt_record_converts_openai_tool_calls_to_function_messages():
@@ -67,6 +71,121 @@ def test_adapt_record_converts_openai_tool_calls_to_function_messages():
     assert adapted["metadata"] == {"source_dataset": "unit"}
 
 
+def test_adapt_record_merges_adjacent_user_messages():
+    record = {
+        "id": "post-condensation",
+        "messages": [
+            {"role": "system", "content": "system"},
+            {"role": "user", "content": "original request"},
+            {"role": "user", "content": "condensation summary"},
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "terminal",
+                            "arguments": json.dumps({"command": "pwd"}),
+                        }
+                    }
+                ],
+            },
+        ],
+    }
+
+    adapted = adapt_record(record)
+
+    assert [message["role"] for message in adapted["messages"]] == [
+        "system",
+        "user",
+        "function_call",
+    ]
+    assert adapted["messages"][1]["content"] == "original request\n\ncondensation summary"
+
+
+def test_adapt_record_merges_tool_then_user_prompt_messages():
+    record = {
+        "id": "tool-user",
+        "messages": [
+            {"role": "user", "content": "request"},
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call_1",
+                        "function": {
+                            "name": "terminal",
+                            "arguments": json.dumps({"command": "pwd"}),
+                        },
+                    }
+                ],
+            },
+            {"role": "tool", "tool_call_id": "call_1", "content": "/workspace"},
+            {"role": "user", "content": "continue with that result"},
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call_2",
+                        "function": {
+                            "name": "terminal",
+                            "arguments": json.dumps({"command": "ls"}),
+                        },
+                    }
+                ],
+            },
+        ],
+    }
+
+    adapted = adapt_record(record)
+
+    assert [message["role"] for message in adapted["messages"]] == [
+        "user",
+        "function_call",
+        "tool",
+        "function_call",
+    ]
+    assert adapted["messages"][2]["content"] == "/workspace\n\ncontinue with that result"
+
+
+def test_adapt_record_can_trim_trailing_tool_response_for_training():
+    record = {
+        "id": "tool-tail",
+        "messages": [
+            {"role": "user", "content": "Run pwd"},
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call_1",
+                        "function": {
+                            "name": "terminal",
+                            "arguments": json.dumps({"command": "pwd"}),
+                        },
+                    }
+                ],
+            },
+            {"role": "tool", "tool_call_id": "call_1", "content": "/workspace"},
+        ],
+    }
+
+    adapted = adapt_record(record, trim_to_trainable=True)
+
+    assert [message["role"] for message in adapted["messages"]] == ["user", "function_call"]
+    assert json.loads(adapted["messages"][-1]["content"])[0] == {
+        "name": "terminal",
+        "arguments": {"command": "pwd"},
+    }
+
+
+def test_adapt_record_rejects_prompt_only_record_when_trimming():
+    record = {
+        "id": "prompt-only",
+        "messages": [{"role": "user", "content": "No response follows"}],
+    }
+
+    with pytest.raises(UntrainableRecordError, match="no trainable"):
+        adapt_record(record, trim_to_trainable=True)
+
 
 def test_adapt_record_rejects_non_object_arguments():
     record = {
@@ -154,7 +273,7 @@ def test_cli_converts_jsonl_and_writes_dataset_info(tmp_path):
         capture_output=True,
     )
 
-    assert json.loads(result.stdout)["records"] == 1
+    assert json.loads(result.stdout)["written"] == 1
     [adapted] = [json.loads(line) for line in output_path.read_text().splitlines()]
     assert adapted["messages"][1]["role"] == "function_call"
     assert json.loads(adapted["messages"][1]["content"])[0] == {

From a3f92511fad9f695de336d485cdfb10165e64a5c Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Tue, 2 Jun 2026 21:04:04 -0400
Subject: [PATCH 13/13] Escape media tags for LLaMA-Factory

Co-authored-by: openhands <openhands@all-hands.dev>
---
 agents/openhands_sdk/sft_to_llamafactory.py   | 42 ++++++++--
 .../test_openhands_sdk_sft_to_llamafactory.py | 76 +++++++++++++++++++
 2 files changed, 111 insertions(+), 7 deletions(-)

diff --git a/agents/openhands_sdk/sft_to_llamafactory.py b/agents/openhands_sdk/sft_to_llamafactory.py
index c5d0e32e..a0f6cd62 100644
--- a/agents/openhands_sdk/sft_to_llamafactory.py
+++ b/agents/openhands_sdk/sft_to_llamafactory.py
@@ -21,7 +21,9 @@
   are merged;
 * when requested, OpenAI-valid conversation prefixes are converted into
   trainable prefixes by trimming trailing prompt-side messages (for example a
-  final tool response) so the adapted record ends on an assistant/function turn.
+  final tool response) so the adapted record ends on an assistant/function turn;
+* literal media tags are escaped so Qwen-VL/LLaMA-Factory does not treat source
+  text such as XML ``<image>`` blocks as multimodal placeholders.
 
 The output is intended for LLaMA-Factory with ``formatting: openai`` and tags
 matching the defaults emitted by ``write_dataset_info``.
@@ -37,18 +39,44 @@
 DEFAULT_DATASET_NAME = "openhands_sdk_llamafactory"
 PROMPT_ROLES = {"user", "tool"}
 RESPONSE_ROLES = {"assistant", "function_call"}
+MEDIA_TAG_REPLACEMENTS = {
+    "<image>": "&lt;image&gt;",
+    "</image>": "&lt;/image&gt;",
+    "<video>": "&lt;video&gt;",
+    "</video>": "&lt;/video&gt;",
+    "<audio>": "&lt;audio&gt;",
+    "</audio>": "&lt;/audio&gt;",
+}
 
 
 class UntrainableRecordError(ValueError):
     """Raised when a record has no assistant/function response to train on."""
 
 
+def escape_media_tags(text: str) -> str:
+    """Escape literal media tags reserved by multimodal chat templates."""
+    for old, new in MEDIA_TAG_REPLACEMENTS.items():
+        text = text.replace(old, new)
+    return text
+
+
+def escape_media_tags_in_json(value: Any) -> Any:
+    """Recursively escape reserved media tags inside JSON-compatible values."""
+    if isinstance(value, str):
+        return escape_media_tags(value)
+    if isinstance(value, list):
+        return [escape_media_tags_in_json(item) for item in value]
+    if isinstance(value, dict):
+        return {key: escape_media_tags_in_json(item) for key, item in value.items()}
+    return value
+
+
 def text_content(content: Any) -> str:
     """Convert OpenAI message content into the string LLaMA-Factory expects."""
     if content is None:
         return ""
     if isinstance(content, str):
-        return content
+        return escape_media_tags(content)
     if isinstance(content, list):
         parts: list[str] = []
         for item in content:
@@ -61,8 +89,8 @@ def text_content(content: Any) -> str:
                     parts.append(json.dumps(item, ensure_ascii=False))
             else:
                 parts.append(str(item))
-        return "\n".join(parts)
-    return str(content)
+        return escape_media_tags("\n".join(parts))
+    return escape_media_tags(str(content))
 
 
 def parse_arguments(
@@ -82,7 +110,7 @@ def parse_arguments(
             f"function.arguments must decode to an object in {record_id} "
             f"message {message_index} tool call {call_index}: {type(arguments).__name__}"
         )
-    return arguments
+    return escape_media_tags_in_json(arguments)
 
 
 def adapt_tool_calls(
@@ -195,11 +223,11 @@ def adapt_record(record: dict[str, Any], *, trim_to_trainable: bool = False) ->
 
     tools = record.get("tools", "")
     if isinstance(tools, str):
-        adapted["tools"] = tools
+        adapted["tools"] = escape_media_tags(tools)
     elif tools is None:
         adapted["tools"] = ""
     else:
-        adapted["tools"] = json.dumps(tools, ensure_ascii=False)
+        adapted["tools"] = escape_media_tags(json.dumps(tools, ensure_ascii=False))
 
     if "metadata" in record:
         adapted["metadata"] = record["metadata"]
diff --git a/tests/test_openhands_sdk_sft_to_llamafactory.py b/tests/test_openhands_sdk_sft_to_llamafactory.py
index 69b07181..778ad0dd 100644
--- a/tests/test_openhands_sdk_sft_to_llamafactory.py
+++ b/tests/test_openhands_sdk_sft_to_llamafactory.py
@@ -209,6 +209,82 @@ def test_adapt_record_rejects_non_object_arguments():
         adapt_record(record)
 
 
+def test_cli_escapes_literal_media_tags(tmp_path):
+    input_path = tmp_path / "input_media.jsonl"
+    output_path = tmp_path / "output_media.jsonl"
+    input_record = {
+        "id": "media-tags",
+        "messages": [
+            {"role": "user", "content": "XML: <image><url>x</url></image>"},
+            {"role": "assistant", "content": "Use <audio> literally, not as media."},
+        ],
+    }
+    input_path.write_text(json.dumps(input_record) + "\n")
+
+    subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "agents.openhands_sdk.sft_to_llamafactory",
+            "--input",
+            str(input_path),
+            "--output",
+            str(output_path),
+        ],
+        check=True,
+        text=True,
+        capture_output=True,
+    )
+
+    [adapted] = [json.loads(line) for line in output_path.read_text().splitlines()]
+    assert adapted["messages"][0]["content"] == "XML: &lt;image&gt;<url>x</url>&lt;/image&gt;"
+    assert adapted["messages"][1]["content"] == "Use &lt;audio&gt; literally, not as media."
+
+
+def test_cli_escapes_literal_media_tags_inside_tool_arguments(tmp_path):
+    input_path = tmp_path / "input_media_args.jsonl"
+    output_path = tmp_path / "output_media_args.jsonl"
+    input_record = {
+        "id": "media-args",
+        "messages": [
+            {"role": "user", "content": "Run script"},
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "terminal",
+                            "arguments": json.dumps(
+                                {"command": "cat <<'EOF'\n<image>x</image>\nEOF"}
+                            ),
+                        }
+                    }
+                ],
+            },
+        ],
+    }
+    input_path.write_text(json.dumps(input_record) + "\n")
+
+    subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "agents.openhands_sdk.sft_to_llamafactory",
+            "--input",
+            str(input_path),
+            "--output",
+            str(output_path),
+        ],
+        check=True,
+        text=True,
+        capture_output=True,
+    )
+
+    [adapted] = [json.loads(line) for line in output_path.read_text().splitlines()]
+    function_call = json.loads(adapted["messages"][1]["content"])[0]
+    assert function_call["arguments"]["command"] == "cat <<'EOF'\n&lt;image&gt;x&lt;/image&gt;\nEOF"
+
+
 def test_dataset_info_uses_llamafactory_openai_tags():
     info = dataset_info("demo", "demo.jsonl")