Skip to content
Draft
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 120 additions & 14 deletions agents/openhands/std_to_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from schema.action.api import ApiAction
from schema.action.code import CodeAction
from schema.action.message import MessageAction
from schema.observation.image import ImageObservation
from schema.observation.text import TextObservation
from schema.observation.web import WebObservation
from schema.trajectory import Trajectory
Expand Down Expand Up @@ -91,7 +92,52 @@ def standardized_event_to_openhands_message(
else:
axtree = generate_axtree.last_xtree
prompt = get_web_user_message("", event.url, axtree, PREV_BID)
return {"from": "human", "value": prompt}

Comment thread
MajikalExplosions marked this conversation as resolved.
# Handle nested image observation
image_path = None
if hasattr(event, "image_observation") and event.image_observation:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit confused, where would this nested image observation come from? I didn't see it in other parts of the code.

In general, "getattr" and "hasattr" are kinda anti-patterns in Python programming. They are indicative of not strictly adhering to type definitions, and can cause all kinds of tricky runtime errors. Let's try to write this without using these.

image_path = event.image_observation.content

# Add visual observation section
prompt += "\n\n---\nVISUAL OBSERVATION:\n<image>"

# Add image annotations if present (using enhanced parsing)
if (
hasattr(event.image_observation, "annotations")
and event.image_observation.annotations
):
annotations = []
for annotation in event.image_observation.annotations:
# Build annotation description from available fields
parts = []
if hasattr(annotation, "text") and annotation.text:
parts.append(annotation.text)
elif (
hasattr(annotation, "content_description")
and annotation.content_description
):
parts.append(annotation.content_description)

# Add element type
if hasattr(annotation, "element_type"):
parts.append(f"({annotation.element_type})")

# Add interactivity info
attrs = []
if hasattr(annotation, "clickable") and annotation.clickable:
attrs.append("clickable")
if hasattr(annotation, "editable") and annotation.editable:
attrs.append("editable")
if attrs:
parts.append(f"[{', '.join(attrs)}]")

if parts:
annotations.append(" ".join(parts))

if annotations:
prompt += "\nElements detected: " + ", ".join(annotations)

return {"from": "human", "value": prompt, "_image_path": image_path}

if isinstance(event, ApiAction):
PREV_BID = None
Expand Down Expand Up @@ -133,10 +179,23 @@ def standardized_event_to_openhands_message(
event_xpath = event.kwargs.get("xpath", None)
if event_xpath:
browsergym_id = generate_axtree.get_bid(id, event_xpath, "all")

# Generate placeholder bid for web datasets when get_bid fails
if not browsergym_id and is_web:
event_xpath = event.kwargs.get("xpath", None)
if event_xpath:
# Use xpath hash as placeholder to maintain some consistency
placeholder_id = f"placeholder_bid_{abs(hash(event_xpath)) % 10000}"
browsergym_id = f'"{placeholder_id}"'
print(
f"Warning: Generated placeholder bid {browsergym_id} for xpath: {event_xpath}",
file=sys.stderr,
)

# for tool calls that are not browser based since there is no browsergym_id
# and tool calls that are specified as non-web
# these should all be dataset specific apis
if (not browsergym_id or not is_web) and function_name in api_sigs:
if not is_web and function_name in api_sigs:
if not api_env:
# Default to 'execute_ipython_cell' if api_env is not specified
api_env = "execute_ipython_cell"
Expand All @@ -151,7 +210,8 @@ def standardized_event_to_openhands_message(
return {"from": "function_call", "value": f"{thought}{function_call}"}

api_env = "browser"
if not browsergym_id[0] == browsergym_id[-1] == '"':
# Fix: Add None check before accessing browsergym_id indices
Comment thread
MajikalExplosions marked this conversation as resolved.
Outdated
if browsergym_id and not browsergym_id[0] == browsergym_id[-1] == '"':
browsergym_id = f'"{browsergym_id[0]}"'
PREV_BID = browsergym_id
# for apis that are browser based but are not OH default browser apis
Expand Down Expand Up @@ -223,25 +283,49 @@ def standardized_event_to_openhands_message(
raise ValueError(f"Wrong event source: {event.source}")
return {"from": event.source, "value": event.content}

elif hasattr(event, "__class__") and event.__class__.__name__ == "ImageObservation":
elif isinstance(event, ImageObservation):
Comment thread
MajikalExplosions marked this conversation as resolved.
# Handle ImageObservation
annotations_text = ""
if hasattr(event, "annotations") and event.annotations:
annotations = []
for annotation in event.annotations:
# Build annotation description from available fields
parts = []
if hasattr(annotation, "text") and annotation.text:
annotations.append(f"{annotation.text} ({annotation.element_type})")
parts.append(annotation.text)
elif hasattr(annotation, "content_description") and annotation.content_description:
parts.append(annotation.content_description)

# Add element type
if hasattr(annotation, "element_type"):
parts.append(f"({annotation.element_type})")

# Add interactivity info
attrs = []
if hasattr(annotation, "clickable") and annotation.clickable:
attrs.append("clickable")
if hasattr(annotation, "editable") and annotation.editable:
attrs.append("editable")
if attrs:
parts.append(f"[{', '.join(attrs)}]")

if parts:
annotations.append(" ".join(parts))

if annotations:
annotations_text = "Elements detected: " + ", ".join(annotations)

image_path = getattr(event, "content", "unknown_image_path")
return {"from": "observation", "value": f"[Image: {image_path}]\n{annotations_text}"}
return {
"from": "observation",
"value": f"<image>{annotations_text}",
"_image_path": event.content,
}

else:
raise ValueError(f"Unknown event type: {type(event)}\n{event}")


def process_row(line, is_web, api_env, api_tool_description, api_sigs):
def process_row(line, is_web, api_env, api_tool_description, api_sigs, output_format="default"):
std_dataset = [json.loads(line)]
std_data = std_dataset[0]
trajectory = Trajectory(**std_data)
Expand All @@ -251,6 +335,7 @@ def process_row(line, is_web, api_env, api_tool_description, api_sigs):
conversations = []
previous_web_actions = []
languages = []
image_paths = []
for i in range(len(events)):
event = events[i]
try:
Expand All @@ -259,6 +344,13 @@ def process_row(line, is_web, api_env, api_tool_description, api_sigs):
)
if not message:
return None

# Extract image path if present
if "_image_path" in message:
path = message.pop("_image_path")
if path:
image_paths.append(path)

if len(conversations) == 0:
# append api function docs to first user message when available
if api_env:
Expand Down Expand Up @@ -290,18 +382,24 @@ def process_row(line, is_web, api_env, api_tool_description, api_sigs):
language_descriptions = get_language_descriptions(languages)
conversations[0]["value"] = language_descriptions + "\n\n" + conversations[0]["value"]
for m in conversations:
if m["from"] == "function_call":
if output_format == "finetune" and m["from"] == "function_call":
m["from"] = "gpt"
if m["from"] == "observation":
m["from"] = "human"
return {

output = {
"id": trajectory.id,
"conversations": conversations,
"system": get_system_message(),
}

if image_paths:
output["images"] = image_paths

return output

def process_line(line, is_web, api_env):

def process_line(line, is_web, api_env, output_format="default"):
exclude_apis = browser_default_apis if is_web else {}
api_tool_description, api_sigs = get_api_tool_description(dataset, exclude_apis, api_env)
output_line = process_row(
Expand All @@ -310,6 +408,7 @@ def process_line(line, is_web, api_env):
api_env=api_env,
api_tool_description=api_tool_description,
api_sigs=api_sigs,
output_format=output_format,
)
output_line = json.dumps(output_line)
# if output_line:
Expand All @@ -325,8 +424,8 @@ def process_line(line, is_web, api_env):


# Keep the old main function for backward compatibility
Comment thread
MajikalExplosions marked this conversation as resolved.
Outdated
def main_with_args(line, is_web, api_env):
return process_line(line, is_web, api_env)
def main_with_args(line, is_web, api_env, output_format="default"):
return process_line(line, is_web, api_env, output_format)


def main():
Expand All @@ -346,10 +445,17 @@ def main():
help="The environment in which the APIs are pre-defined",
default=None,
)
parser.add_argument(
"--output_format",
type=str,
choices=["default", "finetune"],
Comment thread
MajikalExplosions marked this conversation as resolved.
Outdated
default="default",
help="Output format: 'default' keeps function_call, 'finetune' converts to gpt",
)
args = parser.parse_args()
args.is_web = args.is_web == "yes"
for line in sys.stdin:
print(main_with_args(line, args.is_web, args.api_env))
print(main_with_args(line, args.is_web, args.api_env, args.output_format))


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion agents/openhands/system_prompt/tools/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@

scroll(-50.2, -100.5)

fill(bid: str, value: str)
fill(bid: str, value: str, enable_autocomplete_menu: bool = False)
Comment thread
MajikalExplosions marked this conversation as resolved.
Description: Fill out a form field. It focuses the element and triggers an input event with the entered text. It works for <input>, <textarea> and [contenteditable] elements.
Examples:
fill('237', 'example value')
Expand Down
74 changes: 74 additions & 0 deletions convert_samples.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# PowerShell script to convert all 9 dataset samples: Raw -> Standardized -> SFT (OpenHands format)
Comment thread
MajikalExplosions marked this conversation as resolved.
Outdated
# Run this from the agent-data-collection root directory

Write-Host "========================================" -ForegroundColor Cyan
Write-Host "Dataset Sample Conversion Script" -ForegroundColor Cyan
Write-Host "========================================" -ForegroundColor Cyan
Write-Host ""

# Set PYTHONPATH to current directory
$env:PYTHONPATH = "$PWD;$env:PYTHONPATH"

# Array of datasets with their parameters
$datasets = @(
@{Name="android_in_the_wild"; IsWeb="no"; ApiEnv="execute_bash"},
@{Name="androidcontrol"; IsWeb="no"; ApiEnv="execute_bash"},
@{Name="llava_plus"; IsWeb="no"; ApiEnv="execute_bash"},
@{Name="omniact"; IsWeb="no"; ApiEnv="execute_bash"},
@{Name="webarena_successful"; IsWeb="yes"; ApiEnv="browser"},
@{Name="weblinx"; IsWeb="yes"; ApiEnv="browser"},
@{Name="wonderbread"; IsWeb="yes"; ApiEnv="browser"},
@{Name="go-browse-wa"; IsWeb="yes"; ApiEnv="browser"},
@{Name="openhands"; IsWeb="no"; ApiEnv="execute_bash"}
)

$count = 0
foreach ($dataset in $datasets) {
$count++
$name = $dataset.Name
$isWeb = $dataset.IsWeb
$apiEnv = $dataset.ApiEnv

Write-Host "[$count/9] Processing $name..." -ForegroundColor Yellow

$rawPath = "datasets\$name\sample_raw.json"
$stdPath = "datasets\$name\sample_std.json"
$sftPath = "datasets\$name\sample_sft.json"

if (Test-Path $rawPath) {
Write-Host " - Converting raw to standardized..." -ForegroundColor Gray

# Raw -> Standardized
Get-Content $rawPath |
python scripts\json_to_jsonl.py |
python datasets\$name\raw_to_standardized.py |
python scripts\jsonl_to_json.py |
Set-Content $stdPath

Write-Host " - Converting standardized to SFT (OpenHands)..." -ForegroundColor Gray

# Set MY_DATASET environment variable for this dataset
$env:MY_DATASET = $name

# Standardized -> SFT
Get-Content $stdPath |
python scripts\json_to_jsonl.py |
python agents\openhands\std_to_sft.py --is_web=$isWeb --api_env=$apiEnv |
python scripts\jsonl_to_json.py |
Set-Content $sftPath

Write-Host " - Done!" -ForegroundColor Green
} else {
Write-Host " - SKIPPED: sample_raw.json not found" -ForegroundColor DarkGray
}
Write-Host ""
}

Write-Host "========================================" -ForegroundColor Cyan
Write-Host "All datasets processed!" -ForegroundColor Cyan
Write-Host "========================================" -ForegroundColor Cyan
Write-Host ""
Write-Host "Results:" -ForegroundColor White
Write-Host "- sample_std.json files have been regenerated (with ImageObservation schema fixes)" -ForegroundColor White
Write-Host "- sample_sft.json files contain OpenHands format conversions" -ForegroundColor White
Write-Host ""
Loading