openadapt-ml/openadapt_ml/training/grpo/reward.py at 205a7727023890cc745edb10e2ccc85f0ac33aaa · OpenAdaptAI/openadapt-ml · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""Reward functions for GRPO training.

Provides binary task-success rewards and group-relative advantage
computation following the GRPO algorithm (Shao et al., 2024).

GRPO computes advantages relative to the group mean rather than using
a learned value function, which is simpler and works well for sparse
binary rewards (task success/failure).

Also provides ``evaluate_milestones_screenshot``, a standalone utility
that evaluates milestone-based rewards from a screenshot without needing
the WAA /evaluate endpoint.  This is the local-evaluation path used by
the standalone GRPO trainer when ``--task-dir`` is set.
"""

from __future__ import annotations

import logging

logger = logging.getLogger(__name__)


def binary_task_success(score: float, threshold: float = 0.5) -> float:
    """Convert evaluator score to binary reward.

    Args:
        score: Raw evaluator score (0.0-1.0) from WAA environment.
        threshold: Score at or above which the task is considered successful.

    Returns:
        1.0 if score >= threshold, else 0.0.
    """
    return 1.0 if score >= threshold else 0.0


def compute_group_advantages(rewards: list[float]) -> list[float]:
    """Compute group-relative advantages for a batch of rollout rewards.

    GRPO normalizes rewards within each group:
        advantage[i] = (reward[i] - mean) / (std + eps)

    If all rewards are identical (no variance), returns all zeros. This
    avoids NaN from division by zero and correctly signals that there is
    no gradient signal when every rollout in the group has the same outcome.

    Args:
        rewards: List of scalar rewards for each rollout in the group.

    Returns:
        List of advantage values, same length as rewards.
    """
    n = len(rewards)
    if n == 0:
        return []

    mean = sum(rewards) / n
    variance = sum((r - mean) ** 2 for r in rewards) / n
    std = variance**0.5
    eps = 1e-8

    # No variance means no gradient signal: all advantages are zero
    if std < eps:
        return [0.0] * n

    return [(r - mean) / (std + eps) for r in rewards]


def evaluate_milestones_screenshot(
    task_config: object,
    screenshot_bytes: bytes,
    vlm_model: str = "gpt-4.1-mini",
    vlm_provider: str = "openai",
) -> float:
    """Evaluate milestone-based rewards from a screenshot (no server needed).

    Iterates over the milestones in a TaskConfig and evaluates each
    ``screenshot``-type milestone using a VLM judge.  Non-screenshot
    milestones are skipped (they require a live server).

    This is a standalone utility that can be called independently of the
    trainer, e.g.::

        from openadapt_ml.training.grpo.reward import evaluate_milestones_screenshot
        reward = evaluate_milestones_screenshot(task_config, screenshot_bytes)

    Args:
        task_config: A ``TaskConfig`` instance (from ``openadapt_evals.task_config``).
            Must have a ``milestones`` attribute (list of ``Milestone`` objects).
        screenshot_bytes: PNG screenshot bytes to evaluate against.
        vlm_model: VLM model name for the judge.
        vlm_provider: VLM provider (``"openai"`` or ``"anthropic"``).

    Returns:
        Fraction of screenshot milestones that passed (0.0 to 1.0).
        Returns 0.0 if there are no milestones or no screenshot milestones.
    """
    milestones = getattr(task_config, "milestones", None)
    if not milestones:
        return 0.0

    # Only evaluate screenshot-type milestones locally
    screenshot_milestones = [
        ms for ms in milestones
        if getattr(ms.check, "check", None) == "screenshot"
    ]
    if not screenshot_milestones:
        return 0.0

    try:
        from openadapt_evals.vlm_evaluator import vlm_judge
    except ImportError:
        logger.warning(
            "openadapt-evals is not installed; cannot evaluate screenshot "
            "milestones. Install with: pip install openadapt-evals"
        )
        return 0.0

    passed = 0
    for ms in screenshot_milestones:
        description = getattr(ms.check, "description", None) or ""
        if not description:
            continue
        try:
            success, _confidence = vlm_judge(
                screenshot_bytes,
                description,
                model=vlm_model,
                provider=vlm_provider,
            )
            if success:
                passed += 1
            logger.debug(
                "Milestone '%s': %s",
                getattr(ms, "name", "?"),
                "PASS" if success else "FAIL",
            )
        except Exception as exc:
            logger.warning(
                "Milestone '%s' evaluation failed: %s",
                getattr(ms, "name", "?"),
                exc,
            )

    total = len(screenshot_milestones)
    score = passed / total if total > 0 else 0.0
    logger.info(
        "Milestone evaluation: %d/%d screenshot milestones passed (%.2f)",
        passed,
        total,
        score,
    )
    return score