Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
246 changes: 178 additions & 68 deletions examples/geo3k_vlm/run_geo3k_qwen35.sh
Original file line number Diff line number Diff line change
@@ -1,24 +1,38 @@
#!/bin/bash
#
# Qwen3.5-VL RL training on geo3k dataset.
#
# Supports both Dense (Qwen3.5-9B / Qwen3.5-27B) and MoE (Qwen3.5-35B-A3B,
# Qwen3.5-397B-A17B, ...) variants via the official NVIDIA Megatron-Bridge
# package (>= 0.4.0). Selection is by env var:
#
# # Dense — default
# MODEL_NAME=Qwen3.5-9B ./run_geo3k_qwen35_vl.sh
# MODEL_NAME=Qwen3.5-27B ./run_geo3k_qwen35_vl.sh
#
# # MoE
# MODEL_NAME=Qwen3.5-35B-A3B ./run_geo3k_qwen35_vl.sh
#
# The Megatron-side provider is built by megatron-bridge directly from the
# HuggingFace config of the checkpoint at $HF_CHECKPOINT (no fork required).

# Qwen3.5-35B-A3B VL RL training on geo3k dataset

pip install -U transformers

# IMPORTANT: This branch is specially modified for slime's current Megatron
# version and Qwen3.5 from the main Megatron Bridge. Other models are not verified!
# To restore the original Megatron Bridge, run:
# pip install git+https://github.com/fzyzcjy/Megatron-Bridge.git@dev_rl --no-build-isolation
# TODO: Remove this once Megatron & Megatron Bridge are upgraded upstream.
pip install git+https://github.com/coding-famer/Megatron-Bridge-slime.git@qwen35 --no-build-isolation

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
TRAIN_BACKEND="megatron"
MODEL_NAME="Qwen3_5-35B-A3B"
MODEL_NAME=${MODEL_NAME:-"Qwen3.5-9B"}
DATASET_NAME=${SLIME_SCRIPT_DATASET_NAME:-"chenhegu/geo3k_imgurl"}
NUM_GPUS=${SLIME_SCRIPT_NUM_GPUS:-8}
DATASET_LOCAL_NAME=$(basename "$DATASET_NAME")

MODEL_NAME_LOWER=$(echo "$MODEL_NAME" | tr '[:upper:]' '[:lower:]')
BASE_FOLDED=${SLIME_BASE_FOLDED:-"/root"}

# Heuristic: any "*A<digits>B*" suffix denotes a MoE variant (A3B / A17B / ...)
if [[ "$MODEL_NAME" == *A[0-9]*B* ]]; then
IS_MOE=1
else
IS_MOE=0
fi

# External Ray flag
if [ -z "$SLIME_SCRIPT_EXTERNAL_RAY" ] || [ "$SLIME_SCRIPT_EXTERNAL_RAY" = "0" ]; then
Expand All @@ -27,7 +41,9 @@ else
USE_EXTERNAL_RAY=1
fi

# Cleanup
# ---------------------------------------------------------------------------
# Cleanup (no set -e here: pkill returns non-zero when no process matches)
# ---------------------------------------------------------------------------
pkill -9 sglang
sleep 3
if [ "$USE_EXTERNAL_RAY" = "0" ]; then
Expand All @@ -54,25 +70,30 @@ else
HAS_NVLINK=0
fi
echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
echo "MODEL_NAME: $MODEL_NAME (IS_MOE=$IS_MOE)"

# ---------------------------------------------------------------------------
# Download model and dataset
mkdir -p /root/models /root/datasets
if [ ! -d "/root/models/${MODEL_NAME}" ]; then
hf download Qwen/${MODEL_NAME} --local-dir /root/models/${MODEL_NAME}
# ---------------------------------------------------------------------------
mkdir -p ${BASE_FOLDED}/models ${BASE_FOLDED}/datasets
if [ ! -d "${BASE_FOLDED}/models/${MODEL_NAME}" ]; then
hf download Qwen/${MODEL_NAME} --local-dir ${BASE_FOLDED}/models/${MODEL_NAME}
fi
if [ ! -d "/root/datasets/${DATASET_LOCAL_NAME}" ]; then
hf download --repo-type dataset ${DATASET_NAME} --local-dir /root/datasets/${DATASET_LOCAL_NAME}
if [ ! -d "${BASE_FOLDED}/datasets/${DATASET_LOCAL_NAME}" ]; then
hf download --repo-type dataset ${DATASET_NAME} --local-dir ${BASE_FOLDED}/datasets/${DATASET_LOCAL_NAME}
fi

# Common args
# ---------------------------------------------------------------------------
# Args common to dense and MoE
# ---------------------------------------------------------------------------
CKPT_ARGS=(
--hf-checkpoint /root/models/${MODEL_NAME}
--load /root/models/${MODEL_NAME}
--hf-checkpoint ${BASE_FOLDED}/models/${MODEL_NAME}
--load ${BASE_FOLDED}/models/${MODEL_NAME}
--megatron-to-hf-mode bridge
)

ROLLOUT_ARGS=(
--prompt-data /root/datasets/${DATASET_LOCAL_NAME}/train.parquet
--prompt-data ${BASE_FOLDED}/datasets/${DATASET_LOCAL_NAME}/train.parquet
--input-key problem
--label-key answer
--apply-chat-template
Expand All @@ -86,12 +107,12 @@ ROLLOUT_ARGS=(
--global-batch-size 512
)

# required for vlm datasets
# Required for VLM datasets — geo3k stores image URLs under "images"
MULTIMODAL_KEYS='{"image": "images"}'

EVAL_ARGS=(
--eval-interval 20
--eval-prompt-data ${DATASET_LOCAL_NAME} /root/datasets/${DATASET_LOCAL_NAME}/test.parquet
--eval-prompt-data ${DATASET_LOCAL_NAME} ${BASE_FOLDED}/datasets/${DATASET_LOCAL_NAME}/test.parquet
--n-samples-per-eval-prompt 1
--eval-max-response-len 4096
)
Expand All @@ -115,21 +136,6 @@ OPTIMIZER_ARGS=(
--adam-beta2 0.98
)

SGLANG_ARGS=(
--rollout-num-gpus-per-engine 8
--sglang-mem-fraction-static 0.7
--sglang-ep-size 8
--sglang-cuda-graph-bs 1 2 4 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248 256

# MTP speculative decoding
--sglang-speculative-algorithm EAGLE
--sglang-speculative-num-steps 2
--sglang-speculative-eagle-topk 1
--sglang-speculative-num-draft-tokens 3

--sglang-max-running-requests 512
)

# Wandb args (only if WANDB_API_KEY is set)
if [ -n "$WANDB_API_KEY" ]; then
WANDB_ARGS=(
Expand All @@ -147,47 +153,151 @@ MISC_ARGS=(
--colocate
)

# Backend-specific args
# megatron backend
BACKEND_ARGS=(
--train-backend megatron
# Qwen3.5-35B-A3B has num_query_groups = 2
--tensor-model-parallel-size 2
--sequence-parallel
--pipeline-model-parallel-size 1
--context-parallel-size 1
--expert-model-parallel-size 8
--expert-tensor-parallel-size 1
--recompute-granularity full
--recompute-method uniform
--recompute-num-layers 1
--attention-dropout 0.0
--hidden-dropout 0.0
--accumulate-allreduce-grads-in-fp32
--attention-softmax-in-fp32
--attention-backend flash

# Packing is not supported for GDN currently
--qkv-format bshd
--micro-batch-size 1
)
# ---------------------------------------------------------------------------
# Variant-specific args (Dense vs MoE)
# ---------------------------------------------------------------------------
if [ "$IS_MOE" = "1" ]; then
# MoE branch — Qwen3.5-35B-A3B / 397B-A17B follow the same SGLang recipe
# as Qwen3-Next (LMSYS cookbook):
# https://lmsysorg.mintlify.app/cookbook/autoregressive/Qwen/Qwen3-Next
# i.e. NEXTN speculative decoding (uses the model's built-in MTP head,
# which Qwen3.5 ships with) + extra_buffer mamba scheduler + page-size=64
# so radix cache stays enabled. SGLANG_ENABLE_SPEC_V2=1 is exported in
# the Ray runtime_env below.
SGLANG_ARGS=(
--rollout-num-gpus-per-engine 8
--sglang-mem-fraction-static 0.7
--sglang-ep-size 8
--sglang-cuda-graph-bs 1 2 4 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248 256

# NEXTN speculative decoding (MTP-based, native to Qwen3.5).
--sglang-speculative-algorithm NEXTN
--sglang-speculative-num-steps 3
--sglang-speculative-eagle-topk 1
--sglang-speculative-num-draft-tokens 4

# Hybrid (mamba-style) scheduler tuned for Qwen3-Next / Qwen3.5 MoE.
# extra_buffer + page-size=64 lets radix cache coexist with
# speculative decoding. page-size must satisfy
# FLA_CHUNK_SIZE % page_size == 0 (cookbook value: 64).
--sglang-mamba-scheduler-strategy extra_buffer
--sglang-page-size 64

--sglang-max-running-requests 512

# Workaround: SGLang's symmetric-memory custom all-reduce trips
# `cudaIpcOpenMemHandle` / `share_graph_inputs` on some driver / IPC
# configs ("CUDA error: invalid argument" inside custom_all_reduce.cuh).
# NCCL all-reduce is plenty fast for the rollout workers.
--sglang-disable-custom-all-reduce
)

BACKEND_ARGS=(
--train-backend megatron
# MoE Qwen3.5-35B-A3B has num_query_groups = 2 (gated attention)
--tensor-model-parallel-size 2
--sequence-parallel
--pipeline-model-parallel-size 1
--context-parallel-size 1
--expert-model-parallel-size 8
--expert-tensor-parallel-size 1
--recompute-granularity full
--recompute-method uniform
--recompute-num-layers 1
--attention-dropout 0.0
--hidden-dropout 0.0
--accumulate-allreduce-grads-in-fp32
--attention-softmax-in-fp32
--attention-backend flash

# GDN (Gated DeltaNet, Qwen3.5's linear-attention branch) does not
# support packed sequences in megatron-core today
# (`gated_delta_net.py:300 NotImplementedError`). Force the padded
# BSHD layout — slime's data pipeline then sets
# packed_seq_params=None and the GDN guard is not tripped.
--qkv-format bshd
--micro-batch-size 1
)
else
# Dense branch — Qwen3.5-9B / 27B. The bridge derives the full provider
# config (hidden size, GDN heads, mRoPE sections, etc.) from the HF
# checkpoint's config.json, so no model-specific Megatron flags are
# required. We only set parallelism + memory knobs here.
SGLANG_ARGS=(
--rollout-num-gpus-per-engine 8
--sglang-mem-fraction-static 0.7
--sglang-max-running-requests 512

# Workaround: SGLang's symmetric-memory custom all-reduce trips
# `cudaIpcOpenMemHandle` / `share_graph_inputs` on some driver / IPC
# configs ("CUDA error: invalid argument" inside custom_all_reduce.cuh).
# NCCL all-reduce is plenty fast for the rollout workers.
--sglang-disable-custom-all-reduce
)

BACKEND_ARGS=(
--train-backend megatron
--tensor-model-parallel-size 2
--sequence-parallel
--pipeline-model-parallel-size 1
--context-parallel-size 1
--recompute-granularity full
--recompute-method uniform
--recompute-num-layers 1
--attention-dropout 0.0
--hidden-dropout 0.0
--accumulate-allreduce-grads-in-fp32
--attention-softmax-in-fp32
--attention-backend flash

# GDN (Gated DeltaNet, Qwen3.5's linear-attention branch) does not
# support packed sequences in megatron-core today
# (`gated_delta_net.py:300 NotImplementedError`). Force the padded
# BSHD layout — slime's data pipeline then sets
# packed_seq_params=None and the GDN guard is not tripped.
--qkv-format bshd
--micro-batch-size 1
)
fi

# ---------------------------------------------------------------------------
# Optional legacy text-spec MODEL_ARGS (only sourced for the matching variant)
# ---------------------------------------------------------------------------
SLIME_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." &>/dev/null && pwd)"
source "${SLIME_DIR}/scripts/models/qwen3.5-35B-A3B.sh"

# Start Ray if not using external Ray
# Source the matching scripts/models/qwen3.5-*.sh if present. In bridge mode
# (--megatron-to-hf-mode bridge) the provider comes from the HF config, but
# slime's CLI parser still needs MODEL_ARGS to be defined. We fall back to an
# empty array if no matching file exists for the requested variant.
MODEL_ARGS=()
case "$MODEL_NAME" in
*0.8B*) CANDIDATE_MODEL_SH="${SLIME_DIR}/scripts/models/qwen3.5-0.8B.sh" ;;
*4B*) CANDIDATE_MODEL_SH="${SLIME_DIR}/scripts/models/qwen3.5-4B.sh" ;;
*9B*) CANDIDATE_MODEL_SH="${SLIME_DIR}/scripts/models/qwen3.5-9B.sh" ;;
*27B*) CANDIDATE_MODEL_SH="${SLIME_DIR}/scripts/models/qwen3.5-27B.sh" ;;
*35B-A3B*) CANDIDATE_MODEL_SH="${SLIME_DIR}/scripts/models/qwen3.5-35B-A3B.sh" ;;
*) CANDIDATE_MODEL_SH="" ;;
esac
if [ -n "$CANDIDATE_MODEL_SH" ] && [ -f "$CANDIDATE_MODEL_SH" ]; then
# shellcheck disable=SC1090
source "$CANDIDATE_MODEL_SH"
fi

# ---------------------------------------------------------------------------
# Start Ray (if not external) and submit the job
# ---------------------------------------------------------------------------
if [ "$USE_EXTERNAL_RAY" = "0" ]; then
export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
export no_proxy="127.0.0.1,${MASTER_ADDR}"
ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus ${NUM_GPUS} --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265
fi

# Build runtime env
RUNTIME_ENV_JSON="{
\"env_vars\": {
\"PYTHONPATH\": \"/root/Megatron-LM/\",
\"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
\"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\"
\"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\",
\"SGLANG_ENABLE_SPEC_V2\": \"1\"
}
}"

Expand Down
1 change: 1 addition & 0 deletions slime_plugins/megatron_bridge/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
import slime_plugins.megatron_bridge.glm4v_moe # noqa: F401 # register GLM-4.6V bridge
import slime_plugins.megatron_bridge.qwen3_5_vl # noqa: F401 # register Qwen3.5-VL bridges (dense + MoE)
Loading
Loading