From e9c4ab24e586a2374df753b8a44c8899b14579e2 Mon Sep 17 00:00:00 2001 From: demouo <2081510953@qq.com> Date: Sun, 14 Jun 2026 17:04:46 +0800 Subject: [PATCH] feat(qwen3.5-vl): wire NVIDIA megatron-bridge Qwen3.5-VL bridges (dense + MoE) with MTP-naming alias and end-to-end geo3k example Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/geo3k_vlm/run_geo3k_qwen35.sh | 246 ++++++++++++++------ slime_plugins/megatron_bridge/__init__.py | 1 + slime_plugins/megatron_bridge/qwen3_5_vl.py | 88 +++++++ 3 files changed, 267 insertions(+), 68 deletions(-) create mode 100644 slime_plugins/megatron_bridge/qwen3_5_vl.py diff --git a/examples/geo3k_vlm/run_geo3k_qwen35.sh b/examples/geo3k_vlm/run_geo3k_qwen35.sh index 8f057de887..0ca30db74b 100644 --- a/examples/geo3k_vlm/run_geo3k_qwen35.sh +++ b/examples/geo3k_vlm/run_geo3k_qwen35.sh @@ -1,24 +1,38 @@ #!/bin/bash +# +# Qwen3.5-VL RL training on geo3k dataset. +# +# Supports both Dense (Qwen3.5-9B / Qwen3.5-27B) and MoE (Qwen3.5-35B-A3B, +# Qwen3.5-397B-A17B, ...) variants via the official NVIDIA Megatron-Bridge +# package (>= 0.4.0). Selection is by env var: +# +# # Dense — default +# MODEL_NAME=Qwen3.5-9B ./run_geo3k_qwen35_vl.sh +# MODEL_NAME=Qwen3.5-27B ./run_geo3k_qwen35_vl.sh +# +# # MoE +# MODEL_NAME=Qwen3.5-35B-A3B ./run_geo3k_qwen35_vl.sh +# +# The Megatron-side provider is built by megatron-bridge directly from the +# HuggingFace config of the checkpoint at $HF_CHECKPOINT (no fork required). -# Qwen3.5-35B-A3B VL RL training on geo3k dataset - -pip install -U transformers - -# IMPORTANT: This branch is specially modified for slime's current Megatron -# version and Qwen3.5 from the main Megatron Bridge. Other models are not verified! -# To restore the original Megatron Bridge, run: -# pip install git+https://github.com/fzyzcjy/Megatron-Bridge.git@dev_rl --no-build-isolation -# TODO: Remove this once Megatron & Megatron Bridge are upgraded upstream. -pip install git+https://github.com/coding-famer/Megatron-Bridge-slime.git@qwen35 --no-build-isolation - +# --------------------------------------------------------------------------- # Configuration +# --------------------------------------------------------------------------- TRAIN_BACKEND="megatron" -MODEL_NAME="Qwen3_5-35B-A3B" +MODEL_NAME=${MODEL_NAME:-"Qwen3.5-9B"} DATASET_NAME=${SLIME_SCRIPT_DATASET_NAME:-"chenhegu/geo3k_imgurl"} NUM_GPUS=${SLIME_SCRIPT_NUM_GPUS:-8} DATASET_LOCAL_NAME=$(basename "$DATASET_NAME") - MODEL_NAME_LOWER=$(echo "$MODEL_NAME" | tr '[:upper:]' '[:lower:]') +BASE_FOLDED=${SLIME_BASE_FOLDED:-"/root"} + +# Heuristic: any "*AB*" suffix denotes a MoE variant (A3B / A17B / ...) +if [[ "$MODEL_NAME" == *A[0-9]*B* ]]; then + IS_MOE=1 +else + IS_MOE=0 +fi # External Ray flag if [ -z "$SLIME_SCRIPT_EXTERNAL_RAY" ] || [ "$SLIME_SCRIPT_EXTERNAL_RAY" = "0" ]; then @@ -27,7 +41,9 @@ else USE_EXTERNAL_RAY=1 fi -# Cleanup +# --------------------------------------------------------------------------- +# Cleanup (no set -e here: pkill returns non-zero when no process matches) +# --------------------------------------------------------------------------- pkill -9 sglang sleep 3 if [ "$USE_EXTERNAL_RAY" = "0" ]; then @@ -54,25 +70,30 @@ else HAS_NVLINK=0 fi echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)" +echo "MODEL_NAME: $MODEL_NAME (IS_MOE=$IS_MOE)" +# --------------------------------------------------------------------------- # Download model and dataset -mkdir -p /root/models /root/datasets -if [ ! -d "/root/models/${MODEL_NAME}" ]; then - hf download Qwen/${MODEL_NAME} --local-dir /root/models/${MODEL_NAME} +# --------------------------------------------------------------------------- +mkdir -p ${BASE_FOLDED}/models ${BASE_FOLDED}/datasets +if [ ! -d "${BASE_FOLDED}/models/${MODEL_NAME}" ]; then + hf download Qwen/${MODEL_NAME} --local-dir ${BASE_FOLDED}/models/${MODEL_NAME} fi -if [ ! -d "/root/datasets/${DATASET_LOCAL_NAME}" ]; then - hf download --repo-type dataset ${DATASET_NAME} --local-dir /root/datasets/${DATASET_LOCAL_NAME} +if [ ! -d "${BASE_FOLDED}/datasets/${DATASET_LOCAL_NAME}" ]; then + hf download --repo-type dataset ${DATASET_NAME} --local-dir ${BASE_FOLDED}/datasets/${DATASET_LOCAL_NAME} fi -# Common args +# --------------------------------------------------------------------------- +# Args common to dense and MoE +# --------------------------------------------------------------------------- CKPT_ARGS=( - --hf-checkpoint /root/models/${MODEL_NAME} - --load /root/models/${MODEL_NAME} + --hf-checkpoint ${BASE_FOLDED}/models/${MODEL_NAME} + --load ${BASE_FOLDED}/models/${MODEL_NAME} --megatron-to-hf-mode bridge ) ROLLOUT_ARGS=( - --prompt-data /root/datasets/${DATASET_LOCAL_NAME}/train.parquet + --prompt-data ${BASE_FOLDED}/datasets/${DATASET_LOCAL_NAME}/train.parquet --input-key problem --label-key answer --apply-chat-template @@ -86,12 +107,12 @@ ROLLOUT_ARGS=( --global-batch-size 512 ) -# required for vlm datasets +# Required for VLM datasets — geo3k stores image URLs under "images" MULTIMODAL_KEYS='{"image": "images"}' EVAL_ARGS=( --eval-interval 20 - --eval-prompt-data ${DATASET_LOCAL_NAME} /root/datasets/${DATASET_LOCAL_NAME}/test.parquet + --eval-prompt-data ${DATASET_LOCAL_NAME} ${BASE_FOLDED}/datasets/${DATASET_LOCAL_NAME}/test.parquet --n-samples-per-eval-prompt 1 --eval-max-response-len 4096 ) @@ -115,21 +136,6 @@ OPTIMIZER_ARGS=( --adam-beta2 0.98 ) -SGLANG_ARGS=( - --rollout-num-gpus-per-engine 8 - --sglang-mem-fraction-static 0.7 - --sglang-ep-size 8 - --sglang-cuda-graph-bs 1 2 4 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248 256 - - # MTP speculative decoding - --sglang-speculative-algorithm EAGLE - --sglang-speculative-num-steps 2 - --sglang-speculative-eagle-topk 1 - --sglang-speculative-num-draft-tokens 3 - - --sglang-max-running-requests 512 -) - # Wandb args (only if WANDB_API_KEY is set) if [ -n "$WANDB_API_KEY" ]; then WANDB_ARGS=( @@ -147,47 +153,151 @@ MISC_ARGS=( --colocate ) -# Backend-specific args -# megatron backend -BACKEND_ARGS=( - --train-backend megatron - # Qwen3.5-35B-A3B has num_query_groups = 2 - --tensor-model-parallel-size 2 - --sequence-parallel - --pipeline-model-parallel-size 1 - --context-parallel-size 1 - --expert-model-parallel-size 8 - --expert-tensor-parallel-size 1 - --recompute-granularity full - --recompute-method uniform - --recompute-num-layers 1 - --attention-dropout 0.0 - --hidden-dropout 0.0 - --accumulate-allreduce-grads-in-fp32 - --attention-softmax-in-fp32 - --attention-backend flash - - # Packing is not supported for GDN currently - --qkv-format bshd - --micro-batch-size 1 -) +# --------------------------------------------------------------------------- +# Variant-specific args (Dense vs MoE) +# --------------------------------------------------------------------------- +if [ "$IS_MOE" = "1" ]; then + # MoE branch — Qwen3.5-35B-A3B / 397B-A17B follow the same SGLang recipe + # as Qwen3-Next (LMSYS cookbook): + # https://lmsysorg.mintlify.app/cookbook/autoregressive/Qwen/Qwen3-Next + # i.e. NEXTN speculative decoding (uses the model's built-in MTP head, + # which Qwen3.5 ships with) + extra_buffer mamba scheduler + page-size=64 + # so radix cache stays enabled. SGLANG_ENABLE_SPEC_V2=1 is exported in + # the Ray runtime_env below. + SGLANG_ARGS=( + --rollout-num-gpus-per-engine 8 + --sglang-mem-fraction-static 0.7 + --sglang-ep-size 8 + --sglang-cuda-graph-bs 1 2 4 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248 256 + + # NEXTN speculative decoding (MTP-based, native to Qwen3.5). + --sglang-speculative-algorithm NEXTN + --sglang-speculative-num-steps 3 + --sglang-speculative-eagle-topk 1 + --sglang-speculative-num-draft-tokens 4 + + # Hybrid (mamba-style) scheduler tuned for Qwen3-Next / Qwen3.5 MoE. + # extra_buffer + page-size=64 lets radix cache coexist with + # speculative decoding. page-size must satisfy + # FLA_CHUNK_SIZE % page_size == 0 (cookbook value: 64). + --sglang-mamba-scheduler-strategy extra_buffer + --sglang-page-size 64 + --sglang-max-running-requests 512 + + # Workaround: SGLang's symmetric-memory custom all-reduce trips + # `cudaIpcOpenMemHandle` / `share_graph_inputs` on some driver / IPC + # configs ("CUDA error: invalid argument" inside custom_all_reduce.cuh). + # NCCL all-reduce is plenty fast for the rollout workers. + --sglang-disable-custom-all-reduce + ) + + BACKEND_ARGS=( + --train-backend megatron + # MoE Qwen3.5-35B-A3B has num_query_groups = 2 (gated attention) + --tensor-model-parallel-size 2 + --sequence-parallel + --pipeline-model-parallel-size 1 + --context-parallel-size 1 + --expert-model-parallel-size 8 + --expert-tensor-parallel-size 1 + --recompute-granularity full + --recompute-method uniform + --recompute-num-layers 1 + --attention-dropout 0.0 + --hidden-dropout 0.0 + --accumulate-allreduce-grads-in-fp32 + --attention-softmax-in-fp32 + --attention-backend flash + + # GDN (Gated DeltaNet, Qwen3.5's linear-attention branch) does not + # support packed sequences in megatron-core today + # (`gated_delta_net.py:300 NotImplementedError`). Force the padded + # BSHD layout — slime's data pipeline then sets + # packed_seq_params=None and the GDN guard is not tripped. + --qkv-format bshd + --micro-batch-size 1 + ) +else + # Dense branch — Qwen3.5-9B / 27B. The bridge derives the full provider + # config (hidden size, GDN heads, mRoPE sections, etc.) from the HF + # checkpoint's config.json, so no model-specific Megatron flags are + # required. We only set parallelism + memory knobs here. + SGLANG_ARGS=( + --rollout-num-gpus-per-engine 8 + --sglang-mem-fraction-static 0.7 + --sglang-max-running-requests 512 + + # Workaround: SGLang's symmetric-memory custom all-reduce trips + # `cudaIpcOpenMemHandle` / `share_graph_inputs` on some driver / IPC + # configs ("CUDA error: invalid argument" inside custom_all_reduce.cuh). + # NCCL all-reduce is plenty fast for the rollout workers. + --sglang-disable-custom-all-reduce + ) + + BACKEND_ARGS=( + --train-backend megatron + --tensor-model-parallel-size 2 + --sequence-parallel + --pipeline-model-parallel-size 1 + --context-parallel-size 1 + --recompute-granularity full + --recompute-method uniform + --recompute-num-layers 1 + --attention-dropout 0.0 + --hidden-dropout 0.0 + --accumulate-allreduce-grads-in-fp32 + --attention-softmax-in-fp32 + --attention-backend flash + + # GDN (Gated DeltaNet, Qwen3.5's linear-attention branch) does not + # support packed sequences in megatron-core today + # (`gated_delta_net.py:300 NotImplementedError`). Force the padded + # BSHD layout — slime's data pipeline then sets + # packed_seq_params=None and the GDN guard is not tripped. + --qkv-format bshd + --micro-batch-size 1 + ) +fi + +# --------------------------------------------------------------------------- +# Optional legacy text-spec MODEL_ARGS (only sourced for the matching variant) +# --------------------------------------------------------------------------- SLIME_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." &>/dev/null && pwd)" -source "${SLIME_DIR}/scripts/models/qwen3.5-35B-A3B.sh" -# Start Ray if not using external Ray +# Source the matching scripts/models/qwen3.5-*.sh if present. In bridge mode +# (--megatron-to-hf-mode bridge) the provider comes from the HF config, but +# slime's CLI parser still needs MODEL_ARGS to be defined. We fall back to an +# empty array if no matching file exists for the requested variant. +MODEL_ARGS=() +case "$MODEL_NAME" in + *0.8B*) CANDIDATE_MODEL_SH="${SLIME_DIR}/scripts/models/qwen3.5-0.8B.sh" ;; + *4B*) CANDIDATE_MODEL_SH="${SLIME_DIR}/scripts/models/qwen3.5-4B.sh" ;; + *9B*) CANDIDATE_MODEL_SH="${SLIME_DIR}/scripts/models/qwen3.5-9B.sh" ;; + *27B*) CANDIDATE_MODEL_SH="${SLIME_DIR}/scripts/models/qwen3.5-27B.sh" ;; + *35B-A3B*) CANDIDATE_MODEL_SH="${SLIME_DIR}/scripts/models/qwen3.5-35B-A3B.sh" ;; + *) CANDIDATE_MODEL_SH="" ;; +esac +if [ -n "$CANDIDATE_MODEL_SH" ] && [ -f "$CANDIDATE_MODEL_SH" ]; then + # shellcheck disable=SC1090 + source "$CANDIDATE_MODEL_SH" +fi + +# --------------------------------------------------------------------------- +# Start Ray (if not external) and submit the job +# --------------------------------------------------------------------------- if [ "$USE_EXTERNAL_RAY" = "0" ]; then export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} export no_proxy="127.0.0.1,${MASTER_ADDR}" ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus ${NUM_GPUS} --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265 fi -# Build runtime env RUNTIME_ENV_JSON="{ \"env_vars\": { \"PYTHONPATH\": \"/root/Megatron-LM/\", \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\", - \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\" + \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\", + \"SGLANG_ENABLE_SPEC_V2\": \"1\" } }" diff --git a/slime_plugins/megatron_bridge/__init__.py b/slime_plugins/megatron_bridge/__init__.py index a0425d491b..49b7d58500 100644 --- a/slime_plugins/megatron_bridge/__init__.py +++ b/slime_plugins/megatron_bridge/__init__.py @@ -1 +1,2 @@ import slime_plugins.megatron_bridge.glm4v_moe # noqa: F401 # register GLM-4.6V bridge +import slime_plugins.megatron_bridge.qwen3_5_vl # noqa: F401 # register Qwen3.5-VL bridges (dense + MoE) diff --git a/slime_plugins/megatron_bridge/qwen3_5_vl.py b/slime_plugins/megatron_bridge/qwen3_5_vl.py new file mode 100644 index 0000000000..d47b37c5d4 --- /dev/null +++ b/slime_plugins/megatron_bridge/qwen3_5_vl.py @@ -0,0 +1,88 @@ +"""Qwen3.5 Vision-Language bridges (dense + MoE). + +This module is a *thin* registration shim: it imports the official Qwen3.5-VL +bridges from `megatron.bridge` so that their ``@MegatronModelBridge.register_bridge`` +decorators run, registering both the dense and MoE Qwen3.5-VL HF architectures +with `AutoBridge.from_hf_pretrained`: + + - ``Qwen3_5ForConditionalGeneration`` -> Qwen35VLBridge (e.g. Qwen3.5-9B / 27B) + - ``Qwen3_5MoeForConditionalGeneration`` -> Qwen35VLMoEBridge (e.g. Qwen3.5-35B-A3B / 397B-A17B) + +The bridges, providers, mapping registries and hybrid (GDN + Gated Attention) +layer specs all live in NVIDIA's package; we explicitly do NOT reimplement +them on the slime side. See: + https://docs.nvidia.com/nemo/megatron-bridge/0.4.0/apidocs/bridge/bridge.models.qwen_vl.qwen35_vl_bridge.html + +Requires `megatron-bridge >= 0.4.0` and a `transformers` version that exposes +``Qwen3_5ForConditionalGeneration`` (and, for the MoE path, +``Qwen3_5MoeForConditionalGeneration``). The import is wrapped in +``try/except`` so older environments that don't yet ship Qwen3.5-VL still +load the plugin without errors — just without the Qwen3.5-VL bridge +registered. +""" + +from __future__ import annotations + +import copy +import logging + +logger = logging.getLogger(__name__) + + +def _add_legacy_mtp_aliases(registry): + """Duplicate every ``mtp.*.mtp_model_layer.*`` mapping with the legacy + Megatron-LM name ``transformer_layer``. + + The bridge's ``mapping_registry()`` returns a fresh registry on every call + and ``MegatronMappingRegistry.__init__`` *pre-compiles* the patterns into + ``_compiled_patterns`` / ``_reverse_patterns`` — so we cannot just append + to ``registry.mappings``: the new entries would never be matched at + lookup time. Instead we build a brand-new registry from the augmented + mapping list, which lets ``__init__`` re-compile everything. + """ + if registry is None: + return registry + original = list(registry.mappings) + extra = [] + for mapping in original: + m_param = getattr(mapping, "megatron_param", None) + if isinstance(m_param, str) and ".mtp_model_layer." in m_param: + alias = copy.copy(mapping) + alias.megatron_param = m_param.replace(".mtp_model_layer.", ".transformer_layer.") + extra.append(alias) + if not extra: + return registry + + cls = registry.__class__ + new_registry = cls(*original, *extra) + return new_registry + + +def _patch_bridge_mapping_registry(bridge_cls): + """Wrap ``bridge_cls.mapping_registry`` so callers see the alias-augmented + registry. Idempotent — no-op if already wrapped.""" + original = bridge_cls.mapping_registry + if getattr(original, "_slime_mtp_alias_patched", False): + return + + def patched(self, *args, **kwargs): + registry = original(self, *args, **kwargs) + return _add_legacy_mtp_aliases(registry) + + patched._slime_mtp_alias_patched = True # type: ignore[attr-defined] + bridge_cls.mapping_registry = patched + + +try: + # Importing these triggers @MegatronModelBridge.register_bridge(...) at + # module-import time, which is the entire purpose of this file. + from megatron.bridge.models.qwen_vl.qwen35_vl_bridge import Qwen35VLBridge, Qwen35VLMoEBridge # noqa: F401 + + _patch_bridge_mapping_registry(Qwen35VLBridge) + _patch_bridge_mapping_registry(Qwen35VLMoEBridge) +except ImportError as exc: # pragma: no cover - environment dependent + logger.info( + "Qwen3.5-VL bridges not registered (megatron-bridge >= 0.4.0 with a " + "compatible transformers version is required): %s", + exc, + )