From e9c4ab24e586a2374df753b8a44c8899b14579e2 Mon Sep 17 00:00:00 2001
From: demouo <2081510953@qq.com>
Date: Sun, 14 Jun 2026 17:04:46 +0800
Subject: [PATCH] feat(qwen3.5-vl): wire NVIDIA megatron-bridge Qwen3.5-VL
 bridges (dense + MoE) with MTP-naming alias and end-to-end geo3k example

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 examples/geo3k_vlm/run_geo3k_qwen35.sh      | 246 ++++++++++++++------
 slime_plugins/megatron_bridge/__init__.py   |   1 +
 slime_plugins/megatron_bridge/qwen3_5_vl.py |  88 +++++++
 3 files changed, 267 insertions(+), 68 deletions(-)
 create mode 100644 slime_plugins/megatron_bridge/qwen3_5_vl.py
diff --git a/examples/geo3k_vlm/run_geo3k_qwen35.sh b/examples/geo3k_vlm/run_geo3k_qwen35.sh
index 8f057de887..0ca30db74b 100644
--- a/examples/geo3k_vlm/run_geo3k_qwen35.sh
+++ b/examples/geo3k_vlm/run_geo3k_qwen35.sh
@@ -1,24 +1,38 @@
 #!/bin/bash
+#
+# Qwen3.5-VL RL training on geo3k dataset.
+#
+# Supports both Dense (Qwen3.5-9B / Qwen3.5-27B) and MoE (Qwen3.5-35B-A3B,
+# Qwen3.5-397B-A17B, ...) variants via the official NVIDIA Megatron-Bridge
+# package (>= 0.4.0). Selection is by env var:
+#
+#     # Dense — default
+#     MODEL_NAME=Qwen3.5-9B  ./run_geo3k_qwen35_vl.sh
+#     MODEL_NAME=Qwen3.5-27B ./run_geo3k_qwen35_vl.sh
+#
+#     # MoE
+#     MODEL_NAME=Qwen3.5-35B-A3B ./run_geo3k_qwen35_vl.sh
+#
+# The Megatron-side provider is built by megatron-bridge directly from the
+# HuggingFace config of the checkpoint at $HF_CHECKPOINT (no fork required).
 
-# Qwen3.5-35B-A3B VL RL training on geo3k dataset
-
-pip install -U transformers
-
-# IMPORTANT: This branch is specially modified for slime's current Megatron
-# version and Qwen3.5 from the main Megatron Bridge. Other models are not verified!
-# To restore the original Megatron Bridge, run:
-#   pip install git+https://github.com/fzyzcjy/Megatron-Bridge.git@dev_rl --no-build-isolation
-# TODO: Remove this once Megatron & Megatron Bridge are upgraded upstream.
-pip install git+https://github.com/coding-famer/Megatron-Bridge-slime.git@qwen35 --no-build-isolation
-
+# ---------------------------------------------------------------------------
 # Configuration
+# ---------------------------------------------------------------------------
 TRAIN_BACKEND="megatron"
-MODEL_NAME="Qwen3_5-35B-A3B"
+MODEL_NAME=${MODEL_NAME:-"Qwen3.5-9B"}
 DATASET_NAME=${SLIME_SCRIPT_DATASET_NAME:-"chenhegu/geo3k_imgurl"}
 NUM_GPUS=${SLIME_SCRIPT_NUM_GPUS:-8}
 DATASET_LOCAL_NAME=$(basename "$DATASET_NAME")
-
 MODEL_NAME_LOWER=$(echo "$MODEL_NAME" | tr '[:upper:]' '[:lower:]')
+BASE_FOLDED=${SLIME_BASE_FOLDED:-"/root"}
+
+# Heuristic: any "*A<digits>B*" suffix denotes a MoE variant (A3B / A17B / ...)
+if [[ "$MODEL_NAME" == *A[0-9]*B* ]]; then
+    IS_MOE=1
+else
+    IS_MOE=0
+fi
 
 # External Ray flag
 if [ -z "$SLIME_SCRIPT_EXTERNAL_RAY" ] || [ "$SLIME_SCRIPT_EXTERNAL_RAY" = "0" ]; then
@@ -27,7 +41,9 @@ else
    USE_EXTERNAL_RAY=1
 fi
 
-# Cleanup
+# ---------------------------------------------------------------------------
+# Cleanup (no set -e here: pkill returns non-zero when no process matches)
+# ---------------------------------------------------------------------------
 pkill -9 sglang
 sleep 3
 if [ "$USE_EXTERNAL_RAY" = "0" ]; then
@@ -54,25 +70,30 @@ else
    HAS_NVLINK=0
 fi
 echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
+echo "MODEL_NAME: $MODEL_NAME (IS_MOE=$IS_MOE)"
 
+# ---------------------------------------------------------------------------
 # Download model and dataset
-mkdir -p /root/models /root/datasets
-if [ ! -d "/root/models/${MODEL_NAME}" ]; then
-   hf download Qwen/${MODEL_NAME} --local-dir /root/models/${MODEL_NAME}
+# ---------------------------------------------------------------------------
+mkdir -p ${BASE_FOLDED}/models ${BASE_FOLDED}/datasets
+if [ ! -d "${BASE_FOLDED}/models/${MODEL_NAME}" ]; then
+   hf download Qwen/${MODEL_NAME} --local-dir ${BASE_FOLDED}/models/${MODEL_NAME}
 fi
-if [ ! -d "/root/datasets/${DATASET_LOCAL_NAME}" ]; then
-   hf download --repo-type dataset ${DATASET_NAME} --local-dir /root/datasets/${DATASET_LOCAL_NAME}
+if [ ! -d "${BASE_FOLDED}/datasets/${DATASET_LOCAL_NAME}" ]; then
+   hf download --repo-type dataset ${DATASET_NAME} --local-dir ${BASE_FOLDED}/datasets/${DATASET_LOCAL_NAME}
 fi
 
-# Common args
+# ---------------------------------------------------------------------------
+# Args common to dense and MoE
+# ---------------------------------------------------------------------------
 CKPT_ARGS=(
-   --hf-checkpoint /root/models/${MODEL_NAME}
-   --load /root/models/${MODEL_NAME}
+   --hf-checkpoint ${BASE_FOLDED}/models/${MODEL_NAME}
+   --load ${BASE_FOLDED}/models/${MODEL_NAME}
    --megatron-to-hf-mode bridge
 )
 
 ROLLOUT_ARGS=(
-   --prompt-data /root/datasets/${DATASET_LOCAL_NAME}/train.parquet
+   --prompt-data ${BASE_FOLDED}/datasets/${DATASET_LOCAL_NAME}/train.parquet
    --input-key problem
    --label-key answer
    --apply-chat-template
@@ -86,12 +107,12 @@ ROLLOUT_ARGS=(
    --global-batch-size 512
 )
 
-# required for vlm datasets
+# Required for VLM datasets — geo3k stores image URLs under "images"
 MULTIMODAL_KEYS='{"image": "images"}'
 
 EVAL_ARGS=(
    --eval-interval 20
-   --eval-prompt-data ${DATASET_LOCAL_NAME} /root/datasets/${DATASET_LOCAL_NAME}/test.parquet
+   --eval-prompt-data ${DATASET_LOCAL_NAME} ${BASE_FOLDED}/datasets/${DATASET_LOCAL_NAME}/test.parquet
    --n-samples-per-eval-prompt 1
    --eval-max-response-len 4096
 )
@@ -115,21 +136,6 @@ OPTIMIZER_ARGS=(
    --adam-beta2 0.98
 )
 
-SGLANG_ARGS=(
-   --rollout-num-gpus-per-engine 8
-   --sglang-mem-fraction-static 0.7
-   --sglang-ep-size 8
-   --sglang-cuda-graph-bs 1 2 4 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248 256
-
-   # MTP speculative decoding
-   --sglang-speculative-algorithm EAGLE
-   --sglang-speculative-num-steps 2
-   --sglang-speculative-eagle-topk 1
-   --sglang-speculative-num-draft-tokens 3
-
-   --sglang-max-running-requests 512
-)
-
 # Wandb args (only if WANDB_API_KEY is set)
 if [ -n "$WANDB_API_KEY" ]; then
    WANDB_ARGS=(
@@ -147,47 +153,151 @@ MISC_ARGS=(
    --colocate
 )
 
-# Backend-specific args
-# megatron backend
-BACKEND_ARGS=(
-   --train-backend megatron
-   # Qwen3.5-35B-A3B has num_query_groups = 2
-   --tensor-model-parallel-size 2
-   --sequence-parallel
-   --pipeline-model-parallel-size 1
-   --context-parallel-size 1
-   --expert-model-parallel-size 8
-   --expert-tensor-parallel-size 1
-   --recompute-granularity full
-   --recompute-method uniform
-   --recompute-num-layers 1
-   --attention-dropout 0.0
-   --hidden-dropout 0.0
-   --accumulate-allreduce-grads-in-fp32
-   --attention-softmax-in-fp32
-   --attention-backend flash
-
-   # Packing is not supported for GDN currently
-   --qkv-format bshd
-   --micro-batch-size 1
-)
+# ---------------------------------------------------------------------------
+# Variant-specific args (Dense vs MoE)
+# ---------------------------------------------------------------------------
+if [ "$IS_MOE" = "1" ]; then
+   # MoE branch — Qwen3.5-35B-A3B / 397B-A17B follow the same SGLang recipe
+   # as Qwen3-Next (LMSYS cookbook):
+   #   https://lmsysorg.mintlify.app/cookbook/autoregressive/Qwen/Qwen3-Next
+   # i.e. NEXTN speculative decoding (uses the model's built-in MTP head,
+   # which Qwen3.5 ships with) + extra_buffer mamba scheduler + page-size=64
+   # so radix cache stays enabled. SGLANG_ENABLE_SPEC_V2=1 is exported in
+   # the Ray runtime_env below.
+   SGLANG_ARGS=(
+      --rollout-num-gpus-per-engine 8
+      --sglang-mem-fraction-static 0.7
+      --sglang-ep-size 8
+      --sglang-cuda-graph-bs 1 2 4 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248 256
+
+      # NEXTN speculative decoding (MTP-based, native to Qwen3.5).
+      --sglang-speculative-algorithm NEXTN
+      --sglang-speculative-num-steps 3
+      --sglang-speculative-eagle-topk 1
+      --sglang-speculative-num-draft-tokens 4
+
+      # Hybrid (mamba-style) scheduler tuned for Qwen3-Next / Qwen3.5 MoE.
+      # extra_buffer + page-size=64 lets radix cache coexist with
+      # speculative decoding. page-size must satisfy
+      # FLA_CHUNK_SIZE % page_size == 0 (cookbook value: 64).
+      --sglang-mamba-scheduler-strategy extra_buffer
+      --sglang-page-size 64
 
+      --sglang-max-running-requests 512
+      
+      # Workaround: SGLang's symmetric-memory custom all-reduce trips
+      # `cudaIpcOpenMemHandle` / `share_graph_inputs` on some driver / IPC
+      # configs ("CUDA error: invalid argument" inside custom_all_reduce.cuh).
+      # NCCL all-reduce is plenty fast for the rollout workers.
+      --sglang-disable-custom-all-reduce
+   )
+
+   BACKEND_ARGS=(
+      --train-backend megatron
+      # MoE Qwen3.5-35B-A3B has num_query_groups = 2 (gated attention)
+      --tensor-model-parallel-size 2
+      --sequence-parallel
+      --pipeline-model-parallel-size 1
+      --context-parallel-size 1
+      --expert-model-parallel-size 8
+      --expert-tensor-parallel-size 1
+      --recompute-granularity full
+      --recompute-method uniform
+      --recompute-num-layers 1
+      --attention-dropout 0.0
+      --hidden-dropout 0.0
+      --accumulate-allreduce-grads-in-fp32
+      --attention-softmax-in-fp32
+      --attention-backend flash
+
+      # GDN (Gated DeltaNet, Qwen3.5's linear-attention branch) does not
+      # support packed sequences in megatron-core today
+      # (`gated_delta_net.py:300 NotImplementedError`). Force the padded
+      # BSHD layout — slime's data pipeline then sets
+      # packed_seq_params=None and the GDN guard is not tripped.
+      --qkv-format bshd
+      --micro-batch-size 1
+   )
+else
+   # Dense branch — Qwen3.5-9B / 27B. The bridge derives the full provider
+   # config (hidden size, GDN heads, mRoPE sections, etc.) from the HF
+   # checkpoint's config.json, so no model-specific Megatron flags are
+   # required. We only set parallelism + memory knobs here.
+   SGLANG_ARGS=(
+      --rollout-num-gpus-per-engine 8
+      --sglang-mem-fraction-static 0.7
+      --sglang-max-running-requests 512
+
+      # Workaround: SGLang's symmetric-memory custom all-reduce trips
+      # `cudaIpcOpenMemHandle` / `share_graph_inputs` on some driver / IPC
+      # configs ("CUDA error: invalid argument" inside custom_all_reduce.cuh).
+      # NCCL all-reduce is plenty fast for the rollout workers.
+      --sglang-disable-custom-all-reduce
+   )
+
+   BACKEND_ARGS=(
+      --train-backend megatron
+      --tensor-model-parallel-size 2
+      --sequence-parallel
+      --pipeline-model-parallel-size 1
+      --context-parallel-size 1
+      --recompute-granularity full
+      --recompute-method uniform
+      --recompute-num-layers 1
+      --attention-dropout 0.0
+      --hidden-dropout 0.0
+      --accumulate-allreduce-grads-in-fp32
+      --attention-softmax-in-fp32
+      --attention-backend flash
+
+      # GDN (Gated DeltaNet, Qwen3.5's linear-attention branch) does not
+      # support packed sequences in megatron-core today
+      # (`gated_delta_net.py:300 NotImplementedError`). Force the padded
+      # BSHD layout — slime's data pipeline then sets
+      # packed_seq_params=None and the GDN guard is not tripped.
+      --qkv-format bshd
+      --micro-batch-size 1
+   )
+fi
+
+# ---------------------------------------------------------------------------
+# Optional legacy text-spec MODEL_ARGS (only sourced for the matching variant)
+# ---------------------------------------------------------------------------
 SLIME_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." &>/dev/null && pwd)"
-source "${SLIME_DIR}/scripts/models/qwen3.5-35B-A3B.sh"
 
-# Start Ray if not using external Ray
+# Source the matching scripts/models/qwen3.5-*.sh if present. In bridge mode
+# (--megatron-to-hf-mode bridge) the provider comes from the HF config, but
+# slime's CLI parser still needs MODEL_ARGS to be defined. We fall back to an
+# empty array if no matching file exists for the requested variant.
+MODEL_ARGS=()
+case "$MODEL_NAME" in
+   *0.8B*)      CANDIDATE_MODEL_SH="${SLIME_DIR}/scripts/models/qwen3.5-0.8B.sh" ;;
+   *4B*)        CANDIDATE_MODEL_SH="${SLIME_DIR}/scripts/models/qwen3.5-4B.sh" ;;
+   *9B*)        CANDIDATE_MODEL_SH="${SLIME_DIR}/scripts/models/qwen3.5-9B.sh" ;;
+   *27B*)       CANDIDATE_MODEL_SH="${SLIME_DIR}/scripts/models/qwen3.5-27B.sh" ;;
+   *35B-A3B*)   CANDIDATE_MODEL_SH="${SLIME_DIR}/scripts/models/qwen3.5-35B-A3B.sh" ;;
+   *)           CANDIDATE_MODEL_SH="" ;;
+esac
+if [ -n "$CANDIDATE_MODEL_SH" ] && [ -f "$CANDIDATE_MODEL_SH" ]; then
+   # shellcheck disable=SC1090
+   source "$CANDIDATE_MODEL_SH"
+fi
+
+# ---------------------------------------------------------------------------
+# Start Ray (if not external) and submit the job
+# ---------------------------------------------------------------------------
 if [ "$USE_EXTERNAL_RAY" = "0" ]; then
    export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
    export no_proxy="127.0.0.1,${MASTER_ADDR}"
    ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus ${NUM_GPUS} --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265
 fi
 
-# Build runtime env
 RUNTIME_ENV_JSON="{
   \"env_vars\": {
     \"PYTHONPATH\": \"/root/Megatron-LM/\",
     \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
-    \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\"
+    \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\",
+    \"SGLANG_ENABLE_SPEC_V2\": \"1\"
   }
 }"
 
diff --git a/slime_plugins/megatron_bridge/__init__.py b/slime_plugins/megatron_bridge/__init__.py
index a0425d491b..49b7d58500 100644
--- a/slime_plugins/megatron_bridge/__init__.py
+++ b/slime_plugins/megatron_bridge/__init__.py
@@ -1 +1,2 @@
 import slime_plugins.megatron_bridge.glm4v_moe  # noqa: F401  # register GLM-4.6V bridge
+import slime_plugins.megatron_bridge.qwen3_5_vl  # noqa: F401  # register Qwen3.5-VL bridges (dense + MoE)
diff --git a/slime_plugins/megatron_bridge/qwen3_5_vl.py b/slime_plugins/megatron_bridge/qwen3_5_vl.py
new file mode 100644
index 0000000000..d47b37c5d4
--- /dev/null
+++ b/slime_plugins/megatron_bridge/qwen3_5_vl.py
@@ -0,0 +1,88 @@
+"""Qwen3.5 Vision-Language bridges (dense + MoE).
+
+This module is a *thin* registration shim: it imports the official Qwen3.5-VL
+bridges from `megatron.bridge` so that their ``@MegatronModelBridge.register_bridge``
+decorators run, registering both the dense and MoE Qwen3.5-VL HF architectures
+with `AutoBridge.from_hf_pretrained`:
+
+  - ``Qwen3_5ForConditionalGeneration``     -> Qwen35VLBridge      (e.g. Qwen3.5-9B / 27B)
+  - ``Qwen3_5MoeForConditionalGeneration``  -> Qwen35VLMoEBridge   (e.g. Qwen3.5-35B-A3B / 397B-A17B)
+
+The bridges, providers, mapping registries and hybrid (GDN + Gated Attention)
+layer specs all live in NVIDIA's package; we explicitly do NOT reimplement
+them on the slime side. See:
+  https://docs.nvidia.com/nemo/megatron-bridge/0.4.0/apidocs/bridge/bridge.models.qwen_vl.qwen35_vl_bridge.html
+
+Requires `megatron-bridge >= 0.4.0` and a `transformers` version that exposes
+``Qwen3_5ForConditionalGeneration`` (and, for the MoE path,
+``Qwen3_5MoeForConditionalGeneration``). The import is wrapped in
+``try/except`` so older environments that don't yet ship Qwen3.5-VL still
+load the plugin without errors — just without the Qwen3.5-VL bridge
+registered.
+"""
+
+from __future__ import annotations
+
+import copy
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def _add_legacy_mtp_aliases(registry):
+    """Duplicate every ``mtp.*.mtp_model_layer.*`` mapping with the legacy
+    Megatron-LM name ``transformer_layer``.
+
+    The bridge's ``mapping_registry()`` returns a fresh registry on every call
+    and ``MegatronMappingRegistry.__init__`` *pre-compiles* the patterns into
+    ``_compiled_patterns`` / ``_reverse_patterns`` — so we cannot just append
+    to ``registry.mappings``: the new entries would never be matched at
+    lookup time. Instead we build a brand-new registry from the augmented
+    mapping list, which lets ``__init__`` re-compile everything.
+    """
+    if registry is None:
+        return registry
+    original = list(registry.mappings)
+    extra = []
+    for mapping in original:
+        m_param = getattr(mapping, "megatron_param", None)
+        if isinstance(m_param, str) and ".mtp_model_layer." in m_param:
+            alias = copy.copy(mapping)
+            alias.megatron_param = m_param.replace(".mtp_model_layer.", ".transformer_layer.")
+            extra.append(alias)
+    if not extra:
+        return registry
+
+    cls = registry.__class__
+    new_registry = cls(*original, *extra)
+    return new_registry
+
+
+def _patch_bridge_mapping_registry(bridge_cls):
+    """Wrap ``bridge_cls.mapping_registry`` so callers see the alias-augmented
+    registry.  Idempotent — no-op if already wrapped."""
+    original = bridge_cls.mapping_registry
+    if getattr(original, "_slime_mtp_alias_patched", False):
+        return
+
+    def patched(self, *args, **kwargs):
+        registry = original(self, *args, **kwargs)
+        return _add_legacy_mtp_aliases(registry)
+
+    patched._slime_mtp_alias_patched = True  # type: ignore[attr-defined]
+    bridge_cls.mapping_registry = patched
+
+
+try:
+    # Importing these triggers @MegatronModelBridge.register_bridge(...) at
+    # module-import time, which is the entire purpose of this file.
+    from megatron.bridge.models.qwen_vl.qwen35_vl_bridge import Qwen35VLBridge, Qwen35VLMoEBridge  # noqa: F401
+
+    _patch_bridge_mapping_registry(Qwen35VLBridge)
+    _patch_bridge_mapping_registry(Qwen35VLMoEBridge)
+except ImportError as exc:  # pragma: no cover - environment dependent
+    logger.info(
+        "Qwen3.5-VL bridges not registered (megatron-bridge >= 0.4.0 with a "
+        "compatible transformers version is required): %s",
+        exc,
+    )