vllm-project · jasl · May 6, 2026 · May 5, 2026 · May 6, 2026 · May 6, 2026
diff --git a/csrc/libtorch_stable/moe/marlin_moe_wna16/ops.cu b/csrc/libtorch_stable/moe/marlin_moe_wna16/ops.cu
@@ -437,6 +437,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
   cudaDeviceGetAttribute(&max_shared_mem,
                          cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
   STD_TORCH_CHECK(max_shared_mem > 0);
+  int device_max_shared_mem = max_shared_mem;
 
   int major_capability, minor_capability;
   cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
@@ -527,10 +528,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
   }
 
   cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
-                       max_shared_mem);
+                       device_max_shared_mem);
   // avoid ">>>" being formatted to "> > >"
   // clang-format off
-  kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
+  kernel<<<blocks, num_threads, sh_cache_size, stream>>>(
       A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, a_s_ptr, b_s_ptr, g_s_ptr, zp_ptr, g_idx_ptr,
       sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
       topk_weights_ptr, top_k, mul_topk_weights, num_groups, prob_m,
@@ -708,9 +709,8 @@ torch::stable::Tensor moe_wna16_marlin_gemm(
   torch::stable::Tensor c_tmp;
   if (use_fp32_reduce && !use_atomic_add) {
     // max num of threadblocks is sms * 4
-    long max_c_tmp_size = min(
-        (long)size_n * sorted_token_ids.size(0),
-        (long)sms * 4 * moe_block_size * MARLIN_NAMESPACE_NAME::max_thread_n);
+    long max_c_tmp_size =
+        (long)sms * 4 * moe_block_size * MARLIN_NAMESPACE_NAME::max_thread_n;
     if (moe_block_size == 8) max_c_tmp_size *= 2;
     c_tmp = torch::stable::new_empty(a, {max_c_tmp_size}, kFloat);
   } else {

diff --git a/tests/compile/passes/test_functionalization.py b/tests/compile/passes/test_functionalization.py
@@ -251,12 +251,70 @@ def ops_not_in_model(self):
         return []
 
 
+class TestFusedDeepseekV4QnormRopeKvInsert(torch.nn.Module):
+    OP_REGISTERED = False
+
+    def __init__(self):
+        super().__init__()
+        self.register_test_custom_op()
+
+    @classmethod
+    def register_test_custom_op(cls):
+        if not cls.OP_REGISTERED:
+
+            def fused_deepseek_v4_qnorm_rope_kv_rope_quant_insert_impl(
+                q: torch.Tensor,
+                kv: torch.Tensor,
+                k_cache: torch.Tensor,
+            ) -> None:
+                q.add_(kv)
+                k_cache.add_(kv)
+
+            def fused_deepseek_v4_qnorm_rope_kv_rope_quant_insert_fake(
+                q: torch.Tensor,
+                kv: torch.Tensor,
+                k_cache: torch.Tensor,
+            ) -> None:
+                return None
+
+            direct_register_custom_op(
+                op_name="fused_deepseek_v4_qnorm_rope_kv_rope_quant_insert",
+                op_func=fused_deepseek_v4_qnorm_rope_kv_rope_quant_insert_impl,
+                mutates_args=["q", "k_cache"],
+                fake_impl=fused_deepseek_v4_qnorm_rope_kv_rope_quant_insert_fake,
+            )
+
+            cls.OP_REGISTERED = True
+
+    def forward(
+        self, q: torch.Tensor, kv: torch.Tensor, k_cache: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        torch.ops.vllm.fused_deepseek_v4_qnorm_rope_kv_rope_quant_insert(q, kv, k_cache)
+        return q, k_cache
+
+    def example_inputs(self, num_tokens=32, hidden_size=128):
+        return (
+            torch.rand(num_tokens, hidden_size),
+            torch.rand(num_tokens, hidden_size),
+            torch.rand(num_tokens, hidden_size),
+        )
+
+    def ops_in_model(self, do_fusion):
+        return [
+            torch.ops.vllm.fused_deepseek_v4_qnorm_rope_kv_rope_quant_insert.default
+        ]
+
+    def ops_not_in_model(self):
+        return []
+
+
 MODELS_AND_DO_FUSION = {
     TestSiluMul: [True, False],
     TestFusedAddRMSNorm: [True, False],
     TestRotaryEmbedding: [False],
     TestRotaryEmbeddingSliceScatter: [False],
     TestFunctionWithMutatedArgsAndReturn: [False],
+    TestFusedDeepseekV4QnormRopeKvInsert: [False],
 }
 
 

diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
@@ -468,6 +468,38 @@ def test_cudagraph_sizes_post_init(
         )
 
 
+def test_spec_decode_cudagraph_sizes_keep_small_full_decode_batches_exact():
+    config = CompilationConfig(
+        cudagraph_mode=CUDAGraphMode.FULL_AND_PIECEWISE,
+        cudagraph_capture_sizes=[
+            1,
+            2,
+            4,
+            8,
+            16,
+            24,
+            32,
+            40,
+            48,
+            56,
+            64,
+            72,
+            80,
+            88,
+            96,
+        ],
+        max_cudagraph_capture_size=96,
+    )
+
+    config.adjust_cudagraph_sizes_for_spec_decode(
+        uniform_decode_query_len=3,
+        tensor_parallel_size=1,
+    )
+
+    for num_reqs in range(1, 33):
+        assert 3 * num_reqs in config.cudagraph_capture_sizes
+
+
 @pytest.mark.skipif(
     not current_platform.support_static_graph_mode(),
     reason="Skip if not cudagraph mode supported",

diff --git a/tests/config/test_deepseek_v4_cudagraph_config.py b/tests/config/test_deepseek_v4_cudagraph_config.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from types import SimpleNamespace
+
+from vllm.config.vllm import _should_auto_enable_breakable_cudagraph
+
+
+def _model_config(*architectures: str):
+    return SimpleNamespace(architectures=list(architectures))
+
+
+def test_deepseek_v4_does_not_auto_enable_breakable_cudagraph():
+    # Breakable cudagraph disables torch.compile and is 1.5-3.8x slower for MTP
+    # decode on SM12x (measured); DeepSeek-V4 defaults to FULL_AND_PIECEWISE.
+    assert not _should_auto_enable_breakable_cudagraph(
+        _model_config("DeepseekV4ForCausalLM")
+    )
+    assert not _should_auto_enable_breakable_cudagraph(
+        _model_config("DeepSeekV4MTPModel")
+    )
+
+
+def test_minimax_m3_auto_enables_breakable_cudagraph():
+    # MiniMax M3 retains upstream's unconditional auto-enable.
+    assert _should_auto_enable_breakable_cudagraph(
+        _model_config("MiniMaxM3SparseForCausalLM")
+    )
+    assert _should_auto_enable_breakable_cudagraph(
+        _model_config("MiniMaxM3SparseForConditionalGeneration")
+    )
+
+
+def test_other_models_do_not_auto_enable_breakable_cudagraph():
+    assert not _should_auto_enable_breakable_cudagraph(
+        _model_config("Qwen3ForCausalLM")
+    )
@@ -0,0 +1,173 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import sys
+import types
+
+import torch
+
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.activation import MoEActivation
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
+    RoutingMethodType,
+    mxfp4_mxfp8_moe_quant_config,
+)
+from vllm.model_executor.layers.fused_moe.experts.flashinfer_cutlass_moe import (
+    FlashInferExperts,
+)
+from vllm.model_executor.layers.fused_moe.oracle.mxfp4 import (
+    Mxfp4MoeBackend,
+    convert_weight_to_mxfp4_moe_kernel_format,
+)
+
+
+def _make_moe_config() -> FusedMoEConfig:
+    return FusedMoEConfig(
+        num_experts=2,
+        experts_per_token=1,
+        hidden_dim=16,
+        intermediate_size_per_partition=16,
+        num_local_experts=2,
+        num_logical_experts=2,
+        activation=MoEActivation.SILU,
+        device="cpu",
+        moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
+        in_dtype=torch.bfloat16,
+        routing_method=RoutingMethodType.TopK,
+        max_num_tokens=16,
+    )
+
+
+def _make_experts(
+    *,
+    gemm1_alpha: float | None = None,
+    gemm1_beta: float | None = None,
+    gemm1_clamp_limit: float | None = None,
+) -> FlashInferExperts:
+    quant_config = mxfp4_mxfp8_moe_quant_config(
+        w1_scale=torch.ones((2, 32, 1), dtype=torch.float8_e4m3fn),
+        w2_scale=torch.ones((2, 16, 1), dtype=torch.float8_e4m3fn),
+        gemm1_alpha=gemm1_alpha,
+        gemm1_beta=gemm1_beta,
+        gemm1_clamp_limit=gemm1_clamp_limit,
+    )
+    with set_current_vllm_config(
+        VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
+    ):
+        return FlashInferExperts(
+            moe_config=_make_moe_config(),
+            quant_config=quant_config,
+        )
+
+
+def test_mxfp4_swiglu_parameters_stay_unset_without_quant_config() -> None:
+    experts = _make_experts()
+
+    assert experts.gemm1_alpha is None
+    assert experts.gemm1_beta is None
+    assert experts.gemm1_clamp_limit is None
+
+
+def test_mxfp4_swiglu_parameters_follow_quant_config() -> None:
+    experts = _make_experts(
+        gemm1_alpha=1.25,
+        gemm1_beta=0.75,
+        gemm1_clamp_limit=5.5,
+    )
+
+    torch.testing.assert_close(experts.gemm1_alpha, torch.tensor([1.25, 1.25]))
+    torch.testing.assert_close(experts.gemm1_beta, torch.tensor([0.75, 0.75]))
+    torch.testing.assert_close(
+        experts.gemm1_clamp_limit,
+        torch.tensor([5.5, 5.5]),
+    )
+
+
+def test_cutlass_mxfp8_kernel_format_converts_gate_up_layout(monkeypatch) -> None:
+    monkeypatch.setitem(
+        sys.modules,
+        "flashinfer",
+        types.SimpleNamespace(block_scale_interleave=lambda x: x.contiguous()),
+    )
+
+    num_experts = 1
+    intermediate_size = 64
+    hidden_size = 64
+    packed_hidden_size = hidden_size // 2
+    sf_block_size = 32
+
+    w13_weight = torch.arange(
+        num_experts * 2 * intermediate_size * packed_hidden_size,
+        dtype=torch.uint8,
+    ).reshape(num_experts, 2 * intermediate_size, packed_hidden_size)
+    w2_weight = torch.arange(
+        num_experts * hidden_size * (intermediate_size // 2),
+        dtype=torch.uint8,
+    ).reshape(num_experts, hidden_size, intermediate_size // 2)
+    w13_scale_u8 = torch.arange(
+        num_experts * 2 * intermediate_size * (hidden_size // sf_block_size),
+        dtype=torch.uint8,
+    ).reshape(num_experts, 2 * intermediate_size, hidden_size // sf_block_size)
+    w2_scale_u8 = torch.arange(
+        num_experts * hidden_size * (intermediate_size // sf_block_size),
+        dtype=torch.uint8,
+    ).reshape(num_experts, hidden_size, intermediate_size // sf_block_size)
+    w13_bias = torch.arange(
+        num_experts * 2 * intermediate_size,
+        dtype=torch.bfloat16,
+    ).reshape(num_experts, 2 * intermediate_size)
+    w2_bias = torch.arange(
+        num_experts * hidden_size,
+        dtype=torch.bfloat16,
+    ).reshape(num_experts, hidden_size)
+
+    (
+        out_w13,
+        out_w2,
+        out_w13_scale,
+        out_w2_scale,
+        out_w13_bias,
+        out_w2_bias,
+    ) = convert_weight_to_mxfp4_moe_kernel_format(
+        mxfp4_backend=Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
+        layer=torch.nn.Module(),
+        w13_weight=w13_weight,
+        w2_weight=w2_weight,
+        w13_weight_scale=w13_scale_u8.view(torch.float8_e4m3fn),
+        w2_weight_scale=w2_scale_u8.view(torch.float8_e4m3fn),
+        w13_bias=w13_bias,
+        w2_bias=w2_bias,
+    )
+
+    expected_w13 = torch.cat(
+        [
+            w13_weight[:, intermediate_size:, :],
+            w13_weight[:, :intermediate_size, :],
+        ],
+        dim=1,
+    )
+    expected_w13_scale = torch.cat(
+        [
+            w13_scale_u8[:, intermediate_size:, :],
+            w13_scale_u8[:, :intermediate_size, :],
+        ],
+        dim=1,
+    )
+    expected_w13_bias = torch.cat(
+        [
+            w13_bias[:, intermediate_size:],
+            w13_bias[:, :intermediate_size],
+        ],
+        dim=1,
+    )
+
+    assert out_w13.is_contiguous()
+    assert out_w2.is_contiguous()
+    torch.testing.assert_close(out_w13, expected_w13)
+    torch.testing.assert_close(out_w2, w2_weight)
+    torch.testing.assert_close(out_w13_scale, expected_w13_scale)
+    torch.testing.assert_close(out_w2_scale, w2_scale_u8)
+    torch.testing.assert_close(out_w13_bias, expected_w13_bias)
+    torch.testing.assert_close(out_w2_bias, w2_bias)