Added support for MoE for vllm >= 0.14.0rc1 (#1162)

kinjalpatel27 · web-flow · commit 161942138352 · 2026-04-14T16:40:35.000-07:00
### What does this PR do? Type of change: Bug fix `_QuantFusedMoEBase.forward()` previously replaced `vllm_fused_moe_package.invoke_fused_moe_kernel`, which was replaced starting in vLLM v0.14.0rc1, There are two paths for FusedMoE forward: ``` Path 1 (Modular — standard CUDA path): FusedMoE.forward() → self.runner.forward() → TritonExperts.apply() → invoke_fused_moe_triton_kernel() ← called twice (w1, w2) Path 2 (legacy): inplace_fused_experts / outplace_fused_experts → fused_experts_impl() → dispatch_fused_moe_kernel() → invoke_fused_moe_triton_kernel() or invoke_fused_moe_wna16_triton_kernel() or invoke_fused_moe_wna16_cuda_kernel() ``` This caused an `AttributeError` / assertion failure for any MoE model quantized with vLLM ≥ v0.14.0rc1. The fix refactors the kernel-patching logic into a `_patch_moe_kernel()` context manager that probes for both attribute names (the two names are mutually exclusive across vLLM versions — confirmed by inspecting every release from v0.10.0 to v0.19.1). ### Usage NA ### Testing ``` docker run --gpus all -it --shm-size=160GB --network host --rm -v <modelopt path>:/home/modelopt \ vllm/vllm-openai:v0.15.0 bash -c "cd /home/modelopt && pip install . && pip install datasets && \ QUANT_CFG=NVFP4_DEFAULT_CFG python3 /home/modelopt/examples/vllm_serve/vllm_serve_fakequant.py \ nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 -tp 1 --served-model-name NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 \ --host 0.0.0.0 --port 8001 --trust-remote-code --disable-custom-all-reduce \ --gpu-memory-utilization 0.8" ``` ### Before your PR is "*Ready for review*" Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md) and your commits are signed (`git commit -s -S`). Make sure you read and follow the [Security Best Practices](https://github.com/NVIDIA/Model-Optimizer/blob/main/SECURITY.md#security-coding-practices-for-contributors) (e.g. avoiding hardcoded `trust_remote_code=True`, `torch.load(..., weights_only=False)`, `pickle`, etc.). - Is this change backward compatible?: ✅ - If you copied code from any other sources or added a new PIP dependency, did you follow guidance in `CONTRIBUTING.md`: N/A - Did you write any new necessary tests?: N/A - Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?: N/A ### Additional Information   ## Summary by CodeRabbit * **Refactor** * Ensures quantized expert weights are correctly used by the fused-MoE execution path so inference uses the intended quantized tensors. * Replaces fragile manual swapping of the runtime kernel with a safer, context-managed swap that reliably caches and restores the original. * Adds runtime detection and selection among available fused-MoE kernel entrypoints to support multiple variants.  --------- Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
diff --git a/examples/vllm_serve/README.md b/examples/vllm_serve/README.md
@@ -4,7 +4,7 @@ This is a simple example to demonstrate calibrating and serving ModelOpt fakequa
 
 Compared with realquant, fakequant is 2-5x slower, but doesn't require dedicated kernel support and facilitates research.
 
-This example is tested with vllm 0.9.0 and 0.11.2
+This example is tested with vllm 0.9.0 and 0.19.1
 
 ## Prepare environment
 
diff --git a/examples/vllm_serve/fakequant_worker.py b/examples/vllm_serve/fakequant_worker.py
@@ -134,12 +134,15 @@ def determine_available_memory(self) -> int:
         with disable_compilation(model):
             return super().determine_available_memory()
 
-    def compile_or_warm_up_model(self) -> None:
+    def compile_or_warm_up_model(self) -> float:
         if (
             quant_config["quant_cfg"]
             or quant_config["kv_quant_cfg"]
             or quant_config["modelopt_state_path"]
             or quant_config["recipe_path"]
         ):
             _fakequant_run_prolog_worker(self)
-        super().compile_or_warm_up_model()
+        # Must return the base worker's compilation time (seconds). Returning None
+        # breaks vLLM V1 executor: initialize_from_config does max(compilation_times)
+        # across TP workers.
+        return super().compile_or_warm_up_model()
diff --git a/examples/vllm_serve/vllm_serve_fakequant.py b/examples/vllm_serve/vllm_serve_fakequant.py
@@ -62,14 +62,12 @@
 
 vllm_version = version.parse(vllm.__version__)
 if vllm_version <= version.parse("0.11.0"):
-    from vllm.executor.ray_distributed_executor import RayDistributedExecutor
     from vllm.utils import FlexibleArgumentParser
 else:
     from vllm.utils.argparse_utils import FlexibleArgumentParser
-    from vllm.v1.executor.ray_executor import RayDistributedExecutor
 
 
-# Adding the envs you want to pass to the workers
+# Env vars to copy from the driver to Ray workers (must match fakequant_worker / vllm_ptq_utils).
 additional_env_vars = {
     "QUANT_DATASET",
     "QUANT_CALIB_SIZE",
@@ -82,7 +80,17 @@
     "TRUST_REMOTE_CODE",
 }
 
-RayDistributedExecutor.ADDITIONAL_ENV_VARS.update(additional_env_vars)
+try:
+    from vllm.executor.ray_distributed_executor import RayDistributedExecutor
+
+    RayDistributedExecutor.ADDITIONAL_ENV_VARS.update(additional_env_vars)
+except (ImportError, AttributeError):
+    # vLLM v1 Ray: vllm/ray/ray_env.py (get_env_vars_to_copy); merge with any user-set list.
+    extra_env_var = "VLLM_RAY_EXTRA_ENV_VARS_TO_COPY"
+    merged_env_vars = {
+        t.strip() for t in os.environ.get(extra_env_var, "").split(",") if t.strip()
+    } | additional_env_vars
+    os.environ[extra_env_var] = ",".join(sorted(merged_env_vars))
 
 
 def main():
diff --git a/modelopt/torch/quantization/plugins/vllm.py b/modelopt/torch/quantization/plugins/vllm.py
@@ -15,8 +15,11 @@
 
 """Support quantization for VLLM layers."""
 
+import contextvars
 import importlib
+from collections.abc import Callable
 from contextlib import contextmanager
+from functools import partial
 from itertools import chain
 
 import torch
@@ -85,6 +88,21 @@
 )
 
 vllm_fused_moe_package = importlib.import_module("vllm.model_executor.layers.fused_moe.fused_moe")
+# vLLM may call one entry (e.g. ``dispatch_fused_moe_kernel``) which then calls another on the same
+# module (e.g. ``invoke_fused_moe_triton_kernel``). Patching every name would otherwise apply fakequant
+# twice; see ``_moe_fakequant_active`` in ``invoke_fused_moe_quantized``.
+_FUSED_MOE_KERNEL_CANDIDATES = (
+    "invoke_fused_moe_kernel",
+    "invoke_fused_moe_triton_kernel",
+    "dispatch_fused_moe_kernel",
+)
+_FUSED_MOE_KERNEL_FUNCS = tuple(
+    n for n in _FUSED_MOE_KERNEL_CANDIDATES if hasattr(vllm_fused_moe_package, n)
+)
+
+_moe_fakequant_active: contextvars.ContextVar[bool] = contextvars.ContextVar(
+    "moe_fakequant_active", default=False
+)
 
 
 @contextmanager
@@ -340,29 +358,64 @@ def invoke_fused_moe_quantized(
         B: torch.Tensor,  # noqa: N803
         C: torch.Tensor,  # noqa: N803
         *args,
+        original_kernel: Callable,
+        **kwargs,
+    ):
+        # Nested module-level entry (e.g. dispatch -> triton): call the real kernel once, no second quant.
+        if _moe_fakequant_active.get():
+            return original_kernel(A, B, C, *args, **kwargs)
+        token = _moe_fakequant_active.set(True)
+        try:
+            return self._invoke_fused_moe_quantized_function(
+                A, B, C, *args, original_kernel=original_kernel, **kwargs
+            )
+        finally:
+            _moe_fakequant_active.reset(token)
+
+    def _invoke_fused_moe_quantized_function(
+        self,
+        A: torch.Tensor,  # noqa: N803
+        B: torch.Tensor,  # noqa: N803
+        C: torch.Tensor,  # noqa: N803
+        *args,
+        original_kernel: Callable,
         **kwargs,
     ):
         if B is self.w13_weight:
             # First layer of expert
             A = self.w13_input_quantizer(A)  # noqa: N806
-            if self.w13_weight_quantizer.is_enabled:
-                original_weight = self.w13_weight
-                self.w13_weight = self.w13_weight_quantizer(self.w13_weight)
-                vllm_fused_moe_package._invoke_fused_moe_kernel(A, B, C, *args, **kwargs)
-                self.w13_weight = original_weight
+            if self.w13_weight_quantizer.is_enabled:  # pragma: no cover
+                original_weight, self.w13_weight = (
+                    self.w13_weight,
+                    self.w13_weight_quantizer(self.w13_weight),
+                )
+                # In case the weight quantizer isn't folded yet in vllm_serve_fakequant, pass the
+                # quantized weight to the kernel.
+                B = self.w13_weight  # noqa: N806
+                try:
+                    original_kernel(A, B, C, *args, **kwargs)
+                finally:
+                    self.w13_weight = original_weight
             else:
-                vllm_fused_moe_package._invoke_fused_moe_kernel(A, B, C, *args, **kwargs)
+                original_kernel(A, B, C, *args, **kwargs)
             if self.w13_output_quantizer.is_enabled:
                 C[:] = self.w13_output_quantizer(C)
         elif B is self.w2_weight:
             A = self.w2_input_quantizer(A)  # noqa: N806
-            if self.w2_weight_quantizer.is_enabled:
-                original_weight = self.w2_weight
-                self.w2_weight = self.w2_weight_quantizer(self.w2_weight)
-                vllm_fused_moe_package._invoke_fused_moe_kernel(A, B, C, *args, **kwargs)
-                self.w2_weight = original_weight
+            if self.w2_weight_quantizer.is_enabled:  # pragma: no cover
+                original_weight, self.w2_weight = (
+                    self.w2_weight,
+                    self.w2_weight_quantizer(self.w2_weight),
+                )
+                # In case the weight quantizer isn't folded yet in vllm_serve_fakequant, pass the
+                # quantized weight to the kernel.
+                B = self.w2_weight  # noqa: N806
+                try:
+                    original_kernel(A, B, C, *args, **kwargs)
+                finally:
+                    self.w2_weight = original_weight
             else:
-                vllm_fused_moe_package._invoke_fused_moe_kernel(A, B, C, *args, **kwargs)
+                original_kernel(A, B, C, *args, **kwargs)
             if self.w2_output_quantizer.is_enabled:
                 C[:] = self.w2_output_quantizer(C)
         else:
@@ -372,24 +425,31 @@ def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor):
         # This is again due to the bad coding of vLLM
         # fused_moe submodule is overwritten by the fused_moe function
         # so we need to import the fused_moe module explicitly
-        assert vllm_fused_moe_package.invoke_fused_moe_kernel is not None
+        assert _FUSED_MOE_KERNEL_FUNCS and all(
+            getattr(vllm_fused_moe_package, n, None) is not None for n in _FUSED_MOE_KERNEL_FUNCS
+        )
         # This context manager will conflict with torch.compile
         # with replace_function(
         #     vllm_fused_moe_package,
         #     "invoke_fused_moe_kernel",
         #     self.invoke_fused_moe_quantized,
         # ):
+        originals = {n: getattr(vllm_fused_moe_package, n) for n in _FUSED_MOE_KERNEL_FUNCS}
         try:
-            vllm_fused_moe_package._invoke_fused_moe_kernel = (  # type: ignore[attr-defined]
-                vllm_fused_moe_package.invoke_fused_moe_kernel
-            )
-            vllm_fused_moe_package.invoke_fused_moe_kernel = self.invoke_fused_moe_quantized  # type: ignore[attr-defined]
+            for n in _FUSED_MOE_KERNEL_FUNCS:
+                setattr(
+                    vllm_fused_moe_package,
+                    n,
+                    partial(
+                        self.invoke_fused_moe_quantized,
+                        original_kernel=originals[n],
+                    ),
+                )
             output = super().forward(hidden_states, router_logits)
             return output
         finally:
-            vllm_fused_moe_package.invoke_fused_moe_kernel = (  # type: ignore[attr-defined]
-                vllm_fused_moe_package._invoke_fused_moe_kernel
-            )
+            for n in _FUSED_MOE_KERNEL_FUNCS:
+                setattr(vllm_fused_moe_package, n, originals[n])
 
     @torch.no_grad()
     def fold_weight(self, keep_attrs: bool = False):
@@ -409,7 +469,8 @@ def fold_weight(self, keep_attrs: bool = False):
             )
         self.w2_weight_quantizer.disable()
 
-        torch.cuda.empty_cache()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
 
 
 @QuantModuleRegistry.register({vllm_fused_moe_layer.FusedMoE: "vllm_FusedMoE"})