Disable QKV NVFP4 quantization for Qwen3 MOE (#735)

cjluo-nv · web-flow · commit d541324e84b1 · 2026-01-02T11:06:25.000-08:00
## What does this PR do?

**Type of change:** ? Recipe improvement

**Overview:** ?

Disable QKV NVFP4 quantization for Qwen3 MOE models following the Qwen3
Next recipe for accuracy recovery

## Testing
Model accuracy benchmarking

Signed-off-by: Chenjie Luo &lt;chenjiel@nvidia.com&gt;
diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
@@ -180,7 +180,7 @@ def build_quant_cfg(
         quant_cfg["quant_cfg"]["*image*"] = {"enable": False}
         quant_cfg["quant_cfg"]["*vision*"] = {"enable": False}
 
-        if model_type == "qwen3next" and qformat == "nvfp4":
+        if model_type in ["qwen3moe", "qwen3next"] and qformat == "nvfp4":
             # Disable the attention projection layers to retain accuracy
             quant_cfg["quant_cfg"]["model*.*attn*in_proj*"] = {"enable": False}
             quant_cfg["quant_cfg"]["model*.*attn*q_proj*"] = {"enable": False}
diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py
@@ -29,6 +29,7 @@
     "MPT": "mpt",
     "Bloom": "bloom",
     "ChatGLM": "chatglm",
+    "Qwen3Moe": "qwen3moe",
     "Qwen3Next": "qwen3next",
     "QWen": "qwen",
     "RecurrentGemma": "recurrentgemma",