Support for KV cache quantization for MLA Attention vLLM fakequant (#714)

kinjalpatel27 · web-flow · commit 4eb1835df5f4 · 2026-01-05T13:03:24.000-08:00
## What does this PR do? **Type of change:** Feature extention **Overview:** Added support to quantize KV cache in vLLM fakequant by adding quantization support for [MLAAttention](https://github.com/vllm-project/vllm/blob/v0.11.1/vllm/attention/layer.py#L641) ## Usage Please refer to [Readme](https://github.com/NVIDIA/Model-Optimizer/tree/kinjal/vllm_att_quant/examples/vllm_serve#calibrate-and-serve-fake-quant-model-in-vllm) ```shell KV_QUANT_CFG=NVFP4_KV_CFG QUANT_CFG=NVFP4_DEFAULT_CFG python vllm_serve_fakequant.py deepseek-ai/DeepSeek-V2 --served-model-name deepseek-ai/DeepSeek-V2 --host 0.0.0.0 --port 8001 --trust-remote-code --enforce-eager --gpu-memory-utilization 0.8 ``` ## Testing Locally tested KV Cache quantization ``` �(rotary_emb): DeepseekScalingRotaryEmbedding() �(mla_attn): MultiHeadLatentAttentionWrapper( � (fused_qkv_a_proj): QuantMergedColumnParallelLinear( � in_features=5120, output_features=2112, bias=False, tp_size=1, gather_output=False � (input_quantizer): TensorQuantizer((2, 1) bit fake block_sizes={-1: 16, 'type': 'dynamic', 'scale_bits': (4, 3)}, amax=141.0000 calibrator=MaxCalibrator quant) � (weight_quantizer): TensorQuantizer((2, 1) bit fake block_sizes={-1: 16, 'type': 'dynamic', 'scale_bits': (4, 3)}, amax=1.4297 calibrator=MaxCalibrator quant) � (output_quantizer): TensorQuantizer(disabled) � ) � (q_a_layernorm): RMSNorm(hidden_size=1536, eps=1e-06) � (q_b_proj): QuantColumnParallelLinear( � in_features=1536, output_features=3072, bias=False, tp_size=8, gather_output=False � (input_quantizer): TensorQuantizer((2, 1) bit fake block_sizes={-1: 16, 'type': 'dynamic', 'scale_bits': (4, 3)}, amax=32.0000 calibrator=MaxCalibrator quant) � (weight_quantizer): TensorQuantizer((2, 1) bit fake block_sizes={-1: 16, 'type': 'dynamic', 'scale_bits': (4, 3)}, amax=0.1670 calibrator=MaxCalibrator quant) � (output_quantizer): TensorQuantizer(disabled) � ) � (kv_a_layernorm): RMSNorm(hidden_size=512, eps=1e-06) � (kv_b_proj): QuantColumnParallelLinear( � in_features=512, output_features=4096, bias=False, tp_size=8, gather_output=False � (input_quantizer): TensorQuantizer((2, 1) bit fake block_sizes={-1: 16, 'type': 'dynamic', 'scale_bits': (4, 3)}, amax=7.5312 calibrator=MaxCalibrator quant) � (weight_quantizer): TensorQuantizer((2, 1) bit fake block_sizes={-1: 16, 'type': 'dynamic', 'scale_bits': (4, 3)}, amax=0.2773 calibrator=MaxCalibrator quant) � (output_quantizer): TensorQuantizer(disabled) � ) � (rotary_emb): DeepseekScalingRotaryEmbedding() � (o_proj): QuantRowParallelLinear( � in_features=2048, output_features=5120, bias=False, tp_size=8, reduce_results=True � (input_quantizer): TensorQuantizer((2, 1) bit fake block_sizes={-1: 16, 'type': 'dynamic', 'scale_bits': (4, 3)}, amax=1.7188 calibrator=MaxCalibrator quant) � (weight_quantizer): TensorQuantizer((2, 1) bit fake block_sizes={-1: 16, 'type': 'dynamic', 'scale_bits': (4, 3)}, amax=0.4336 calibrator=MaxCalibrator quant) � (output_quantizer): TensorQuantizer(disabled) � ) � (mla_attn): QuantMLAAttention( � (q_bmm_quantizer): TensorQuantizer(disabled) � (kv_c_bmm_quantizer): TensorQuantizer((2, 1) bit fake block_sizes={-1: 16, 'type': 'dynamic', 'scale_bits': (4, 3)}, amax=7.5312 calibrator=MaxCalibrator quant) � ) �) ``` ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes - **Did you write any new necessary tests?**:No - **Did you add or update any necessary documentation?**:NA - **Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?**: NA ## Additional Information  --------- Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
diff --git a/examples/vllm_serve/fakequant_worker.py b/examples/vllm_serve/fakequant_worker.py
@@ -149,12 +149,35 @@ def disable_compilation(model):
 quant_config: dict[str, Any] = {
     "dataset": os.environ.get("QUANT_DATASET", "cnn_dailymail"),
     "calib_size": int(os.environ.get("QUANT_CALIB_SIZE", 512)),
-    "quant_cfg": os.environ.get("QUANT_CFG", "NVFP4_DEFAULT_CFG"),
+    "quant_cfg": os.environ.get("QUANT_CFG", None),
     "kv_quant_cfg": os.environ.get("KV_QUANT_CFG", None),
     "amax_file_path": os.environ.get("AMAX_FILE_PATH", None),
 }
 
 
+def update_kv_cfg_for_mla(model: torch.nn.Module, kv_quant_cfg: dict[str, Any]) -> dict[str, Any]:
+    """Update KV cache quantization config for MLA models.
+
+    MLA uses `kv_c_bmm_quantizer` (compressed KV) instead of separate
+    `k_bmm_quantizer` and `v_bmm_quantizer`. This function copies the
+    config from `*[kv]_bmm_quantizer` to also cover `*kv_c_bmm_quantizer`.
+    """
+    try:
+        from vllm.attention.layer import MLAAttention
+    except ImportError:
+        return kv_quant_cfg
+
+    if not any(isinstance(m, MLAAttention) for m in model.modules()):
+        return kv_quant_cfg
+
+    if kv_config := kv_quant_cfg.get("*[kv]_bmm_quantizer"):
+        kv_quant_cfg["*kv_c_bmm_quantizer"] = kv_config
+        kv_quant_cfg["*k_pe_bmm_quantizer"] = kv_config
+        print("MLA detected: added *kv_c_bmm_quantizer and k_pe_bmm_quantizer config")
+
+    return kv_quant_cfg
+
+
 def _create_new_data_cls(data_cls, **kwargs):
     """vLLM's low-level API changes frequently. This function creates a class with parameters
     compatible with the different vLLM versions."""
@@ -236,16 +259,24 @@ def calibrate_loop(model: Any = None) -> None:
                 if output is None:  # TODO: make this default when vllm <= 0.11 is outdated
                     self.sample_tokens(None)
 
-    quant_cfg = getattr(mtq, quant_config["quant_cfg"])
-    if quant_config["kv_quant_cfg"] is not None:
-        quant_cfg = mtq.utils.update_quant_cfg_with_kv_cache_quant(
-            quant_cfg, getattr(mtq, quant_config["kv_quant_cfg"])["quant_cfg"]
-        )
+    quant_cfg = {} if quant_config["quant_cfg"] is None else getattr(mtq, quant_config["quant_cfg"])
+    quant_kv_cfg = (
+        {} if quant_config["kv_quant_cfg"] is None else getattr(mtq, quant_config["kv_quant_cfg"])
+    )
 
     model = self.model_runner.model
     if hasattr(model, "unwrap"):
         model = model.unwrap()
 
+    # Check if model has MLA and update KV config accordingly
+    if quant_kv_cfg:
+        quant_kv_cfg["quant_cfg"] = update_kv_cfg_for_mla(model, quant_kv_cfg["quant_cfg"])
+
+    if quant_kv_cfg:
+        quant_cfg = mtq.utils.update_quant_cfg_with_kv_cache_quant(
+            quant_cfg, quant_kv_cfg["quant_cfg"]
+        )
+
     with disable_compilation(model):
         print("quantizing model...")
         mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
@@ -314,6 +345,6 @@ def determine_available_memory(self) -> int:
             return super().determine_available_memory()
 
     def compile_or_warm_up_model(self) -> None:
-        if quant_config["quant_cfg"]:
+        if quant_config["quant_cfg"] or quant_config["kv_quant_cfg"]:
             _fakequant_run_prolog_worker(self)
         super().compile_or_warm_up_model()
diff --git a/examples/vllm_serve/vllm_serve_fakequant.py b/examples/vllm_serve/vllm_serve_fakequant.py
@@ -70,7 +70,13 @@
 
 
 # Adding the envs you want to pass to the workers
-additional_env_vars = {"QUANT_DATASET", "QUANT_CALIB_SIZE", "QUANT_CFG", "AMAX_FILE_PATH"}
+additional_env_vars = {
+    "QUANT_DATASET",
+    "QUANT_CALIB_SIZE",
+    "QUANT_CFG",
+    "AMAX_FILE_PATH",
+    "KV_QUANT_CFG",
+}
 
 RayDistributedExecutor.ADDITIONAL_ENV_VARS.update(additional_env_vars)
 
diff --git a/modelopt/torch/quantization/plugins/vllm.py b/modelopt/torch/quantization/plugins/vllm.py
@@ -40,6 +40,11 @@
     except ImportError:
         continue
 
+try:
+    from vllm.attention.layer import MLAAttention as VllmMLAAttention
+except ImportError:
+    VllmMLAAttention = None
+
 vllm_fused_moe_package = importlib.import_module("vllm.model_executor.layers.fused_moe.fused_moe")
 
 
@@ -281,3 +286,20 @@ class _QuantVLLMCrossAttention(_QuantVLLMAttention):
 @QuantModuleRegistry.register({EncoderOnlyAttention: "vllm_EncoderOnlyAttention"})
 class _QuantVLLMEncoderOnlyAttention(_QuantVLLMAttention):
     pass
+
+
+if VllmMLAAttention is not None:
+
+    @QuantModuleRegistry.register({VllmMLAAttention: "vllm_MLAAttention"})
+    class _QuantVLLMMLAAttention(QuantModule):
+        def _setup(self):
+            self.q_bmm_quantizer = TensorQuantizer()
+            self.kv_c_bmm_quantizer = TensorQuantizer()
+            self.k_pe_bmm_quantizer = TensorQuantizer()
+            self.parallel_state = create_parallel_state()
+
+        def forward(self, query, kv_c, k_pe, *args, **kwargs):
+            query = self.q_bmm_quantizer(query)
+            kv_c = self.kv_c_bmm_quantizer(kv_c)
+            k_pe = self.k_pe_bmm_quantizer(k_pe)
+            return super().forward(query, kv_c, k_pe, *args, **kwargs)