1- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22# SPDX-License-Identifier: Apache-2.0
33#
44# Licensed under the Apache License, Version 2.0 (the "License");
1313# See the License for the specific language governing permissions and
1414# limitations under the License.
1515
16- # Mirrors the published nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 hf_quant_config.json:
16+ # Approximately mirrors the published nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 hf_quant_config.json:
1717# - MoE routed experts (mixer.experts.<N>.{up,down}_proj): NVFP4 W4A4 weight MSE, group_size 16
1818# - MoE shared experts (mixer.shared_experts.{up,down}_proj): FP8 per-tensor
1919# - Mamba mixer linears (mixer.{in,out}_proj): FP8 per-tensor
2020# - KV cache: FP8
2121# - Attention linears ({q,k,v}_proj): BF16 (not quantized)
22- # - MTP head, lm_head, output, mamba conv1d: BF16 (not quantized)
23- # - Latent MOE (fc1_latent_proj, fc2_latent_proj): BF16 (not quantized)
24- # - SSM cache: FP32 (can be set to FP16 in VLLM)
22+ # - MTP head, lm_head, output, mamba conv1d: BF16 (not quantized)
23+ # - Latent MOE (fc1_latent_proj, fc2_latent_proj): BF16 (not quantized)
24+ # - SSM cache: FP32 (can be set to FP16 in VLLM)
2525#
2626# Calibration: weight MSE with FP8-scale sweep over the 128 e4m3 scale values
2727# (NVFP4 weights use static block scales selected by MSE; FP8 per-tensor scales
@@ -35,6 +35,8 @@ quantize:
3535 method : mse
3636 fp8_scale_sweep : true
3737 quant_cfg :
38+ # Disable all layers by default so that these layers stay in their original precision: BF16/FP32:
39+ # lm_head, output projection, MoE routers/gates, MTP head, SSM state, mamba conv1d.
3840 - quantizer_name : ' *'
3941 enable : false
4042
@@ -97,5 +99,3 @@ quantize:
9799 cfg :
98100 num_bits : e4m3
99101
100- # Stay BF16: lm_head, output projection, MoE routers/gates, MTP head.
101- # SSM state / mamba conv1d stay FP16.
0 commit comments