|
28 | 28 | # - Latent MOE (fc1_latent_proj, fc2_latent_proj): BF16 (not quantized) |
29 | 29 | # - SSM cache: FP32 (can be set to FP16 in VLLM) |
30 | 30 | # |
31 | | -# Calibration: amax/max calibration comparison variant. This skips MSE weight |
32 | | -# scale search and uses max calibration for enabled quantizers. |
| 31 | +# Calibration: amax/max calibration comparison variant |
33 | 32 | metadata: |
34 | 33 | recipe_type: ptq |
35 | | - description: Super NVFP4 mixed precision — sparse MoE experts NVFP4 (W4A4, group_size 16); shared experts, mamba in/out_proj, and Latent MOE fc1_latent_proj/fc2_latent_proj |
36 | | - FP8 per-tensor; FP8 KV cache; lm_head/MTP/SSM stay BF16/FP16. Amax calibration comparison variant. |
| 34 | + description: Super NVFP4 mixed precision — sparse MoE experts NVFP4 (W4A4, group_size 16); shared experts, mamba in/out_proj |
| 35 | + FP8 per-tensor; FP8 KV cache; everything else(lm_head/MTP/Latent MOE) stay BF16. Amax calibration comparison variant. |
37 | 36 | quantize: |
38 | 37 | algorithm: |
39 | 38 | method: max |
40 | 39 | quant_cfg: |
| 40 | + # Disable all layers by default so that these layers stay in original BF16 precision: |
| 41 | + # lm_head, output projection, MoE routers/gates, Latent MOE, MTP head, mamba conv1d. |
41 | 42 | - quantizer_name: '*' |
42 | 43 | enable: false |
43 | 44 |
|
44 | 45 | # MoE routed experts -> NVFP4 W4A4, block_size 16, e4m3 scale. |
45 | | - # Max/amax calibration uses dynamic block scales for both weight and activation. |
46 | 46 | # HF/export names: backbone.layers.*.mixer.experts.*.{up,down}_proj. |
47 | 47 | - quantizer_name: '*mixer.experts.*weight_quantizer' |
48 | 48 | enable: true |
@@ -129,6 +129,3 @@ quantize: |
129 | 129 | enable: true |
130 | 130 | cfg: |
131 | 131 | num_bits: e4m3 |
132 | | - |
133 | | - # Stay BF16: lm_head, output projection, MoE routers/gates, MTP head. |
134 | | - # SSM state / mamba conv1d stay FP16. |
0 commit comments