cleanup recipes

jenchen13 · jenchen13 · commit 5de5541c2fcd · 2026-05-01T14:05:24.000-07:00
Signed-off-by: Jennifer Chen &lt;jennifchen@nvidia.com&gt;
diff --git a/modelopt_recipes/models/Nemotron-3-Super-120B-A12B/super-nvfp4-fp8-sweep-stride4.yaml b/modelopt_recipes/models/Nemotron-3-Super-120B-A12B/super-nvfp4-fp8-sweep-stride4.yaml
@@ -31,14 +31,16 @@
 # values. This keeps the FP8 static-scale path but uses a coarser candidate set.
 metadata:
   recipe_type: ptq
-  description: Super NVFP4 mixed precision — sparse MoE experts NVFP4 (W4A4, group_size 16); shared experts, mamba in/out_proj, and Latent MOE fc1_latent_proj/fc2_latent_proj
-    FP8 per-tensor; FP8 KV cache; lm_head/MTP/SSM stay BF16/FP16. Weight-MSE calibration with stride-4 FP8 scale sweep.
+  description: Super NVFP4 mixed precision — sparse MoE experts NVFP4 (W4A4, group_size 16); shared experts, mamba in/out_proj
+    FP8 per-tensor; FP8 KV cache; everything else(lm_head/MTP/Latent MOE) stay BF16. Weight-MSE calibration with stride-4 FP8 scale sweep.
 quantize:
   algorithm:
     method: mse
     fp8_scale_sweep: true
     fp8_scale_sweep_stride: 4
   quant_cfg:
+    # Disable all layers by default so that these layers stay in original BF16 precision:
+    # lm_head, output projection, MoE routers/gates, Latent MOE, MTP head, mamba conv1d.
     - quantizer_name: '*'
       enable: false
 
@@ -130,6 +132,3 @@ quantize:
       enable: true
       cfg:
         num_bits: e4m3
-
-    # Stay BF16: lm_head, output projection, MoE routers/gates, MTP head.
-    # SSM state / mamba conv1d stay FP16.
diff --git a/modelopt_recipes/models/Nemotron-3-Super-120B-A12B/super-nvfp4-max-calib.yaml b/modelopt_recipes/models/Nemotron-3-Super-120B-A12B/super-nvfp4-max-calib.yaml
@@ -28,21 +28,21 @@
 #   - Latent MOE (fc1_latent_proj, fc2_latent_proj):            BF16 (not quantized)
 #   - SSM cache:                                    FP32 (can be set to FP16 in VLLM)
 #
-# Calibration: amax/max calibration comparison variant. This skips MSE weight
-# scale search and uses max calibration for enabled quantizers.
+# Calibration: amax/max calibration comparison variant
 metadata:
   recipe_type: ptq
-  description: Super NVFP4 mixed precision — sparse MoE experts NVFP4 (W4A4, group_size 16); shared experts, mamba in/out_proj, and Latent MOE fc1_latent_proj/fc2_latent_proj
-    FP8 per-tensor; FP8 KV cache; lm_head/MTP/SSM stay BF16/FP16. Amax calibration comparison variant.
+  description: Super NVFP4 mixed precision — sparse MoE experts NVFP4 (W4A4, group_size 16); shared experts, mamba in/out_proj
+    FP8 per-tensor; FP8 KV cache; everything else(lm_head/MTP/Latent MOE) stay BF16. Amax calibration comparison variant.
 quantize:
   algorithm:
     method: max
   quant_cfg:
+    # Disable all layers by default so that these layers stay in original BF16 precision:
+    # lm_head, output projection, MoE routers/gates, Latent MOE, MTP head, mamba conv1d.
     - quantizer_name: '*'
       enable: false
 
     # MoE routed experts -> NVFP4 W4A4, block_size 16, e4m3 scale.
-    # Max/amax calibration uses dynamic block scales for both weight and activation.
     # HF/export names: backbone.layers.*.mixer.experts.*.{up,down}_proj.
     - quantizer_name: '*mixer.experts.*weight_quantizer'
       enable: true
@@ -129,6 +129,3 @@ quantize:
       enable: true
       cfg:
         num_bits: e4m3
-
-    # Stay BF16: lm_head, output projection, MoE routers/gates, MTP head.
-    # SSM state / mamba conv1d stay FP16.
diff --git a/modelopt_recipes/models/Nemotron-3-Super-120B-A12B/super-nvfp4.yaml b/modelopt_recipes/models/Nemotron-3-Super-120B-A12B/super-nvfp4.yaml
@@ -32,15 +32,15 @@
 # are also chosen via MSE search instead of plain amax).
 metadata:
   recipe_type: ptq
-  description: Super NVFP4 mixed precision — sparse MoE experts NVFP4 (W4A4, group_size 16); shared experts, mamba in/out_proj, and Latent MOE fc1_latent_proj/fc2_latent_proj
-    FP8 per-tensor; FP8 KV cache; lm_head/MTP/SSM stay BF16/FP16. Weight-MSE calibration with FP8 scale sweep.
+  description: Super NVFP4 mixed precision — sparse MoE experts NVFP4 (W4A4, group_size 16); shared experts, mamba in/out_proj
+    FP8 per-tensor; FP8 KV cache; everything else(lm_head/MTP/latent MOE) stay BF16. Weight-MSE calibration with FP8 scale sweep.
 quantize:
   algorithm:
     method: mse
     fp8_scale_sweep: true
   quant_cfg:
-    # Disable all layers by default so that these layers stay in their original precision: BF16/FP32:
-    # lm_head, output projection, MoE routers/gates, MTP head, SSM state, mamba conv1d.
+    # Disable all layers by default so that these layers stay in original BF16 precision:
+    # lm_head, output projection, MoE routers/gates, Latent MOE, MTP head, mamba conv1d.
     - quantizer_name: '*'
       enable: false