NVIDIA · KshitijLakhani · Mar 19, 2026 · Apr 9, 2026 · Apr 10, 2026 · Apr 10, 2026
diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu
@@ -252,6 +252,9 @@ inline std::string grouped_gemm_skip_reason(const TestParams& params) {
     return "Grouped GEMM on Hopper (SM90) requires cuBLAS 13.4+, but run-time cuBLAS "
            "version is " + std::to_string(cublas_ver) + ".";
   }
+  if (cc == 120 || cc == 121) {
+    return "Grouped GEMM is currently unsupported on SM12x architectures.";
+  }
   if (params.recipe != InputRecipe::kBF16) {
     const bool is_blackwell_plus = cc >= blackwellComputeCapability;
     const bool fp8_block = is_fp8_block_recipe(params.recipe);

diff --git a/tests/pytorch/debug/run_distributed.py b/tests/pytorch/debug/run_distributed.py
@@ -48,6 +48,19 @@
 fp8_available = is_fp8_available()
 
 
+def _cmp_dist(ground_truth, output, parallel_mode):
+    if parallel_mode == "column" and torch.cuda.get_device_capability() == (12, 0):
+        # SM120: distributed column-parallel path may show a single-element
+        # activation outlier slightly above default fp32 atol, while grads match.
+        torch.testing.assert_close(
+            ground_truth["activation"], output["activation"], atol=1.2e-5, rtol=1.3e-6
+        )
+        torch.testing.assert_close(ground_truth["wgrad"], output["wgrad"])
+        torch.testing.assert_close(ground_truth["dgrad"], output["dgrad"])
+    else:
+        _cmp(ground_truth, output)
+
+
 def _get_tensors(parallel_mode, weight_seed=SEED, data_seed=SEED, tp_size=None, tp_rank=None):
     if tp_size is None:
         tp_size = WORLD_SIZE
@@ -445,7 +458,7 @@ def test_disable_fp8_gemms(fprop_fp8, dgrad_fp8, wgrad_fp8, parallel_mode, **kwa
 
     x.grad.zero_()
     ground_truth = _emulate_linear_distributed(x, weight, parallel_mode=parallel_mode, **fp8_kwargs)
-    _cmp(ground_truth, output)
+    _cmp_dist(ground_truth, output, parallel_mode)
 
 
 @run_debug_test
@@ -466,7 +479,7 @@ def test_disable_fp8_layer(parallel_mode, **kwargs):
     y = _run_forward_backward(x, model, parallel_mode)
 
     output = {"activation": y.clone(), "wgrad": model.weight.grad.clone(), "dgrad": x.grad.clone()}
-    _cmp(ground_truth, output)
+    _cmp_dist(ground_truth, output, parallel_mode)
 
 
 @run_debug_test
@@ -554,7 +567,7 @@ def test_per_tensor_scaling(
         x, weight, parallel_mode=parallel_mode, loss_multiplier=LOSS_MULTIPLIER, **fp8_kwargs
     )
 
-    _cmp(ground_truth, output)
+    _cmp_dist(ground_truth, output, parallel_mode)
 
 
 @run_debug_test
@@ -617,7 +630,7 @@ def test_fake_quant_fp8(
         _get_current_scale(x, wgrad_input) if not fp8_kwargs["wgrad_fp8"] else None
     )
     ground_truth = _emulate_linear_distributed(x, weight, parallel_mode=parallel_mode, **fp8_kwargs)
-    _cmp(ground_truth, output)
+    _cmp_dist(ground_truth, output, parallel_mode)
 
 
 def _init_distributed():

diff --git a/tests/pytorch/distributed/run_layer_with_overlap.py b/tests/pytorch/distributed/run_layer_with_overlap.py
@@ -30,6 +30,11 @@
 warnings.filterwarnings("ignore", category=FutureWarning)
 warnings.filterwarnings("ignore", category=UserWarning)
 
+FP8_DEFAULT_RTOL_ATOL = (0.125, 0.0625)
+FP8_CS_SM120_DETERMINISTIC_RTOL_ATOL = (0.4, 0.25)
+BF16_DEFAULT_RTOL_ATOL = (0.025, 0.00125)
+BF16_SM120_DETERMINISTIC_OVERLAP_RTOL_ATOL = (0.05, 0.01)
+
 
 class multi_module_model(torch.nn.Module):
     def __init__(self, module, num_layers, *args, **kwargs):
@@ -551,9 +556,33 @@ def run_fwd_bwd(model, x):
 
         # Now validate accuracy
         if not bool(numerics_failed.item()):
+            is_sm120 = torch.cuda.get_device_capability() == (12, 0)
+            is_deterministic_mode = os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO", "1") == "0"
             for i, (test_g, ref_g) in enumerate(zip(test_grads, ref_grads)):
-                rtol = 0.125 if opts.fp8 else 0.025
-                atol = 0.0625 if opts.fp8 else 0.00125
+                if opts.fp8:
+                    if (
+                        opts.quantization == "fp8_current_scaling"
+                        and is_sm120
+                        and is_deterministic_mode
+                    ):
+                        # SM120 deterministic mode disables fused attn, so rt uses alternate attn backends.
+                        # Combined with FP8 CS, this path needs the looser distributed fp8_cs tolerance policy.
+                        rtol, atol = FP8_CS_SM120_DETERMINISTIC_RTOL_ATOL
+                    else:
+                        rtol, atol = FP8_DEFAULT_RTOL_ATOL
+                else:
+                    rtol, atol = BF16_DEFAULT_RTOL_ATOL
+                    if (
+                        is_sm120
+                        and is_deterministic_mode
+                        and opts.layer_type == te.TransformerLayer
+                        and opts.num_layers > 1
+                        and opts.overlap_rs_dgrad
+                    ):
+                        # SM120 + deterministic training disables fused attn .
+                        # Rt then selects an alternate attn backend, and
+                        # the overlap path can show tiny BF16 accumulation-order drift vs reference.
+                        rtol, atol = BF16_SM120_DETERMINISTIC_OVERLAP_RTOL_ATOL
                 grad_failed, grad_info = _compare_tensors(names[i], test_g, ref_g, rtol, atol)
                 dist_print(grad_info, src=WORLD_RANK, error=grad_failed)
                 numerics_failed[0] = int(grad_failed)

diff --git a/tests/pytorch/nvfp4/test_nvfp4_group_quantize_graph_safe.py b/tests/pytorch/nvfp4/test_nvfp4_group_quantize_graph_safe.py
@@ -28,6 +28,19 @@
 recipe_available, reason_for_no_recipe = te.is_nvfp4_available(return_reason=True)
 
 
+def _reference_scale_for_layout(
+    ref_unswizzled: torch.Tensor,
+    split_m: int,
+    n: int,
+    columnwise: bool,
+    with_gemm_swizzled_scales: bool,
+) -> torch.Tensor:
+    """Return reference scale in expected backend-reported layout."""
+    if with_gemm_swizzled_scales:
+        return swizzle_nvfp4_scale(split_m, n, ref_unswizzled.clone(), columnwise=columnwise)
+    return ref_unswizzled
+
+
 def fused_grouped_quantize(
     x: torch.Tensor, split_section_tensor: torch.Tensor, quantizer: NVFP4Quantizer
 ):
@@ -56,7 +69,6 @@ def check_grouped_tensor_nvfp4_versus_reference(
 ) -> None:
 
     te_dtype = tex.DType.kFloat4E2M1
-
     split_section_tensor = torch.tensor(split_sections, dtype=torch.int64, device="cuda")
 
     # Setup device and random seed
@@ -98,6 +110,14 @@ def check_grouped_tensor_nvfp4_versus_reference(
     group_quantized_output = fused_grouped_quantize(x, split_section_tensor, grouped_quantizer)
     # get a list of nvfp4 quantized tensors for testing
     split_quantize_outputs = group_quantized_output.split_into_quantized_tensors()
+    expected_swizzled_layout = bool(group_quantized_output._with_gemm_swizzled_scales)
+    for i, output in enumerate(split_quantize_outputs):
+        split_flag = bool(output._with_gemm_swizzled_scales)
+        assert split_flag == expected_swizzled_layout, (
+            "Grouped output and split output disagree on swizzled-scale metadata "
+            f"(split {i}: grouped={expected_swizzled_layout}, split={split_flag})"
+        )
+    scale_atol, scale_rtol = 0.0, 0.0
 
     if return_rowwise:
         x_qx = [output._rowwise_data.view(dtype=torch.uint8) for output in split_quantize_outputs]
@@ -121,11 +141,15 @@ def check_grouped_tensor_nvfp4_versus_reference(
                 ), "The scale shape is not correctly aligned"
                 x_sx_i = x_sx[i].clone()
                 x_sx_ref_i = x_sx_ref[i].clone()
-                if optimize_for_gemm:
-                    x_sx_ref_i = swizzle_nvfp4_scale(
-                        split_sections[i], N, x_sx_ref_i, columnwise=False
-                    )
-                torch.testing.assert_close(x_sx_i, x_sx_ref_i, atol=0.0, rtol=0.0)
+                # Swizzle the reference scale based on expected_swizzled_layout
+                x_sx_ref_i = _reference_scale_for_layout(
+                    ref_unswizzled=x_sx_ref_i,
+                    split_m=split_sections[i],
+                    n=N,
+                    columnwise=False,
+                    with_gemm_swizzled_scales=expected_swizzled_layout,
+                )
+                torch.testing.assert_close(x_sx_i, x_sx_ref_i, atol=scale_atol, rtol=scale_rtol)
 
     if return_transpose:
         x_qx_t = [
@@ -151,11 +175,14 @@ def check_grouped_tensor_nvfp4_versus_reference(
                 ), "The scale shape is not correctly aligned"
                 x_sx_t_i = x_sx_t[i].clone()
                 x_sx_t_ref_i = x_sx_t_ref[i].clone()
-                if optimize_for_gemm:
-                    x_sx_t_ref_i = swizzle_nvfp4_scale(
-                        split_sections[i], N, x_sx_t_ref_i, columnwise=True
-                    )
-                torch.testing.assert_close(x_sx_t_i, x_sx_t_ref_i, atol=0.0, rtol=0.0)
+                x_sx_t_ref_i = _reference_scale_for_layout(
+                    ref_unswizzled=x_sx_t_ref_i,
+                    split_m=split_sections[i],
+                    n=N,
+                    columnwise=True,
+                    with_gemm_swizzled_scales=expected_swizzled_layout,
+                )
+                torch.testing.assert_close(x_sx_t_i, x_sx_t_ref_i, atol=scale_atol, rtol=scale_rtol)
 
 
 def check_grouped_tensor_nvfp4_with_paged_stashing(
@@ -173,7 +200,6 @@ def check_grouped_tensor_nvfp4_with_paged_stashing(
 ) -> None:
 
     te_dtype = tex.DType.kFloat4E2M1
-
     assert valid_M is not None, "valid_M must be provided when with_paged_stashing is True"
     assert valid_M < M, "valid_M must be less than M when with_paged_stashing is True"
 
@@ -225,6 +251,14 @@ def check_grouped_tensor_nvfp4_with_paged_stashing(
 
     # get a list of nvfp4 quantized tensors for testing
     split_quantize_outputs = group_quantized_output.split_into_quantized_tensors()
+    expected_swizzled_layout = bool(group_quantized_output._with_gemm_swizzled_scales)
+    for i, output in enumerate(split_quantize_outputs):
+        split_flag = bool(output._with_gemm_swizzled_scales)
+        assert split_flag == expected_swizzled_layout, (
+            "Grouped output and split output disagree on swizzled-scale metadata "
+            f"(split {i}: grouped={expected_swizzled_layout}, split={split_flag})"
+        )
+    scale_atol, scale_rtol = 0.0, 0.0
 
     if return_rowwise:
         x_qx = [output._rowwise_data.view(dtype=torch.uint8) for output in split_quantize_outputs]
@@ -248,11 +282,15 @@ def check_grouped_tensor_nvfp4_with_paged_stashing(
                 ), "The scale shape is not correctly aligned"
                 x_sx_i = x_sx[i].clone()
                 x_sx_ref_i = x_sx_ref[i].clone()
-                if optimize_for_gemm:
-                    x_sx_ref_i = swizzle_nvfp4_scale(
-                        split_sections[i], N, x_sx_ref_i, columnwise=False
-                    )
-                torch.testing.assert_close(x_sx_i, x_sx_ref_i, atol=0.0, rtol=0.0)
+                # Swizzle the reference scale based on expected swizzled layout
+                x_sx_ref_i = _reference_scale_for_layout(
+                    ref_unswizzled=x_sx_ref_i,
+                    split_m=split_sections[i],
+                    n=N,
+                    columnwise=False,
+                    with_gemm_swizzled_scales=expected_swizzled_layout,
+                )
+                torch.testing.assert_close(x_sx_i, x_sx_ref_i, atol=scale_atol, rtol=scale_rtol)
 
     if return_transpose:
         x_qx_t = [
@@ -275,11 +313,14 @@ def check_grouped_tensor_nvfp4_with_paged_stashing(
                 valid_scale_shape = get_nvfp4_scale_shape_no_padding(x_splits[i].shape, True)
                 x_sx_t_i = x_sx_t[i].clone()
                 x_sx_t_ref_i = x_sx_t_ref[i].clone()
-                if optimize_for_gemm:
-                    x_sx_t_ref_i = swizzle_nvfp4_scale(
-                        split_sections[i], N, x_sx_t_ref_i, columnwise=True
-                    )
-                torch.testing.assert_close(x_sx_t_i, x_sx_t_ref_i, atol=0.0, rtol=0.0)
+                x_sx_t_ref_i = _reference_scale_for_layout(
+                    ref_unswizzled=x_sx_t_ref_i,
+                    split_m=split_sections[i],
+                    n=N,
+                    columnwise=True,
+                    with_gemm_swizzled_scales=expected_swizzled_layout,
+                )
+                torch.testing.assert_close(x_sx_t_i, x_sx_t_ref_i, atol=scale_atol, rtol=scale_rtol)
 
 
 @pytest.mark.skipif(not recipe_available, reason=reason_for_no_recipe)
@@ -402,6 +443,11 @@ def test_grouped_tensor_nvfp4_with_paged_stashing(
     with_rht: bool,
     optimize_for_gemm: bool,
 ) -> None:
+    if torch.cuda.get_device_capability() == (12, 0):
+        pytest.skip(
+            "SM120: paged-stashing grouped NVFP4 path is currently unsupported. "
+            "group_hadamard_transform_amax assumes sum(split_sections) == input rows)."
+        )
 
     # paged stashing means that the sum of total tokens is less than
     # or equal to the buffer size, you can have buffer [2048, 1024]

diff --git a/tests/pytorch/nvfp4/test_nvfp4_sr_quantize.py b/tests/pytorch/nvfp4/test_nvfp4_sr_quantize.py
@@ -14,11 +14,31 @@
 
 recipe_available, reason_for_no_recipe = te.is_nvfp4_available(return_reason=True)
 
+SM120_SR_EQUIVALENCE_ATOL = 2e-7
+
 seed = 12345
 torch.manual_seed(seed)
 torch.cuda.manual_seed(seed)
 
 
+def _assert_sr_vs_rn_behavior(
+    me_sr: torch.Tensor,
+    me_rn: torch.Tensor,
+    me_t_sr: torch.Tensor,
+    me_t_rn: torch.Tensor,
+) -> None:
+    if torch.cuda.get_device_capability() == (12, 0):
+        # SM120 currently disables NVFP4 stochastic rounding in backend paths,
+        # so SR and RN should be numerically equivalent.
+        torch.testing.assert_close(me_sr, me_rn, atol=SM120_SR_EQUIVALENCE_ATOL, rtol=0.0)
+        torch.testing.assert_close(me_t_sr, me_t_rn, atol=SM120_SR_EQUIVALENCE_ATOL, rtol=0.0)
+    else:
+        assert me_sr < me_rn, "Stochastic rounding failed - error larger than the round to nearest."
+        assert (
+            me_t_sr < me_t_rn
+        ), "Stochastic rounding failed - error larger than the round to nearest."
+
+
 def unpack_fp4(x: torch.Tensor) -> torch.Tensor:
     repeated = x.repeat_interleave(2, dim=1)
     repeated[:, 0::2] &= 0x0F
@@ -247,7 +267,7 @@ def check_quantization_nvfp4_versus_reference(
     me_t_rn = torch.sqrt((error_t_rn * error_t_rn).mean())
     sr_result = torch.zeros_like(x).float()
     sr_t_result = torch.zeros_like(x).float().t().contiguous()
-    for i in range(n_iters):
+    for _ in range(n_iters):
         q_sr, s_sr, q_t_sr, s_t_sr = quantize_fp4(
             x, use_stochastic_rounding=True, use_2D=use_2D, use_RHT=use_RHT
         )
@@ -278,8 +298,7 @@ def check_quantization_nvfp4_versus_reference(
 
     print(f"RMSE SR: {me_sr:.3e} | RMSE RN: {me_rn:.3e}")
     print(f"RMSE SR_t: {me_t_sr:.3e} | RMSE RN_t: {me_t_rn:.3e}")
-    assert me_sr < me_rn, "Stochastic rounding failed - error larger than the round to nearest."
-    assert me_t_sr < me_t_rn, "Stochastic rounding failed - error larger than the round to nearest."
+    _assert_sr_vs_rn_behavior(me_sr, me_rn, me_t_sr, me_t_rn)
 
 
 def check_group_quantization_nvfp4_versus_reference(
@@ -362,10 +381,7 @@ def check_group_quantization_nvfp4_versus_reference(
 
         print(f"RMSE SR: {me_sr:.3e} | RMSE RN: {me_rn:.3e}")
         print(f"RMSE SR_t: {me_t_sr:.3e} | RMSE RN_t: {me_t_rn:.3e}")
-        assert me_sr < me_rn, "Stochastic rounding failed - error larger than the round to nearest."
-        assert (
-            me_t_sr < me_t_rn
-        ), "Stochastic rounding failed - error larger than the round to nearest."
+        _assert_sr_vs_rn_behavior(me_sr, me_rn, me_t_sr, me_t_rn)
 
 
 @pytest.mark.skipif(not recipe_available, reason=reason_for_no_recipe)

diff --git a/tests/pytorch/test_custom_recipe.py b/tests/pytorch/test_custom_recipe.py
@@ -128,7 +128,7 @@ def test_custom_recipe_grouped_linear_sanity():
     in_features = 64
     out_features = 64
     # Each per-GEMM M dim must be a multiple of 16 to satisfy cuBLAS FP8 GEMM's
-    # leading-dimension alignment requirement on Hopper (sm_90).
+    # leading-dimension alignment requirement on Hopper and SM120 paths.
     m_splits = [16] * num_gemms
     batch = sum(m_splits)
 
@@ -281,7 +281,7 @@ def test_custom_recipe_factory_invocation_counts_and_cycling():
     in_features = 64
     out_features = 64
     # batch must be a multiple of 16 to satisfy cuBLAS FP8 GEMM's leading-dim
-    # alignment requirement on Hopper (sm_90).
+    # alignment requirement on Hopper and SM120 paths.
     batch = 16
 
     op = Linear(in_features, out_features, params_dtype=torch.bfloat16)