NVIDIA · denera · Dec 2, 2025 · Dec 2, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/build_tools/pytorch.py b/build_tools/pytorch.py
@@ -5,6 +5,7 @@
 """PyTorch related extensions."""
 import os
 from pathlib import Path
+from importlib import metadata
 
 import setuptools
 
@@ -87,6 +88,15 @@ def setup_pytorch_extension(
         libraries.append("nvshmem_host")
         cxx_flags.append("-DNVTE_ENABLE_NVSHMEM")
 
+    if bool(int(os.getenv("NVTE_WITH_CUBLASMP", 0))):
+        # Creating a cuBlasMp context requires direct access to the underlying NCCL
+        # communicator in a tensor-parallel process group. The header for ProcessGroupNCCL
+        # needs this CPP directive to be included properly.
+        cxx_flags.append("-DNVTE_WITH_CUBLASMP")
+        torch_lib_path = metadata.distribution("torch").locate_file("torch/lib")
+        library_dirs.append(torch_lib_path)
+        libraries.append("torch_cuda")
+
     # Construct PyTorch CUDA extension
     sources = [str(path) for path in sources]
     include_dirs = [str(path) for path in include_dirs]

diff --git a/examples/jax/collective_gemm/common.py b/examples/jax/collective_gemm/common.py
@@ -128,6 +128,7 @@ def _initialize_distributed(args):
         num_devices_per_process=devices_per_process,
         process_id=args.process_id,
         tensor_parallel_size=args.tensor_parallel_size,
+        use_cublasmp=args.use_cublasmp,
     )
 
 
@@ -224,5 +225,11 @@ def cgemm_parser(description="Collective GEMM test on multi-GPU with tensor para
     parser.add_argument(
         "--enable-result-check", action="store_true", default=True, help="Enable result checking"
     )
+    parser.add_argument(
+        "--use-cublasmp",
+        action="store_true",
+        default=False,
+        help="Use the cuBLASMp backend for overlapping collective operations with GEMM computation",
+    )
 
     return parser
diff --git a/examples/jax/collective_gemm/run_test_cgemm.sh b/examples/jax/collective_gemm/run_test_cgemm.sh
@@ -93,50 +93,69 @@ for TEST_CASE in "${TEST_CASES[@]}"; do
   # Clear PIDs array for this test case
   PIDS=()
 
-  for i in $(seq 0 $(($NUM_GPUS - 1))); do
-    # Define output file for logs
-    LOG_FILE="${TEST_NAME}_gpu_${i}.log"
-
-    if [ $i -eq 0 ]; then
-      # For process 0: show live output AND save to log file using tee
-      echo "=== Live output from process 0 ==="
-      pytest -s -c "$TE_PATH/tests/jax/pytest.ini" \
-        -vs --junitxml=$XML_LOG_DIR/collective_gemm_${TEST_NAME}.xml \
-        "$TE_PATH/examples/jax/collective_gemm/$TEST_CASE" \
-        --num-processes=$NUM_GPUS \
-        --process-id=$i 2>&1 | tee "$LOG_FILE" &
-      PID=$!
-      PIDS+=($PID)
+  BACKENDS=("userbuffers", "cublasmp")
+  for BACKEND in "${BACKENDS[@]}"; do
+    echo "Setting backend to $BACKEND for test $TEST_NAME"
+
+    for i in $(seq 0 $(($NUM_GPUS - 1))); do
+      # Define output file for logs
+      LOG_FILE="${TEST_NAME}_gpu_${i}_${BACKEND}.log"
+
+      test_case_args=(
+
+        "--num-processes=$NUM_GPUS"
+        "--process-id=$i"
+      )
+      if [ "$BACKEND" == "cublasmp" ]; then
+        pytest_args+=("--use-cublasmp")
+      fi
+
+      pytest_args=(
+        "-s"
+        "-c $TE_PATH/tests/jax/pytest.ini"
+        "-vs"
+      )
+      if [ $i -eq 0 ]; then
+        # For process 0: show live output AND save to log file using tee
+        echo "=== Live output from process 0 ==="
+        pytest_args+=("--junitxml=${XML_LOG_DIR}/${TEST_NAME}_gpu_${i}_${BACKEND}.xml")
+        pytest "${pytest_args[@]}" \
+          "$TE_PATH/examples/jax/collective_gemm/$TEST_CASE" \
+          "${test_case_args[@]}" 2>&1 | tee "$LOG_FILE" &
+        PID=$!
+        PIDS+=($PID)
+      else
+        # For other processes: redirect to log files only
+        pytest "${pytest_args[@]}" \
+          "$TE_PATH/examples/jax/collective_gemm/$TEST_CASE" \
+          "${test_case_args[@]}" > "$LOG_FILE" 2>&1 &
+        PID=$!
+        PIDS+=($PID)
+      fi
+    done
+
+    # Wait for all processes to finish
+    wait
+
+    # Check and print the log content from process 0
+    if grep -q "SKIPPED" "${TEST_NAME}_gpu_0_${BACKEND}.log"; then
+      echo "... $TEST_CASE SKIPPED"
+    elif grep -q "FAILED" "${TEST_NAME}_gpu_0_${BACKEND}.log"; then
+      echo "... $TEST_CASE FAILED"
+      HAS_FAILURE=1
+    elif grep -q "PASSED" "${TEST_NAME}_gpu_0_${BACKEND}.log"; then
+      echo "... $TEST_CASE PASSED"
     else
-      # For other processes: redirect to log files only
-      pytest -s -c "$TE_PATH/tests/jax/pytest.ini" \
-        -vs "$TE_PATH/examples/jax/collective_gemm/$TEST_CASE" \
-        --num-processes=$NUM_GPUS \
-        --process-id=$i > "$LOG_FILE" 2>&1 &
-      PID=$!
-      PIDS+=($PID)
+      echo "... $TEST_CASE INVALID"
+      HAS_FAILURE=1
     fi
-  done
 
-  # Wait for all processes to finish
-  wait
-
-  # Check and print the log content from process 0
-  if grep -q "SKIPPED" "${TEST_NAME}_gpu_0.log"; then
-    echo "... $TEST_CASE SKIPPED"
-  elif grep -q "FAILED" "${TEST_NAME}_gpu_0.log"; then
-    echo "... $TEST_CASE FAILED"
-    HAS_FAILURE=1
-  elif grep -q "PASSED" "${TEST_NAME}_gpu_0.log"; then
-    echo "... $TEST_CASE PASSED"
-  else
-    echo "... $TEST_CASE INVALID"
-    HAS_FAILURE=1
-  fi
-
-  # Remove the log files after processing them
-  wait
-  rm ${TEST_NAME}_gpu_*.log
+
+    # Remove the log files after processing them
+    wait
+    rm ${TEST_NAME}_gpu_*_${BACKEND}.log
+
+  done
 done
 
 wait

diff --git a/tests/pytorch/distributed/run_gemm_with_overlap.py b/tests/pytorch/distributed/run_gemm_with_overlap.py
@@ -151,6 +151,9 @@ def _parse_args(argv=None, namespace=None):
     parser.add_argument(
         "--use-cuda-graphs", action="store_true", default=False, help="Use CUDA graphs."
     )
+    parser.add_argument(
+        "--use-cublasmp", action="store_true", default=False, help="Use cuBLASMp backend."
+    )
     parser.add_argument(
         "-v", "--verbose", action="store_true", default=False, help="Verbose info messages."
     )
@@ -203,6 +206,7 @@ def _main(opts):
         capture_output=True,
         text=True,
         shell=True,
+        check=False,
     )
 
     if result.stdout == "0":  # Extra checks for non-MNNVL platforms
@@ -306,7 +310,7 @@ def dist_print(msg, src=None, info=False, error=False, section=False, group=None
     helper = (
         tex.CommOverlapHelper()
         if tex.ubuf_built_with_mpi()
-        else tex.CommOverlapHelper(bootstrap_pg)
+        else tex.CommOverlapHelper(bootstrap_pg, tp_group)
     )
 
     # Initialize userbuffers with (M, N) buffer
@@ -323,47 +327,75 @@ def dist_print(msg, src=None, info=False, error=False, section=False, group=None
     ):
         buffer_dtype = torch.uint8
     ub_obj = (
-        tex.CommOverlapP2P(
-            (outer_size, hidden_size),
-            buffer_dtype,
-            helper,
-            tp_size,  # Tensor-parallel group size (may be different than LOCAL_SIZE)
-            opts.comm_type,
-            set_sm_margin=opts.comm_type == tex.CommOverlapType.RS or opts.atomic,
-            atomic_gemm=opts.atomic,
-            aggregate=opts.aggregate,
-            use_ce=not (opts.atomic and bool(int(os.getenv("NVTE_AG_P2P_MULTI_ATOMIC", "0")))),
+        (
+            tex.CommOverlapP2P(
+                (outer_size, hidden_size),
+                buffer_dtype,
+                helper,
+                tp_size,  # Tensor-parallel group size (may be different than LOCAL_SIZE)
+                opts.comm_type,
+                set_sm_margin=opts.comm_type == tex.CommOverlapType.RS or opts.atomic,
+                atomic_gemm=opts.atomic,
+                aggregate=opts.aggregate,
+                use_ce=not (opts.atomic and bool(int(os.getenv("NVTE_AG_P2P_MULTI_ATOMIC", "0")))),
+            )
+            if not opts.use_cublasmp
+            else tex.CommOverlapP2P(
+                helper,
+                tp_rank,
+                tp_size,
+                num_comm_sm=3,
+                atomic_gemm=opts.atomic,
+            )
         )
         if opts.p2p
-        else tex.CommOverlap(
-            (outer_size, hidden_size),
-            buffer_dtype,
-            helper,
-            tp_size,  # Tensor-parallel group size (may be different than LOCAL_SIZE)
-            atomic_gemm=opts.atomic,
+        else (
+            tex.CommOverlap(
+                (outer_size, hidden_size),
+                buffer_dtype,
+                helper,
+                tp_size,  # Tensor-parallel group size (may be different than LOCAL_SIZE)
+                atomic_gemm=opts.atomic,
+            )
+            if not opts.use_cublasmp
+            else tex.CommOverlap(
+                helper,
+                tp_rank,
+                tp_size,
+                num_comm_sm=16,
+                atomic_gemm=opts.atomic,
+            )
         )
     )
 
     # Numerical check on AG + atomic GEMM requires testing an AG+RS pair
     ub_obj2 = None
     if opts.atomic and opts.comm_type == tex.CommOverlapType.AG and opts.check_numerics:
         ub_obj2 = (
-            tex.CommOverlapP2P(
-                (outer_size, hidden_size),
-                torch.uint8 if opts.fp8_output else torch.bfloat16,
-                helper,
-                tp_size,  # Tensor-parallel group size (may be different than LOCAL_SIZE)
-                tex.CommOverlapType.RS,
-                set_sm_margin=True,
-                atomic_gemm=True,
+            (
+                tex.CommOverlapP2P(
+                    (outer_size, hidden_size),
+                    torch.uint8 if opts.fp8_output else torch.bfloat16,
+                    helper,
+                    tp_size,  # Tensor-parallel group size (may be different than LOCAL_SIZE)
+                    tex.CommOverlapType.RS,
+                    set_sm_margin=True,
+                    atomic_gemm=True,
+                )
+                if not opts.use_cublasmp
+                else tex.CommOverlapP2P(helper, tp_rank, tp_size, num_comm_sm=16, atomic_gemm=True)
             )
             if opts.atomic_rs_p2p
-            else tex.CommOverlap(
-                (outer_size, hidden_size),
-                torch.uint8 if opts.fp8_output else torch.bfloat16,
-                helper,
-                tp_size,  # Tensor-parallel group size (may be different than LOCAL_SIZE)
-                atomic_gemm=True,
+            else (
+                tex.CommOverlap(
+                    (outer_size, hidden_size),
+                    torch.uint8 if opts.fp8_output else torch.bfloat16,
+                    helper,
+                    tp_size,  # Tensor-parallel group size (may be different than LOCAL_SIZE)
+                    atomic_gemm=True,
+                )
+                if not opts.use_cublasmp
+                else tex.CommOverlap(helper, tp_rank, tp_size, num_comm_sm=3, atomic_gemm=True)
             )
         )
 
@@ -408,7 +440,7 @@ def dist_print(msg, src=None, info=False, error=False, section=False, group=None
         mean=0.0,
         std=opts.std,
     )
-    if ub_obj2 is not None:
+    if opts.comm_type == tex.CommOverlapType.AG and ub_obj2 is not None:
         kernel2_t = torch.nn.init.normal_(
             torch.empty(local_kernel2_t_shape, dtype=torch.bfloat16, device="cuda"),
             mean=0.0,
@@ -429,22 +461,22 @@ def dist_print(msg, src=None, info=False, error=False, section=False, group=None
             # AG Kernel: (K/P, N) -> gather -> (K, N) -> T -> (N, K)
             ker_g = torch.transpose(
                 te.distributed.gather_along_first_dim(kernel_t, tp_group)[0], 0, 1
-            ).to(dtype=torch.float32)
+            )
             # AG Input: (M/P, N) -> gather -> (M, N)
-            inp_g = te.distributed.gather_along_first_dim(inp, tp_group)[0].to(dtype=torch.float32)
+            inp_g = te.distributed.gather_along_first_dim(inp, tp_group)[0]
             if ub_obj2 is not None:
                 ker2_g = te.distributed.gather_along_first_dim(
                     torch.transpose(kernel2_t, 0, 1), tp_group
-                )[0].to(dtype=torch.float32)
+                )[0]
         else:
             # RS Kernel: (N, K/P) -> T -> (K/P, N) -> gather -> (K, N)
             ker_g = te.distributed.gather_along_first_dim(
                 torch.transpose(kernel_t, 0, 1), tp_group
-            )[0].to(dtype=torch.float32)
+            )[0]
             # RS Input: (M, K/P) -> T -> (K/P, M) -> gather -> (K, M) -> T -> (M, K)
             inp_g = torch.transpose(
                 te.distributed.gather_along_first_dim(torch.transpose(inp, 0, 1), tp_group)[0], 0, 1
-            ).to(dtype=torch.float32)
+            )
 
     if opts.bulk_overlap:
         if opts.comm_type == tex.CommOverlapType.AG:
@@ -456,10 +488,20 @@ def dist_print(msg, src=None, info=False, error=False, section=False, group=None
             # Sum the list together for final global result
             ref_g = torch.stack(bulk_inp_list).sum(dim=0)
     else:
-        ref_g = torch.matmul(inp_g, ker_g)
+        ref_g, *_ = tex.general_gemm(
+            torch.transpose(ker_g, 0, 1),
+            inp_g,
+            out_dtype=torch.bfloat16,
+            use_split_accumulator=te.module.base._2X_ACC_FPROP,
+        )
         if ub_obj2 is not None:
             inp2_g = torch.nn.functional.gelu(ref_g)  # pylint: disable=not-callable
-            ref2_g = torch.matmul(inp2_g, ker2_g)
+            ref2_g = tex.general_gemm(
+                torch.transpose(ker2_g),
+                inp2_g,
+                out_dtype=torch.bfloat16,
+                use_split_accumulator=te.module.base._2X_ACC_FPROP,
+            )
 
     # Initialize quantizers
     with_quantized_compute = opts.quantization != "none"
@@ -580,14 +622,16 @@ def dist_print(msg, src=None, info=False, error=False, section=False, group=None
                 tp_group,
             )
             gemm_inp = inp
-        else:
+        elif not opts.use_cublasmp:
             ag_out, _ = fill_userbuffers_buffer_for_all_gather(
                 ub_obj,
                 inp_fp8 if with_quantized_compute else inp,
                 inp_quantizer,
                 tp_group,
             )
             gemm_inp = ag_out
+        else:
+            gemm_inp = inp_fp8 if with_quantized_compute else inp
         if ub_obj2 is not None:
             rs_out2 = torch.empty(
                 (outer_size // tp_size, hidden_size), dtype=torch.bfloat16, device="cuda"

diff --git a/tests/pytorch/distributed/run_layer_with_overlap.py b/tests/pytorch/distributed/run_layer_with_overlap.py
@@ -258,6 +258,12 @@ def _parse_args(argv=None, namespace=None):
         default=0,
         help="Number of layers at the end to run in bf16.",
     )
+    parser.add_argument(
+        "--use-cublasmp",
+        action="store_true",
+        default=False,
+        help="Use cuBLASMp backend.",
+    )
     args = parser.parse_args(argv, namespace)
 
     if args.use_cuda_graphs and args.layer_type in [te.MultiheadAttention, te.TransformerLayer]:
@@ -436,6 +442,7 @@ def dist_print(msg, src=None, end="\n", debug=False, error=False):
         dtype=torch.bfloat16,
         bootstrap_backend=opts.bootstrap_backend,
         ub_cfgs=ub_cfgs if opts.ub_cfg is None else opts.ub_cfg,
+        with_cublasmp=opts.use_cublasmp,
     )
 
     with te.quantized_model_init(enabled=opts.fp8_init):