deepseek-ai · leavelet · Apr 26, 2026 · Apr 28, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/csrc/apis/attention.hpp b/csrc/apis/attention.hpp
@@ -6,6 +6,7 @@
 #include "../jit_kernels/impls/sm90_fp8_gemm_1d1d.hpp"
 #include "../jit_kernels/impls/sm90_fp8_gemm_1d2d.hpp"
 #include "../jit_kernels/impls/sm100_fp8_fp4_gemm_1d1d.hpp"
+#include "../jit_kernels/impls/sm120_fp8_fp4_gemm_1d1d.hpp"
 #include "../jit_kernels/impls/smxx_fp8_fp4_mqa_logits.hpp"
 #include "../jit_kernels/impls/smxx_fp8_fp4_paged_mqa_logits.hpp"
 #include "../jit_kernels/impls/smxx_clean_logits.hpp"
@@ -68,6 +69,9 @@ static void fp8_gemm_nt_skip_head_mid(const std::pair<torch::Tensor, torch::Tens
         // NOTES: Only granularity 128 and FP8 are exposed in the API
         sm100_fp8_fp4_gemm_1d1d(a.first, sfa, b.first, sfb, std::nullopt, d, m, n, k,
                                 128, 128, major_a, major_b, compiled_dims, epilogue_type);
+    } else if (arch_major == 12 and sfa.scalar_type() == torch::kInt) {
+        sm120_fp8_fp4_gemm_1d1d(a.first, sfa, b.first, sfb, std::nullopt, d, m, n, k,
+                                128, 128, major_a, major_b, compiled_dims, epilogue_type);
     } else {
         DG_HOST_UNREACHABLE("Unsupported architecture or scaling factor types");
     }
@@ -162,7 +166,7 @@ static torch::Tensor fp8_fp4_mqa_logits(const std::tuple<torch::Tensor, std::opt
 
     // Allocate output
     constexpr int block_qh = 128;
-    constexpr int block_kv = 256;
+    const int block_kv = (device_runtime->get_arch_major() == 12) ? 128 : 256;
     const int block_q = block_qh / num_heads;
     DG_HOST_ASSERT(block_qh % num_heads == 0);
 
@@ -192,11 +196,14 @@ static torch::Tensor fp8_fp4_mqa_logits(const std::tuple<torch::Tensor, std::opt
     if (is_fp4 and arch_major == 10) {
         sm100_fp4_mqa_logits(q_fp, q_sf.value(), kv_fp, kv_sf, weights, cu_seq_len_k_start, cu_seq_len_k_end, logits, logits_dtype,
                              seq_len, seq_len_kv, max_seqlen_k, stride_logits, num_heads, head_dim, block_q, block_kv);
+    } else if (is_fp4 and arch_major == 12) {
+        sm120_fp4_mqa_logits(q_fp, q_sf.value(), kv_fp, kv_sf, weights, cu_seq_len_k_start, cu_seq_len_k_end, logits, logits_dtype,
+                             seq_len, seq_len_kv, max_seqlen_k, stride_logits, num_heads, head_dim, block_q, block_kv);
     } else if (not is_fp4 and weights_is_f16) {
         // FP16 weights -> FP16 MMA accumulator (Q*K score + per-head reduction in FP16); see note above
         sm100_fp8_mqa_logits_f16_weights(q_fp, kv_fp, kv_sf, weights, cu_seq_len_k_start, cu_seq_len_k_end, logits, logits_dtype,
                                          seq_len, seq_len_kv, max_seqlen_k, stride_logits, num_heads, head_dim, block_q, block_kv);
-    } else if (not is_fp4 and (arch_major == 9 or arch_major == 10)) {
+    } else if (not is_fp4 and (arch_major == 9 or arch_major == 10 or arch_major == 12)) {
         smxx_fp8_mqa_logits(q_fp, kv_fp, kv_sf, weights, cu_seq_len_k_start, cu_seq_len_k_end, logits, logits_dtype,
                             seq_len, seq_len_kv, max_seqlen_k, stride_logits, num_heads, head_dim, block_q, block_kv);
     } else {
@@ -228,15 +235,15 @@ static torch::Tensor get_paged_mqa_logits_metadata(const torch::Tensor& context_
     const auto arch_major = device_runtime->get_arch_major();
     if (is_varlen) {
         const auto& indices_tensor = indices.value();
-        DG_HOST_ASSERT(arch_major == 10 and next_n == 1 and (block_kv == 64 or block_kv == 32));
+        DG_HOST_ASSERT((arch_major == 10 or arch_major == 12) and next_n == 1 and (block_kv == 64 or block_kv == 32));
         DG_HOST_ASSERT(indices_tensor.dim() == 1 and indices_tensor.size(0) == batch_size);
         DG_HOST_ASSERT(indices_tensor.is_contiguous());
         DG_HOST_ASSERT(indices_tensor.scalar_type() == torch::kInt);
         // Varlen runs on SM100 with next_n=1: no atomization (num_next_n_atoms=1).
         smxx_paged_mqa_logits_metadata(context_lens, schedule_metadata, batch_size, next_n, block_kv,
                                        num_sms, is_context_lens_2d, /*num_next_n_atoms=*/1,
                                        /*is_varlen=*/true, indices_tensor.data_ptr<int>());
-    } else if (arch_major == 9 or arch_major == 10) {
+    } else if (arch_major == 9 or arch_major == 10 or arch_major == 12) {
         DG_HOST_ASSERT(block_kv == 32 or block_kv == 64);
         // SM90 schedules in units of `kComputeBlockKV = 64` regardless of physical
         // `block_kv`; pass the compute block size to the metadata kernel.
@@ -245,6 +252,7 @@ static torch::Tensor get_paged_mqa_logits_metadata(const torch::Tensor& context_
         //   kNextNAtom = (kIsVarlen or kNextN >= 2) ? 2 : 1
         //   kNumNextNAtoms = ceil_div(kNextN, kNextNAtom)
         // SM90 cluster multicast hard-codes kNumNextNAtoms = 1 (one q per cluster).
+        // SM100/SM120 atomize next_n in time: kNextNAtom = (next_n >= 2) ? 2 : 1.
         int num_next_n_atoms;
         if (arch_major == 9) {
             num_next_n_atoms = 1;
@@ -375,7 +383,7 @@ static torch::Tensor fp8_fp4_paged_mqa_logits(const std::tuple<torch::Tensor, st
     const auto arch_major = device_runtime->get_arch_major();
     const auto indices_tensor = indices.value_or(torch::Tensor());
     if (is_varlen) {
-        DG_HOST_ASSERT(arch_major == 10 and next_n == 1);
+        DG_HOST_ASSERT((arch_major == 10 or arch_major == 12) and next_n == 1);
         DG_HOST_ASSERT(indices_tensor.dim() == 1 and indices_tensor.size(0) == batch_size);
         DG_HOST_ASSERT(indices_tensor.is_contiguous());
         DG_HOST_ASSERT(indices_tensor.scalar_type() == torch::kInt);
@@ -399,7 +407,8 @@ static torch::Tensor fp8_fp4_paged_mqa_logits(const std::tuple<torch::Tensor, st
     DG_HOST_ASSERT(context_lens.scalar_type() == torch::kInt);
 
     // Allocate output
-    constexpr int split_kv = 256;
+    // SM120a: 2 groups × 64 KV rows = 128; SM90/100: 256
+    const int split_kv = (arch_major == 12) ? 128 : 256;
     const auto aligned_max_context_len = align(max_context_len, split_kv);
     auto logits = torch::empty({batch_size * next_n, aligned_max_context_len}, q_fp.options().dtype(logits_dtype));
     logits = logits.slice(-1, 0, max_context_len);
@@ -410,7 +419,11 @@ static torch::Tensor fp8_fp4_paged_mqa_logits(const std::tuple<torch::Tensor, st
         sm100_fp4_paged_mqa_logits(q_fp, q_sf.value(), kv_cache, kv_cache_sf, weights, context_lens, logits, block_table, indices_tensor, schedule_meta,
                                    logits_dtype, batch_size, next_n, num_heads, head_dim, num_kv_blocks, block_kv, is_context_lens_2d,
                                    is_varlen, aligned_max_context_len, block_table_stride, num_sms, split_kv);
-    } else if (not is_fp4 and (arch_major == 9 or arch_major == 10)) {
+    } else if (is_fp4 and arch_major == 12) {
+        sm120_fp4_paged_mqa_logits(q_fp, q_sf.value(), kv_cache, kv_cache_sf, weights, context_lens, logits, block_table, indices_tensor, schedule_meta,
+                                   logits_dtype, batch_size, next_n, num_heads, head_dim, num_kv_blocks, block_kv, is_context_lens_2d,
+                                   is_varlen, aligned_max_context_len, block_table_stride, num_sms, split_kv);
+    } else if (not is_fp4 and (arch_major == 9 or arch_major == 10 or arch_major == 12)) {
         smxx_fp8_paged_mqa_logits(q_fp, kv_cache, kv_cache_sf, weights, context_lens, logits, block_table, indices_tensor, schedule_meta,
                                   logits_dtype, batch_size, next_n, num_heads, head_dim, num_kv_blocks, block_kv, is_context_lens_2d,
                                   is_varlen, aligned_max_context_len, block_table_stride, num_sms, split_kv);

diff --git a/csrc/apis/einsum.hpp b/csrc/apis/einsum.hpp
@@ -12,8 +12,11 @@
 #if DG_FP8_COMPATIBLE and DG_TENSORMAP_COMPATIBLE
 #include "../jit_kernels/impls/sm90_bmk_bnk_mn.hpp"
 #include "../jit_kernels/impls/sm100_bmk_bnk_mn.hpp"
+#include "../jit_kernels/impls/sm120_bmk_bnk_mn.hpp"
 #include "../jit_kernels/impls/sm90_bf16_gemm.hpp"
 #include "../jit_kernels/impls/sm100_bf16_gemm.hpp"
+#include "../jit_kernels/impls/sm120_bf16_gemm.hpp"
+#include "../jit_kernels/impls/sm120_fp8_fp4_gemm_1d1d.hpp"
 #include "../jit_kernels/impls/smxx_cublaslt.hpp"
 #endif
 
@@ -51,6 +54,8 @@ static void bmk_bnk_mn(const torch::Tensor& a, const torch::Tensor& b, const tor
     const auto arch_major = device_runtime->get_arch_major();
     if (arch_major == 9) {
         sm90_bmn_bnk_mn_gemm(a, b, d, s, m, n, k);
+    } else if (arch_major == 12) {
+        sm120_bmn_bnk_mn_gemm(a, b, d, s, m, n, k);
     } else if (arch_major == 10) {
         sm100_bmn_bnk_mn_gemm(a, b, d, s, m, n, k);
     } else {
@@ -74,6 +79,8 @@ static void bhr_hdr_bhd(const torch::Tensor& A, const torch::Tensor& B, const to
         cublaslt_bhr_hdr_bhd(A, B, D, b, h, r, d);
     } else if (arch_major == 9) {
         sm90_bf16_bhr_hdr_bhd(A, B, D, b, h, r, d);
+    } else if (arch_major == 12) {
+        sm120_bf16_bhr_hdr_bhd(A, B, D, b, h, r, d);
     } else if (arch_major == 10) {
         sm100_bf16_bhr_hdr_bhd(A, B, D, b, h, r, d);
     } else {
@@ -97,6 +104,8 @@ static void bhd_hdr_bhr(const torch::Tensor& A, const torch::Tensor& B, const to
         cublaslt_bhd_hdr_bhr(A, B, D, b, h, r, d);
     } else if (arch_major == 9) {
         sm90_bf16_bhd_hdr_bhr(A, B, D, b, h, r, d);
+    } else if (arch_major == 12) {
+        sm120_bf16_bhd_hdr_bhr(A, B, D, b, h, r, d);
     } else if (arch_major == 10) {
         sm100_bf16_bhd_hdr_bhr(A, B, D, b, h, r, d);
     } else {
@@ -161,13 +170,59 @@ static void fp8_bmm(const torch::Tensor& a, const torch::Tensor& sfa,
     if (batch_size == 0 or gemm::early_return(m, n, k, d, c))
         return;
 
-    // Transform scaling factors
+    // AB-swap small-M decode path. Mirrors TRT-LLM's runGemmSwapAB
+    // (cpp/include/tensorrt_llm/deep_gemm/fp8_gemm.cuh): SM120 1d1d has
+    // BLOCK_M ≥ 64, so M_orig ≤ 32 wastes lanes. Swapping A↔B moves the
+    // small dim to N where BLOCK_N can shrink to {16, 32}. The swap runs
+    // BEFORE the SF layout transform so transform_sf_pair_into_required_layout
+    // sees operands in their post-swap roles. The kernel writes back into the
+    // caller's (B, M_orig, N_orig) buffer directly via runtime stride_cd_m/n
+    // (no temp buffer); see sm120_fp8_fp4_bmm for the stride remap.
+    const auto arch_major = device_runtime->get_arch_major();
+    constexpr int kSwapAbMMax = 32;
+    const bool swap_ab_eligible =
+        arch_major == 12 and m >= 1 and m <= kSwapAbMMax
+        and major_a == cute::UMMA::Major::K and major_b == cute::UMMA::Major::K
+        and d.stride(-1) == 1    // d's innermost dim must be contiguous
+        and not c.has_value();   // swap's strided/transposed output is incompatible with
+                                 // the batched accumulation epilogue (both the REDUCE_ADD
+                                 // TMA-store path and the direct-store path mishandle the
+                                 // batch offset + swapped strides). Mirrors the dense GEMM
+                                 // swap exclusion in gemm.hpp.
+
+    if (swap_ab_eligible) {
+        // Swap the recipe's gran_mn entries too: (gran_mn_a, gran_mn_b, gran_k)
+        // describes the original A/B roles, so after operand swap the per-tensor
+        // granularities must follow. Without this, asymmetric recipes
+        // like (1, 128, 128) trip the SF layout shape check.
+        const auto eff_recipe = recipe.has_value()
+            ? recipe.value()
+            : get_default_recipe(sfa.scalar_type(), sfb.scalar_type());
+        const auto& [ga, gb, gk] = eff_recipe;
+        std::optional<std::tuple<int, int, int>> swap_recipe = std::nullopt;
+        std::optional<std::tuple<int, int>> swap_recipe_a = std::make_tuple(gb, gk);
+        std::optional<std::tuple<int, int>> swap_recipe_b = std::make_tuple(ga, gk);
+        const auto [transformed_sfa_swap, transformed_sfb_swap, gran_k_a_swap, gran_k_b_swap]
+            = layout::transform_sf_pair_into_required_layout(
+                sfb, sfa, /*m=*/n, /*n=*/m, k, swap_recipe,
+                swap_recipe_a, swap_recipe_b, batch_size, batch_size, false);
+        sm120_fp8_fp4_bmm(
+            b, transformed_sfa_swap, a, transformed_sfb_swap, c, d,
+            batch_size, /*m=*/n, /*n=*/m, k,
+            gran_k_a_swap, gran_k_b_swap,
+            major_b, major_a, compiled_dims,
+            /*swap_ab=*/true);
+        return;
+    }
+
+    // Transform scaling factors (non-swap path)
     const auto [transformed_sfa, transformed_sfb, gran_k_a, gran_k_b] = layout::transform_sf_pair_into_required_layout(
         sfa, sfb, m, n, k, recipe, std::nullopt, std::nullopt, batch_size, batch_size, false);
 
     // Dispatch implementation
-    const auto arch_major = device_runtime->get_arch_major();
-    if (arch_major == 10) {
+    if (arch_major == 12) {
+        sm120_fp8_fp4_bmm(a, transformed_sfa, b, transformed_sfb, c, d, batch_size, m, n, k, gran_k_a, gran_k_b, major_a, major_b, compiled_dims);
+    } else if (arch_major == 10) {
         sm100_fp8_bmm(a, transformed_sfa, b, transformed_sfb, c, d, batch_size, m, n, k, gran_k_a, gran_k_b, major_a, major_b, compiled_dims);
     } else {
         const auto major_sfb = get_major_type_ab(sfb);
@@ -192,21 +247,30 @@ static void fp8_einsum(const std::string& expr,
         const auto perm_d = d.permute({1, 0, 2});
         const auto perm_c = c.has_value() ? std::make_optional(c.value().permute({1, 0, 2})) : std::nullopt;
         fp8_bmm(perm_a, perm_sfa, b.first, b.second, perm_d, perm_c, recipe, "nk");
-    } else if (expr == "bhd,hdr->bhr" and arch_major == 10) {
+    } else if (expr == "bhd,hdr->bhr") {
         // (batch_size, m, n, k): (h, b, r, d)
         const auto perm_a = a.first.permute({1, 0, 2});
         const auto perm_sfa = a.second.permute({1, 0, 2});
-        const auto perm_b = b.first.permute({0, 2, 1});
-        const auto perm_sfb = b.second.permute({0, 2, 1});
+        auto perm_b = b.first.permute({0, 2, 1});
+        auto perm_sfb = b.second.permute({0, 2, 1});
+        // SM120: B is MN-major after permute; .contiguous() to K-major (scalar MN-major path ~3x slower).
+        if (arch_major == 12) {
+            perm_b = perm_b.contiguous();
+        }
         const auto perm_d = d.permute({1, 0, 2});
         const auto perm_c = c.has_value() ? std::make_optional(c.value().permute({1, 0, 2})) : std::nullopt;
         fp8_bmm(perm_a, perm_sfa, perm_b, perm_sfb, perm_d, perm_c, recipe, "nk");
-    } else if (expr == "bhd,bhr->hdr" and arch_major == 10) {
+    } else if (expr == "bhd,bhr->hdr") {
         // (batch_size, m, n, k): (h, d, r, b)
-        const auto perm_a = a.first.permute({1, 2, 0});
-        const auto perm_sfa = a.second.permute({1, 2, 0});
-        const auto perm_b = b.first.permute({1, 2, 0});
-        const auto perm_sfb = b.second.permute({1, 2, 0});
+        auto perm_a = a.first.permute({1, 2, 0});
+        auto perm_sfa = a.second.permute({1, 2, 0});
+        auto perm_b = b.first.permute({1, 2, 0});
+        auto perm_sfb = b.second.permute({1, 2, 0});
+        // SM120: A/B MN-major after permute; force K-major (MN-major A unsupported, scalar path ~3x slower).
+        if (arch_major == 12) {
+            perm_a = perm_a.contiguous();
+            perm_b = perm_b.contiguous();
+        }
         fp8_bmm(perm_a, perm_sfa, perm_b, perm_sfb, d, c, recipe, "mn");
     } else {
         DG_HOST_UNREACHABLE(fmt::format("Unsupported einsum expression: {}", expr));