[3/n] Migrate cutlass/scaled_mm_entry.cu torch stable ABI (#37221)

Signed-off-by: Mikayla Gawarecki <mikaylagawarecki@gmail.com>

[3/n] Migrate cutlass/scaled_mm_entry.cu torch stable ABI (#37221)
Signed-off-by: Mikayla Gawarecki <mikaylagawarecki@gmail.com>
ab1a6a43 · mikaylagawarecki · GitHub · b5e60825 · ab1a6a43 · ab1a6a43
Unverified Commit ab1a6a43 authored Mar 30, 2026 by mikaylagawarecki Committed by GitHub Mar 30, 2026
4 changed files
--- a/csrc/libtorch_stable/torch_utils.h
+++ b/csrc/libtorch_stable/torch_utils.h
 #pragma once

 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/stable/accelerator.h>
+#include <torch/csrc/stable/tensor.h>
 #include <torch/headeronly/util/shim_utils.h>

 #include <cuda_runtime.h>

+// Stable ABI equivalent of TORCH_CHECK_NOT_IMPLEMENTED.
+#define STD_TORCH_CHECK_NOT_IMPLEMENTED(cond, ...) \
+  STD_TORCH_CHECK(cond, "NotImplementedError: ", __VA_ARGS__)
+
 // Utility to get the current CUDA stream for a given device using stable APIs.
 // Returns a cudaStream_t for use in kernel launches.
 inline cudaStream_t get_current_cuda_stream(int32_t device_index = -1) {

--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -228,63 +228,18 @@ int64_t ggml_moe_get_block_size(int64_t type);
 #ifndef USE_ROCM

 bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability);
-bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
-bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability);
-bool cutlass_group_gemm_supported(int64_t cuda_device_capability);

 void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
                           torch::Tensor const& B, torch::Tensor const& A_sf,
                           torch::Tensor const& B_sf,
                           torch::Tensor const& alpha);

-void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
-                       torch::Tensor const& b, torch::Tensor const& a_scales,
-                       torch::Tensor const& b_scales,
-                       std::optional<torch::Tensor> const& bias);
-
-void cutlass_moe_mm(
-    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
-    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
-    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
-    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
-    bool per_act_token, bool per_out_ch);
-
 void cutlass_fp4_group_mm(
    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets);

-void get_cutlass_moe_mm_data(
-    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
-    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
-    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
-    const int64_t num_experts, const int64_t n, const int64_t k,
-    const std::optional<torch::Tensor>& blockscale_offsets,
-    const bool is_gated);
-
-void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
-    const torch::Tensor& expert_first_token_offset,
-    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
-    const int64_t n, const int64_t k, const bool swap_ab);
-
-void get_cutlass_batched_moe_mm_data(torch::Tensor& expert_offsets,
-                                     torch::Tensor& problem_sizes1,
-                                     torch::Tensor& problem_sizes2,
-                                     const torch::Tensor& expert_num_tokens,
-                                     const int64_t num_local_experts,
-                                     const int64_t padded_m, const int64_t n,
-                                     const int64_t k);
-
-void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
-                           torch::Tensor const& b,
-                           torch::Tensor const& a_scales,
-                           torch::Tensor const& b_scales,
-                           torch::Tensor const& azp_adj,
-                           std::optional<torch::Tensor> const& azp,
-                           std::optional<torch::Tensor> const& bias);
-
 std::tuple<torch::Tensor, torch::Tensor> scaled_fp4_quant_func(
    torch::Tensor const& input, torch::Tensor const& input_scale,
    bool is_sf_swizzled_layout);

--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_kernels.hpp
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_kernels.hpp
-#pragma once
-
-#include <torch/all.h>
-
-namespace vllm {
-
-void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
-                                torch::Tensor const& b,
-                                torch::Tensor const& a_scales,
-                                torch::Tensor const& b_scales,
-                                std::optional<torch::Tensor> const& bias);
-
-void cutlass_scaled_mm_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
-                                 torch::Tensor const& b,
-                                 torch::Tensor const& a_scales,
-                                 torch::Tensor const& b_scales,
-                                 std::optional<torch::Tensor> const& bias);
-
-void cutlass_scaled_mm_azp_sm90_int8(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& b,
-                                     torch::Tensor const& a_scales,
-                                     torch::Tensor const& b_scales,
-                                     torch::Tensor const& azp_adj,
-                                     std::optional<torch::Tensor> const& azp,
-                                     std::optional<torch::Tensor> const& bias);
-
-void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out,
-                                          torch::Tensor const& a,
-                                          torch::Tensor const& b,
-                                          torch::Tensor const& a_scales,
-                                          torch::Tensor const& b_scales);
-
-void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
-                                 torch::Tensor const& b,
-                                 torch::Tensor const& a_scales,
-                                 torch::Tensor const& b_scales,
-                                 std::optional<torch::Tensor> const& bias);
-
-void cutlass_scaled_mm_sm120_fp8(torch::Tensor& out, torch::Tensor const& a,
-                                 torch::Tensor const& b,
-                                 torch::Tensor const& a_scales,
-                                 torch::Tensor const& b_scales,
-                                 std::optional<torch::Tensor> const& bias);
-
-void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
-                                           torch::Tensor const& a,
-                                           torch::Tensor const& b,
-                                           torch::Tensor const& a_scales,
-                                           torch::Tensor const& b_scales);
-
-void cutlass_scaled_mm_blockwise_sm120_fp8(torch::Tensor& out,
-                                           torch::Tensor const& a,
-                                           torch::Tensor const& b,
-                                           torch::Tensor const& a_scales,
-                                           torch::Tensor const& b_scales);
-}  // namespace vllm
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -439,90 +439,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      " -> ()");
  // conditionally compiled so impl registration is in source file

-  // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
-  // quantization, as well as bias
-  ops.def(
-      "cutlass_scaled_mm(Tensor! out, Tensor a,"
-      "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor? bias) -> ()");
-  ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
-
-  // CUTLASS w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
-  // quantization.
-  ops.def(
-      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
-      "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor azp_adj,"
-      "                  Tensor? azp, Tensor? bias) -> ()");
-  ops.impl("cutlass_scaled_mm_azp", torch::kCUDA, &cutlass_scaled_mm_azp);
-
-  // Check if cutlass scaled_mm is supported for CUDA devices of the given
-  // capability
-  ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
-  ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
-
-  // Check if cutlass grouped gemm is supported for CUDA devices of the given
-  // capability
-  ops.def("cutlass_group_gemm_supported(int cuda_device_capability) -> bool");
-  ops.impl("cutlass_group_gemm_supported", &cutlass_group_gemm_supported);
-
-  // CUTLASS w8a8 grouped GEMM
-  ops.def(
-      "cutlass_moe_mm(Tensor! out_tensors, Tensor a_tensors, Tensor b_tensors, "
-      "               Tensor a_scales, Tensor b_scales, Tensor expert_offsets, "
-      "               Tensor problem_sizes, Tensor a_strides, "
-      "               Tensor b_strides, Tensor c_strides, bool per_act_token, "
-      "               bool per_out_ch) -> ()");
-  ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm);
-
-  // A function that computes data required to run fused MoE with w8a8 grouped
-  // GEMM. It takes topk_ids as an input, and computes expert_offsets
-  // (token start indices of each expert). In addition to this, it computes
-  // problem sizes for each expert's multiplication used by the two mms called
-  // from fused MoE operation, and arrays with permutations required to shuffle
-  // and de-shuffle the input/output of the fused operation.
-  ops.def(
-      "get_cutlass_moe_mm_data(Tensor topk_ids, Tensor! expert_offsets, "
-      "                        Tensor! problem_sizes1, Tensor! problem_sizes2, "
-      "                        Tensor! input_permutation, "
-      "                        Tensor! output_permutation, int num_experts, "
-      "                        int n, int k, Tensor? blockscale_offsets, "
-      "                        bool is_gated) -> ()");
-  ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
-
-  // compute per-expert problem sizes from expert_first_token_offset
-  // produced by vLLM's moe_permute kernel
-  ops.def(
-      "get_cutlass_moe_mm_problem_sizes_from_expert_offsets("
-      "    Tensor expert_first_token_offset, "
-      "    Tensor! problem_sizes1, "
-      "    Tensor! problem_sizes2, "
-      "    int n, int k, bool swap_ab) -> ()");
-  ops.impl("get_cutlass_moe_mm_problem_sizes_from_expert_offsets", torch::kCUDA,
-           &get_cutlass_moe_mm_problem_sizes_from_expert_offsets);
-
-  // A function that computes data required to run fused MoE with w8a8 grouped
-  // GEMM in batched expert format. It takes expert_num_tokens
-  // as an input, and computes expert_offsets (token start indices of each
-  // expert). In addition to this, it computes problem sizes for each expert's
-  // multiplication used by the two mms called from fused MoE operation.
-  ops.def(
-      "get_cutlass_batched_moe_mm_data(Tensor! expert_offsets, "
-      "                             Tensor! problem_sizes1, "
-      "                             Tensor! problem_sizes2, "
-      "                             Tensor expert_num_tokens, "
-      "                             int num_local_experts, int padded_m, "
-      "                             int n, int k) -> ()");
-  ops.impl("get_cutlass_batched_moe_mm_data", torch::kCUDA,
-           &get_cutlass_batched_moe_mm_data);
-
-  // Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3)
-  ops.def(
-      "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
-      "bool");
-  ops.impl("cutlass_scaled_mm_supports_block_fp8",
-           &cutlass_scaled_mm_supports_block_fp8);
-
  // SM100 CUTLASS MLA decode
  ops.def(
      "sm100_cutlass_mla_decode(Tensor! out, Tensor! lse, Tensor q_nope,"