still working on cutlass

60781119 · yanyan · 23d9faaf · 60781119 · 60781119 · 60781119
Commit 60781119 authored Jun 09, 2020 by yanyan
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,11 +44,13 @@ set(ALL_LIBS ${TORCH_LIBRARIES})

 set(ALL_INCLUDE ${PROJECT_SOURCE_DIR}/include)
 set(MP11_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/mp11/include)
+set(CUTLASS_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/cutlass/include)

 if (SPCONV_BuildCUDA)
    set(ALL_LIBS ${ALL_LIBS} ${CUDA_CUDART} ${CUDA_CUBLAS})
    set(ALL_INCLUDE ${ALL_INCLUDE} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
    add_subdirectory(src/cuhash)
+    add_subdirectory(src/spgemm)
 endif()
 add_subdirectory(src/spconv)
 add_subdirectory(src/utils)

--- a/codeai-devops.yaml
+++ b/codeai-devops.yaml
@@ -65,7 +65,7 @@ observers:
        std: c++14
        options: [
            -Wno-deprecated-declarations,
-            "-gencode=arch=compute_52,code=sm_61",
+            "-gencode=arch=compute_52,code=sm_52",
            "-gencode=arch=compute_61,code=sm_61",
            "-gencode=arch=compute_60,code=sm_60",
            "-gencode=arch=compute_70,code=sm_70",
@@ -76,7 +76,7 @@ observers:
        type: CPPDevObserver
        main_pattern: torchdev_.*\.(cu|cpp|cc|cxx)
        pattern: .*\.(cc|cpp|cxx|h|hpp|hxx|cu)
-        compiler: clang++
+        compiler: nvcc
        executable: build/codeai_dev_torch
        run_cmd: [$(executable)]
        fail_cmds: # run cmd when pervious run fail with retcode
@@ -97,13 +97,20 @@ observers:
            /home/yy/anaconda3/lib,
        ]
        libraries: [-lpython3.7m, -lcublas, -lcudart, -ljpeg, -lpthread, 
-                    "-Wl,--no-as-needed,-lc10", 
-                    "-Wl,--no-as-needed,-ltorch", 
-                    "-Wl,--no-as-needed,-ltorch_cpu", 
-                    "-Wl,--no-as-needed,-lc10_cuda", 
-                    "-Wl,--no-as-needed,-ltorch_cuda"]
-        std: c++2a
-        options: [--cuda-gpu-arch=sm_61, -Wno-deprecated-declarations, -D_GLIBCXX_USE_CXX11_ABI=0]
+                    "-Xcompiler=\"-Wl,--no-as-needed,-lc10\"", 
+                    "-Xcompiler=\"-Wl,--no-as-needed,-ltorch\"", 
+                    "-Xcompiler=\"-Wl,--no-as-needed,-ltorch_cpu\"", 
+                    "-Xcompiler=\"-Wl,--no-as-needed,-lc10_cuda\"", 
+                    "-Xcompiler=\"-Wl,--no-as-needed,-ltorch_cuda\""]
+        std: c++14
+        # options: [--cuda-gpu-arch=sm_61, -Wno-deprecated-declarations, -D_GLIBCXX_USE_CXX11_ABI=0]
+
+        options: [
+            -Wno-deprecated-declarations,
+            --expt-relaxed-constexpr,
+            "-gencode=arch=compute_61,code=sm_61",
+            -D_GLIBCXX_USE_CXX11_ABI=0,
+        ]



--- a/include/spconv/indice.cu.h
+++ b/include/spconv/indice.cu.h
@@ -20,8 +20,8 @@
 #include <tensorview/tensorview.h>

 namespace spconv {
-template <typename Index, unsigned NDim, int KernelMaxVolume = 256,
-          typename Index1D = int>
+template <typename Index, unsigned NDim, bool UseDeconv,
+          int KernelMaxVolume = 256, typename Index1D = int>
 __global__ void prepareIndicePairsKernel(
    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
    tv::TensorView<Index> indiceNum, tv::TensorView<Index1D> indicePairUnique,
@@ -47,54 +47,19 @@ __global__ void prepareIndicePairsKernel(
  auto indicePairsDim2 = indicePairs.dim(2);
  Index index;
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-      indicePairs(0, offset, oldNum) = ix;
-      index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-                  pointPtr, outSpatialShape.data(), 0) +
-              spatialVolume * indicesIn(ix, 0);
-      indicePairs(1, offset, oldNum) = index;
-      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
+    if (UseDeconv) {
+      // nvcc will optimize this fake "if constexpr"
+      // after cuda 11 released, we will start to use real if constexpr.
+      numValidPoints = getValidOutPosTranspose<Index, NDim>(
+          indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+          stride.data(), padding.data(), dilation.data(),
+          outSpatialShape.data(), validPoints);
+    } else {
+      numValidPoints = getValidOutPos<Index, NDim>(
+          indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+          stride.data(), padding.data(), dilation.data(),
+          outSpatialShape.data(), validPoints);
    }
-  }
-}
-
-template <typename Index, unsigned NDim, int KernelMaxVolume = 256>
-__global__ void prepareDeConvIndicePairsKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  auto indicePairsDim2 = indicePairs.dim(2);
-  Index index;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPosTranspose<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
    for (Index i = 0; i < numValidPoints; ++i) {
      pointPtr = validPoints + i * (NDim + 1);
      auto offset = pointPtr[NDim];
@@ -191,6 +156,29 @@ assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
  }
 }

+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void
+assignIndicePairsLimitedKernel(tv::TensorView<IndexGrid> gridsOut, int numActIn,
+                               tv::TensorView<Index> indicePairs,
+                               tv::TensorView<Index> indiceNum) {
+
+  Index index, val;
+  int kernelVolume = indicePairs.dim(0);
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    for (int i = 0; i < kernelVolume; ++i) {
+      index = indicePairs(i, 1, ix);
+      if (index != -1) {
+        val = gridsOut[index];
+        if (val != -1) {
+          auto oldNum = atomicAdd(indiceNum.data() + i, Index(1));
+          indicePairs(i, 0, oldNum) = indicePairs(i, 0, ix);
+          indicePairs(i, 1, oldNum) = val;
+        }
+      }
+    }
+  }
+}
+
 template <typename Index, typename IndexGrid, unsigned NDim>
 __global__ void prepareSubMGridKernel(
    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
@@ -542,7 +530,6 @@ __global__ void getSubMIndicePairsHashUnrollKernel2(
  }
 }

-
 template <typename Index, typename IndexGrid, unsigned NDim>
 __global__ void resetGridKernel(const Index *indicePairUnique,
                                tv::TensorView<IndexGrid> gridsOut,
@@ -567,8 +554,8 @@ resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
  Index index;
  for (int ix : tv::KernelLoopX<int>(numAct)) {
    indsPtr = indices + ix * (NDim + 1);
-    index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(indsPtr + 1,
-                                                        outSpatialShape.data(), 0);
+    index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
+        indsPtr + 1, outSpatialShape.data(), 0);
    gridsOut[index + spatialVolume * indsPtr[0]] = -1;
  }
 }

--- a/include/spconv/reordering.h
+++ b/include/spconv/reordering.h
@@ -14,9 +14,9 @@

 #ifndef SPARSE_REORDERING_FUNCTOR_H_
 #define SPARSE_REORDERING_FUNCTOR_H_
+#include <cuda_runtime_api.h>
 #include <tensorview/tensorview.h>
 #include <torch/script.h>
-
 namespace spconv {

 void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
@@ -35,6 +35,13 @@ void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
 void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
                            torch::Tensor indices, int size);

+void sparse_gather_cuda(cudaStream_t s, torch::Tensor buffer,
+                        torch::Tensor features, torch::Tensor indices,
+                        int size);
+void sparse_scatter_add_cuda(cudaStream_t s, torch::Tensor buffer,
+                             torch::Tensor outFeatures, torch::Tensor indices,
+                             int size);
+
 } // namespace spconv

 #endif
\ No newline at end of file
--- a/include/spgemm/gemm.h
+++ b/include/spgemm/gemm.h
+#include <cutlass/gemm/device/gemm.h>
+#include <type_traits>
+namespace spconv {
+
+template <typename T>
+using determine_acc_t =
+    std::conditional_t<std::is_same<T, cutlass::half_t>::value, float, T>;
+
+template <typename T, bool TransA, bool TransB, bool TransC>
+cudaError_t cutlassGemm(cudaStream_t s, int M, int N, int K, T alpha,
+                        T const *A, int lda, T const *B, int ldb, T beta, T *C,
+                        int ldc) {
+
+  // Define type definition for single-precision CUTLASS GEMM with column-major
+  // input matrices and 128x128x8 threadblock tile size (chosen by default).
+  //
+  // To keep the interface manageable, several helpers are defined for plausible
+  // compositions including the following example for single-precision GEMM.
+  // Typical values are used as default template arguments. See
+  // `cutlass/gemm/device/default_gemm_configuration.h` for more details.
+  //
+  // To view the full gemm device API interface, see
+  // `cutlass/gemm/device/gemm.h`
+  using TAcc = determine_acc_t<T>;
+  using ColumnMajor = cutlass::layout::ColumnMajor;
+  using RowMajor = cutlass::layout::RowMajor;
+  using LayoutA = std::conditional_t<TransA, ColumnMajor, RowMajor>;
+  using LayoutB = std::conditional_t<TransB, ColumnMajor, RowMajor>;
+  using LayoutC = std::conditional_t<TransC, ColumnMajor, RowMajor>;
+
+  using CutlassGemm = cutlass::gemm::device::Gemm<T, // Data-type of A matrix
+                                                  LayoutA, // Layout of A matrix
+                                                  T, // Data-type of B matrix
+                                                  LayoutB, // Layout of B matrix
+                                                  T, // Data-type of C matrix
+                                                  LayoutC,
+                                                  TAcc>; // Layout of C matrix
+
+  // Define a CUTLASS GEMM type
+  CutlassGemm gemm_operator;
+
+  // Construct the CUTLASS GEMM arguments object.
+  //
+  // One of CUTLASS's design patterns is to define gemm argument objects that
+  // are constructible in host code and passed to kernels by value. These may
+  // include pointers, strides, scalars, and other arguments needed by Gemm and
+  // its components.
+  //
+  // The benefits of this pattern are (1.) a structured, composable strategy for
+  // passing host-constructible arguments to kernels and (2.) minimized
+  // initialization overhead on kernel entry.
+  //
+  typename CutlassGemm::Arguments args(
+      {M, N, K}, // Gemm Problem dimensions
+      {A, lda},  // Tensor-ref for source matrix A
+      {B, ldb},  // Tensor-ref for source matrix B
+      {C, ldc},  // Tensor-ref for source matrix C
+      {C, ldc},  // Tensor-ref for destination matrix D (may be different memory
+                 // than source C matrix)
+      {alpha, beta}); // Scalars used in the Epilogue
+
+  //
+  // Launch the CUTLASS GEMM kernel.
+  //
+
+  cutlass::Status status = gemm_operator(args, nullptr, s);
+
+  //
+  // Return a cudaError_t if the CUTLASS GEMM operator returned an error code.
+  //
+
+  if (status != cutlass::Status::kSuccess) {
+    return cudaErrorUnknown;
+  }
+
+  // Return success, if no errors were encountered.
+  return cudaSuccess;
+}
+
+} // namespace spconv
--- a/include/spgemm/gemm_th.h
+++ b/include/spgemm/gemm_th.h
+
+#include <cuda_runtime_api.h>
+#include <tensorview/torch_utils.h>
+#include <torch/script.h>
+
+namespace spconv {
+void cutlass_mm_out(torch::Tensor c, torch::Tensor a, torch::Tensor b);
+void cutlass_mm_out(cudaStream_t stream, torch::Tensor c, torch::Tensor a,
+                    torch::Tensor b);
+
+} // namespace spconv
\ No newline at end of file
--- a/include/sphash/hashmap.h
+++ b/include/sphash/hashmap.h
 #include <tensorview/tensor.h>

-
 namespace spconv {

-enum HashTypes {
-    kDenseMap = 0,
-    kCUDPPHash = 1
-};
-
-template <int Impl>
-struct HashMap;
+enum HashTypes { kDenseMap = 0, kCUDPPHash = 1 };

-template<>
-struct HashMap<kDenseMap>{
+template <int Impl> struct HashMap;

-};
+template <> struct HashMap<kDenseMap> {};

-}
\ No newline at end of file
+} // namespace spconv
\ No newline at end of file
--- a/include/tensorview/mp_helper.h
+++ b/include/tensorview/mp_helper.h
@@ -12,7 +12,6 @@ using mp_list_c = mp_list<std::integral_constant<T, I>...>;
 template <int... I>
 using mp_list_int_c = mp_list<std::integral_constant<int, I>...>;

-
 namespace detail {

 template <class... Ts, class F>

--- a/include/tensorview/tensor.h
+++ b/include/tensorview/tensor.h
@@ -524,7 +524,6 @@ struct DispatchIntNoexcept<T<Args...>> {
  }
 };

-
 constexpr size_t kTensorMaxDim = 10;
 using TensorShape = ShapeBase<kTensorMaxDim, int64_t>;


--- a/spconv/__init__.py
+++ b/spconv/__init__.py
@@ -69,7 +69,7 @@ class SparseConvTensor(object):
        self.batch_size = batch_size
        self.indice_dict = {}
        if grid is None:
-            grid = torch.Tensor() # empty tensor
+            grid = torch.Tensor()  # empty tensor
        self.grid = grid

    @classmethod

--- a/spconv/ops.py
+++ b/spconv/ops.py
@@ -89,11 +89,11 @@ def get_indice_pairs(indices,
        out_shape = spatial_shape
    if grid is None:
        grid = torch.Tensor()
-    res = torch.ops.spconv.get_indice_pairs(indices, grid, batch_size, out_shape,
-                                            spatial_shape, ksize, stride,
-                                            padding, dilation, out_padding,
-                                            int(subm), int(transpose),
-                                            int(use_hash))
+    res = torch.ops.spconv.get_indice_pairs(indices, grid, batch_size,
+                                            out_shape, spatial_shape, ksize,
+                                            stride, padding, dilation,
+                                            out_padding, int(subm),
+                                            int(transpose), int(use_hash))
    return res



--- a/spconv/pool.py
+++ b/spconv/pool.py
@@ -67,8 +67,16 @@ class SparseMaxPool(SparseModule):
        else:
            out_spatial_shape = spatial_shape
        outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs(
-            indices, batch_size, spatial_shape, self.kernel_size, self.stride,
-            self.padding, self.dilation, 0, self.subm, grid=input.grid)
+            indices,
+            batch_size,
+            spatial_shape,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.dilation,
+            0,
+            self.subm,
+            grid=input.grid)

        out_features = Fsp.indice_maxpool(features, indice_pairs.to(device),
                                          indice_pairs_num.to(device),

--- a/src/spconv/CMakeLists.txt
+++ b/src/spconv/CMakeLists.txt
@@ -15,7 +15,7 @@ set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)
 set_property(TARGET spconv PROPERTY CXX_STANDARD 14)
 set_target_properties(spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 if (SPCONV_BuildCUDA)
-    target_link_libraries(spconv PRIVATE ${ALL_LIBS} cuhash)
+    target_link_libraries(spconv PRIVATE ${ALL_LIBS} cuhash spgemm)
 else()
    target_link_libraries(spconv PRIVATE ${ALL_LIBS})
 endif()

--- a/src/spconv/indice.cu
+++ b/src/spconv/indice.cu
@@ -61,17 +61,9 @@ int create_conv_indice_pair_p1_cuda(
      tv::DispatchInt<max_kernel_vol_t>()(
          kernelVolume, std::less_equal<int>(), [&](auto I2) {
            constexpr int MaxKernelVolume = decltype(I2)::value;
-            if (transpose) {
-              prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>
-                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
-                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
-                                  tv::torch2tv<Index>(indicePairs),
-                                  tv::torch2tv<Index>(indiceNum),
-                                  tv::torch2tv<Index>(indicePairUnique), ks, st,
-                                  pa, di, ou);
-              TV_CHECK_CUDA_ERR_V2("prepareDeConvIndicePairsKernel failed");
-            } else {
-              prepareIndicePairsKernel<Index, NDim, MaxKernelVolume>
+            tv::dispatch_int<0, 1>(int(transpose), [&](auto I) {
+              constexpr bool UseDeconv = decltype(I)::value;
+              prepareIndicePairsKernel<Index, NDim, UseDeconv, MaxKernelVolume>
                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
                                  tv::torch2tv<Index>(indicePairs),
@@ -79,8 +71,7 @@ int create_conv_indice_pair_p1_cuda(
                                  tv::torch2tv<Index>(indicePairUnique), ks, st,
                                  pa, di, ou);
              TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
-            }
-        // tv::ssprint("prepareIndicePairsKernel", timer.report() / 1000.0);
+            });
 #ifdef TV_LOG_KERNEL_INFO
            cudaFuncAttributes attr;
            checkCudaErrors(cudaFuncGetAttributes(
@@ -230,15 +221,17 @@ int create_submconv_indice_pair_cuda(
      namespace mp11 = boost::mp11;

      using kernel2_candidates_t =
-          mp11::mp_product<tv::mp_list, tv::mp_list_int_c<1, 3, 5>,
-                           tv::mp_list_int_c<1, 3, 5>>;
+          mp11::mp_product<tv::mp_list, tv::mp_list_c<int, 1, 3, 5>,
+                           tv::mp_list_c<int, 1, 3, 5>>;
      using kernel3_candidates_t =
-          mp11::mp_product<tv::mp_list, tv::mp_list_int_c<1, 3, 5>,
-                           tv::mp_list_int_c<1, 3, 5>,
-                           tv::mp_list_int_c<1, 3, 5>>;
-      using kernel3_candidates_final_t = mp11::mp_push_back<kernel3_candidates_t>;
+          mp11::mp_product<tv::mp_list, tv::mp_list_c<int, 1, 3, 5>,
+                           tv::mp_list_c<int, 1, 3, 5>,
+                           tv::mp_list_c<int, 1, 3, 5>>;
+      using kernel3_candidates_final_t =
+          mp11::mp_push_back<kernel3_candidates_t>;
      auto dispatcher2 = tv::DispatchContainerNoexcept<kernel2_candidates_t>();
-      auto dispatcher3 = tv::DispatchContainerNoexcept<kernel3_candidates_final_t>();
+      auto dispatcher3 =
+          tv::DispatchContainerNoexcept<kernel3_candidates_final_t>();

      if (useHash) {
        auto table = cuhash::HashTable();

--- a/src/spconv/reordering.cu
+++ b/src/spconv/reordering.cu
@@ -37,12 +37,12 @@ using half_vec_sadd_t =
    std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
 using kernel_block_t = tv::mp_list_c<int, 64, 32, 16, 8>;

-void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
-                        torch::Tensor indices, int size) {
+void sparse_gather_cuda(cudaStream_t stream, torch::Tensor buffer,
+                        torch::Tensor features, torch::Tensor indices,
+                        int size) {
  if (size <= 0)
    return;
  int numPlanes = features.size(1);
-  auto stream = at::cuda::getCurrentCUDAStream();
  auto dtype = features.scalar_type();
  auto inds_dtype = indices.scalar_type();
  // auto timer = spconv::CudaContextTimer<>();
@@ -126,12 +126,18 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
  });
 }

-void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
-                             torch::Tensor indices, int size) {
+void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
+                        torch::Tensor indices, int size) {
+  auto stream = at::cuda::getCurrentCUDAStream();
+  return sparse_gather_cuda(stream, buffer, features, indices, size);
+}
+
+void sparse_scatter_add_cuda(cudaStream_t stream, torch::Tensor buffer,
+                             torch::Tensor outFeatures, torch::Tensor indices,
+                             int size) {
  if (size <= 0)
    return;
  int numPlanes = outFeatures.size(1);
-  auto stream = at::cuda::getCurrentCUDAStream();
  auto dtype = outFeatures.scalar_type();
  auto inds_dtype = indices.scalar_type();

@@ -216,6 +222,12 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
  });
 }

+void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
+                             torch::Tensor indices, int size) {
+  auto stream = at::cuda::getCurrentCUDAStream();
+  return sparse_scatter_add_cuda(stream, buffer, outFeatures, indices, size);
+}
+
 void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
                              torch::Tensor indices, int size) {
  // indices: [volume, inds_stride]

--- a/src/spconv/spconv_ops.cc
+++ b/src/spconv/spconv_ops.cc
 #include <spconv/spconv_ops.h>
+#include <spgemm/gemm_th.h>
+
 namespace spconv {

 std::vector<torch::Tensor>
@@ -48,9 +50,9 @@ getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
    gridSize = batchSize;
  }
  bool resetGrid = gridOut.numel() != 0;
-  if (!resetGrid){
-    gridOut = torch::full(
-          {gridSize}, -1, torch::dtype(torch::kInt32).device(indices.device()));
+  if (!resetGrid) {
+    gridOut = torch::full({gridSize}, -1,
+                          torch::dtype(torch::kInt32).device(indices.device()));
  }
  gridOut = gridOut.view({batchSize, -1});
  int64_t numActOut = -1;
@@ -104,7 +106,7 @@ getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
          padding, dilation, outSpatialShape, transpose, resetGrid, useHash);
    }
 #ifdef TV_CUDA
-    else if (indices.device().type() == torch::kCUDA) {      
+    else if (indices.device().type() == torch::kCUDA) {
      numActOut = create_conv_indice_pair_p1_cuda(
          indices, indicePairs, indiceNum, indicePairUnique, kernelSize, stride,
          padding, dilation, outSpatialShape, transpose);
@@ -191,7 +193,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
  double totalGatherTime = 0;
  double totalGEMMTime = 0;
  double totalSAddTime = 0;
-  // tv::ssprint("first subm gemm time", timer.report() / 1000.0, std::vector<int>(indicePairNumCpu.data_ptr<int>(),
+  // tv::ssprint("first subm gemm time", timer.report() / 1000.0,
+  // std::vector<int>(indicePairNumCpu.data_ptr<int>(),
  //                      indicePairNumCpu.data_ptr<int>() + kernelVolume));

  for (int i = 0; i < kernelVolume; ++i) {

--- a/src/spgemm/CMakeLists.txt
+++ b/src/spgemm/CMakeLists.txt
+set(ALL_FILES ${ALL_FILES} gemm.cu)
+
+add_library(spgemm SHARED ${ALL_FILES})
+
+target_include_directories(spgemm PRIVATE ${ALL_INCLUDE} ${MP11_INCLUDE} ${CUTLASS_INCLUDE} )
+set_property(TARGET spgemm PROPERTY CUDA_STANDARD 14)
+set_property(TARGET spgemm PROPERTY CXX_STANDARD 14)
+set_target_properties(spgemm PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+target_link_libraries(spgemm PRIVATE ${ALL_LIBS})
+install (TARGETS spgemm DESTINATION lib)
--- a/src/spgemm/gemm.cu
+++ b/src/spgemm/gemm.cu
+#include <spgemm/gemm.h>
+#include <spgemm/gemm_th.h>
+namespace spconv {
+template <typename T>
+using determine_half_t =
+    std::conditional_t<std::is_same<T, at::Half>::value, cutlass::half_t, T>;
+
+void cutlass_mm_out(cudaStream_t stream, torch::Tensor c, torch::Tensor a,
+                    torch::Tensor b) {
+  TV_ASSERT_RT_ERR(c.dtype() == a.dtype() && c.dtype() == b.dtype(),
+                   "dtype must be same");
+  TV_ASSERT_RT_ERR(c.is_contiguous() && b.is_contiguous() && a.is_contiguous(),
+                   "error");
+  auto M = a.size(0);
+  auto K = a.size(1);
+  auto N = b.size(1);
+  TV_ASSERT_RT_ERR(b.size(0) == K && c.size(0) == M && c.size(1) == N, "error");
+  tv::dispatch_torch<float, at::Half>(c.scalar_type(), [&](auto I) {
+    using T = decltype(I);
+    using HalfT = determine_half_t<T>;
+    auto status = cutlassGemm<HalfT, false, false, false>(
+        stream, M, N, K, HalfT(1.0), reinterpret_cast<HalfT *>(a.data_ptr<T>()),
+        a.size(1), reinterpret_cast<HalfT *>(b.data_ptr<T>()), b.size(1),
+        HalfT(0.0), reinterpret_cast<HalfT *>(c.data_ptr<T>()), c.size(1));
+    TV_ASSERT_RT_ERR(status == cudaSuccess, "error");
+  });
+}
+
+void cutlass_mm_out(torch::Tensor c, torch::Tensor a, torch::Tensor b) {
+  auto stream = at::cuda::getCurrentCUDAStream();
+  return cutlass_mm_out(stream, c, a, b);
+}
+
+} // namespace spconv
\ No newline at end of file
--- a/src/spgemm/torchdev_cutlass.cu
+++ b/src/spgemm/torchdev_cutlass.cu
+#define TV_CUDA
+#include <cutlass/gemm/device/gemm.h>
+#include <spgemm/gemm.h>
+#include <tensorview/cuda_utils.h>
+#include <tensorview/kernel_utils.h>
+#include <tensorview/tensor.h>
+#include <tensorview/torch_utils.h>
+#include <torch/script.h>
+#include <utility/timer.h>
+
+int main() {
+  auto M = 100000;
+  auto N = 128;
+  auto K = 128;
+  auto a =
+      torch::rand({M, K}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
+  auto b = torch::rand({K, N}, a.options());
+  auto c = torch::zeros({a.size(0), b.size(1)}, a.options());
+  auto c2 = torch::zeros({a.size(0), b.size(1)}, a.options());
+  torch::mm_out(c, a, b);
+  auto status = spconv::cutlassGemm<float, false, false, false>(
+      0, M, N, K, 1.0, a.data_ptr<float>(), a.size(1), b.data_ptr<float>(),
+      b.size(1), 0.0, c2.data_ptr<float>(), c2.size(1));
+  auto err = torch::norm(c2 - c);
+  tv::ssprint(status, "linalg norm", err);
+  tv::ssprint((c.view({-1}) == 0).sum());
+  auto timer = spconv::CudaContextTimer<>();
+  for (int i = 0; i < 10; ++i) {
+    torch::mm_out(c, a, b);
+    tv::ssprint("mm", timer.report() / 1000.0);
+    spconv::cutlassGemm<float, false, false, false>(
+        0, M, N, K, 1.0, a.data_ptr<float>(), a.size(1), b.data_ptr<float>(),
+        b.size(1), 0.0, c2.data_ptr<float>(), c2.size(1));
+    tv::ssprint("cutlass_mm", timer.report() / 1000.0);
+  }
+
+  return 0;
+}
\ No newline at end of file
--- a/test/benchmark.py
+++ b/test/benchmark.py
-import torch 
-import spconv 
-import numpy as np 
-from spconv.utils import VoxelGeneratorV2
-from pathlib import Path 
+import time
+from pathlib import Path
+
+import numpy as np
+import torch
 from torch import nn
-import time 
+
+import spconv
+from spconv.utils import VoxelGeneratorV2
+

 def waymo_data(batch_size=1):
-    gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1, 150000)
+    gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
+                           150000)
    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
    pc = data["pc"]
    data = gen.generate(pc)
@@ -17,9 +21,9 @@ def waymo_data(batch_size=1):
    coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
    return voxels, coors, gen.grid_size

+
 class Net(nn.Module):
-    def __init__(self,
-                 shape):
+    def __init__(self, shape):
        super().__init__()
        self.net = spconv.SparseSequential(
            spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0"),
@@ -57,7 +61,8 @@ class Net(nn.Module):
        )
        max_batch_size = 1
        # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
-        self.grid = torch.full([max_batch_size, *shape], -1, dtype=torch.int32).cuda()
+        self.grid = torch.full([max_batch_size, *shape], -1,
+                               dtype=torch.int32).cuda()
        # self.grid = None
        self.shape = shape

@@ -87,5 +92,6 @@ def main():
    # print("spconv time", time.time() - t)
    print("spconv time", np.mean(times[10:]))

+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()