try to increase half performance

02eb1d56 · yanyan · 75b46f90 · 02eb1d56 · 02eb1d56 · 02eb1d56
Commit 02eb1d56 authored May 29, 2020 by yanyan
10 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,6 +17,7 @@ set(CMAKE_CXX_EXTENSIONS OFF) # avoid gnu++11 be added to CXX flags
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
    add_compile_definitions(TV_DEBUG)
 endif()
+# add_compile_definitions(TV_LOG_KERNEL_INFO)
 find_package(Torch REQUIRED)
 # set(CMAKE_VERBOSE_MAKEFILE ON)

--- a/PERFORMANCE_GUIDE.md
+++ b/PERFORMANCE_GUIDE.md
+## Performance Guide
+### 1. Regular sparse conv is very slow
+Regular sparse convolution will greatly increase the number of active points. for 3x3x3 3D convolution, we can get at most 27x active points, which means next convolution will perform 27x slower!
+This problem can be solved by using submanifold convolution (SubMConv3d). This kind of sparse convolution doesn't generate new active points.
+**NEVER** use SparseConv3d except downsample data, **NEVER** use SparseConv3dTranspose, use SparseInverseConv3d instead.
+### 2. Large Spatial Shape cost too much GPU memory
+Our implementation use dense map to generate indices in GPU for sparse convolution, which means if your spatial shape is ```[batchSize=4, 1600, 1600, 40]```, it will cost ~2GB GPU memory.
+To solve this problem, you can use CPU algorithm (hash map) for first layer that has large shape, then convert generated indices to GPU and use GPU algorithm for downsampled data.
+Another way is use cuda hash. Unfortunately this library isn't stable enough, it should only be used when the spatial shape is very large.
+### 3. Stacked submanifold convolution can share same indice data
+When you using stacked subm convolution, there is no need to generate indice data again, but this can't be done automatically. you need to specify a unique key ```indice_key="c0"``` and use it for all stacked subm convolution.
+### 4. Different convolution algorithm may lead to different performance
+There are three kind of algorithm: ```Native```, ```Batch```, ```BatchGemmGather```. 
+* ```Native```: should be used for all submanifold convolutions. should be used when there are too much active points.
+* ```Batch```: **cost more GPU memory** should be used when number of active points is small.
+* ```BatchGemmGather```: **cost more GPU memory** can be used for regular convolution.
\ No newline at end of file
--- a/include/spconv/reordering.cu.h
+++ b/include/spconv/reordering.cu.h
@@ -272,6 +272,8 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
                                         int numPlanes) {
  int ILPStrideX[NumILP];
  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
+  constexpr int vecloadHalf2Factor = sizeof(VecType) / sizeof(__half2);
 #pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
@@ -288,16 +290,65 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
          reinterpret_cast<VecType *>(outFeatures)[idx];
      reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
          buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
+      if (std::is_same<T, at::Half>::value) {
+#pragma unroll
+        for (int i = 0; i < vecloadHalf2Factor; i++) {
+          reinterpret_cast<__half2 *>(buf)[i] =
+              __hadd2(reinterpret_cast<__half2 *>(buf)[i],
+                      reinterpret_cast<__half2 *>(buf2)[i]);
+        }
+      } else {
 #pragma unroll
        for (int i = 0; i < vecloadFactor; i++) {
          buf[i] += buf2[i];
        }
+      }
      reinterpret_cast<VecType *>(outFeatures)[idx] =
          reinterpret_cast<VecType *>(buf)[0];
    }
  }
 }
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer,
+                                         const Index *indices, int size,
+                                         int numPlanes) {
+  int ILPStrideX[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  outFeatures += blockIdx.y * NumTLP;
+  buffer += blockIdx.y * NumTLP;
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      outFeatures[indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y] += buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
+    }
+  }
+}
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void scatterAddHalfBlockKernel(T *outFeatures, const T *buffer,
+                                         const Index *indices, int size,
+                                         int numPlanes) {
+  int ILPStrideX[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  outFeatures += blockIdx.y * NumTLP;
+  buffer += blockIdx.y * NumTLP;
+  Index idx;
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idx = indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y;
+      reinterpret_cast<__half2 *>(outFeatures)[idx] =
+          __hadd2(reinterpret_cast<__half2 *>(outFeatures)[idx],
+                  reinterpret_cast<__half2 *>(buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]);
+    }
+  }
+}
 template <typename T, typename Index, int NumTLP, int NumILP>
 __global__ void batchScatterAddGenericKernel(T *outFeatures, const T *buffer,
                                             const Index *indices, int size,

--- a/include/tensorrt/inference.h
+++ b/include/tensorrt/inference.h
+#include "NvInfer.h"
+#include <memory>
+#include <tensorview/tensor.h>
+#include <unordered_map>
+#include <vector>
+namespace trt {
+template <typename T> tv::DType trt_dtype_to_tv(T trt_dtype) {
+  switch (trt_dtype) {
+  case nvinfer1::DataType::kFLOAT:
+    return tv::float32;
+  case nvinfer1::DataType::kHALF:
+    return tv::float16;
+  case nvinfer1::DataType::kINT32:
+    return tv::int32;
+  case nvinfer1::DataType::kINT8:
+    return tv::int8;
+  default:;
+  }
+  TV_THROW_INVALID_ARG("unknown trt dtype");
+}
+struct InferDeleter {
+  template <typename T> void operator()(T *obj) const {
+    if (obj) {
+      obj->destroy();
+    }
+  }
+};
+template <typename T> using trt_unique_ptr_t = std::unique_ptr<T, InferDeleter>;
+class Logger : public nvinfer1::ILogger {
+public:
+  Logger(Severity severity = Severity::kWARNING)
+      : reportableSeverity(severity) {}
+  void log(Severity severity, const char *msg) override {
+    // suppress messages with severity enum value greater than the reportable
+    if (severity > reportableSeverity)
+      return;
+    switch (severity) {
+    case Severity::kINTERNAL_ERROR:
+      std::cerr << "INTERNAL_ERROR: ";
+      break;
+    case Severity::kERROR:
+      std::cerr << "ERROR: ";
+      break;
+    case Severity::kWARNING:
+      std::cerr << "WARNING: ";
+      break;
+    case Severity::kINFO:
+      std::cerr << "INFO: ";
+      break;
+    default:
+      std::cerr << "UNKNOWN: ";
+      break;
+    }
+    std::cerr << msg << std::endl;
+  }
+  Severity reportableSeverity;
+};
+class InferenceContext {
+public:
+  explicit InferenceContext(const std::string& engine_bin, int device)
+      : logger_(nvinfer1::ILogger::Severity::kINFO), device_(device) {
+    TV_ASSERT_INVALID_ARG(device >= 0, "invalid device id");
+    int deviceCount;
+    cudaGetDeviceCount(&deviceCount);
+    if (device >= deviceCount) {
+      TV_THROW_INVALID_ARG("you provide device ", device, " but you only have ",
+                           deviceCount, " device.");
+    }
+    cudaSetDevice(device);
+    auto runtime = trt_unique_ptr_t<nvinfer1::IRuntime>(
+        nvinfer1::createInferRuntime(logger_));
+    engine_ =
+        trt_unique_ptr_t<nvinfer1::ICudaEngine>(runtime->deserializeCudaEngine(
+            engine_bin.c_str(), engine_bin.size(), nullptr));
+    ctx_ = trt_unique_ptr_t<nvinfer1::IExecutionContext>(
+        engine_->createExecutionContext());
+    max_batch_size_ = engine_->getMaxBatchSize();
+    for (int i = 0; i < engine_->getNbBindings(); ++i) {
+      auto dims = engine_->getBindingDimensions(i);
+      std::vector<int> shape_vec(dims.d, dims.d + dims.nbDims);
+      shape_vec.insert(shape_vec.begin(), {max_batch_size_});
+      tv::TensorShape shape(shape_vec);
+      std::string name = engine_->getBindingName(i);
+      auto trt_dtype = engine_->getBindingDataType(i);
+      auto tv_dtype = trt_dtype_to_tv(trt_dtype);
+      bool isInput = engine_->bindingIsInput(i);
+      name_to_idx_[name] = i;
+      idx_to_name_[i] = name;
+      name_to_host_mem_.insert({name, tv::Tensor(shape, tv_dtype, -1)});
+      name_to_dev_mem_.insert({name, tv::Tensor(shape, tv_dtype, 0)});
+      if (isInput)
+        inp_idxes_.push_back(i);
+      else
+        out_idxes_.push_back(i);
+      bindings_.push_back(name_to_dev_mem_[name].raw_data());
+    }
+    checkCudaErrors(cudaStreamCreate(&stream_));
+  }
+  std::unordered_map<std::string, tv::Tensor>
+  operator()(std::vector<tv::Tensor> inputs) {
+    TV_ASSERT_INVALID_ARG(inputs.size() == inp_idxes_.size(), "must provide",
+                          inp_idxes_.size(), "inputs, but got", inputs.size());
+    // inference batch size
+    int bs = inputs[0].dim(0);
+    for (auto &inp : inputs) {
+      TV_ASSERT_INVALID_ARG(inp.dim(0) == bs,
+                            "batch sizes of all input must same");
+    }
+    TV_ASSERT_INVALID_ARG(bs <= max_batch_size_, "your batchsize too large", bs,
+                          max_batch_size_);
+    for (int i = 0; i < inputs.size(); ++i) {
+      auto &dev_mem = name_to_dev_mem_[idx_to_name_[i]];
+      auto shape_inp = inputs[i].shape().subshape(1);
+      auto shape_dev = dev_mem.shape().subshape(1);
+      TV_ASSERT_INVALID_ARG(shape_inp == shape_dev,
+                            "shape except batch must same", shape_inp,
+                            shape_dev);
+      dev_mem.slice_first_axis(0, bs).copy_(inputs[i].slice_first_axis(0, bs),
+                                            stream_);
+    }
+    ctx_->enqueue(bs, bindings_.data(), stream_, nullptr);
+    for (int i : out_idxes_) {
+      name_to_host_mem_[idx_to_name_[i]].slice_first_axis(0, bs).copy_(
+          name_to_dev_mem_[idx_to_name_[i]].slice_first_axis(0, bs), stream_);
+    }
+    checkCudaErrors(cudaStreamSynchronize(stream_));
+    std::unordered_map<std::string, tv::Tensor> output_map;
+    for (int i = 0; i < out_idxes_.size(); ++i) {
+      auto name = idx_to_name_[out_idxes_[i]];
+      output_map[name] = name_to_host_mem_[name].slice_first_axis(0, bs);
+    }
+    return output_map;
+  }
+  std::unordered_map<std::string, tv::Tensor>
+  operator()(std::unordered_map<std::string, tv::Tensor> inputs) {
+    std::vector<tv::Tensor> inputs_vec(inp_idxes_.size());
+    int count = 0;
+    for (auto &p : inputs) {
+      auto iter = name_to_idx_.find(p.first);
+      TV_ASSERT_INVALID_ARG(iter != name_to_idx_.end(), "cant find your name",
+                            p.first);
+      inputs_vec[name_to_idx_[p.first]] = p.second;
+    }
+    TV_ASSERT_INVALID_ARG(count == inp_idxes_.size(), "your inp not enough");
+    return (*this)(inputs_vec);
+  }
+  tv::Tensor operator[](std::string name){
+    auto iter = name_to_host_mem_.find(name);
+    if (iter == name_to_host_mem_.end()){
+      TV_THROW_INVALID_ARG(name, "not found.");
+    }
+    return iter->second;
+  }
+  std::string repr() {
+    std::stringstream ss;
+    ss << "InferenceContext[gpu=" << device_ << "]";
+    ss << "\n  Inputs:";
+    std::string name;
+    for (auto &i : inp_idxes_) {
+      name = idx_to_name_[i];
+      auto &mem = name_to_host_mem_[name];
+      ss << "\n    " << name << "[" << tv::detail::typeString(mem.dtype())
+         << "]: " << mem.shape();
+    }
+    ss << "\n  Outputs:";
+    for (auto &i : out_idxes_) {
+      name = idx_to_name_[i];
+      auto &mem = name_to_host_mem_[name];
+      ss << "\n    " << name << "[" << tv::detail::typeString(mem.dtype())
+         << "]: " << mem.shape();
+    }
+    return ss.str();
+  }
+private:
+  Logger logger_;
+  trt_unique_ptr_t<nvinfer1::ICudaEngine> engine_;
+  trt_unique_ptr_t<nvinfer1::IExecutionContext> ctx_;
+  std::unordered_map<std::string, tv::Tensor> name_to_dev_mem_;
+  std::unordered_map<std::string, tv::Tensor> name_to_host_mem_;
+  std::unordered_map<std::string, int> name_to_idx_;
+  std::unordered_map<int, std::string> idx_to_name_;
+  std::vector<int> inp_idxes_;
+  std::vector<int> out_idxes_;
+  std::vector<void *> bindings_;
+  cudaStream_t stream_;
+  int max_batch_size_;
+  int device_;
+};
+} // namespace trtplus
--- a/setup.py
+++ b/setup.py
@@ -63,7 +63,7 @@ class CMakeBuild(build_ext):
            # must add following flags to use at::Half
            # but will remove raw half operators.
            cuda_flags += ["-D__CUDA_NO_HALF_OPERATORS__", "-D__CUDA_NO_HALF_CONVERSIONS__"]
-            cuda_flags += ["-D__CUDA_NO_HALF2_OPERATORS__"] 
+            # cuda_flags += ["-D__CUDA_NO_HALF2_OPERATORS__"] 
            cmake_args += ['-DCMAKE_CUDA_FLAGS=' + " ".join(cuda_flags)]
        cfg = 'Debug' if self.debug else 'Release'
        assert cfg == "Release", "pytorch ops don't support debug build."

--- a/spconv/__init__.py
+++ b/spconv/__init__.py
@@ -61,8 +61,6 @@ class SparseConvTensor(object):
        """
        self.features = features
        self.indices = indices
-        if self.indices.dtype != torch.int32:
-            self.indices = self.indices.int()
        self.spatial_shape = spatial_shape
        self.batch_size = batch_size
        self.indice_dict = {}

--- a/src/spconv/indice.cu
+++ b/src/spconv/indice.cu
@@ -29,7 +29,7 @@
 namespace spconv {
-using max_kernel_vol_t = tv::mp_list_c<int, 16, 32, 256, 4096>;
+using max_kernel_vol_t = tv::mp_list_c<int, 9, 16, 27, 32, 128, 256, 4096>;
 int create_conv_indice_pair_p1_cuda(
    torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
@@ -76,6 +76,12 @@ int create_conv_indice_pair_p1_cuda(
                                  pa, di, ou);
              TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
            }
+#ifdef TV_LOG_KERNEL_INFO
+            cudaFuncAttributes attr;
+            checkCudaErrors(cudaFuncGetAttributes(&attr, prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>));
+            tv::ssprint("prepareIndicePairsKernel<", tv::type_s<Index>, NDim, MaxKernelVolume, ">", attr.numRegs);
+#endif
          });
    });
  });
@@ -158,6 +164,15 @@ int create_conv_indice_pair_p2_cuda(
                         tv::torch2tv<Index>(indicePairs),
                         tv::torch2tv<Index>(indicePairUnique), ou);
        TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
+#ifdef TV_LOG_KERNEL_INFO
+        cudaFuncAttributes attr;
+        checkCudaErrors(cudaFuncGetAttributes(&attr, assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>));
+        tv::ssprint("assignGridAndIndiceOutKernel<", tv::type_s<Index>, NDim, ">", attr.numRegs);
+        cudaFuncAttributes attr2;
+        checkCudaErrors(cudaFuncGetAttributes(&attr2, assignIndicePairsKernel<Index, IndexGrid, NDim>));
+        tv::ssprint("assignIndicePairsKernel<", tv::type_s<Index>, NDim, ">", attr2.numRegs);
+#endif
      }
      if (resetGrid && (!useHash)) {

--- a/src/spconv/reordering.cc
+++ b/src/spconv/reordering.cc
@@ -20,7 +20,6 @@
 namespace spconv {
 using float_types_t = tv::mp_list<float, double, at::Half>;
 using int_types_t = tv::mp_list<int32_t, int64_t>;
 void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
                       torch::Tensor indices, int size) {
  int numPlanes = features.size(1);
@@ -57,9 +56,10 @@ void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
      Index *indices_data = indices.data_ptr<Index>();
      const T *buffer_data = buffer.data_ptr<T>();
      T *features_data = outFeatures.data_ptr<T>();
+      at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) {
        const T *buf = buffer.data_ptr<T>();
        T *out = outFeatures.data_ptr<T>();
-      for (int i = 0; i < size; ++i) {
+        for (int i = begin; i < end; ++i) {
          buf = buffer_data + i * numPlanes;
          out = features_data + indices_data[i] * numPlanes;
          for (int j = 0; j < numPlanes; ++j) {
@@ -68,6 +68,7 @@ void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
        }
      });
    });
+  });
 }
 } // namespace spconv
--- a/src/spconv/reordering.cu
+++ b/src/spconv/reordering.cu
@@ -29,6 +29,11 @@ namespace spconv {
 using float_types_t = tv::mp_list<float, double, at::Half>;
 using int_types_t = tv::mp_list<int32_t, int64_t>;
+template <typename T>
+using half_vec_t = std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
+template <typename T>
+using half_vec_sadd_t = std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
+  using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
 void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
                        torch::Tensor indices, int size) {
@@ -40,9 +45,7 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
  auto inds_dtype = indices.scalar_type();
  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
    using T = decltype(TValue);
-    using vecload_type_t =
+    using vecload_type_t = half_vec_t<T>;
-        std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-    using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
    tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
      using Index = decltype(IndexValue);
      bool notFound = true;
@@ -63,7 +66,12 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
                         stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
                                   indices.data_ptr<Index>(), nHotBlock,
                                   numPlanes / vecloadFactor);
+#ifdef TV_LOG_KERNEL_INFO
+                  cudaFuncAttributes attr;
+                  checkCudaErrors(cudaFuncGetAttributes(&attr, gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
+                                       vecload_type_t>));
+                  tv::ssprint("gatherVecBlockKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
+#endif
                  TV_CHECK_CUDA_ERR();
                }
                if (size - nHotBlock > 0) {
@@ -74,6 +82,12 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
                                   features.data_ptr<T>(),
                                   indices.data_ptr<Index>() + nHotBlock,
                                   size - nHotBlock, numPlanes / vecloadFactor);
+#ifdef TV_LOG_KERNEL_INFO
+                  cudaFuncAttributes attr;
+                  checkCudaErrors(cudaFuncGetAttributes(&attr, gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>));
+                  tv::ssprint("gatherVecKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
+#endif
                  TV_CHECK_CUDA_ERR();
                }
                notFound = false;
@@ -89,6 +103,12 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
               dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
                buffer.data_ptr<T>(), features.data_ptr<T>(),
                indices.data_ptr<Index>(), size, numPlanes);
+#ifdef TV_LOG_KERNEL_INFO
+        cudaFuncAttributes attr;
+        checkCudaErrors(cudaFuncGetAttributes(&attr, gatherGenericKernel<T, Index, NumTLP, NumILP>));
+        tv::ssprint("gatherGenericKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
+#endif
        TV_CHECK_CUDA_ERR();
      }
    });
@@ -106,9 +126,7 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
    using T = decltype(TValue);
-    using vecload_type_t =
+    using vecload_type_t = half_vec_sadd_t<T>;
-        std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-    using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
    tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
      using Index = decltype(IndexValue);
      bool notFound = true;
@@ -131,6 +149,13 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
                     stream>>>(outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
                               indices.data_ptr<Index>(), nHotBlock,
                               numPlanes / vecloadFactor);
+#ifdef TV_LOG_KERNEL_INFO
+              cudaFuncAttributes attr;
+              checkCudaErrors(cudaFuncGetAttributes(&attr, scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
+                                       vecload_type_t>));
+              tv::ssprint("scatterAddVecBlockKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
+#endif
              TV_CHECK_CUDA_ERR();
            }
            if (size - nHotBlock > 0) {
@@ -140,6 +165,11 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
                                  buffer.data_ptr<T>() + nHotBlock * numPlanes,
                                  indices.data_ptr<Index>() + nHotBlock,
                                  size - nHotBlock, numPlanes);
+#ifdef TV_LOG_KERNEL_INFO
+              cudaFuncAttributes attr;
+              checkCudaErrors(cudaFuncGetAttributes(&attr, scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
+              tv::ssprint("scatterAddGenericKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
+#endif
              TV_CHECK_CUDA_ERR();
            }
            notFound = false;
@@ -155,6 +185,12 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
               dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
                outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
                indices.data_ptr<Index>(), size, numPlanes);
+#ifdef TV_LOG_KERNEL_INFO
+        cudaFuncAttributes attr;
+        checkCudaErrors(cudaFuncGetAttributes(&attr, scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
+        tv::ssprint("notfound scatterAddGenericKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
+#endif
        TV_CHECK_CUDA_ERR();
      }
    });
@@ -176,9 +212,7 @@ void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
  int feature_stride = buffer.size(1);
  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
    using T = decltype(TValue);
-    using vecload_type_t =
+    using vecload_type_t = half_vec_t<T>;
-        std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-    using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
    tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
      using Index = decltype(IndexValue);
      bool notFound = true;
@@ -251,9 +285,7 @@ void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
    using T = decltype(TValue);
-    using vecload_type_t =
+    using vecload_type_t = half_vec_sadd_t<T>;
-        std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-    using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
    tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
      using Index = decltype(IndexValue);
      bool notFound = true;

--- a/test/test_conv.py
+++ b/test/test_conv.py
@@ -37,7 +37,7 @@ class SparseConv3dTestTorch(nn.Module):
                 stride,
                 padding,
                 dilation,
-                 algo=spconv.ConvAlgo.BatchGemmGather):
+                 algo=spconv.ConvAlgo.Native):
        super().__init__()
        layers = [
            spconv.SparseConv3d(in_channels,
@@ -349,7 +349,7 @@ def scatter_nd(indices, updates, shape):
 class TestSpConv(TestCase):
    def testSpConv3d(self):
        np.random.seed(484)
-        devices = ["cuda:0"]
+        devices = ["cpu:0"]
        shapes = [[19, 18, 17]]
        batchsizes = [1, 2]
@@ -615,13 +615,13 @@ class TestSpConv(TestCase):
            self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
-def main(algo=spconv.ConvAlgo.Native):
+def main(algo=spconv.ConvAlgo.Native, dtype=torch.float32):
    # function for develop.
    np.random.seed(484)
    # devices = ["cuda:0"]
    devices = ["cuda:0"]
    shapes = [[400, 400, 15]]
-    batchsizes = [1]
+    batchsizes = [2]
    in_channels = [32]
    out_channels = [64]
@@ -648,15 +648,15 @@ def main(algo=spconv.ConvAlgo.Native):
        indices_t = torch.from_numpy(indices)
        filters = np.random.uniform(0, 1, size=[k[0], 1, 1, IC,
                                                OC]).astype(np.float32)
-        indices_t = torch.from_numpy(indices).int().to(device).float()
+        indices_t = torch.from_numpy(indices).int().to(device).to(dtype)
-        features_t = torch.from_numpy(features).to(device).float()
+        features_t = torch.from_numpy(features).to(device).to(dtype)
-        features_dense_t = torch.from_numpy(features_dense).to(device).float()
+        features_dense_t = torch.from_numpy(features_dense).to(device).to(dtype)
        net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d,
-                                    algo=algo).to(device).float()
+                                    algo=algo).to(device).to(dtype)
        net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
-                                  d).to(device).float()
+                                  d).to(device).to(dtype)
-        filters_t = torch.from_numpy(filters).to(device).float()
+        filters_t = torch.from_numpy(filters).to(device).to(dtype)
        net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1,
                                                     2).contiguous()
        net.net[0].weight[:] = filters_t
@@ -682,7 +682,7 @@ def main(algo=spconv.ConvAlgo.Native):
              out_numpy.sum())
-def main_subm(algo):
+def main_subm(algo, dtype=torch.float32):
    # function for develop.
    np.random.seed(484)
    torch.manual_seed(50051)
@@ -703,7 +703,7 @@ def main_subm(algo):
        if all([s > 1, d > 1]):
            continue
        device = torch.device(dev)
-        num_points = [240000] * bs
+        num_points = [120000] * bs
        sparse_dict = generate_sparse_data(shape, num_points, IC)
@@ -715,15 +715,15 @@ def main_subm(algo):
        indices_t = torch.from_numpy(indices)
        filters = np.random.uniform(0, 1, size=[k[0], 1, 1, IC,
                                                OC]).astype(np.float32)
-        indices_t = torch.from_numpy(indices).int().to(device).float()
+        indices_t = torch.from_numpy(indices).int().to(device).to(dtype)
-        features_t = torch.from_numpy(features).to(device).float()
+        features_t = torch.from_numpy(features).to(device).to(dtype)
-        features_dense_t = torch.from_numpy(features_dense).to(device).float()
+        features_dense_t = torch.from_numpy(features_dense).to(device).to(dtype)
        net = SubMConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d,
-                                  algo=algo).to(device).float()
+                                  algo=algo).to(device).to(dtype)
        net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
-                                  d).to(device).float()
+                                  d).to(device).to(dtype)
-        filters_t = torch.from_numpy(filters).to(device).float()
+        filters_t = torch.from_numpy(filters).to(device).to(dtype)
        net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1,
                                                     2).contiguous()
        net.net[0].weight[:] = filters_t
@@ -741,17 +741,17 @@ def main_subm(algo):
        # print(out.indices)
        out = out.dense()
        out_numpy = out.detach().cpu().numpy()
-        print(
+        # print(
-            np.linalg.norm(out.detach().cpu().numpy() -
+        #     np.linalg.norm(out.detach().cpu().numpy() -
-                           out_ref.detach().cpu().numpy()))
+        #                    out_ref.detach().cpu().numpy()))
        print(out_numpy.min(), out_numpy.max(), out_numpy.mean(),
              out_numpy.sum())
    return out_numpy
 if __name__ == '__main__':
-    # main_subm(algo=spconv.ConvAlgo.BatchGemmGather)
+    main(algo=spconv.ConvAlgo.Native, dtype=torch.float32)
-    # out_ref = main_subm(algo=spconv.ConvAlgo.Native)
+    main(algo=spconv.ConvAlgo.Native, dtype=torch.half)
    # TestCase().assertAllClose(out_my, out_ref)
    # unittest.main()
-    TestSpConv().testSpConv3d()
+    # TestSpConv().testSpConv3d()