spconv v1.1 release:

1. add cuda hash support for cuda indice generation. 2. use hash table instead of dense table in CPU code. 3. add CPU-only build support.

spconv v1.1 release:
1. add cuda hash support for cuda indice generation. 2. use hash table instead of dense table in CPU code. 3. add CPU-only build support.
a6ae8967 · traveller59 · 0757c45b · a6ae8967 · a6ae8967 · a6ae8967
Commit a6ae8967 authored May 24, 2019 by traveller59
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
-project(SparseConv LANGUAGES CXX CUDA VERSION 1.0)

-option(SPCONV_BuildTests "Build the unit tests when BUILD_TESTING is enabled." OFF)
-set(CMAKE_CXX_EXTENSIONS OFF) # avoid gnu++11 be added to CXX flags
-
-set(CUDA_TOOLKIT_ROOT_DIR "${CMAKE_CUDA_COMPILER}")
-get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CUDA_TOOLKIT_ROOT_DIR}" DIRECTORY)
-get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CUDA_TOOLKIT_ROOT_DIR}" DIRECTORY)
-if(WIN32) # true if windows (32 and 64 bit)
-    set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64")
-    add_compile_definitions(TV_WINDOWS)
+option(SPCONV_BuildTests "Build the unit tests when BUILD_TESTING is enabled." ON)
+option(SPCONV_BuildCUDA "Build cuda code when BUILD_TESTING is enabled." ON)
+if (SPCONV_BuildCUDA)
+    project(SparseConv LANGUAGES CXX CUDA VERSION 1.0)
 else()
-    set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+project(SparseConv LANGUAGES CXX VERSION 1.0)
 endif()
-# set(CMAKE_VERBOSE_MAKEFILE ON)
-find_library(CUDA_CUDART NAMES cudart HINTS ${CUDA_LIB_PATH_HINTS})
-find_library(CUDA_CUBLAS NAMES cublas HINTS ${CUDA_LIB_PATH_HINTS})
+
+
+set(CMAKE_CXX_EXTENSIONS OFF) # avoid gnu++11 be added to CXX flags
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
    add_compile_definitions(TV_DEBUG)
 endif()

 find_package(Torch REQUIRED)

-torch_cuda_get_nvcc_gencode_flag(NVCC_FLAGS_EXTRA)
-string (REPLACE ";" " " NVCC_FLAGS_EXTRA_STR "${NVCC_FLAGS_EXTRA}")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA_STR}")
-
+if (SPCONV_BuildCUDA)
+    set(CUDA_TOOLKIT_ROOT_DIR "${CMAKE_CUDA_COMPILER}")
+    get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CUDA_TOOLKIT_ROOT_DIR}" DIRECTORY)
+    get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CUDA_TOOLKIT_ROOT_DIR}" DIRECTORY)
+    if(WIN32) # true if windows (32 and 64 bit)
+        set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64")
+        add_compile_definitions(TV_WINDOWS)
+    else()
+        set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+    endif()
+    # set(CMAKE_VERBOSE_MAKEFILE ON)
+    find_library(CUDA_CUDART NAMES cudart HINTS ${CUDA_LIB_PATH_HINTS})
+    find_library(CUDA_CUBLAS NAMES cublas HINTS ${CUDA_LIB_PATH_HINTS})
+    torch_cuda_get_nvcc_gencode_flag(NVCC_FLAGS_EXTRA)
+    string (REPLACE ";" " " NVCC_FLAGS_EXTRA_STR "${NVCC_FLAGS_EXTRA}")
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA_STR}")
+    add_compile_definitions(SPCONV_CUDA)
+endif()
 # add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
-
-add_compile_definitions(SPCONV_CUDA)
 add_subdirectory(third_party/pybind11)

-set(ALL_LIBS ${CUDA_CUDART} ${CUDA_CUBLAS} ${TORCH_LIBRARIES}) 
+set(ALL_LIBS ${TORCH_LIBRARIES}) 

-set(ALL_INCLUDE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 
-    ${PROJECT_SOURCE_DIR}/include)
+set(ALL_INCLUDE ${PROJECT_SOURCE_DIR}/include)

+if (SPCONV_BuildCUDA)
+    set(ALL_LIBS ${ALL_LIBS} ${CUDA_CUDART} ${CUDA_CUBLAS})
+    set(ALL_INCLUDE ${ALL_INCLUDE} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    add_subdirectory(src/cuhash)
+endif()

 add_subdirectory(src/spconv)
 add_subdirectory(src/utils)
-add_subdirectory(src/hash)
+

 if (SPCONV_BuildTests)
    include(CTest) #adds option BUILD_TESTING (default ON)

--- a/README.md
+++ b/README.md
@@ -2,15 +2,15 @@

 This is a spatially sparse convolution library like [SparseConvNet](https://github.com/facebookresearch/SparseConvNet) but faster and easy to read. This library provide sparse convolution/transposed, submanifold convolution, inverse convolution and sparse maxpool.

-If you need more kinds of spatial layers such as avg pool, please implement it by yourself, I don't have time to do this.
-
 The GPU Indice Generation algorithm is a unofficial implementation of paper [SECOND](http://www.mdpi.com/1424-8220/18/10/3337). That algorithm (don't include GPU SubM indice generation algorithm) may be protected by patent.

-This project only support CUDA 9.0+. If you are using cuda 8.0, please update it to 9.0.
+This project only support CUDA 9.0+ or CPU only. If you are using cuda 8.0, please update it to 9.0.
+
+This project only support tensors with spatial volume less than ```std::numeric_limits<int>::max()``` (~2e9). if someone really need very large space, open an issue.

 ## News:

-2019-5-22: spconv v1.1 alpha released, now cuda hash implementation will be default. you can use ```use_hash=False``` to use dense implementation. you may see some message during running, they will be removed in future.
+2019-5-24: spconv v1.1 released, now indice generation will use hash table as default (CPU code only support hash table). you can use ```use_hash=False``` to use dense table when using CUDA. In addition, add CPU only build support.

 ## Install on Ubuntu 16.04/18.04

@@ -20,7 +20,7 @@ This project only support CUDA 9.0+. If you are using cuda 8.0, please update it

 2. Download cmake >= 3.13.2, then add cmake executables to PATH.

-3. Ensure you have installed pytorch 1.0 in your environment, run ```python setup.py bdist_wheel``` (don't use ```python setup.py install```).
+3. Ensure you have installed pytorch 1.0+ in your environment, run ```python setup.py bdist_wheel``` (don't use ```python setup.py install```).

 4. Run ```cd ./dist```, use pip to install generated whl file.

@@ -152,6 +152,20 @@ This implementation use gather-gemm-scatter framework to do sparse convolution.

 * **Bo Li** - *gpu indice generation idea, owner of patent of the sparse conv gpu indice generation algorithm (don't include subm)* - [prclibo](https://github.com/prclibo)

+## Third party libraries
+
+* [CUDPP](https://github.com/cudpp/cudpp): A cuda library. contains a cuda hash implementation.
+
+* [robin-map](https://github.com/Tessil/robin-map): A fast c++ hash library. almost 2x faster than std::unordered_map in this project.
+
+* [pybind11](https://github.com/pybind/pybind11): A head-only python c++ binding library.
+
+* [prettyprint](https://github.com/louisdx/cxx-prettyprint): A head-only library for container print.
+
 ## License

 This project is licensed under the Apache license 2.0 License - see the [LICENSE.md](LICENSE.md) file for details
+
+The [CUDPP](https://github.com/cudpp/cudpp) hash code is licensed under BSD License.
+
+The [robin-map](https://github.com/Tessil/robin-map) code is licensed under MIT license.
\ No newline at end of file
--- a/include/hash/cuda_util.h
+++ b/include/hash/cuda_util.h
--- a/include/hash/debugging.h
+++ b/include/hash/debugging.h
@@ -24,7 +24,7 @@

 #include <algorithm>

-namespace cudahash {
+namespace cuhash {

 //! @name Debugging functions
 /// @{

--- a/include/hash/definitions.h
+++ b/include/hash/definitions.h
@@ -58,7 +58,7 @@ inline void PrintMessage(const char *message, const bool error = false) {
 //! Prints a message out to the console.
 inline void PrintMessage(const char *message, const bool error = false) {
  if (error) {
-    printf("!!! %s\n", message);
+    printf("cudahash: %s\n", message);
  } else {
    printf("%s\n", message);
  }
@@ -68,7 +68,7 @@ inline void PrintMessage(const char *message, const bool error = false) {
 /* -------------------------------------------------------------------------
   Hash table constants and definitions.
   ------------------------------------------------------------------------- */
-namespace cudahash {
+namespace cuhash {

 /**
 * \addtogroup cudpp_hash_data_structures
@@ -111,6 +111,6 @@ const float kMinimumSpaceUsages[] = {std::numeric_limits<float>::max(),

 /** @} */ // end cudpp_hash_data_structures

-}; // namespace cudahash
+}; // namespace cuhash

 #endif
--- a/include/hash/hash_functions.h
+++ b/include/hash/hash_functions.h
@@ -9,7 +9,7 @@
 #include <vector_types.h>
 #include "definitions.h"

-namespace cudahash {
+namespace cuhash {

 //! Prime number larger than the largest practical hash table size.
 const unsigned kPrimeDivisor = 4294967291u;
@@ -89,6 +89,7 @@ unsigned stash_hash_function(const uint2 stash_constants,
  return (stash_constants.x ^ key + stash_constants.y) % kStashSize;
 }

+unsigned generate_random_uint32();

 };  // namespace CuckooHashing


--- a/include/hash/hash_table.cuh
+++ b/include/hash/hash_table.cuh
@@ -22,7 +22,7 @@
 #include <tensorview/tensorview.h>
 #include <driver_types.h>

-namespace cudahash {
+namespace cuhash {

 //! Makes an 64-bit Entry out of a key-value pair for the hash table.
 TV_HOST_DEVICE_INLINE Entry make_entry(unsigned key, unsigned value) {

--- a/include/hash/hash_table.h
+++ b/include/hash/hash_table.h
@@ -45,7 +45,7 @@
 /* -------------------------------------------------------------------------
   Hash table code.
   ------------------------------------------------------------------------- */
-namespace cudahash {
+namespace cuhash {

 //! Compute how many thread blocks are required for the given number of threads.
 dim3 ComputeGridDim(unsigned threads);
@@ -113,7 +113,7 @@ class HashTable {
   *  The input keys are expected to be completely unique.
   *  To reduce the chance of a failure, increase the space usage or number of 
   *  functions.
-   *  Keys are not allowed to be equal to cudahash::kKeyEmpty.
+   *  Keys are not allowed to be equal to cuhash::kKeyEmpty.
   */
  virtual bool Build(const unsigned  input_size,
                     const unsigned *d_keys,

--- a/include/hash/mt19937ar.h
+++ b/include/hash/mt19937ar.h
-void init_genrand(unsigned long s);
-void init_by_array(unsigned long init_key[], int key_length);
-unsigned long genrand_int32(void);
-long genrand_int31(void);
-double genrand_real1(void);
-double genrand_real2(void);
-double genrand_real3(void);
-double genrand_res53(void);
--- a/include/paramsgrid.h
+++ b/include/paramsgrid.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+// This file is used for c++ unit test, but pytorch jit ops don't support c++ debug build.
+
 #ifndef PARAMS_GRID_H_
 #define PARAMS_GRID_H_
 #include <tuple>

--- a/include/spconv/fused_spconv_ops.h
+++ b/include/spconv/fused_spconv_ops.h
@@ -15,7 +15,6 @@
 #ifndef FUSED_SPARSE_CONV_OP_H_
 #define FUSED_SPARSE_CONV_OP_H_

-#include <cuda_runtime_api.h>
 #include <spconv/indice.h>
 #include <spconv/reordering.h>
 #include <torch/script.h>
@@ -84,7 +83,9 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
                 tv::torch2tv<const T>(features),
                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
-    } else {
+    } 
+#ifdef SPCONV_CUDA
+    else if (device == torch::kCUDA) {
      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
                 tv::torch2tv<const T>(features),
@@ -97,6 +98,11 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
      torch::index_select_out(inputBufferBlob, features, 0,
      indicePairBlob);*/
    }
+#endif
+    else {
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
+    }
+
    // totalGatherTime += timer.report() / 1000.0;
    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
    // totalGEMMTime += timer.report() / 1000.0;
@@ -107,7 +113,9 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
                  tv::torch2tv<const T>(outputBuffer),
                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
                  true);
-    } else {
+    }
+#ifdef SPCONV_CUDA 
+    else if (device == torch::kCUDA) {
      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
                  tv::torch2tv<const T>(outputBuffer),
@@ -115,6 +123,11 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
                  true);
      TV_CHECK_CUDA_ERR();
    }
+#endif
+    else {
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
+    }
+
    // totalSAddTime += timer.report() / 1000.0;
  }
  // std::cout << "gather time " << totalGatherTime << std::endl;

--- a/include/spconv/geometry.h
+++ b/include/spconv/geometry.h
@@ -18,8 +18,51 @@
 #include <iostream>
 #include <limits>
 #include <tensorview/tensorview.h>
-
+#include <tsl/robin_map.h>
+#include <unordered_map>
 namespace spconv {
+
+namespace detail {
+
+template <typename T> struct ToUnsigned;
+
+template <> struct ToUnsigned<int>{
+  using type = uint32_t;
+};
+
+template <> struct ToUnsigned<long>{
+  using type = uint64_t;
+};
+
+template <typename T> struct FNVInternal;
+template <> struct FNVInternal<uint32_t>
+{
+  constexpr static uint32_t defaultOffsetBasis = 0x811C9DC5;
+  constexpr static uint32_t prime = 0x01000193;
+};
+
+template <> struct FNVInternal<uint64_t>
+{
+  constexpr static uint64_t defaultOffsetBasis = 0xcbf29ce484222325;
+  constexpr static uint64_t prime = 0x100000001b3;
+};
+
+}
+template <typename T>
+using to_unsigned_t = typename detail::ToUnsigned<std::remove_const_t<T>>::type;
+
+template <typename T>
+struct FNV1a : detail::FNVInternal<T>{
+  std::size_t operator()(const T* data, std::size_t size){
+    to_unsigned_t<T> hash = detail::FNVInternal<T>::defaultOffsetBasis;
+    for (std::size_t i = 0; i < size; ++i) {
+      hash *= detail::FNVInternal<T>::prime;
+      hash ^= static_cast<to_unsigned_t<T>>(data[i]);
+    }
+    return hash;
+  }
+};
+
 template <typename Index, unsigned NDim>
 TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
                                    const Index *kernelSize,
@@ -169,6 +212,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
  Index* validPoints = validPoints_.data();
  Index *pointPtr = nullptr;
+  tsl::robin_map<Index, Index> hash;
  for (int j = 0; j < numActIn; ++j) {
    batchIdx = indicesIn(j, 0);
    numValidPoints = getValidOutPos<Index, NDim>(
@@ -179,16 +223,16 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
      auto offset = pointPtr[NDim];
      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
                   spatialVolume * batchIdx;
-      if (gridsOut[index] == -1) {
+      if (hash.find(index) == hash.end()) {
        for (unsigned k = 1; k < NDim + 1; ++k) {
          indicesOut(numAct, k) = pointPtr[k - 1];
        }
        indicesOut(numAct, 0) = batchIdx;
-        gridsOut[index] = numAct++;
+        hash[index] = numAct++;
      }
      // indicePairs: [K, 2, L]
      indicePairs(offset, 0, indiceNum[offset]) = j;
-      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+      indicePairs(offset, 1, indiceNum[offset]++) = hash[index];
    }
  }
  return numAct;
@@ -220,6 +264,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
  Index* validPoints = validPoints_.data();
  Index *pointPtr = nullptr;
+  tsl::robin_map<Index, Index> hash;
  for (int j = 0; j < numActIn; ++j) {
    batchIdx = indicesIn(j, 0);
    numValidPoints = getValidOutPosTranspose<Index, NDim>(
@@ -230,16 +275,16 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
      auto offset = pointPtr[NDim];
      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
                   spatialVolume * batchIdx;
-      if (gridsOut[index] == -1) {
+      if (hash.find(index) == hash.end()) {
        for (unsigned k = 1; k < NDim + 1; ++k) {
          indicesOut(numAct, k) = pointPtr[k - 1];
        }
        indicesOut(numAct, 0) = batchIdx;
-        gridsOut[index] = numAct++;
+        hash[index] = numAct++;
      }
      // indicePairs: [K, 2, L]
      indicePairs(offset, 0, indiceNum[offset]) = j;
-      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+      indicePairs(offset, 1, indiceNum[offset]++) = hash[index];
    }
  }
  return numAct;
@@ -271,12 +316,13 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
  Index* validPoints = validPoints_.data();
  Index *pointPtr = nullptr;
+  tsl::robin_map<Index, Index> hash;
  for (int j = 0; j < numActIn; ++j) {
    Index index = 0;
    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
                                         outSpatialShape) +
            spatialVolume * indicesIn(j, 0);
-    gridsOut[index] = j;
+    hash[index] = j;
  }
  Index index = 0;
  for (int j = 0; j < numActIn; ++j) {
@@ -288,9 +334,9 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
      auto offset = pointPtr[NDim];
      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
              spatialVolume * indicesIn(j, 0);
-      if (gridsOut[index] > -1) {
+      if (hash.find(index) == hash.end()) {
        indicePairs(offset, 0, indiceNum[offset]) = j;
-        indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+        indicePairs(offset, 1, indiceNum[offset]++) = hash[index];
      }
    }
  }

--- a/include/spconv/indice.cu.h
+++ b/include/spconv/indice.cu.h
@@ -14,18 +14,18 @@

 #ifndef INDICE_CU_H_
 #define INDICE_CU_H_
-#include <hash/hash_table.cuh>
+#include <cuhash/hash_table.cuh>
 #include <spconv/geometry.h>
 #include <tensorview/helper_kernel.cu.h>
 #include <tensorview/tensorview.h>

 namespace spconv {
 template <typename Index, typename IndexGrid, unsigned NDim,
-          int KernelMaxVolume = 256>
+          int KernelMaxVolume = 256, typename Index1D=int>
 __global__ void prepareIndicePairsKernel(
    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
+    tv::TensorView<Index> indiceNum, tv::TensorView<Index1D> indicePairUnique,
    const tv::SimpleVector<Index, NDim> kernelSize,
    const tv::SimpleVector<Index, NDim> stride,
    const tv::SimpleVector<Index, NDim> padding,
@@ -151,8 +151,8 @@ __global__ void
 assignIndicePairsHashKernel(tv::TensorView<Index> indicesOut, int numActIn,
                            tv::TensorView<Index> indicePairs,
                            tv::TensorView<Index> indicePairUnique,
-                            unsigned table_size, const cudahash::Entry *table,
-                            cudahash::Functions<kNumHashFunctions> constants,
+                            unsigned table_size, const cuhash::Entry *table,
+                            cuhash::Functions<kNumHashFunctions> constants,
                            uint2 stash_constants, unsigned stash_count) {

  Index index;
@@ -162,9 +162,9 @@ assignIndicePairsHashKernel(tv::TensorView<Index> indicesOut, int numActIn,
      index = indicePairs(i, 1, ix);
      if (index > -1) {
        auto val =
-            cudahash::retrieve((unsigned)(index), table_size,
+            cuhash::retrieve((unsigned)(index), table_size,
                               table, constants, stash_constants, stash_count);
-        assert(val != cudahash::kNotFound);
+        assert(val != cuhash::kNotFound);
        indicePairs(i, 1, ix) = (unsigned)val;
      }
    }
@@ -283,8 +283,8 @@ __global__ void getSubMIndicePairsHashKernel(
    const tv::SimpleVector<Index, NDim> padding,
    const tv::SimpleVector<Index, NDim> dilation,
    const tv::SimpleVector<Index, NDim> outSpatialShape,
-    unsigned table_size, const cudahash::Entry *table,
-    cudahash::Functions<kNumHashFunctions> constants,
+    unsigned table_size, const cuhash::Entry *table,
+    cuhash::Functions<kNumHashFunctions> constants,
    uint2 stash_constants, unsigned stash_count) {
  auto numActIn = indicesIn.dim(0);
  Index spatialVolume = 1;
@@ -307,9 +307,9 @@ __global__ void getSubMIndicePairsHashKernel(
      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
              spatialVolume * indicesIn(ix, 0);
      auto val =
-          cudahash::retrieve((unsigned)(index), table_size,
+          cuhash::retrieve((unsigned)(index), table_size,
                              table, constants, stash_constants, stash_count);
-      if (val != cudahash::kNotFound) {
+      if (val != cuhash::kNotFound) {
        auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
        indicePairs(offset, 1, oldNum) = val;
        indicePairs(offset, 0, oldNum) = ix;

--- a/include/spconv/nms.h
+++ b/include/spconv/nms.h
@@ -181,7 +181,7 @@ std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners,
  }
  return keep;
 }
-
+#ifdef SPCONV_CUDA
 constexpr int const threadsPerBlock = sizeof(unsigned long long) * 8;

 template <typename DType>
@@ -196,6 +196,7 @@ int non_max_suppression(py::array_t<DType> boxes, py::array_t<int> keep_out,
                                          boxes.shape(0), boxes.shape(1),
                                          nms_overlap_thresh, device_id);
 }
+#endif

 } // namespace spconv
 #endif
--- a/include/spconv/nms_ops.h
+++ b/include/spconv/nms_ops.h
@@ -15,7 +15,6 @@
 #ifndef NMS_TORCH_OP_H_
 #define NMS_TORCH_OP_H_

-#include <cuda_runtime_api.h>
 #include <spconv/indice.h>
 #include <spconv/reordering.h>
 #include <torch/script.h>

--- a/include/spconv/pillar_scatter_ops.h
+++ b/include/spconv/pillar_scatter_ops.h
@@ -15,7 +15,6 @@
 #ifndef PILLAR_SCATTER_OP_H_
 #define PILLAR_SCATTER_OP_H_

-#include <cuda_runtime_api.h>
 #include <spconv/pillar_scatter_functor.h>
 #include <torch/script.h>
 #include <torch_utils.h>
@@ -28,6 +27,7 @@ template <typename T>
 torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors,
                                 torch::Tensor shape) {
  TV_ASSERT_RT_ERR(shape.device().type() == torch::kCPU, "error");
+  TV_ASSERT_RT_ERR(features.device().type() == torch::kCUDA, "error");
  TV_ASSERT_RT_ERR(shape.dim() == 1, "error");
  TV_ASSERT_RT_ERR(shape.size(0) == 4, "error");
  TV_ASSERT_RT_ERR(features.dim() >= 3, "error");
@@ -42,10 +42,11 @@ torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors,
      torch::zeros({shapeData[0], shapeData[1], shapeData[2], shapeData[3]},
                   features.options());
  TV_ASSERT_RT_ERR(shapeData[1] == features.size(1), "error");
+#ifdef SPCONV_CUDA
  functor::PointPillarScatter<tv::GPU, T, int> ftor;
  ftor(tv::TorchGPU(), tv::torch2tv<T>(canvas), tv::torch2tv<const T>(features.squeeze()),
       tv::torch2tv<const T>(coors.squeeze()));
-
+#endif
  return canvas;
 }


--- a/include/spconv/pool_ops.h
+++ b/include/spconv/pool_ops.h
@@ -15,7 +15,6 @@
 #ifndef SPARSE_POOL_OP_H_
 #define SPARSE_POOL_OP_H_

-#include <cuda_runtime_api.h>
 #include <spconv/maxpool.h>
 #include <torch/script.h>
 #include <torch_utils.h>
@@ -44,13 +43,19 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
      forwardFtor(tv::CPU(), tv::torch2tv<T>(output),
                  tv::torch2tv<const T>(features),
                  tv::torch2tv<const int>(indicePairs).subview(i), nHot);
-    } else {
+    } 
+#ifdef SPCONV_CUDA
+    else if (device == torch::kCUDA) {
      functor::SparseMaxPoolForwardFunctor<tv::GPU, T, int> forwardFtor;
      forwardFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
                  tv::torch2tv<const T>(features),
                  tv::torch2tv<const int>(indicePairs).subview(i), nHot);
      TV_CHECK_CUDA_ERR();
    }
+#endif
+    else{
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
+    }
    // totalTime += timer.report() / 1000.0;
  }
  // std::cout << "maxpool forward time " << totalTime << std::endl;
@@ -80,7 +85,9 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
                   tv::torch2tv<const T>(features),
                   tv::torch2tv<const T>(outGrad), tv::torch2tv<T>(inputGrad),
                   tv::torch2tv<const int>(indicePairs).subview(i), nHot);
-    } else {
+    } 
+#ifdef SPCONV_CUDA
+    else if (device == torch::kCUDA) {
      functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, int> backwardFtor;
      backwardFtor(tv::TorchGPU(), tv::torch2tv<const T>(outFeatures),
                   tv::torch2tv<const T>(features),
@@ -88,6 +95,11 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
                   tv::torch2tv<const int>(indicePairs).subview(i), nHot);
      TV_CHECK_CUDA_ERR();
    }
+#endif
+    else{
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
+    }
+
  }
  return inputGrad;
 }

--- a/include/spconv/spconv_ops.h
+++ b/include/spconv/spconv_ops.h
 // Copyright 2019 Yan Yan
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,7 +15,6 @@
 #ifndef SPARSE_CONV_OP_H_
 #define SPARSE_CONV_OP_H_

-#include <cuda_runtime_api.h>
 #include <spconv/indice.h>
 #include <spconv/reordering.h>
 #include <torch/script.h>
@@ -27,15 +26,17 @@ namespace spconv {
 template <unsigned NDim>
 std::vector<torch::Tensor>
 getIndicePair(torch::Tensor indices, int64_t batchSize,
-        std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-        std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-        std::vector<int64_t> padding, std::vector<int64_t> dilation,
-        std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose, int64_t _useHash) {
-  
+              std::vector<int64_t> outSpatialShape,
+              std::vector<int64_t> spatialShape,
+              std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+              std::vector<int64_t> padding, std::vector<int64_t> dilation,
+              std::vector<int64_t> outPadding, int64_t _subM,
+              int64_t _transpose, int64_t _useHash) {
  // auto timer = spconv::CudaContextTimer<>();
  bool subM = _subM != 0;
  bool transpose = _transpose != 0;
-  bool useHash = _useHash != 0;
+  // CPU always use hash (tsl::robin_map).
+  bool useHash = _useHash != 0 || indices.device().type() == torch::kCPU;
  auto numAct = indices.size(0);
  auto coorDim = indices.size(1) - 1; // batchIdx + xyz
  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
@@ -54,31 +55,29 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
  for (int i = 1; i < outSpatialShape.size(); ++i) {
    outputVolume *= outSpatialShape[i];
  }
-  std::string msg = "due to limits of cuda hash, the volume of dense space include batch size ";
-  msg += "must less than std::numeric_limits<int>::max()";
-  TV_ASSERT_RT_ERR(batchSize * outputVolume < std::numeric_limits<int>::max(), msg);
+  std::string msg = "due to limits of cuda hash, the volume of dense space "
+                    "include batch size ";
+  msg += "must less than std::numeric_limits<int>::max() = 2e9";
+  TV_ASSERT_RT_ERR(batchSize * outputVolume < std::numeric_limits<int>::max(),
+                   msg);
  torch::Tensor indicePairs =
      torch::full({kernelVolume, 2, numAct}, -1,
-                   torch::dtype(torch::kInt32).device(indices.device()));
+                  torch::dtype(torch::kInt32).device(indices.device()));
  torch::Tensor indiceNum = torch::zeros(
      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
  auto gridSize = batchSize * outputVolume;
-  if (useHash){
-    gridSize = 1;
+  if (useHash) {
+    gridSize = batchSize;
  }
-  torch::Tensor gridOut =
-      torch::full({gridSize}, -1,
-                  torch::dtype(torch::kInt32).device(indices.device()));
-  // std::cout << "full time " << timer.report() / 1000.0 << std::endl;
+  torch::Tensor gridOut = torch::full(
+      {gridSize}, -1, torch::dtype(torch::kInt32).device(indices.device()));
+  gridOut = gridOut.view({batchSize, -1});
  int64_t numActOut = -1;
  tv::SimpleVector<int, NDim> outSpatialShape32;
  tv::SimpleVector<int, NDim> kernelSize32;
  tv::SimpleVector<int, NDim> stride32;
  tv::SimpleVector<int, NDim> padding32;
  tv::SimpleVector<int, NDim> dilation32;
-  auto indicePairUnique =
-      torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
-                  torch::dtype(torch::kInt32).device(indices.device()));
  for (int i = 0; i < NDim; ++i) {
    outSpatialShape32.push_back(outSpatialShape[i]);
    kernelSize32.push_back(kernelSize[i]);
@@ -97,40 +96,70 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
-          tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
-          stride32, padding32, dilation32, outSpatialShape32, transpose, useHash);
-    } else {
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose, false, useHash);
+    }
+#ifdef SPCONV_CUDA
+    else if (indices.device().type() == torch::kCUDA) {
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
-          tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
-          stride32, padding32, dilation32, outSpatialShape32, transpose, useHash);
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose, false, useHash);
+      if (numActOut == -1){
+        // build hash failed. use CPU algorithm
+        auto device = indices.device();
+        indicePairs = indicePairs.to({torch::kCPU});
+        indiceNum = indiceNum.to({torch::kCPU});
+        indices = indices.to({torch::kCPU});
+        auto getIndicePairFtor =
+            functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+        numActOut = getIndicePairFtor(
+            tv::CPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+            tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+            dilation32, outSpatialShape32, transpose, false, useHash);
+        return {indices.to(device), indicePairs.to(device), indiceNum.to(device)};
+      }
+    }
+#endif
+    else {
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
    }
    return {indices, indicePairs, indiceNum};
  } else {
+    auto indicePairUnique = torch::full(
+        {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+        torch::dtype(torch::kInt32).device(indices.device()));
    torch::Tensor outInds =
        torch::zeros({numAct * kernelVolume, coorDim + 1},
-                    torch::dtype(torch::kInt32).device(indices.device()));
+                     torch::dtype(torch::kInt32).device(indices.device()));
    if (indices.device().type() == torch::kCPU) {
-      auto getIndicePairFtor = functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      auto getIndicePairFtor =
+          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
          tv::CPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
-          stride32, padding32, dilation32, outSpatialShape32, transpose);
-    } else {
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
+          transpose);
+    }
+#ifdef SPCONV_CUDA
+    else if (indices.device().type() == torch::kCUDA) {
      auto getIndicePairFtorP1 =
          functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
      auto getIndicePairFtorP2 =
          functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
-      numActOut =
-          getIndicePairFtorP1(tv::TorchGPU(), tv::torch2tv<const int>(indices),
-                        tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-                        tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-                        tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
-                        padding32, dilation32, outSpatialShape32, transpose);
+      numActOut = getIndicePairFtorP1(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+          padding32, dilation32, outSpatialShape32, transpose);
      if (numActOut > 0) {
        auto res = torch::_unique(indicePairUnique);
        indicePairUnique = std::get<0>(res);
@@ -138,20 +167,43 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
            tv::TorchGPU(), tv::torch2tv<const int>(indices),
            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose, useHash);
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
+            false, useHash);
+        if (numActOut == -1){
+          // build hash failed. use CPU algorithm
+          auto getIndicePairFtor =
+              functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+          auto device = indices.device();
+          outInds = outInds.to({torch::kCPU});
+          indicePairs = indicePairs.to({torch::kCPU});
+          indiceNum = indiceNum.to({torch::kCPU});
+          indices = indices.to({torch::kCPU});
+          numActOut = getIndicePairFtor(
+              tv::CPU(), tv::torch2tv<const int>(indices),
+              tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+              tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+              kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
+              transpose);
+          return {outInds.to(device).slice(0, 0, numActOut), indicePairs.to(device), indiceNum.to(device)};
+        }
      }
    }
+#endif
+    else {
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
+    }
    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
  }
 }

 template <unsigned NDim>
-std::vector<torch::Tensor>
-getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
-        std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
-        std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-        std::vector<int64_t> padding, std::vector<int64_t> dilation,
-        std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose, int64_t _useHash) {
+std::vector<torch::Tensor> getIndicePairPreGrid(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose,
+    int64_t _useHash) {
  // auto timer = spconv::CudaContextTimer<>();
  bool subM = _subM != 0;
  bool transpose = _transpose != 0;
@@ -178,7 +230,7 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
  TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
  torch::Tensor indicePairs =
      torch::full({kernelVolume, 2, numAct}, -1,
-                   torch::dtype(torch::kInt32).device(indices.device()));
+                  torch::dtype(torch::kInt32).device(indices.device()));
  torch::Tensor indiceNum = torch::zeros(
      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
  // std::cout << "full time " << timer.report() / 1000.0 << std::endl;
@@ -188,9 +240,9 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
  tv::SimpleVector<int, NDim> stride32;
  tv::SimpleVector<int, NDim> padding32;
  tv::SimpleVector<int, NDim> dilation32;
-  auto indicePairUnique =
-      torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
-                  torch::dtype(torch::kInt32).device(indices.device()));
+  auto indicePairUnique = torch::full(
+      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+      torch::dtype(torch::kInt32).device(indices.device()));
  for (int i = 0; i < NDim; ++i) {
    outSpatialShape32.push_back(outSpatialShape[i]);
    kernelSize32.push_back(kernelSize[i]);
@@ -209,42 +261,54 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
-          tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
-          stride32, padding32, dilation32, outSpatialShape32, transpose);
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
      gridOut.fill_(-1);
-    } else {
+    }
+#ifdef SPCONV_CUDA
+    else if (indices.device().type() == torch::kCUDA) {
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
-          tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
-          stride32, padding32, dilation32, outSpatialShape32, transpose, true);
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose, true);
+    }
+#endif
+    else {
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
    }
    return {indices, indicePairs, indiceNum};
  } else {
    torch::Tensor outInds =
        torch::zeros({numAct * kernelVolume, coorDim + 1},
-                    torch::dtype(torch::kInt32).device(indices.device()));
+                     torch::dtype(torch::kInt32).device(indices.device()));
    if (indices.device().type() == torch::kCPU) {
-      auto getIndicePairFtor = functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      auto getIndicePairFtor =
+          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
          tv::CPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
-          stride32, padding32, dilation32, outSpatialShape32, transpose, true);
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
+          transpose, true);
      gridOut.fill_(-1);
-    } else {
+    }
+#ifdef SPCONV_CUDA
+    else if (indices.device().type() == torch::kCUDA) {
      auto getIndicePairFtorP1 =
          functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
      auto getIndicePairFtorP2 =
          functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
-      numActOut =
-          getIndicePairFtorP1(tv::TorchGPU(), tv::torch2tv<const int>(indices),
-                        tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-                        tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-                        tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
-                        padding32, dilation32, outSpatialShape32, transpose);
+      numActOut = getIndicePairFtorP1(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+          padding32, dilation32, outSpatialShape32, transpose);
      if (numActOut > 0) {
        auto res = torch::_unique(indicePairUnique);
        indicePairUnique = std::get<0>(res);
@@ -252,19 +316,22 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
            tv::TorchGPU(), tv::torch2tv<const int>(indices),
            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose, true);
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
+            true);
      }
    }
+#endif
+    else {
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
+    }
    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
  }
-  
 }

-
 template <typename T>
 torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
-                       torch::Tensor indicePairs, torch::Tensor indiceNum,
-                       int64_t numActOut, int64_t _inverse, int64_t _subM) {
+                         torch::Tensor indicePairs, torch::Tensor indiceNum,
+                         int64_t numActOut, int64_t _inverse, int64_t _subM) {
  bool subM = _subM != 0;
  bool inverse = _inverse != 0;
  auto device = features.device().type();
@@ -273,13 +340,16 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
  auto numInPlanes = features.size(1);
  auto numOutPlanes = filters.size(ndim + 1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter = std::max_element(
-      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
-  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data<int>(),
+                       indicePairNumCpu.data<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data<int>();
  int indicePairMaxSize = *indicePairMaxSizeIter;
-  
+
  /*if (_subM){
-    std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+    std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(),
+  indicePairNumCpu.data<int>() + kernelVolume);
    indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);

    auto indicePairVecMaxSizeIter = std::max_element(
@@ -293,7 +363,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());

  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
-  torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
  torch::Tensor outputBuffer =
      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
  filters = filters.view({-1, numInPlanes, numOutPlanes});
@@ -319,20 +390,28 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
-    } else {
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                 nHot);
+    }
+#ifdef SPCONV_CUDA
+    else if (device == torch::kCUDA) {
      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                 nHot);
      TV_CHECK_CUDA_ERR();
      /* slower than SparseGatherFunctor, may due to int->long conversion
      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
-      auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot},
-      indicePairOptions); 
-      torch::index_select_out(inputBufferBlob, features, 0,
-      indicePairBlob);*/
+      auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
+      {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
+      features, 0, indicePairBlob);*/
+    }
+#endif
+    else {
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
    }
+
    // totalGatherTime += timer.report() / 1000.0;
    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
    // totalGEMMTime += timer.report() / 1000.0;
@@ -341,16 +420,23 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
                  tv::torch2tv<const T>(outputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
-                  true);
-    } else {
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                  nHot, true);
+    }
+#ifdef SPCONV_CUDA
+    else if (device == torch::kCUDA) {
      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
                  tv::torch2tv<const T>(outputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
-                  true);
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                  nHot, true);
      TV_CHECK_CUDA_ERR();
    }
+#endif
+    else {
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
+    }
+
    // totalSAddTime += timer.report() / 1000.0;
  }
  // std::cout << "gather time " << totalGatherTime << std::endl;
@@ -362,8 +448,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
 template <typename T>
 std::vector<torch::Tensor>
 indiceConvBackward(torch::Tensor features, torch::Tensor filters,
-                 torch::Tensor outGrad, torch::Tensor indicePairs, torch::Tensor indiceNum,
-                 int64_t _inverse, int64_t _subM) {
+                   torch::Tensor outGrad, torch::Tensor indicePairs,
+                   torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
  bool subM = _subM != 0;
  bool inverse = _inverse != 0;

@@ -373,16 +459,19 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
  auto numInPlanes = features.size(1);
  auto numOutPlanes = filters.size(ndim + 1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter = std::max_element(
-      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
-  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data<int>(),
+                       indicePairNumCpu.data<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data<int>();
  int indicePairMaxSize = *indicePairMaxSizeIter;
  auto options =
      torch::TensorOptions().dtype(features.dtype()).device(features.device());
  auto filterShape = filters.sizes();
  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
-  torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
  torch::Tensor outputBuffer =
      torch::zeros({indicePairMaxSize, numOutPlanes}, options);

@@ -403,22 +492,33 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                 nHot);
      gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
                    tv::torch2tv<const T>(outGrad),
-                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot);
-    } else {
+                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                    nHot);
+    }
+#ifdef SPCONV_CUDA
+    else if (device == torch::kCUDA) {
      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                 nHot);
      TV_CHECK_CUDA_ERR();
      gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
                    tv::torch2tv<const T>(outGrad),
-                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot);
+                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                    nHot);
      TV_CHECK_CUDA_ERR();
    }
+#endif
+    else {
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
+    }
+
    auto filterGradSub = filtersGrad[i];
    auto outputBufferBlob =
        torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
@@ -431,20 +531,29 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
      scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
                  tv::torch2tv<const T>(inputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
-    } else {
+                  tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                  nHot);
+    }
+#ifdef SPCONV_CUDA
+    else if (device == torch::kCUDA) {
      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
                  tv::torch2tv<const T>(inputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+                  tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                  nHot);
      TV_CHECK_CUDA_ERR();
    }
+#endif
+    else {
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
+    }
  }
  return {inputGrad, filtersGrad.view(filterShape)};
 }

 template <typename T>
-torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor filters,
+torch::Tensor
+indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor filters,
                         torch::Tensor indicePairs, torch::Tensor indiceNum,
                         int64_t numActOut, int64_t _inverse, int64_t _subM) {
  bool subM = _subM != 0;
@@ -458,18 +567,22 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
  auto totalActsTen = indicePairNumCpu.sum();
  auto totalActs = indicePairNumCpu.data<int>()[0];
-  auto indicePairMaxSizeIter = std::max_element(
-      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
-  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data<int>(),
+                       indicePairNumCpu.data<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data<int>();
  int indicePairMaxSize = *indicePairMaxSizeIter;
  std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(),
-                              indicePairNumCpu.data<int>() + kernelVolume);
+                                    indicePairNumCpu.data<int>() +
+                                        kernelVolume);
  indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
-  int subRuleMaxSize = *std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
+  int subRuleMaxSize =
+      *std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
  if (subM) {
    indicePairMaxSize = subRuleMaxSize;
  }
-  auto timer = spconv::CudaContextTimer<>();
+  // auto timer = spconv::CudaContextTimer<>();
  auto options =
      torch::TensorOptions().dtype(features.dtype()).device(features.device());
  // auto indicePairOptions =
@@ -481,7 +594,7 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
  torch::Tensor outputBuffer =
      torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options);
  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  std::cout << "create time " << timer.report()/1000.0 << std::endl;
+  // std::cout << "create time " << timer.report() / 1000.0 << std::endl;
  if (subM) { // the center index of subm conv don't need gather and scatter
              // add.
    torch::mm_out(output, features, filters[indicePairMaxOffset]);
@@ -495,7 +608,7 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
      continue;
    }
-    // 
+    //
    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
                                             {nHot, numOutPlanes}, options);
    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
@@ -504,20 +617,28 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBufferBlob),
                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
-    } else {
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                 nHot);
+    } 
+#ifdef SPCONV_CUDA
+    else if (device == torch::kCUDA) {
      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBufferBlob),
                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                 nHot);
      TV_CHECK_CUDA_ERR();
    }
+#endif
+    else {
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
+    }
+
    // }
    // for (int i = 0; i < kernelVolume; ++i) {
    // totalGatherTime += timer.report() / 1000.0;
    // auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
    // {nHot, numOutPlanes}, options);
-
  }
  // totalGatherTime += timer.report() / 1000.0;
  for (int i = 0; i < kernelVolume; ++i) {
@@ -548,16 +669,23 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
                  tv::torch2tv<const T>(outputBufferBlob),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
-                  true);
-    } else {
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                  nHot, true);
+    } 
+#ifdef SPCONV_CUDA
+    else if (device == torch::kCUDA) {
      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
                  tv::torch2tv<const T>(outputBufferBlob),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
-                  true);
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                  nHot, true);
      TV_CHECK_CUDA_ERR();
    }
+#endif
+    else {
+      TV_ASSERT_INVALID_ARG(false, "unknown device type");
+    }
+
    // totalSAddTime += timer.report() / 1000.0;
  }
  // totalSAddTime += timer.report() / 1000.0;

--- a/include/tensorview/tensorview.h
+++ b/include/tensorview/tensorview.h
@@ -16,17 +16,22 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdlib>
-#include <cuda_runtime_api.h>
+
 #include <iostream>
 #include <memory>
 // #include <prettyprint.h>
 #include <sstream>
 #include <type_traits>
 #include <vector>
+#ifdef SPCONV_CUDA
+#include <cuda_runtime_api.h>
+#endif
+

 namespace tv {

 #ifdef __NVCC__
+
 #define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
 #define TV_DEVICE_INLINE __forceinline__ __device__
 #define TV_HOST_DEVICE __device__ __host__
@@ -113,12 +118,13 @@ void sstream_print(SStream &ss, T val, TArgs... args) {
    }                                                                          \
  }

-
+#ifdef SPCONV_CUDA
 struct GPU {
  GPU(cudaStream_t s = 0) : mStream(s) {}
  virtual cudaStream_t getStream() const { return mStream; }
  cudaStream_t mStream = 0;
 };
+#endif
 struct CPU {};

 #define TV_MAX_DIM 6

--- a/include/torch_utils.h
+++ b/include/torch_utils.h
@@ -16,16 +16,19 @@
 #include <tensorview/tensorview.h>
 #include <torch/script.h>
 #include <ATen/ATen.h>
+#ifdef SPCONV_CUDA
 #include <ATen/cuda/CUDAContext.h>
+#endif

 namespace tv {
-
+  
+#ifdef SPCONV_CUDA
 struct TorchGPU: public tv::GPU {
  virtual cudaStream_t getStream() const override {
    return at::cuda::getCurrentCUDAStream();
  }
 };
-
+#endif
 template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
  switch (tensor.type().scalarType()) {
  case at::ScalarType::Double: {