spconv v1.1 release:

1. add cuda hash support for cuda indice generation. 2. use hash table instead of dense table in CPU code. 3. add CPU-only build support.

spconv v1.1 release:
1. add cuda hash support for cuda indice generation. 2. use hash table instead of dense table in CPU code. 3. add CPU-only build support.
a6ae8967 · traveller59 · 0757c45b · a6ae8967 · a6ae8967 · a6ae8967
Commit a6ae8967 authored May 24, 2019 by traveller59
6 changed files
--- a/src/spconv/maxpool.cu
+++ b/src/spconv/maxpool.cu
@@ -464,7 +464,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {

 DECLARE_GPU_SPECS(float);
 DECLARE_GPU_SPECS(double);
-// DECLARE_GPU_SPECS(at::Half); // currently have problem
+DECLARE_GPU_SPECS(at::Half);

 #undef DECLARE_GPU_SPECS
 #undef DECLARE_GPU_SPECS_T_INDEX

--- a/src/spconv/reordering.cc
+++ b/src/spconv/reordering.cc
@@ -14,6 +14,7 @@

 #include <spconv/reordering.h>
 #include <torch/script.h>
+#include <ATen/Parallel.h>

 namespace spconv {
 namespace functor {
@@ -22,11 +23,13 @@ struct SparseGatherFunctor<tv::CPU, T, Index> {
  void operator()(const tv::CPU& d, tv::TensorView<T> buffer, tv::TensorView<const T> features,
                  tv::TensorView<const Index> indices, int size) {
    int numPlanes = features.dim(1);
-    for (int i = 0; i < size; ++i) {
-      std::memcpy(buffer.data() + i * numPlanes,
-                  features.data() + indices[i] * numPlanes,
-                  sizeof(T) * numPlanes);
-    }
+    at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end){
+      for (int i = begin; i < end; ++i) {
+        std::memcpy(buffer.data() + i * numPlanes,
+                    features.data() + indices[i] * numPlanes,
+                    sizeof(T) * numPlanes);
+      }
+    });
  }
 };

@@ -38,13 +41,15 @@ struct SparseScatterAddFunctor<tv::CPU, T, Index> {
    int numPlanes = outFeatures.dim(1);
    const T* buf = buffer.data();
    T* out = outFeatures.data();
-    for (int i = 0; i < size; ++i) {
-      buf = buffer.data() + i * numPlanes;
-      out = outFeatures.data() + indices[i] * numPlanes;
-      for (int j = 0; j < numPlanes; ++j){
-        out[j] += buf[j];
+    at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end){
+      for (int i = begin; i < end; ++i) {
+        buf = buffer.data() + i * numPlanes;
+        out = outFeatures.data() + indices[i] * numPlanes;
+        for (int j = 0; j < numPlanes; ++j){
+          out[j] += buf[j];
+        }
      }
-    }
+    });
  }
 };


--- a/src/utils/CMakeLists.txt
+++ b/src/utils/CMakeLists.txt
-add_library(spconv_nms STATIC nms.cu)
-set_target_properties(spconv_nms PROPERTIES VERSION ${PROJECT_VERSION})
-set_target_properties(spconv_nms PROPERTIES SOVERSION 1)
-target_include_directories(spconv_nms PRIVATE ${ALL_INCLUDE})
-set_property(TARGET spconv_nms PROPERTY CXX_STANDARD 14)
-set_property(TARGET spconv_nms PROPERTY CUDA_STANDARD 14)
-set_property(TARGET spconv_nms PROPERTY POSITION_INDEPENDENT_CODE ON)
-target_link_libraries(spconv_nms ${CUDA_CUDART})
-
-install (TARGETS spconv_nms DESTINATION lib)
+if (SPCONV_BuildCUDA)
+    add_library(spconv_nms STATIC nms.cu)
+    set_target_properties(spconv_nms PROPERTIES VERSION ${PROJECT_VERSION})
+    set_target_properties(spconv_nms PROPERTIES SOVERSION 1)
+    target_include_directories(spconv_nms PRIVATE ${ALL_INCLUDE})
+    set_property(TARGET spconv_nms PROPERTY CXX_STANDARD 14)
+    set_property(TARGET spconv_nms PROPERTY CUDA_STANDARD 14)
+    set_property(TARGET spconv_nms PROPERTY POSITION_INDEPENDENT_CODE ON)
+    target_link_libraries(spconv_nms ${CUDA_CUDART})
+    install (TARGETS spconv_nms DESTINATION lib)
+endif()

 add_library(spconv_utils SHARED all.cc)
 set_target_properties(spconv_utils PROPERTIES VERSION ${PROJECT_VERSION})
@@ -18,6 +19,9 @@ set_property(TARGET spconv_utils PROPERTY CXX_STANDARD 14)
 set_property(TARGET spconv_utils PROPERTY CUDA_STANDARD 14)
 set_target_properties(spconv_utils PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
                                         SUFFIX "${PYTHON_MODULE_EXTENSION}")
-target_link_libraries(spconv_utils ${CUDA_CUDART} pybind11::module spconv_nms)
-
+if (SPCONV_BuildCUDA)
+    target_link_libraries(spconv_utils ${CUDA_CUDART} pybind11::module spconv_nms)
+else()
+    target_link_libraries(spconv_utils pybind11::module)
+endif()
 install (TARGETS spconv_utils DESTINATION lib)
--- a/src/utils/all.cc
+++ b/src/utils/all.cc
@@ -20,12 +20,14 @@ using namespace pybind11::literals;

 PYBIND11_MODULE(spconv_utils, m) {
  m.doc() = "util pybind11 functions for spconv";
+#ifdef SPCONV_CUDA
  m.def("non_max_suppression", &spconv::non_max_suppression<double>,
        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
        "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
  m.def("non_max_suppression", &spconv::non_max_suppression<float>,
        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
        "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
+#endif
  m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<double>,
        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
        "order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);

--- a/test/src/test_conv_rule.cpp
+++ b/test/src/test_conv_rule.cpp
@@ -8,7 +8,6 @@
 #include <string>
 #include <vector>
 #include <exception>
-#include <fmt/format.h>
 #include <numeric>
 #include <pybind11/embed.h> // everything needed for embedding
 #include <pybind11/functional.h>
@@ -118,7 +117,7 @@ TEST_CASE("GetConvIndPair", "[SpConvNet]")
        auto filters_gpu = filters_tensor.to(torch::Device(torch::kCUDA, 0));
        
        auto outputs = spconv::getIndicePair<3>(inds_gpu, 1, {46, 26, 26}, {50, 30, 30}, {3, 3, 3},
-            {1, 1, 1}, {0, 0, 0}, {2, 2, 2}, {0, 0, 0}, false, false);
+            {1, 1, 1}, {0, 0, 0}, {2, 2, 2}, {0, 0, 0}, 0, 0, 0);
        // std::cout << outputs[2] << std::endl;
        /*
        auto output = spconv::indiceConv<float>(features_gpu, filters_gpu, outputs[1], outputs[2], outputs[0].size(0), false);

--- a/test/test_conv.py
+++ b/test/test_conv.py
@@ -33,7 +33,8 @@ class SparseConv3dTestTorch(nn.Module):
                stride,
                padding=padding,
                dilation=dilation,
-                bias=False)]
+                bias=False,
+                use_hash=True)]
        for i in range(1, num_layers):
            layers.append(spconv.SparseConv3d(
                out_channels,
@@ -561,14 +562,15 @@ class TestSpConv(TestCase):
 def main():
    # function for develop.
    np.random.seed(484)
+    # devices = ["cuda:0"]
    devices = ["cuda:0"]
    shapes = [[50, 30, 30]]
-    batchsizes = [3]
+    batchsizes = [2]

    
    in_channels = [256]
    out_channels = [256]
-    ksizes = [3]
+    ksizes = [(3, 1, 1)]
    strides = [1]
    paddings = [0]
    dilations = [1]
@@ -579,7 +581,7 @@ def main():
        if all([s > 1, d > 1]):
            continue
        device = torch.device(dev)
-        num_points = [5000] * bs
+        num_points = [5] * bs

        sparse_dict = generate_sparse_data(shape, num_points, IC)

@@ -587,19 +589,19 @@ def main():
        indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
        features_dense = sparse_dict["features_dense"].astype(np.float32)
        indices_t = torch.from_numpy(indices)
-        filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
-        indices_t = torch.from_numpy(indices).int().to(device).half()
-        features_t = torch.from_numpy(features).to(device).half()
+        filters = np.random.uniform(0, 1, size=[k[0], 1, 1, IC, OC]).astype(np.float32)
+        indices_t = torch.from_numpy(indices).int().to(device).float()
+        features_t = torch.from_numpy(features).to(device).float()
        
-        features_dense_t = torch.from_numpy(features_dense).to(device).half()
-        net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).half()
-        net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).half()
-        filters_t = torch.from_numpy(filters).to(device).half()
+        features_dense_t = torch.from_numpy(features_dense).to(device).float()
+        net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).float()
+        net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).float()
+        filters_t = torch.from_numpy(filters).to(device).float()
        net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1, 2).contiguous()
        net.net[0].weight[:] = filters_t
        out_ref = net_ref(features_dense_t)
        times = []
-        for i in range(30):
+        for i in range(0):
            t = time.time()
            out = net(features_t, indices_t, bs)
            torch.cuda.synchronize()
@@ -607,7 +609,9 @@ def main():
        # print((net.grid == -1).float().sum(), net.grid.numel())
            # print("spconv time", time.time() - t)
        print("spconv time", np.mean(times[2:]))
-        out = net(features_t, indices_t, bs).dense()
+        out = net(features_t, indices_t, bs)
+        # print(out.indices)
+        out = out.dense()
        print(np.linalg.norm(out.detach().cpu().numpy() - out_ref.detach().cpu().numpy()))