Commit a6ae8967 authored by traveller59's avatar traveller59
Browse files

spconv v1.1 release:

1. add cuda hash support for cuda indice generation.
2. use hash table instead of dense table in CPU code.
3. add CPU-only build support.
parent 0757c45b
......@@ -464,7 +464,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
DECLARE_GPU_SPECS(float);
DECLARE_GPU_SPECS(double);
// DECLARE_GPU_SPECS(at::Half); // currently have problem
DECLARE_GPU_SPECS(at::Half);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
......
......@@ -14,6 +14,7 @@
#include <spconv/reordering.h>
#include <torch/script.h>
#include <ATen/Parallel.h>
namespace spconv {
namespace functor {
......@@ -22,11 +23,13 @@ struct SparseGatherFunctor<tv::CPU, T, Index> {
void operator()(const tv::CPU& d, tv::TensorView<T> buffer, tv::TensorView<const T> features,
tv::TensorView<const Index> indices, int size) {
int numPlanes = features.dim(1);
for (int i = 0; i < size; ++i) {
std::memcpy(buffer.data() + i * numPlanes,
features.data() + indices[i] * numPlanes,
sizeof(T) * numPlanes);
}
at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end){
for (int i = begin; i < end; ++i) {
std::memcpy(buffer.data() + i * numPlanes,
features.data() + indices[i] * numPlanes,
sizeof(T) * numPlanes);
}
});
}
};
......@@ -38,13 +41,15 @@ struct SparseScatterAddFunctor<tv::CPU, T, Index> {
int numPlanes = outFeatures.dim(1);
const T* buf = buffer.data();
T* out = outFeatures.data();
for (int i = 0; i < size; ++i) {
buf = buffer.data() + i * numPlanes;
out = outFeatures.data() + indices[i] * numPlanes;
for (int j = 0; j < numPlanes; ++j){
out[j] += buf[j];
at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end){
for (int i = begin; i < end; ++i) {
buf = buffer.data() + i * numPlanes;
out = outFeatures.data() + indices[i] * numPlanes;
for (int j = 0; j < numPlanes; ++j){
out[j] += buf[j];
}
}
}
});
}
};
......
add_library(spconv_nms STATIC nms.cu)
set_target_properties(spconv_nms PROPERTIES VERSION ${PROJECT_VERSION})
set_target_properties(spconv_nms PROPERTIES SOVERSION 1)
target_include_directories(spconv_nms PRIVATE ${ALL_INCLUDE})
set_property(TARGET spconv_nms PROPERTY CXX_STANDARD 14)
set_property(TARGET spconv_nms PROPERTY CUDA_STANDARD 14)
set_property(TARGET spconv_nms PROPERTY POSITION_INDEPENDENT_CODE ON)
target_link_libraries(spconv_nms ${CUDA_CUDART})
install (TARGETS spconv_nms DESTINATION lib)
if (SPCONV_BuildCUDA)
add_library(spconv_nms STATIC nms.cu)
set_target_properties(spconv_nms PROPERTIES VERSION ${PROJECT_VERSION})
set_target_properties(spconv_nms PROPERTIES SOVERSION 1)
target_include_directories(spconv_nms PRIVATE ${ALL_INCLUDE})
set_property(TARGET spconv_nms PROPERTY CXX_STANDARD 14)
set_property(TARGET spconv_nms PROPERTY CUDA_STANDARD 14)
set_property(TARGET spconv_nms PROPERTY POSITION_INDEPENDENT_CODE ON)
target_link_libraries(spconv_nms ${CUDA_CUDART})
install (TARGETS spconv_nms DESTINATION lib)
endif()
add_library(spconv_utils SHARED all.cc)
set_target_properties(spconv_utils PROPERTIES VERSION ${PROJECT_VERSION})
......@@ -18,6 +19,9 @@ set_property(TARGET spconv_utils PROPERTY CXX_STANDARD 14)
set_property(TARGET spconv_utils PROPERTY CUDA_STANDARD 14)
set_target_properties(spconv_utils PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
SUFFIX "${PYTHON_MODULE_EXTENSION}")
target_link_libraries(spconv_utils ${CUDA_CUDART} pybind11::module spconv_nms)
if (SPCONV_BuildCUDA)
target_link_libraries(spconv_utils ${CUDA_CUDART} pybind11::module spconv_nms)
else()
target_link_libraries(spconv_utils pybind11::module)
endif()
install (TARGETS spconv_utils DESTINATION lib)
......@@ -20,12 +20,14 @@ using namespace pybind11::literals;
PYBIND11_MODULE(spconv_utils, m) {
m.doc() = "util pybind11 functions for spconv";
#ifdef SPCONV_CUDA
m.def("non_max_suppression", &spconv::non_max_suppression<double>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
m.def("non_max_suppression", &spconv::non_max_suppression<float>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
#endif
m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<double>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
......
......@@ -8,7 +8,6 @@
#include <string>
#include <vector>
#include <exception>
#include <fmt/format.h>
#include <numeric>
#include <pybind11/embed.h> // everything needed for embedding
#include <pybind11/functional.h>
......@@ -118,7 +117,7 @@ TEST_CASE("GetConvIndPair", "[SpConvNet]")
auto filters_gpu = filters_tensor.to(torch::Device(torch::kCUDA, 0));
auto outputs = spconv::getIndicePair<3>(inds_gpu, 1, {46, 26, 26}, {50, 30, 30}, {3, 3, 3},
{1, 1, 1}, {0, 0, 0}, {2, 2, 2}, {0, 0, 0}, false, false);
{1, 1, 1}, {0, 0, 0}, {2, 2, 2}, {0, 0, 0}, 0, 0, 0);
// std::cout << outputs[2] << std::endl;
/*
auto output = spconv::indiceConv<float>(features_gpu, filters_gpu, outputs[1], outputs[2], outputs[0].size(0), false);
......
......@@ -33,7 +33,8 @@ class SparseConv3dTestTorch(nn.Module):
stride,
padding=padding,
dilation=dilation,
bias=False)]
bias=False,
use_hash=True)]
for i in range(1, num_layers):
layers.append(spconv.SparseConv3d(
out_channels,
......@@ -561,14 +562,15 @@ class TestSpConv(TestCase):
def main():
# function for develop.
np.random.seed(484)
# devices = ["cuda:0"]
devices = ["cuda:0"]
shapes = [[50, 30, 30]]
batchsizes = [3]
batchsizes = [2]
in_channels = [256]
out_channels = [256]
ksizes = [3]
ksizes = [(3, 1, 1)]
strides = [1]
paddings = [0]
dilations = [1]
......@@ -579,7 +581,7 @@ def main():
if all([s > 1, d > 1]):
continue
device = torch.device(dev)
num_points = [5000] * bs
num_points = [5] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC)
......@@ -587,19 +589,19 @@ def main():
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32)
indices_t = torch.from_numpy(indices)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device).half()
features_t = torch.from_numpy(features).to(device).half()
filters = np.random.uniform(0, 1, size=[k[0], 1, 1, IC, OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device).float()
features_t = torch.from_numpy(features).to(device).float()
features_dense_t = torch.from_numpy(features_dense).to(device).half()
net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).half()
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).half()
filters_t = torch.from_numpy(filters).to(device).half()
features_dense_t = torch.from_numpy(features_dense).to(device).float()
net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).float()
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).float()
filters_t = torch.from_numpy(filters).to(device).float()
net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1, 2).contiguous()
net.net[0].weight[:] = filters_t
out_ref = net_ref(features_dense_t)
times = []
for i in range(30):
for i in range(0):
t = time.time()
out = net(features_t, indices_t, bs)
torch.cuda.synchronize()
......@@ -607,7 +609,9 @@ def main():
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
print("spconv time", np.mean(times[2:]))
out = net(features_t, indices_t, bs).dense()
out = net(features_t, indices_t, bs)
# print(out.indices)
out = out.dense()
print(np.linalg.norm(out.detach().cpu().numpy() - out_ref.detach().cpu().numpy()))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment