Commit a6ae8967 authored by traveller59's avatar traveller59
Browse files

spconv v1.1 release:

1. add cuda hash support for cuda indice generation.
2. use hash table instead of dense table in CPU code.
3. add CPU-only build support.
parent 0757c45b
...@@ -464,7 +464,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> { ...@@ -464,7 +464,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
DECLARE_GPU_SPECS(float); DECLARE_GPU_SPECS(float);
DECLARE_GPU_SPECS(double); DECLARE_GPU_SPECS(double);
// DECLARE_GPU_SPECS(at::Half); // currently have problem DECLARE_GPU_SPECS(at::Half);
#undef DECLARE_GPU_SPECS #undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX #undef DECLARE_GPU_SPECS_T_INDEX
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <spconv/reordering.h> #include <spconv/reordering.h>
#include <torch/script.h> #include <torch/script.h>
#include <ATen/Parallel.h>
namespace spconv { namespace spconv {
namespace functor { namespace functor {
...@@ -22,11 +23,13 @@ struct SparseGatherFunctor<tv::CPU, T, Index> { ...@@ -22,11 +23,13 @@ struct SparseGatherFunctor<tv::CPU, T, Index> {
void operator()(const tv::CPU& d, tv::TensorView<T> buffer, tv::TensorView<const T> features, void operator()(const tv::CPU& d, tv::TensorView<T> buffer, tv::TensorView<const T> features,
tv::TensorView<const Index> indices, int size) { tv::TensorView<const Index> indices, int size) {
int numPlanes = features.dim(1); int numPlanes = features.dim(1);
for (int i = 0; i < size; ++i) { at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end){
for (int i = begin; i < end; ++i) {
std::memcpy(buffer.data() + i * numPlanes, std::memcpy(buffer.data() + i * numPlanes,
features.data() + indices[i] * numPlanes, features.data() + indices[i] * numPlanes,
sizeof(T) * numPlanes); sizeof(T) * numPlanes);
} }
});
} }
}; };
...@@ -38,13 +41,15 @@ struct SparseScatterAddFunctor<tv::CPU, T, Index> { ...@@ -38,13 +41,15 @@ struct SparseScatterAddFunctor<tv::CPU, T, Index> {
int numPlanes = outFeatures.dim(1); int numPlanes = outFeatures.dim(1);
const T* buf = buffer.data(); const T* buf = buffer.data();
T* out = outFeatures.data(); T* out = outFeatures.data();
for (int i = 0; i < size; ++i) { at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end){
for (int i = begin; i < end; ++i) {
buf = buffer.data() + i * numPlanes; buf = buffer.data() + i * numPlanes;
out = outFeatures.data() + indices[i] * numPlanes; out = outFeatures.data() + indices[i] * numPlanes;
for (int j = 0; j < numPlanes; ++j){ for (int j = 0; j < numPlanes; ++j){
out[j] += buf[j]; out[j] += buf[j];
} }
} }
});
} }
}; };
......
add_library(spconv_nms STATIC nms.cu) if (SPCONV_BuildCUDA)
set_target_properties(spconv_nms PROPERTIES VERSION ${PROJECT_VERSION}) add_library(spconv_nms STATIC nms.cu)
set_target_properties(spconv_nms PROPERTIES SOVERSION 1) set_target_properties(spconv_nms PROPERTIES VERSION ${PROJECT_VERSION})
target_include_directories(spconv_nms PRIVATE ${ALL_INCLUDE}) set_target_properties(spconv_nms PROPERTIES SOVERSION 1)
set_property(TARGET spconv_nms PROPERTY CXX_STANDARD 14) target_include_directories(spconv_nms PRIVATE ${ALL_INCLUDE})
set_property(TARGET spconv_nms PROPERTY CUDA_STANDARD 14) set_property(TARGET spconv_nms PROPERTY CXX_STANDARD 14)
set_property(TARGET spconv_nms PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET spconv_nms PROPERTY CUDA_STANDARD 14)
target_link_libraries(spconv_nms ${CUDA_CUDART}) set_property(TARGET spconv_nms PROPERTY POSITION_INDEPENDENT_CODE ON)
target_link_libraries(spconv_nms ${CUDA_CUDART})
install (TARGETS spconv_nms DESTINATION lib) install (TARGETS spconv_nms DESTINATION lib)
endif()
add_library(spconv_utils SHARED all.cc) add_library(spconv_utils SHARED all.cc)
set_target_properties(spconv_utils PROPERTIES VERSION ${PROJECT_VERSION}) set_target_properties(spconv_utils PROPERTIES VERSION ${PROJECT_VERSION})
...@@ -18,6 +19,9 @@ set_property(TARGET spconv_utils PROPERTY CXX_STANDARD 14) ...@@ -18,6 +19,9 @@ set_property(TARGET spconv_utils PROPERTY CXX_STANDARD 14)
set_property(TARGET spconv_utils PROPERTY CUDA_STANDARD 14) set_property(TARGET spconv_utils PROPERTY CUDA_STANDARD 14)
set_target_properties(spconv_utils PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}" set_target_properties(spconv_utils PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
SUFFIX "${PYTHON_MODULE_EXTENSION}") SUFFIX "${PYTHON_MODULE_EXTENSION}")
target_link_libraries(spconv_utils ${CUDA_CUDART} pybind11::module spconv_nms) if (SPCONV_BuildCUDA)
target_link_libraries(spconv_utils ${CUDA_CUDART} pybind11::module spconv_nms)
else()
target_link_libraries(spconv_utils pybind11::module)
endif()
install (TARGETS spconv_utils DESTINATION lib) install (TARGETS spconv_utils DESTINATION lib)
...@@ -20,12 +20,14 @@ using namespace pybind11::literals; ...@@ -20,12 +20,14 @@ using namespace pybind11::literals;
PYBIND11_MODULE(spconv_utils, m) { PYBIND11_MODULE(spconv_utils, m) {
m.doc() = "util pybind11 functions for spconv"; m.doc() = "util pybind11 functions for spconv";
#ifdef SPCONV_CUDA
m.def("non_max_suppression", &spconv::non_max_suppression<double>, m.def("non_max_suppression", &spconv::non_max_suppression<double>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1, py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4); "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
m.def("non_max_suppression", &spconv::non_max_suppression<float>, m.def("non_max_suppression", &spconv::non_max_suppression<float>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1, py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4); "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
#endif
m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<double>, m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<double>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1, py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4); "order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
......
...@@ -8,7 +8,6 @@ ...@@ -8,7 +8,6 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <exception> #include <exception>
#include <fmt/format.h>
#include <numeric> #include <numeric>
#include <pybind11/embed.h> // everything needed for embedding #include <pybind11/embed.h> // everything needed for embedding
#include <pybind11/functional.h> #include <pybind11/functional.h>
...@@ -118,7 +117,7 @@ TEST_CASE("GetConvIndPair", "[SpConvNet]") ...@@ -118,7 +117,7 @@ TEST_CASE("GetConvIndPair", "[SpConvNet]")
auto filters_gpu = filters_tensor.to(torch::Device(torch::kCUDA, 0)); auto filters_gpu = filters_tensor.to(torch::Device(torch::kCUDA, 0));
auto outputs = spconv::getIndicePair<3>(inds_gpu, 1, {46, 26, 26}, {50, 30, 30}, {3, 3, 3}, auto outputs = spconv::getIndicePair<3>(inds_gpu, 1, {46, 26, 26}, {50, 30, 30}, {3, 3, 3},
{1, 1, 1}, {0, 0, 0}, {2, 2, 2}, {0, 0, 0}, false, false); {1, 1, 1}, {0, 0, 0}, {2, 2, 2}, {0, 0, 0}, 0, 0, 0);
// std::cout << outputs[2] << std::endl; // std::cout << outputs[2] << std::endl;
/* /*
auto output = spconv::indiceConv<float>(features_gpu, filters_gpu, outputs[1], outputs[2], outputs[0].size(0), false); auto output = spconv::indiceConv<float>(features_gpu, filters_gpu, outputs[1], outputs[2], outputs[0].size(0), false);
......
...@@ -33,7 +33,8 @@ class SparseConv3dTestTorch(nn.Module): ...@@ -33,7 +33,8 @@ class SparseConv3dTestTorch(nn.Module):
stride, stride,
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
bias=False)] bias=False,
use_hash=True)]
for i in range(1, num_layers): for i in range(1, num_layers):
layers.append(spconv.SparseConv3d( layers.append(spconv.SparseConv3d(
out_channels, out_channels,
...@@ -561,14 +562,15 @@ class TestSpConv(TestCase): ...@@ -561,14 +562,15 @@ class TestSpConv(TestCase):
def main(): def main():
# function for develop. # function for develop.
np.random.seed(484) np.random.seed(484)
# devices = ["cuda:0"]
devices = ["cuda:0"] devices = ["cuda:0"]
shapes = [[50, 30, 30]] shapes = [[50, 30, 30]]
batchsizes = [3] batchsizes = [2]
in_channels = [256] in_channels = [256]
out_channels = [256] out_channels = [256]
ksizes = [3] ksizes = [(3, 1, 1)]
strides = [1] strides = [1]
paddings = [0] paddings = [0]
dilations = [1] dilations = [1]
...@@ -579,7 +581,7 @@ def main(): ...@@ -579,7 +581,7 @@ def main():
if all([s > 1, d > 1]): if all([s > 1, d > 1]):
continue continue
device = torch.device(dev) device = torch.device(dev)
num_points = [5000] * bs num_points = [5] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC) sparse_dict = generate_sparse_data(shape, num_points, IC)
...@@ -587,19 +589,19 @@ def main(): ...@@ -587,19 +589,19 @@ def main():
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32) indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32) features_dense = sparse_dict["features_dense"].astype(np.float32)
indices_t = torch.from_numpy(indices) indices_t = torch.from_numpy(indices)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32) filters = np.random.uniform(0, 1, size=[k[0], 1, 1, IC, OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device).half() indices_t = torch.from_numpy(indices).int().to(device).float()
features_t = torch.from_numpy(features).to(device).half() features_t = torch.from_numpy(features).to(device).float()
features_dense_t = torch.from_numpy(features_dense).to(device).half() features_dense_t = torch.from_numpy(features_dense).to(device).float()
net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).half() net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).float()
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).half() net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).float()
filters_t = torch.from_numpy(filters).to(device).half() filters_t = torch.from_numpy(filters).to(device).float()
net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1, 2).contiguous() net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1, 2).contiguous()
net.net[0].weight[:] = filters_t net.net[0].weight[:] = filters_t
out_ref = net_ref(features_dense_t) out_ref = net_ref(features_dense_t)
times = [] times = []
for i in range(30): for i in range(0):
t = time.time() t = time.time()
out = net(features_t, indices_t, bs) out = net(features_t, indices_t, bs)
torch.cuda.synchronize() torch.cuda.synchronize()
...@@ -607,7 +609,9 @@ def main(): ...@@ -607,7 +609,9 @@ def main():
# print((net.grid == -1).float().sum(), net.grid.numel()) # print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t) # print("spconv time", time.time() - t)
print("spconv time", np.mean(times[2:])) print("spconv time", np.mean(times[2:]))
out = net(features_t, indices_t, bs).dense() out = net(features_t, indices_t, bs)
# print(out.indices)
out = out.dense()
print(np.linalg.norm(out.detach().cpu().numpy() - out_ref.detach().cpu().numpy())) print(np.linalg.norm(out.detach().cpu().numpy() - out_ref.detach().cpu().numpy()))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment