Commit a6abf55d authored by yan.yan's avatar yan.yan
Browse files

Merge branch 'develop'

parents fad30002 79a3eaf2
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/pillar_scatter_functor.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
namespace spconv {
template <typename T, typename Index>
__global__ void pointPillarsScatterKernel(tv::TensorView<T> canvas,
tv::TensorView<const T> features,
tv::TensorView<const T> coors) {
auto numFeatures = features.dim(0);
auto numPoints = features.dim(1);
for (int i : tv::KernelLoopX<int>(numPoints)) {
for (int ifeature : tv::KernelLoopY<int>(numFeatures)) {
canvas(int(coors(0, i)), ifeature, int(coors(2, i)), int(coors(3, i))) =
features(ifeature, i);
}
}
}
namespace functor {
template <typename T, typename Index>
struct PointPillarScatter<tv::GPU, T, Index> {
void operator()(const tv::GPU &d, tv::TensorView<T> canvas,
tv::TensorView<const T> features,
tv::TensorView<const T> coors) {
auto grid = dim3(tv::cuda::DivUp(features.dim(1), 32),
tv::cuda::DivUp(features.dim(0), 32));
pointPillarsScatterKernel<T, Index>
<<<grid, dim3(32, 32), 0, d.getStream()>>>(canvas, features, coors);
TV_CHECK_CUDA_ERR();
}
};
} // namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::PointPillarScatter<tv::GPU, T, Index>;
#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
DECLARE_GPU_SPECS(float);
DECLARE_GPU_SPECS(double);
DECLARE_GPU_SPECS(at::Half);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
} // namespace spconv
\ No newline at end of file
#include <spconv/pool_ops.h>
namespace spconv {
torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numAct) {
auto device = features.device().type();
auto kernelVolume = indiceNum.size(0);
auto numInPlanes = features.size(1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
double totalTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
if (device == torch::kCPU) {
maxpool_fwd_cpu(output, features, indicePairs[0][i], indicePairs[1][i],
nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
maxpool_fwd_cuda(output, features, indicePairs[0][i], indicePairs[1][i],
nHot);
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalTime += timer.report() / 1000.0;
}
// std::cout << "maxpool forward time " << totalTime << std::endl;
return output;
}
torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum) {
auto device = features.device().type();
auto numInPlanes = features.size(1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
auto kernelVolume = indiceNum.size(0);
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0) {
continue;
}
if (device == torch::kCPU) {
maxpool_bwd_cpu(outFeatures, features, outGrad, inputGrad,
indicePairs[0][i], indicePairs[1][i], nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
maxpool_bwd_cuda(outFeatures, features, outGrad, inputGrad,
indicePairs[0][i], indicePairs[1][i], nHot);
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
}
return inputGrad;
}
} // namespace spconv
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/Parallel.h>
#include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
namespace spconv {
using float_types_t = tv::mp_list<float, double, at::Half>;
using int_types_t = tv::mp_list<int32_t, int64_t>;
void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size) {
int numPlanes = features.size(1);
auto dtype = features.scalar_type();
auto int_dtype = indices.scalar_type();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = TV_DECLTYPE(TValue);
tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
using Index = TV_DECLTYPE(IndexValue);
Index *indices_data = indices.data_ptr<Index>();
T *buffer_data = buffer.data_ptr<T>();
const T *features_data = features.data_ptr<T>();
at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) {
for (int i = begin; i < end; ++i) {
std::memcpy(buffer_data + i * numPlanes,
features_data + indices_data[i] * numPlanes,
sizeof(T) * numPlanes);
}
});
});
});
}
void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
torch::Tensor indices, int size) {
int numPlanes = outFeatures.size(1);
auto dtype = outFeatures.scalar_type();
auto int_dtype = indices.scalar_type();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = TV_DECLTYPE(TValue);
tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
using Index = TV_DECLTYPE(IndexValue);
Index *indices_data = indices.data_ptr<Index>();
const T *buffer_data = buffer.data_ptr<T>();
T *features_data = outFeatures.data_ptr<T>();
at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) {
const T *buf = buffer.data_ptr<T>();
T *out = outFeatures.data_ptr<T>();
for (int i = begin; i < end; ++i) {
buf = buffer_data + i * numPlanes;
out = features_data + indices_data[i] * numPlanes;
for (int j = 0; j < numPlanes; ++j) {
out[j] += buf[j];
}
}
});
});
});
}
} // namespace spconv
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/reordering.cu.h>
#include <spconv/reordering.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensor.h>
#include <tensorview/tensorview.h>
#include <tensorview/torch_utils.h>
#include <type_traits>
#include <utility/timer.h>
namespace spconv {
using float_types_t = tv::mp_list<float, double, at::Half>;
using int_types_t = tv::mp_list<int32_t, int64_t>;
template <typename T>
struct half_vec{
using type = typename std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
};
template <typename T>
struct half_vec_sadd{
using type = typename std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
};
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size) {
if (size <= 0)
return;
int numPlanes = features.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
auto dtype = features.scalar_type();
auto inds_dtype = indices.scalar_type();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = TV_DECLTYPE(TValue);
using vecload_type_t = typename half_vec_sadd<T>::type;
tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
using Index = TV_DECLTYPE(IndexValue);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
tv::mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
&notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(size / NumTLP, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
indices.data_ptr<Index>(), nHotBlock,
numPlanes / vecloadFactor);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr, gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>));
tv::ssprint("gatherVecBlockKernel<", tv::type_s<T>,
tv::type_s<Index>, int(NumTLP), NumILP, ">",
attr.numRegs);
#endif
TV_CHECK_CUDA_ERR();
}
if (size - nHotBlock > 0) {
gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
<<<dim3(1, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes,
features.data_ptr<T>(),
indices.data_ptr<Index>() + nHotBlock,
size - nHotBlock, numPlanes / vecloadFactor);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr, gatherVecKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>));
tv::ssprint("gatherVecKernel<", tv::type_s<T>, tv::type_s<Index>,
int(NumTLP), NumILP, ">", attr.numRegs);
#endif
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
gatherGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::cuda::DivUp(size, NumTLP),
tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
buffer.data_ptr<T>(), features.data_ptr<T>(),
indices.data_ptr<Index>(), size, numPlanes);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr, gatherGenericKernel<T, Index, NumTLP, NumILP>));
tv::ssprint("gatherGenericKernel<", tv::type_s<T>, tv::type_s<Index>,
int(NumTLP), NumILP, ">", attr.numRegs);
#endif
TV_CHECK_CUDA_ERR();
}
});
});
}
void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
torch::Tensor indices, int size) {
if (size <= 0)
return;
int numPlanes = outFeatures.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
auto dtype = outFeatures.scalar_type();
auto inds_dtype = indices.scalar_type();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = TV_DECLTYPE(TValue);
using vecload_type_t = typename half_vec_sadd<T>::type;
tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
using Index = TV_DECLTYPE(IndexValue);
bool notFound = true;
constexpr int vecloadFactor =
sizeof(vecload_type_t) / sizeof(T); // important for half.
tv::mp_for_each<kernel_block_t>([=, &outFeatures, &buffer, &indices,
&notFound](auto NumTLP) {
// constexpr int NumILP = NumTLP / (64 / (NumTLP /
// vecloadFactor));
constexpr int NumILP = NumTLP / 4;
int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(size / NumTLP, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
indices.data_ptr<Index>(), nHotBlock,
numPlanes / vecloadFactor);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr, scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>));
tv::ssprint("scatterAddVecBlockKernel<", tv::type_s<T>,
tv::type_s<Index>, int(NumTLP), NumILP, ">",
attr.numRegs);
#endif
TV_CHECK_CUDA_ERR();
}
if (size - nHotBlock > 0) {
scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, stream>>>(outFeatures.data_ptr<T>(),
buffer.data_ptr<T>() + nHotBlock * numPlanes,
indices.data_ptr<Index>() + nHotBlock,
size - nHotBlock, numPlanes);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr,
scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
tv::ssprint("scatterAddGenericKernel<", tv::type_s<T>,
tv::type_s<Index>, int(NumTLP), NumILP, ">",
attr.numRegs);
#endif
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
scatterAddGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::cuda::DivUp(size, NumTLP),
tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
indices.data_ptr<Index>(), size, numPlanes);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr, scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
tv::ssprint("notfound scatterAddGenericKernel<", tv::type_s<T>,
tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
#endif
TV_CHECK_CUDA_ERR();
}
});
});
}
void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size) {
// indices: [volume, inds_stride]
// buffer: [volume, num_points, num_features]
// size == volume * num_points
if (size <= 0)
return;
int numPlanes = features.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
auto dtype = features.scalar_type();
auto inds_dtype = indices.scalar_type();
int inds_stride = indices.size(1);
int feature_stride = buffer.size(1);
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = TV_DECLTYPE(TValue);
using vecload_type_t = typename half_vec<T>::type;
tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
using Index = TV_DECLTYPE(IndexValue);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
tv::mp_for_each<kernel_block_t>(
[=, &buffer, &features, &indices, &notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
batchGatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(size / NumTLP, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
indices.data_ptr<Index>(), nHotBlock,
numPlanes / vecloadFactor, inds_stride,
feature_stride);
TV_CHECK_CUDA_ERR_V2("batchGatherVecBlockKernel");
}
if (size - nHotBlock > 0) {
batchGatherVecKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(1, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes,
features.data_ptr<T>(),
indices.data_ptr<Index>(), size - nHotBlock,
nHotBlock, numPlanes / vecloadFactor,
inds_stride, feature_stride);
TV_CHECK_CUDA_ERR_V2("batchGatherVecKernel");
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
batchGatherGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::cuda::DivUp(size, NumTLP),
tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
buffer.data_ptr<T>(), features.data_ptr<T>(),
indices.data_ptr<Index>(), size, numPlanes, inds_stride,
feature_stride);
TV_CHECK_CUDA_ERR();
}
});
});
}
void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
torch::Tensor outFeatures,
torch::Tensor indices, int size) {
// indices: [volume, inds_stride]
// buffer: [volume, num_points, num_features]
// size == volume * num_points
if (size <= 0)
return;
int numPlanes = outFeatures.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
auto dtype = outFeatures.scalar_type();
auto inds_dtype = indices.scalar_type();
int inds_stride = indices.size(1);
int feature_stride = buffer.size(1);
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = TV_DECLTYPE(TValue);
using vecload_type_t = typename half_vec_sadd<T>::type;
tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
using Index = TV_DECLTYPE(IndexValue);
bool notFound = true;
constexpr int vecloadFactor = 1; // important for half.
tv::mp_for_each<kernel_block_t>([=, &outFeatures, &buffer, &indices,
&notFound](auto NumTLP) {
// constexpr int NumILP = NumTLP / (64 / (NumTLP /
// vecloadFactor));
constexpr int NumILP = NumTLP / 4;
int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
batchScatterAddBlockKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(size / NumTLP, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
indices.data_ptr<Index>(), nHotBlock,
numPlanes / vecloadFactor, inds_stride,
feature_stride);
TV_CHECK_CUDA_ERR();
}
if (size - nHotBlock > 0) {
batchScatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, stream>>>(outFeatures.data_ptr<T>(),
buffer.data_ptr<T>() + nHotBlock * numPlanes,
indices.data_ptr<Index>(), size - nHotBlock,
nHotBlock, numPlanes, inds_stride,
feature_stride);
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
batchScatterAddGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::cuda::DivUp(size, NumTLP),
tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
indices.data_ptr<Index>(), size, 0, numPlanes, inds_stride,
feature_stride);
TV_CHECK_CUDA_ERR();
}
});
});
}
} // namespace spconv
#include <spconv/spconv_ops.h>
namespace spconv {
std::vector<torch::Tensor>
getIndicePairs(torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape,
std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM,
int64_t _transpose, int64_t _useHash) {
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
bool transpose = _transpose != 0;
auto NDim = kernelSize.size();
// CPU always use hash (tsl::robin_map).
bool useHash = _useHash != 0 || indices.device().type() == torch::kCPU;
auto numAct = indices.size(0);
auto coorDim = indices.size(1) - 1; // batchIdx + xyz
TV_ASSERT_RT_ERR(NDim == coorDim, "error");
TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
auto kernelVolume = kernelSize[0];
for (int i = 1; i < kernelSize.size(); ++i) {
kernelVolume *= kernelSize[i];
}
TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
auto outputVolume = outSpatialShape[0];
for (int i = 1; i < outSpatialShape.size(); ++i) {
outputVolume *= outSpatialShape[i];
}
std::string msg = "due to limits of cuda hash, the volume of dense space "
"include batch size ";
msg += "must less than std::numeric_limits<int>::max() = 2e9";
TV_ASSERT_RT_ERR(batchSize * outputVolume < std::numeric_limits<int>::max(),
msg);
torch::Tensor indicePairs =
torch::full({2, kernelVolume, numAct}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor indiceNum = torch::zeros(
{kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
auto gridSize = batchSize * outputVolume;
if (useHash) {
gridSize = batchSize;
}
torch::Tensor gridOut = torch::full(
{gridSize}, -1, torch::dtype(torch::kInt32).device(indices.device()));
gridOut = gridOut.view({batchSize, -1});
int64_t numActOut = -1;
for (int i = 0; i < NDim; ++i) {
if (subM) {
padding[i] = kernelSize[i] / 2;
stride[i] = 1;
}
}
// tv::ssprint("prepare", timer.report() / 1000.0);
if (subM) {
if (indices.device().type() == torch::kCPU) {
numActOut = create_submconv_indice_pair_cpu(
indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding,
dilation, outSpatialShape, transpose, false, useHash);
}
#ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) {
numActOut = create_submconv_indice_pair_cuda(
indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding,
dilation, outSpatialShape, transpose, false, useHash);
if (numActOut == -1) {
auto device = indices.device();
indicePairs = indicePairs.to({torch::kCPU});
indiceNum = indiceNum.to({torch::kCPU});
indices = indices.to({torch::kCPU});
numActOut = create_submconv_indice_pair_cpu(
indices, gridOut, indicePairs, indiceNum, kernelSize, stride,
padding, dilation, outSpatialShape, transpose, false, useHash);
return {indices.to(device), indicePairs.to(device),
indiceNum.to(device)};
}
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
// tv::ssprint("subm", timer.report() / 1000.0);
return {indices, indicePairs, indiceNum};
} else {
auto indicePairUnique = torch::full(
{indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor outInds =
torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) {
numActOut = create_conv_indice_pair_cpu(
indices, outInds, gridOut, indicePairs, indiceNum, kernelSize, stride,
padding, dilation, outSpatialShape, transpose, false, useHash);
}
#ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) {
numActOut = create_conv_indice_pair_p1_cuda(
indices, indicePairs, indiceNum, indicePairUnique, kernelSize, stride,
padding, dilation, outSpatialShape, transpose);
if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res);
numActOut = create_conv_indice_pair_p2_cuda(
indices, outInds, gridOut, indicePairs, indiceNum, indicePairUnique,
outSpatialShape, transpose, false, useHash);
if (numActOut == -1) {
auto device = indices.device();
outInds = outInds.to({torch::kCPU});
indicePairs = indicePairs.to({torch::kCPU});
indiceNum = indiceNum.to({torch::kCPU});
indices = indices.to({torch::kCPU});
numActOut = create_conv_indice_pair_cpu(
indices, outInds, gridOut, indicePairs, indiceNum, kernelSize,
stride, padding, dilation, outSpatialShape, transpose, false,
useHash);
return {outInds.to(device).slice(0, 0, numActOut),
indicePairs.to(device), indiceNum.to(device)};
}
}
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
}
}
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM,
int64_t algo) {
auto kernelVolume = indiceNum.size(0);
switch (algo) {
case kBatchGemmGather:
case kBatch: {
if (kernelVolume != 1) {
return indiceConvBatch(features, filters, indicePairs, indiceNum,
numActOut, _inverse, _subM,
algo != kBatchGemmGather);
} else {
break;
}
}
case kNative:
break;
default:
TV_THROW_RT_ERR("unknown algo");
}
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
// init for subM
int indicePairMaxOffset = kernelVolume / 2;
int indicePairMaxSize = numActOut;
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
// get indice pair second max size based on subM symmetric property
indicePairMaxSize =
*std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + indicePairMaxOffset);
if (indicePairMaxSize == 0) {
return output;
}
} else {
indicePairMaxSize =
*std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
}
torch::Tensor inputBuffer =
torch::empty({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::empty({indicePairMaxSize, numOutPlanes}, options);
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
// tv::ssprint("first subm gemm time", timer.report() / 1000.0);
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
// TODO torch::from_blob is a little slow
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
if (device == torch::kCPU) {
sparse_gather_cpu(inputBuffer, features, indicePairs[inverse][i], nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_gather_cuda(inputBuffer, features, indicePairs[inverse][i], nHot);
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
// totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
// totalGEMMTime += timer.report() / 1000.0;
if (device == torch::kCPU) {
sparse_scatter_add_cpu(outputBuffer, output, indicePairs[!inverse][i],
nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_scatter_add_cuda(outputBuffer, output, indicePairs[!inverse][i],
nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
// totalSAddTime += timer.report() / 1000.0;
}
// tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
return output;
}
torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM,
bool batchScatter) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indiceNum.size(0);
TV_ASSERT_INVALID_ARG(kernelVolume > 1, "error");
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
// auto timer = spconv::CudaContextTimer<>();
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairNumVec =
std::vector<int>(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
auto indicePairMaxSizeIter =
std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumVec.begin();
int indicePairMaxSize = *indicePairMaxSizeIter;
std::nth_element(indicePairNumVec.begin(), indicePairNumVec.begin() + 1,
indicePairNumVec.end(), std::greater<int>());
int indicePairTop2Size = indicePairNumVec[1];
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto indice_dtype = indicePairs.scalar_type();
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
// we cant use batch conv in subm directly because
// number of indice in the center of filter is much more than other
// filter location.
// so we first use top2 indice num to do batch conv, then
// do native conv (gemm) in center.
int bufferSize = subM ? indicePairTop2Size : indicePairMaxSize;
int maxKernelVolumePart = kernelVolume;
std::vector<std::pair<int, int>> part_ranges = {{0, kernelVolume}};
filters = filters.view({kernelVolume, numInPlanes, numOutPlanes});
if (subM) {
maxKernelVolumePart = std::max(indicePairMaxOffset,
int(kernelVolume - indicePairMaxOffset - 1));
part_ranges = {{0, indicePairMaxOffset},
{indicePairMaxOffset + 1, kernelVolume}};
torch::mm_out(output, features, filters[indicePairMaxOffset]);
if (indicePairTop2Size == 0) {
return output;
}
}
// tv::ssprint("first subm gemm time", timer.report() / 1000.0);
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
torch::Tensor inputBuffer =
torch::empty({maxKernelVolumePart, bufferSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::empty({maxKernelVolumePart, bufferSize, numOutPlanes}, options);
for (auto &range : part_ranges) {
int start = range.first;
int end = range.second;
int length = end - start;
int64_t size = length * bufferSize;
auto inputBufferPart = tv::torch_slice_first_axis(inputBuffer, 0, length);
auto outputBufferPart = tv::torch_slice_first_axis(outputBuffer, 0, length);
auto indicePairs1Part =
tv::torch_slice_first_axis(indicePairs[inverse], start, end);
auto indicePairs2Part =
tv::torch_slice_first_axis(indicePairs[!inverse], start, end);
auto filtersPart = tv::torch_slice_first_axis(filters, start, end);
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
batch_sparse_gather_cuda(inputBufferPart, features, indicePairs1Part,
size);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
// totalGatherTime += timer.report() / 1000.0;
torch::bmm_out(outputBufferPart, inputBufferPart, filtersPart);
// totalGEMMTime += timer.report() / 1000.0;
if (batchScatter) {
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
batch_sparse_scatter_add_cuda(outputBufferPart, output,
indicePairs2Part, size);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
} else {
for (int i = 0; i < length; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i + start];
if (nHot <= 0) {
continue;
}
if (device == torch::kCPU) {
sparse_scatter_add_cpu(outputBufferPart[i], output,
indicePairs2Part[i], nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_scatter_add_cuda(outputBufferPart[i], output,
indicePairs2Part[i], nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
}
}
// totalSAddTime += timer.report() / 1000.0;
}
// tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
return output;
}
std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM,
int64_t algo) {
auto kernelVolume = indiceNum.size(0);
switch (algo) {
case kBatchGemmGather:
case kBatch: {
if (kernelVolume != 1) {
return indiceConvBackwardBatch(features, filters, outGrad, indicePairs,
indiceNum, _inverse, _subM,
algo != kBatchGemmGather);
} else {
break;
}
}
case kNative:
break;
default:
TV_THROW_RT_ERR("unknown algo");
}
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::empty(filterShape, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
// init for subM
int indicePairMaxOffset = kernelVolume / 2;
int indicePairMaxSize = indicePairNumCpu.data_ptr<int>()[indicePairMaxOffset];
if (subM) {
auto filterGradSub = filtersGrad[indicePairMaxOffset];
torch::mm_out(filterGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
// get indice pair second max size based on subM symmetric property
indicePairMaxSize =
*std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + indicePairMaxOffset);
if (indicePairMaxSize == 0) {
return {inputGrad, filtersGrad.view(filterShape)};
}
} else {
indicePairMaxSize =
*std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
}
torch::Tensor inputBuffer =
torch::empty({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::empty({indicePairMaxSize, numOutPlanes}, options);
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
if (device == torch::kCPU) {
sparse_gather_cpu(inputBuffer, features, indicePairs[inverse][i], nHot);
sparse_gather_cpu(outputBuffer, outGrad, indicePairs[!inverse][i], nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_gather_cuda(inputBuffer, features, indicePairs[inverse][i], nHot);
sparse_gather_cuda(outputBuffer, outGrad, indicePairs[!inverse][i], nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
auto filterGradSub = filtersGrad[i];
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
if (device == torch::kCPU) {
sparse_scatter_add_cpu(inputBuffer, inputGrad, indicePairs[inverse][i],
nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_scatter_add_cuda(inputBuffer, inputGrad, indicePairs[inverse][i],
nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
}
return {inputGrad, filtersGrad.view(filterShape)};
}
std::vector<torch::Tensor>
indiceConvBackwardBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM, bool batchScatter) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indiceNum.size(0);
TV_ASSERT_INVALID_ARG(kernelVolume > 1, "error");
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairNumVec =
std::vector<int>(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
auto indicePairMaxSizeIter =
std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumVec.begin();
int indicePairMaxSize = *indicePairMaxSizeIter;
std::nth_element(indicePairNumVec.begin(), indicePairNumVec.begin() + 1,
indicePairNumVec.end(), std::greater<int>());
int indicePairTop2Size = indicePairNumVec[1];
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto indice_dtype = indicePairs.scalar_type();
auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::zeros(filterShape, options);
int bufferSize = subM ? indicePairTop2Size : indicePairMaxSize;
filters = filters.view({-1, numInPlanes, numOutPlanes});
filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
std::vector<std::pair<int, int>> part_ranges = {{0, kernelVolume}};
int maxKernelVolumePart = kernelVolume;
if (subM) {
maxKernelVolumePart = std::max(indicePairMaxOffset,
int(kernelVolume - indicePairMaxOffset - 1));
part_ranges = {{0, indicePairMaxOffset},
{indicePairMaxOffset + 1, kernelVolume}};
auto filtersGradSub = filtersGrad[indicePairMaxOffset];
auto filtersSub = filters[indicePairMaxOffset];
torch::mm_out(filtersGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filtersSub.t());
if (indicePairTop2Size == 0) {
return {inputGrad, filtersGrad.view(filterShape)};
}
}
torch::Tensor inputBuffer =
torch::zeros({maxKernelVolumePart, bufferSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({maxKernelVolumePart, bufferSize, numOutPlanes}, options);
for (auto &range : part_ranges) {
int start = range.first;
int end = range.second;
int length = end - start;
int64_t size = length * bufferSize;
auto inputBufferPart = tv::torch_slice_first_axis(inputBuffer, 0, length);
auto outputBufferPart = tv::torch_slice_first_axis(outputBuffer, 0, length);
auto indicePairs1Part =
tv::torch_slice_first_axis(indicePairs[inverse], start, end);
auto indicePairs2Part =
tv::torch_slice_first_axis(indicePairs[!inverse], start, end);
auto filtersPart = tv::torch_slice_first_axis(filters, start, end);
auto filtersGradPart = tv::torch_slice_first_axis(filtersGrad, start, end);
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
batch_sparse_gather_cuda(inputBufferPart, features, indicePairs1Part,
size);
batch_sparse_gather_cuda(outputBufferPart, outGrad, indicePairs2Part,
size);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
// filters: KV, I, O, inputBuffer: [KV, buffer, I]
// outputBuffer: [KV, buffer, O]
torch::bmm_out(filtersGradPart, inputBufferPart.permute({0, 2, 1}),
outputBufferPart);
torch::bmm_out(inputBuffer, outputBufferPart,
filtersPart.permute({0, 2, 1}));
if (batchScatter) {
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
batch_sparse_scatter_add_cuda(inputBufferPart, inputGrad,
indicePairs1Part, size);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
} else {
for (int i = 0; i < length; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i + start];
if (nHot <= 0) {
continue;
}
if (device == torch::kCPU) {
sparse_scatter_add_cpu(inputBufferPart[i], inputGrad,
indicePairs1Part[i], nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_scatter_add_cuda(inputBufferPart[i], inputGrad,
indicePairs1Part[i], nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
}
}
}
return {inputGrad, filtersGrad.view(filterShape)};
}
} // namespace spconv
if (SPCONV_BuildCUDA)
add_library(spconv_nms STATIC nms.cu)
set_target_properties(spconv_nms PROPERTIES VERSION ${PROJECT_VERSION})
set_target_properties(spconv_nms PROPERTIES SOVERSION 1)
target_include_directories(spconv_nms PRIVATE ${ALL_INCLUDE})
set_property(TARGET spconv_nms PROPERTY CXX_STANDARD 14)
set_property(TARGET spconv_nms PROPERTY CUDA_STANDARD 14)
set_property(TARGET spconv_nms PROPERTY POSITION_INDEPENDENT_CODE ON)
target_link_libraries(spconv_nms ${CUDA_CUDART})
install (TARGETS spconv_nms DESTINATION lib)
endif()
add_library(spconv_utils SHARED all.cc)
set_target_properties(spconv_utils PROPERTIES VERSION ${PROJECT_VERSION})
set_target_properties(spconv_utils PROPERTIES SOVERSION 1)
target_include_directories(spconv_utils PRIVATE ${ALL_INCLUDE}
${PROJECT_SOURCE_DIR}/third_party/pybind11/include)
set_property(TARGET spconv_utils PROPERTY CXX_STANDARD 14)
set_property(TARGET spconv_utils PROPERTY CUDA_STANDARD 14)
set_target_properties(spconv_utils PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
SUFFIX "${PYTHON_MODULE_EXTENSION}")
if (SPCONV_BuildCUDA)
target_link_libraries(spconv_utils ${CUDA_CUDART} pybind11::module spconv_nms)
else()
target_link_libraries(spconv_utils pybind11::module)
endif()
install (TARGETS spconv_utils DESTINATION lib)
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/box_iou.h>
#include <spconv/nms.h>
#include <spconv/point2voxel.h>
namespace py = pybind11;
using namespace pybind11::literals;
PYBIND11_MODULE(spconv_utils, m) {
m.doc() = "util pybind11 functions for spconv";
#ifdef TV_CUDA
m.def("non_max_suppression", &spconv::non_max_suppression<double>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
m.def("non_max_suppression", &spconv::non_max_suppression<float>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
#endif
m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<double>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<float>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
m.def("rotate_non_max_suppression_cpu",
&spconv::rotate_non_max_suppression_cpu<float>,
py::return_value_policy::reference_internal, "bbox iou",
"box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3,
"thresh"_a = 4);
m.def("rotate_non_max_suppression_cpu",
&spconv::rotate_non_max_suppression_cpu<double>,
py::return_value_policy::reference_internal, "bbox iou",
"box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3,
"thresh"_a = 4);
m.def("rbbox_iou", &spconv::rbbox_iou<double>,
py::return_value_policy::reference_internal, "rbbox iou",
"box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
"standup_thresh"_a = 4);
m.def("rbbox_iou", &spconv::rbbox_iou<float>,
py::return_value_policy::reference_internal, "rbbox iou",
"box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
"standup_thresh"_a = 4);
m.def("rbbox_intersection", &spconv::rbbox_intersection<double>,
py::return_value_policy::reference_internal, "rbbox iou",
"box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
"standup_thresh"_a = 4);
m.def("rbbox_intersection", &spconv::rbbox_intersection<float>,
py::return_value_policy::reference_internal, "rbbox iou",
"box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
"standup_thresh"_a = 4);
m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<float, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
"voxel_point_mask"_a = 3, "coors"_a = 4, "num_points_per_voxel"_a = 5,
"coor_to_voxelidx"_a = 6, "voxel_size"_a = 7, "coors_range"_a = 8,
"max_points"_a = 9, "max_voxels"_a = 10);
m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<double, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
"voxel_point_mask"_a = 3, "coors"_a = 4, "num_points_per_voxel"_a = 5,
"coor_to_voxelidx"_a = 6, "voxel_size"_a = 7, "coors_range"_a = 8,
"max_points"_a = 9, "max_voxels"_a = 10);
m.def("points_to_voxel_3d_np_mean",
&spconv::points_to_voxel_3d_np_mean<float, 3>, "matrix tensor_square",
"points"_a = 1, "voxels"_a = 2, "voxel_point_mask"_a = 3, "means"_a = 4,
"coors"_a = 5, "num_points_per_voxel"_a = 6, "coor_to_voxelidx"_a = 7,
"voxel_size"_a = 8, "coors_range"_a = 9, "max_points"_a = 10,
"max_voxels"_a = 11);
m.def("points_to_voxel_3d_np_mean",
&spconv::points_to_voxel_3d_np_mean<double, 3>, "matrix tensor_square",
"points"_a = 1, "voxels"_a = 2, "voxel_point_mask"_a = 3, "means"_a = 4,
"coors"_a = 5, "num_points_per_voxel"_a = 6, "coor_to_voxelidx"_a = 7,
"voxel_size"_a = 8, "coors_range"_a = 9, "max_points"_a = 10,
"max_voxels"_a = 11);
m.def("points_to_voxel_3d_with_filtering",
&spconv::points_to_voxel_3d_with_filtering<float, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
"voxel_point_mask"_a = 3, "voxel_mask"_a = 4, "mins"_a = 5,
"maxs"_a = 6, "coors"_a = 7, "num_points_per_voxel"_a = 8,
"coor_to_voxelidx"_a = 9, "voxel_size"_a = 10, "coors_range"_a = 11,
"max_points"_a = 12, "max_voxels"_a = 13, "block_factor"_a = 14,
"block_size"_a = 15, "height_threshold"_a = 16,
"height_high_threshold"_a = 17);
m.def("points_to_voxel_3d_with_filtering",
&spconv::points_to_voxel_3d_with_filtering<float, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
"voxel_point_mask"_a = 3, "voxel_mask"_a = 4, "mins"_a = 5,
"maxs"_a = 6, "coors"_a = 7, "num_points_per_voxel"_a = 8,
"coor_to_voxelidx"_a = 9, "voxel_size"_a = 10, "coors_range"_a = 11,
"max_points"_a = 12, "max_voxels"_a = 13, "block_factor"_a = 14,
"block_size"_a = 15, "height_threshold"_a = 16,
"height_high_threshold"_a = 17);
}
\ No newline at end of file
// ------------------------------------------------------------------
// Deformable Convolutional Networks
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License
// Modified from MATLAB Faster R-CNN
// (https://github.com/shaoqingren/faster_rcnn)
// ------------------------------------------------------------------
#include <cuda_runtime.h>
#include <iostream>
#include <spconv/nms_gpu.h>
#include <vector>
#define CUDA_CHECK(condition) \
/* Code block avoids redefinition of cudaError_t error */ \
do { \
cudaError_t error = condition; \
if (error != cudaSuccess) { \
std::cout << cudaGetErrorString(error) << std::endl; \
} \
} while (0)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
int const threadsPerBlock = sizeof(unsigned long long) * 8;
template <typename DType>
__device__ inline DType devIoU(DType const *const a, DType const *const b) {
DType left = max(a[0], b[0]), right = min(a[2], b[2]);
DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
DType interS = width * height;
DType Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
DType Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return interS / (Sa + Sb - interS);
}
template <typename DType, int BLOCK_THREADS>
__global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
const DType *dev_boxes,
unsigned long long *dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size = min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
const int col_size = min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
__shared__ DType block_boxes[BLOCK_THREADS * 5];
if (threadIdx.x < col_size) {
#pragma unroll
for (int i = 0; i < 5; ++i) {
block_boxes[threadIdx.x * 5 + i] =
dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
}
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
const DType *cur_box = dev_boxes + cur_box_idx * 5;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (int i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
const int col_blocks = DIVUP(n_boxes, BLOCK_THREADS);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
void _set_device(int device_id) {
int current_device;
CUDA_CHECK(cudaGetDevice(&current_device));
if (current_device == device_id) {
return;
}
// The call to cudaSetDevice must come before any calls to Get, which
// may perform initialization using the GPU.
CUDA_CHECK(cudaSetDevice(device_id));
}
template <typename DType, int BLOCK_THREADS>
int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
int boxes_dim, DType nms_overlap_thresh, int device_id) {
_set_device(device_id);
DType *boxes_dev = NULL;
unsigned long long *mask_dev = NULL;
const int col_blocks = DIVUP(boxes_num, BLOCK_THREADS);
CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(DType)));
CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host,
boxes_num * boxes_dim * sizeof(DType),
cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMalloc(&mask_dev,
boxes_num * col_blocks * sizeof(unsigned long long)));
dim3 blocks(DIVUP(boxes_num, BLOCK_THREADS), DIVUP(boxes_num, BLOCK_THREADS));
dim3 threads(BLOCK_THREADS);
nms_kernel<DType, BLOCK_THREADS>
<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes_dev, mask_dev);
std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks,
cudaMemcpyDeviceToHost));
std::vector<unsigned long long> remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / BLOCK_THREADS;
int inblock = i % BLOCK_THREADS;
if (!(remv[nblock] & (1ULL << inblock))) {
keep_out[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j];
}
}
}
CUDA_CHECK(cudaFree(boxes_dev));
CUDA_CHECK(cudaFree(mask_dev));
return num_to_keep;
}
// template<>
template int _nms_gpu<float, threadsPerBlock>(int *keep_out,
const float *boxes_host,
int boxes_num, int boxes_dim,
float nms_overlap_thresh,
int device_id);
// template<>
template int _nms_gpu<double, threadsPerBlock>(int *keep_out,
const double *boxes_host,
int boxes_num, int boxes_dim,
double nms_overlap_thresh,
int device_id);
\ No newline at end of file
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
STR = """
BWG 0.0008761882781982422
BWG 0.0008311271667480469
BWG 0.002079486846923828
BWG 0.002329587936401367
BWG 0.0025458335876464844
BWG 0.0026700496673583984
BWG 0.002583742141723633
BWG 0.0025262832641601562
BWG 0.003481149673461914
BWG 0.003238201141357422
BWG 0.005095958709716797
BWG 0.0037899017333984375
BWG 0.003931283950805664
BWG 0.003300189971923828
"""
"""
0.003921985626220703
0.0049707889556884766
0.0052530765533447266
0.0060312747955322266
0.0036766529083251953
0.00421142578125
0.002129793167114258
0.0023038387298583984
0.0013151168823242188
0.0015285015106201172
0.0008392333984375
0.0008127689361572266
0.0002486705780029297
0.00030994415283203125
"""
STR1 = """
SUBM 0.00036716461181640625
G 0.0010955333709716797
G 0.0010745525360107422
REGU 0.0006923675537109375
M 0.0005242824554443359
SUBM 0.0003108978271484375
G 0.0010905265808105469
G 0.0011067390441894531
REGU 0.00058746337890625
M 0.0005304813385009766
SUBM 0.0002682209014892578
G 0.0010945796966552734
G 0.0011165142059326172
REGU 0.0005419254302978516
M 0.0005164146423339844
SUBM 0.00021505355834960938
G 0.0010805130004882812
G 0.0010516643524169922
REGU 0.00052642822265625
M 0.0004677772521972656
SUBM 0.0002262592315673828
G 0.0010986328125
G 0.0010256767272949219
REGU 0.0005693435668945312
M 0.00048661231994628906
SUBM 0.0002319812774658203
G 0.0011110305786132812
G 0.0011196136474609375
REGU 0.0005295276641845703
M 0.0005729198455810547
SUBM 0.00023889541625976562
G 0.0005326271057128906
G 0.0005140304565429688
"""
STR2 = """
SUBM 0.0003352165222167969
G 0.001149892807006836
G 0.0017066001892089844
REGU 0.0006349086761474609
M 0.00048804283142089844
SUBM 0.00029850006103515625
G 0.001767873764038086
G 0.0020656585693359375
REGU 0.0005462169647216797
M 0.0005753040313720703
SUBM 0.0002789497375488281
G 0.0012230873107910156
G 0.0014438629150390625
REGU 0.0005102157592773438
M 0.0005676746368408203
SUBM 0.00020241737365722656
G 0.00102996826171875
G 0.0011174678802490234
REGU 0.0005424022674560547
M 0.0005102157592773438
SUBM 0.0001976490020751953
G 0.0010385513305664062
G 0.0010204315185546875
REGU 0.0005321502685546875
M 0.00047278404235839844
SUBM 0.00021529197692871094
G 0.0010280609130859375
G 0.0010151863098144531
REGU 0.0004942417144775391
M 0.0004811286926269531
SUBM 0.00020694732666015625
G 0.0005142688751220703
G 0.0005171298980712891
"""
def _handle_lines(s: str):
arr = s.split(" ")
return (arr[0], float(arr[-1]))
from cumm.gemm.codeops import group_by
def print_str(s: str):
nums = list(map(_handle_lines, s.strip().split("\n")))
num_dict = group_by(lambda x: x[0], nums)
num_dict_ = {k: sum([vv[1] for vv in v]) for k, v in num_dict.items()}
print(num_dict_)
print_str(STR1)
print_str(STR2)
\ No newline at end of file
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
from cumm import tensorview as tv
import spconv.pytorch as spconv
from spconv.utils import Point2VoxelCPU3d
def waymo_data(batch_size=1):
gen = Point2VoxelCPU3d([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3,
150000, 1)
# gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
# 150000)
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
pc = np.ascontiguousarray(data["pc"])
print(pc.shape)
voxels_tv, indices_tv, _ = gen.point_to_voxel(tv.from_numpy(pc))
voxels = voxels_tv.numpy().reshape(-1, 3)
coors = indices_tv.numpy()
N = coors.shape[0]
coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
return voxels, coors, gen.grid_size
class Net(nn.Module):
def __init__(self, shape, algo):
super().__init__()
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
algo=algo),
# spconv.SubMConv3d(32,
# 32,
# 3,
# bias=False,
# indice_key="c0",
# algo=algo),
# # nn.BatchNorm1d(32),
# # nn.ReLU(),
# # spconv.SparseConv3d(64, 64, 2, 2, bias=False,
# # algo=algo),
# spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
# algo=algo),
spconv.SubMConv3d(64,
64,
3,
bias=False,
indice_key="c0",
algo=algo),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(64,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
spconv.SubMConv3d(96,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(96,
128,
3,
bias=False,
indice_key="c2",
algo=algo),
spconv.SubMConv3d(128,
128,
3,
bias=False,
indice_key="c2",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(128,
160,
3,
bias=False,
indice_key="c3",
algo=algo),
spconv.SubMConv3d(160,
160,
3,
bias=False,
indice_key="c3",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(160,
192,
3,
bias=False,
indice_key="c4",
algo=algo),
spconv.SubMConv3d(192,
192,
3,
bias=False,
indice_key="c4",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(192,
224,
3,
bias=False,
indice_key="c5",
algo=algo),
spconv.SubMConv3d(224,
224,
3,
bias=False,
indice_key="c5",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(224,
256,
3,
bias=False,
indice_key="c6",
algo=algo),
spconv.SubMConv3d(256,
256,
3,
bias=False,
indice_key="c6",
algo=algo),
)
max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
self.grid = torch.full([max_batch_size, *shape], -1,
dtype=torch.int32).cuda()
# self.grid = None
self.shape = shape
def forward(self, features, coors, batch_size):
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
self.grid)
return self.net(x)
class Net2(nn.Module):
def __init__(self, shape, algo):
super().__init__()
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 256, 3, bias=False, indice_key="c0",
algo=algo),
# spconv.SubMConv3d(32,
# 32,
# 3,
# bias=False,
# indice_key="c0",
# algo=algo),
# # nn.BatchNorm1d(32),
# # nn.ReLU(),
# # spconv.SparseConv3d(64, 64, 2, 2, bias=False,
# # algo=algo),
# spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
# algo=algo),
spconv.SubMConv3d(256,
256,
3,
bias=False,
indice_key="c0",
algo=algo),
# nn.BatchNorm1d(32),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(256,
512,
3,
bias=False,
indice_key="c1",
algo=algo),
spconv.SubMConv3d(512,
512,
3,
bias=False,
indice_key="c1",
algo=algo),
)
max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
self.grid = torch.full([max_batch_size, *shape], -1,
dtype=torch.int32).cuda()
# self.grid = None
self.shape = shape
def forward(self, features, coors, batch_size):
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
self.grid)
return self.net(x)
def main():
import pickle
np.random.seed(50051)
torch.manual_seed(50051)
# voxels, coors, spatial_shape = waymo_data()
# with open("/home/yy/test_spconv.pkl", "wb") as f:
# pickle.dump((voxels, coors, spatial_shape), f)
with open(Path(__file__).parent / "data" / "test_spconv.pkl", "rb") as f:
(voxels, coors, spatial_shape) = pickle.load(f)
print(spatial_shape)
print(voxels.shape)
# voxels = voxels[:100]
# coors = coors[:100]
dtype = torch.float32
voxels_th = torch.from_numpy(voxels).cuda().to(dtype)
coors_th = torch.from_numpy(coors).cuda().int()
voxels_th.requires_grad = True
algo = spconv.ConvAlgo.Native
net = Net(spatial_shape, algo).cuda().eval().to(dtype)
print(coors_th.shape)
out = net(voxels_th, coors_th, 1)
print(out.spatial_shape)
print(voxels.mean(), voxels.max(), voxels.min())
dout = np.random.uniform(-0.2, 0.2,
out.features.shape).astype(np.float32)
dout_t = torch.from_numpy(dout).cuda().to(dtype)
print(out.spatial_shape, out.features.mean(), out.features.max(), out.features.min())
times = []
with torch.no_grad():
for i in range(20):
print("------------")
torch.cuda.synchronize()
t = time.time()
out_nograd = net(voxels_th, coors_th, 1)
torch.cuda.synchronize()
times.append(time.time() - t)
print("spconv time", np.mean(times[10:]))
times = []
for i in range(10):
out = net(voxels_th, coors_th, 1)
print("------------")
torch.cuda.synchronize()
t = time.time()
out.features.backward(dout_t)
torch.cuda.synchronize()
times.append(time.time() - t)
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
print("spconv bw time", np.mean(times[5:]))
if __name__ == "__main__":
main()
// 000-CatchMain.cpp
// In a Catch project with multiple files, dedicate one file to compile the
// source code of Catch itself and reuse the resulting object file for linking.
// Let Catch provide main():
#define CATCH_CONFIG_MAIN
#include "catch.hpp"
// That's it
// Compile implementation of Catch for use with files that do contain tests:
// - g++ -std=c++11 -Wall -I$(CATCH_SINGLE_INCLUDE) -c 000-CatchMain.cpp
// - cl -EHsc -I%CATCH_SINGLE_INCLUDE% -c 000-CatchMain.cpp
#include <algorithm>
#include <iostream>
#include <map>
#include "catch.hpp"
#include <prettyprint.h>
#include <string>
#include <vector>
#include <exception>
#include <numeric>
#include <pybind11/embed.h> // everything needed for embedding
#include <pybind11/functional.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <tuple>
#include <pybind11_utils.h>
#include <spconv/spconv_ops.h>
namespace py = pybind11;
TEST_CASE("GetConvIndPair", "[SpConvNet]")
{
using namespace py::literals;
py::scoped_interpreter guard{}; // start the interpreter and keep it alive
py::exec(R"(
from __future__ import print_function
import numpy as np
import math
# import spconv
# import torch
def get_convolution_output_size(input_size,
kernel_size,
stride,
padding=None,
rate=None):
ndim = len(input_size)
if padding is None:
padding = [0] * ndim
output_size = []
for i in range(ndim):
output_size.append((input_size[i] + 2 * padding[i] - (
(kernel_size[i] - 1) + 1)) // stride[i] + 1)
return output_size
def get_test_sparse_data(shape,
num_points,
num_channels,
integer=False,
dtype=np.float32):
dense_shape = shape
ndim = len(dense_shape)
# num_points = np.random.randint(10, 100, size=[batch_size, ndim])
num_points = np.array(num_points)
# num_points = np.array([3, 2])
batch_size = len(num_points)
batch_indices = []
coors_total = np.stack(
np.meshgrid(*[np.arange(0, s) for s in shape]), axis=-1)
coors_total = coors_total.reshape(-1, ndim)
for i in range(batch_size):
np.random.shuffle(coors_total)
inds_total = coors_total[:num_points[i]]
inds_total = np.pad(
inds_total, ((0, 0), (0, 1)), mode="constant", constant_values=i)
batch_indices.append(inds_total)
if integer:
sparse_data = np.random.randint(
20, 100, size=[num_points.sum(), num_channels]).astype(dtype)
else:
sparse_data = np.random.uniform(
-1, 1, size=[num_points.sum(), num_channels]).astype(dtype)
# sparse_data = np.arange(1, num_points.sum() + 1).astype(np.float32).reshape(5, 1)
dense_data = np.zeros(
[batch_size, num_channels, *dense_shape], dtype=sparse_data.dtype)
start = 0
for i, inds in enumerate(batch_indices):
for j, ind in enumerate(inds):
dense_slice = (i, slice(None), *ind[:-1])
dense_data[dense_slice] = sparse_data[start + j]
start += len(inds)
batch_indices = np.concatenate(batch_indices, axis=0)
return {
"features": sparse_data.astype(dtype),
"indices": batch_indices.astype(np.int32),
"features_dense": dense_data.astype(dtype),
}
shape = [50, 30, 30]
num_points = [5000] * 1
# np.random.seed(np.random.randint(1, 100000))
in_channels = 64
sparse_dict = get_test_sparse_data(shape, num_points, in_channels)
features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"]
# indices_t = torch.from_numpy(indices)
filters = np.random.uniform(0, 1, size=[3, 3, 3, 64, 64]).astype(np.float32)
# print(outids.shape)
)");
SECTION("DebugTest"){
auto inds = array2TensorView<int>(py::array(py::globals()["indices"]));
auto inds_tensor = torch::from_blob(inds.data(), {inds.dim(0), inds.dim(1)}, torch::dtype(torch::kInt32));
auto inds_gpu = inds_tensor.to(torch::Device(torch::kCPU));
auto features = array2TensorView<float>(py::array(py::globals()["features"]));
auto features_tensor = torch::from_blob(features.data(), {features.dim(0), features.dim(1)}, torch::dtype(torch::kFloat));
auto features_gpu = features_tensor.to(torch::Device(torch::kCUDA, 0));
auto filters = array2TensorView<float>(py::array(py::globals()["filters"]));
auto filters_tensor = torch::from_blob(filters.data(), {filters.dim(0), filters.dim(1), filters.dim(2), filters.dim(3), filters.dim(4)}, torch::dtype(torch::kFloat));
auto filters_gpu = filters_tensor.to(torch::Device(torch::kCUDA, 0));
auto outputs = spconv::getIndicePair<3>(inds_gpu, 1, {46, 26, 26}, {50, 30, 30}, {3, 3, 3},
{1, 1, 1}, {0, 0, 0}, {2, 2, 2}, {0, 0, 0}, 0, 0, 0);
// std::cout << outputs[2] << std::endl;
/*
auto output = spconv::indiceConv<float>(features_gpu, filters_gpu, outputs[1], outputs[2], outputs[0].size(0), false);
std::cout << output << std::endl;*/
}
}
\ No newline at end of file
# Copyright 2019 Yan Yan # Copyright 2021 Yan Yan
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -20,9 +20,9 @@ import numpy as np ...@@ -20,9 +20,9 @@ import numpy as np
import torch import torch
from torch import nn from torch import nn
import spconv import spconv.pytorch as spconv
from spconv.test_utils import TestCase, generate_sparse_data, params_grid from spconv.test_utils import TestCase, generate_sparse_data, params_grid
from spconv.constants import FILTER_HWIO
# import sparseconvnet as scn # import sparseconvnet as scn
...@@ -47,7 +47,6 @@ class SparseConv3dTestTorch(nn.Module): ...@@ -47,7 +47,6 @@ class SparseConv3dTestTorch(nn.Module):
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
bias=False, bias=False,
use_hash=False,
algo=algo) algo=algo)
] ]
for i in range(1, num_layers): for i in range(1, num_layers):
...@@ -59,7 +58,6 @@ class SparseConv3dTestTorch(nn.Module): ...@@ -59,7 +58,6 @@ class SparseConv3dTestTorch(nn.Module):
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
bias=False, bias=False,
use_hash=False,
algo=algo)) algo=algo))
self.net = spconv.SparseSequential(*layers, ) self.net = spconv.SparseSequential(*layers, )
# self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda() # self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
...@@ -349,16 +347,19 @@ def scatter_nd(indices, updates, shape): ...@@ -349,16 +347,19 @@ def scatter_nd(indices, updates, shape):
class TestSpConv(TestCase): class TestSpConv(TestCase):
def testSpConv3d(self): def testSpConv3d(self):
np.random.seed(484) np.random.seed(484)
devices = ["cpu:0"] devices = ["cuda:0"]
shapes = [[19, 18, 17]] shapes = [[19, 18, 17]]
batchsizes = [1, 2] batchsizes = [1, 2]
in_channels = [64] in_channels = [32]
out_channels = [32, 48, 64] out_channels = [32, 48, 64]
ksizes = [2, 3] ksizes = [2, 3]
strides = [1, 2, 3] strides = [1, 2, 3]
paddings = [0, 1, 2] paddings = [0, 1, 2]
dilations = [1, 2, 3] dilations = [1, 2, 3]
# strides = [1]
# paddings = [0]
# dilations = [1]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid( for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes, devices, shapes, batchsizes, in_channels, out_channels, ksizes,
...@@ -367,7 +368,6 @@ class TestSpConv(TestCase): ...@@ -367,7 +368,6 @@ class TestSpConv(TestCase):
continue # don't support this. continue # don't support this.
device = torch.device(dev) device = torch.device(dev)
num_points = [1000] * bs num_points = [1000] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC) sparse_dict = generate_sparse_data(shape, num_points, IC)
features = np.ascontiguousarray(sparse_dict["features"]).astype( features = np.ascontiguousarray(sparse_dict["features"]).astype(
...@@ -375,23 +375,36 @@ class TestSpConv(TestCase): ...@@ -375,23 +375,36 @@ class TestSpConv(TestCase):
indices = np.ascontiguousarray( indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32) sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32) features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, if FILTER_HWIO:
OC]).astype(np.float32) filters = np.random.uniform(0, 1, size=[k, k, k, IC,
OC]).astype(np.float32)
else:
filters = np.random.uniform(0, 1, size=[k, k, k, OC,
IC]).astype(np.float32)
dtype = torch.float16
indices_t = torch.from_numpy(indices).int().to(device) indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device) features_t = torch.from_numpy(features).to(device).to(dtype)
features_t.requires_grad = True features_t.requires_grad = True
features_dense_t = torch.from_numpy(features_dense).to(device) features_dense_t = torch.from_numpy(features_dense).to(device).to(dtype)
features_dense_t.requires_grad = True features_dense_t.requires_grad = True
net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device) d).to(device).to(dtype)
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device) d).to(device).to(dtype)
filters_t = torch.from_numpy(filters).to(device) filters_t = torch.from_numpy(filters).to(device).to(dtype)
net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1, if FILTER_HWIO:
2).contiguous() net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
2).contiguous()
else:
net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1,
2).contiguous()
net.net[0].weight.data[:] = filters_t net.net[0].weight.data[:] = filters_t
out_ref = net_ref(features_dense_t) out_ref = net_ref(features_dense_t)
out = net(features_t, indices_t, bs).dense() out = net(features_t, indices_t, bs).dense()
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
dout = np.random.uniform(-0.2, 0.2, dout = np.random.uniform(-0.2, 0.2,
out_ref.shape).astype(features.dtype) out_ref.shape).astype(features.dtype)
dout_t = torch.from_numpy(dout).to(device) dout_t = torch.from_numpy(dout).to(device)
...@@ -401,18 +414,21 @@ class TestSpConv(TestCase): ...@@ -401,18 +414,21 @@ class TestSpConv(TestCase):
1).contiguous() 1).contiguous()
din_sparse = gather_nd(din_dense, indices_t.long()) din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach() din = features_t.grad.detach()
din_np = din.cpu().numpy() din_np = din.cpu().numpy()
din_sparse_np = din_sparse.cpu().numpy() din_sparse_np = din_sparse.cpu().numpy()
self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
for layer, layer_ref in zip(net.net, net_ref.net): for layer, layer_ref in zip(net.net, net_ref.net):
dw = layer.weight.grad.detach().cpu().numpy() dw = layer.weight.grad.detach().cpu().numpy()
dw_ref = layer_ref.weight.grad.detach().cpu().numpy() dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
dw = dw.transpose(4, 3, 0, 1, 2) if FILTER_HWIO:
dw = dw.transpose(4, 3, 0, 1, 2)
else:
dw = dw.transpose(3, 4, 0, 1, 2)
self.assertAllClose(dw, dw_ref, atol=1e-4) self.assertAllClose(dw, dw_ref, atol=1e-4)
self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
def testSpDeConv3d(self): def testSpDeConv3d(self):
np.random.seed(484) np.random.seed(484)
...@@ -426,6 +442,11 @@ class TestSpConv(TestCase): ...@@ -426,6 +442,11 @@ class TestSpConv(TestCase):
strides = [2, 3] strides = [2, 3]
paddings = [0, 1, 2] paddings = [0, 1, 2]
dilations = [1, 2, 3] dilations = [1, 2, 3]
ksizes = [3]
strides = [1]
paddings = [0]
dilations = [1]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid( for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes, devices, shapes, batchsizes, in_channels, out_channels, ksizes,
...@@ -442,8 +463,13 @@ class TestSpConv(TestCase): ...@@ -442,8 +463,13 @@ class TestSpConv(TestCase):
indices = np.ascontiguousarray( indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32) sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32) features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, if FILTER_HWIO:
OC]).astype(np.float32) filters = np.random.uniform(0, 1, size=[k, k, k, IC,
OC]).astype(np.float32)
else:
filters = np.random.uniform(0, 1, size=[k, k, k, OC,
IC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device) indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device) features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True features_t.requires_grad = True
...@@ -454,11 +480,20 @@ class TestSpConv(TestCase): ...@@ -454,11 +480,20 @@ class TestSpConv(TestCase):
net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device) d).to(device)
filters_t = torch.from_numpy(filters).to(device) filters_t = torch.from_numpy(filters).to(device)
net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1, print(net_ref.net[0].weight.shape)
2).contiguous() if FILTER_HWIO:
net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1,
2).contiguous()
else:
net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
2).contiguous()
net.net[0].weight.data[:] = filters_t net.net[0].weight.data[:] = filters_t
out_ref = net_ref(features_dense_t) out_ref = net_ref(features_dense_t)
out = net(features_t, indices_t, bs).dense() out = net(features_t, indices_t, bs).dense()
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
dout = np.random.uniform(-0.2, 0.2, dout = np.random.uniform(-0.2, 0.2,
out_ref.shape).astype(features.dtype) out_ref.shape).astype(features.dtype)
dout_t = torch.from_numpy(dout).to(device) dout_t = torch.from_numpy(dout).to(device)
...@@ -474,12 +509,12 @@ class TestSpConv(TestCase): ...@@ -474,12 +509,12 @@ class TestSpConv(TestCase):
for layer, layer_ref in zip(net.net, net_ref.net): for layer, layer_ref in zip(net.net, net_ref.net):
dw = layer.weight.grad.detach().cpu().numpy() dw = layer.weight.grad.detach().cpu().numpy()
dw_ref = layer_ref.weight.grad.detach().cpu().numpy() dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
dw = dw.transpose(3, 4, 0, 1, 2) if FILTER_HWIO:
dw = dw.transpose(3, 4, 0, 1, 2)
else:
dw = dw.transpose(4, 3, 0, 1, 2)
self.assertAllClose(dw, dw_ref, atol=1e-4) self.assertAllClose(dw, dw_ref, atol=1e-4)
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
def testSpCpConv3d(self): def testSpCpConv3d(self):
np.random.seed(484) np.random.seed(484)
...@@ -551,12 +586,16 @@ class TestSpConv(TestCase): ...@@ -551,12 +586,16 @@ class TestSpConv(TestCase):
shapes = [[19, 18, 17]] shapes = [[19, 18, 17]]
batchsizes = [1, 2] batchsizes = [1, 2]
in_channels = [62] in_channels = [64]
out_channels = [62] out_channels = [64]
ksizes = [2, 3] ksizes = [2, 3]
strides = [1, 2, 3] strides = [1, 2, 3]
paddings = [0, 1] paddings = [0, 1]
dilations = [1, 2, 3] dilations = [1, 2, 3]
ksizes = [2]
strides = [2]
paddings = [0]
dilations = [1]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid( for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes, devices, shapes, batchsizes, in_channels, out_channels, ksizes,
...@@ -565,6 +604,7 @@ class TestSpConv(TestCase): ...@@ -565,6 +604,7 @@ class TestSpConv(TestCase):
continue # don't support this. continue # don't support this.
device = torch.device(dev) device = torch.device(dev)
num_points = [1000] * bs num_points = [1000] * bs
# when data contains negative, sparse maxpool is not equal to dense maxpool. # when data contains negative, sparse maxpool is not equal to dense maxpool.
sparse_dict = generate_sparse_data(shape, sparse_dict = generate_sparse_data(shape,
num_points, num_points,
...@@ -576,8 +616,8 @@ class TestSpConv(TestCase): ...@@ -576,8 +616,8 @@ class TestSpConv(TestCase):
indices = np.ascontiguousarray( indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32) sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32) features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, filters = np.random.uniform(0, 1, size=[k, k, k, OC,
OC]).astype(np.float32) IC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device) indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device) features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True features_t.requires_grad = True
...@@ -588,11 +628,15 @@ class TestSpConv(TestCase): ...@@ -588,11 +628,15 @@ class TestSpConv(TestCase):
out_ref = net_ref(features_dense_t) out_ref = net_ref(features_dense_t)
out = net(features_t, indices_t, bs) out = net(features_t, indices_t, bs)
outids = out.indices outids = out.indices
outfeatures = out.features outfeatures = out.features
outids_dev = outids.float() outids_dev = outids.float()
out_dense = out.dense(channels_first=False) out_dense = out.dense(channels_first=False)
out = out_dense.permute(0, 4, 1, 2, 3).contiguous() out = out_dense.permute(0, 4, 1, 2, 3).contiguous()
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
dout_sparse = np.random.uniform( dout_sparse = np.random.uniform(
-0.2, 0.2, outfeatures.shape).astype(features.dtype) -0.2, 0.2, outfeatures.shape).astype(features.dtype)
...@@ -607,9 +651,6 @@ class TestSpConv(TestCase): ...@@ -607,9 +651,6 @@ class TestSpConv(TestCase):
din_sparse = gather_nd(din_dense, indices_t.long()) din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach() din = features_t.grad.detach()
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
din_np = din.cpu().numpy() din_np = din.cpu().numpy()
din_sparse_np = din_sparse.cpu().numpy() din_sparse_np = din_sparse.cpu().numpy()
self.assertAllClose(din_np, din_sparse_np, atol=1e-4) self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
...@@ -623,8 +664,8 @@ def main(algo=spconv.ConvAlgo.Native, dtype=torch.float32): ...@@ -623,8 +664,8 @@ def main(algo=spconv.ConvAlgo.Native, dtype=torch.float32):
shapes = [[400, 400, 15]] shapes = [[400, 400, 15]]
batchsizes = [2] batchsizes = [2]
in_channels = [32] in_channels = [19]
out_channels = [64] out_channels = [17]
ksizes = [(3, 3, 3)] ksizes = [(3, 3, 3)]
strides = [1] strides = [1]
paddings = [0] paddings = [0]
...@@ -752,8 +793,8 @@ def main_subm(algo, dtype=torch.float32): ...@@ -752,8 +793,8 @@ def main_subm(algo, dtype=torch.float32):
if __name__ == '__main__': if __name__ == '__main__':
main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.float32) # main_subm(algo=spconv.ConvAlgo.SparseConvNet, dtype=torch.float32)
main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.half) # main(algo=spconv.ConvAlgo.SparseConvNet, dtype=torch.float32)
# TestCase().assertAllClose(out_my, out_ref) # TestCase().assertAllClose(out_my, out_ref)
# unittest.main() # unittest.main()
# TestSpConv().testSpConv3d() TestSpConv().testSpConv3d()
This source diff could not be displayed because it is too large. You can view the blob instead.
Subproject commit 3b1dbebabc801c9cf6f0953a4c20b904d444f879
<!--
Copyright 2021 Yan Yan
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
## How to debug manylinux build
```Bash
docker run --rm -it -e PLAT=manylinux2014_x86_64 -v `pwd`:/io -v $HOME:/myhome scrin/manylinux2014-cuda:cu114-devel bash
/io/tools/build-wheels.sh
```
## Windows C++ Tips
* cuda attributes such as ```__device__``` must put before return type. when you see ```warning: __declspec attributes ignored```, this means ```__device__``` is ignored because you put it after return type, then cause error.
#!/bin/bash
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e -u -x
function repair_wheel {
wheel="$1"
outpath="$2"
if ! auditwheel show "$wheel"; then
echo "Skipping non-platform wheel $wheel"
else
auditwheel repair "$wheel" --plat "$PLAT" -w "$outpath"
fi
}
export SPCONV_DISABLE_JIT="1"
export CUMM_CUDA_ARCH_LIST="all"
# export SPCONV_PYTHON_LIST="3.7;3.8;3.9;3.10"
# Compile wheels, we only support 3.6-3.10.
# "/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
for PYVER in ${SPCONV_PYTHON_LIST//;/ }
do
PYVER2=`echo "$PYVER" | sed 's/\.//'`
PYVER_CP="cp$PYVER2-cp$PYVER2"
if [ "$PYVER2" = "36" ]; then
PYVER_CP="cp$PYVER2-cp${PYVER2}m"
fi
if [ "$PYVER2" = "37" ]; then
PYVER_CP="cp$PYVER2-cp${PYVER2}m"
fi
"/opt/python/$PYVER_CP/bin/pip" wheel /io/ -v --no-deps -w /io/wheelhouse_tmp
done
# Bundle external shared libraries into the wheels
for whl in /io/wheelhouse_tmp/*.whl; do
repair_wheel "$whl" /io/dist
done
rm -rf /io/wheelhouse_tmp
\ No newline at end of file
## -------------------
## Constants
## -------------------
# Dictionary of known cuda versions and thier download URLS, which do not follow a consistent pattern :(
$CUDA_KNOWN_URLS = @{
"10.2" = "http://developer.download.nvidia.com/compute/cuda/10.2/Prod/network_installers/cuda_10.2.89_win10_network.exe";
"11.0" = "http://developer.download.nvidia.com/compute/cuda/11.0.3/network_installers/cuda_11.0.3_win10_network.exe";
"11.1" = "https://developer.download.nvidia.com/compute/cuda/11.1.1/network_installers/cuda_11.1.1_win10_network.exe";
"11.2" = "https://developer.download.nvidia.com/compute/cuda/11.2.2/network_installers/cuda_11.2.2_win10_network.exe";
"11.3" = "https://developer.download.nvidia.com/compute/cuda/11.3.1/network_installers/cuda_11.3.1_win10_network.exe";
"11.4" = "https://developer.download.nvidia.com/compute/cuda/11.4.2/network_installers/cuda_11.4.2_win10_network.exe";
}
# cuda_runtime.h is in nvcc <= 10.2, but cudart >= 11.0
# @todo - make this easier to vary per CUDA version.
$CUDA_PACKAGES_IN = @(
"nvcc";
"visual_studio_integration";
"curand_dev";
"nvrtc_dev";
"cudart";
)
## -------------------
## Select CUDA version
## -------------------
# Get the cuda version from the environment as env:cuda.
$CUDA_VERSION_FULL = $env:cuda
# Make sure CUDA_VERSION_FULL is set and valid, otherwise error.
# Validate CUDA version, extracting components via regex
$cuda_ver_matched = $CUDA_VERSION_FULL -match "^(?<major>[1-9][0-9]*)\.(?<minor>[0-9]+)$"
if(-not $cuda_ver_matched){
Write-Output "Invalid CUDA version specified, <major>.<minor> required. '$CUDA_VERSION_FULL'."
exit 1
}
$CUDA_MAJOR=$Matches.major
$CUDA_MINOR=$Matches.minor
## ------------------------------------------------
## Select CUDA packages to install from environment
## ------------------------------------------------
$CUDA_PACKAGES = ""
# for CUDA >= 11 cudart is a required package.
# if([version]$CUDA_VERSION_FULL -ge [version]"11.0") {
# if(-not $CUDA_PACKAGES_IN -contains "cudart") {
# $CUDA_PACKAGES_IN += 'cudart'
# }
# }
Foreach ($package in $CUDA_PACKAGES_IN) {
# Make sure the correct package name is used for nvcc.
if($package -eq "nvcc" -and [version]$CUDA_VERSION_FULL -lt [version]"9.1"){
$package="compiler"
} elseif($package -eq "compiler" -and [version]$CUDA_VERSION_FULL -ge [version]"9.1") {
$package="nvcc"
}
$CUDA_PACKAGES += " $($package)_$($CUDA_MAJOR).$($CUDA_MINOR)"
}
echo "$($CUDA_PACKAGES)"
## -----------------
## Prepare download
## -----------------
# Select the download link if known, otherwise have a guess.
$CUDA_REPO_PKG_REMOTE=""
if($CUDA_KNOWN_URLS.containsKey($CUDA_VERSION_FULL)){
$CUDA_REPO_PKG_REMOTE=$CUDA_KNOWN_URLS[$CUDA_VERSION_FULL]
} else{
# Guess what the url is given the most recent pattern (at the time of writing, 10.1)
Write-Output "note: URL for CUDA ${$CUDA_VERSION_FULL} not known, estimating."
$CUDA_REPO_PKG_REMOTE="http://developer.download.nvidia.com/compute/cuda/$($CUDA_MAJOR).$($CUDA_MINOR)/Prod/network_installers/cuda_$($CUDA_VERSION_FULL)_win10_network.exe"
}
$CUDA_REPO_PKG_LOCAL="cuda_$($CUDA_VERSION_FULL)_win10_network.exe"
## ------------
## Install CUDA
## ------------
# Get CUDA network installer
Write-Output "Downloading CUDA Network Installer for $($CUDA_VERSION_FULL) from: $($CUDA_REPO_PKG_REMOTE)"
Invoke-WebRequest $CUDA_REPO_PKG_REMOTE -OutFile $CUDA_REPO_PKG_LOCAL | Out-Null
if(Test-Path -Path $CUDA_REPO_PKG_LOCAL){
Write-Output "Downloading Complete"
} else {
Write-Output "Error: Failed to download $($CUDA_REPO_PKG_LOCAL) from $($CUDA_REPO_PKG_REMOTE)"
exit 1
}
# Invoke silent install of CUDA (via network installer)
Write-Output "Installing CUDA $($CUDA_VERSION_FULL). Subpackages $($CUDA_PACKAGES)"
Start-Process -Wait -FilePath .\"$($CUDA_REPO_PKG_LOCAL)" -ArgumentList "-s $($CUDA_PACKAGES)"
# Check the return status of the CUDA installer.
if (!$?) {
Write-Output "Error: CUDA installer reported error. $($LASTEXITCODE)"
exit 1
}
# Store the CUDA_PATH in the environment for the current session, to be forwarded in the action.
$CUDA_PATH = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$($CUDA_MAJOR).$($CUDA_MINOR)"
$CUDA_PATH_VX_Y = "CUDA_PATH_V$($CUDA_MAJOR)_$($CUDA_MINOR)"
# Set environmental variables in this session
$env:CUDA_PATH = "$($CUDA_PATH)"
$env:CUDA_PATH_VX_Y = "$($CUDA_PATH_VX_Y)"
Write-Output "CUDA_PATH $($CUDA_PATH)"
Write-Output "CUDA_PATH_VX_Y $($CUDA_PATH_VX_Y)"
# PATH needs updating elsewhere, anything in here won't persist.
# Append $CUDA_PATH/bin to path.
# Set CUDA_PATH as an environmental variable
# If executing on github actions, emit the appropriate echo statements to update environment variables
if (Test-Path "env:GITHUB_ACTIONS") {
# Set paths for subsequent steps, using $env:CUDA_PATH
echo "Adding CUDA to CUDA_PATH, CUDA_PATH_X_Y and PATH"
echo "CUDA_PATH=$env:CUDA_PATH" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
echo "$env:CUDA_PATH_VX_Y=$env:CUDA_PATH" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
echo "$env:CUDA_PATH/bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment