"vscode:/vscode.git/clone" did not exist on "5ecced399b0d78e32ff952d01d217a1098d17201"
Unverified Commit 2f88c124 authored by Wenhao Wu's avatar Wenhao Wu Committed by GitHub
Browse files

[Enhance] Replace mmdet3d ops with mmcv ops (#1240)

* import some ops from mmcv instead of mmdet3d

* use mmcv ops in primitive_head.py

* use mmcv ops in PAConv

* remove ops in mmdet3d & fix some bugs

* remove spconv & fix some bugs

* fix voxelization unittest

* remove spconv in ops/__init__.py

* refine ops/__init__.py

* recover sparse_block in ops/__init__

* fix parta2_bbox_head unittest

* remove remaining ops

* recover ops/__init__.py for bc breaking

* add source of ops from mmcv

* recover the unittest for voxelization

* remove unittest
parent 41d77dad
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <spconv/indice.cu.h>
#include <spconv/indice.h>
#include <spconv/mp_helper.h>
#include <tensorview/helper_launch.h>
#include <tensorview/tensorview.h>
#include <utility/timer.h>
#include <chrono>
#include <limits>
#include <type_traits>
namespace spconv {
namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose) {
Index batchSize = gridsOut.dim(0);
auto numActIn = indicesIn.dim(0);
if (numActIn == 0) return 0;
// auto timer = spconv::CudaContextTimer<>();
if (transpose)
prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
indiceNum, indicePairUnique, kernelSize, stride,
padding, dilation, outSpatialShape);
else
prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
indiceNum, indicePairUnique, kernelSize, stride,
padding, dilation, outSpatialShape);
TV_CHECK_CUDA_ERR();
// std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
return 1;
}
};
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) {
Index batchSize = gridsOut.dim(0);
auto kernelVolume = indicePairs.dim(0);
auto numActIn = indicesIn.dim(0);
if (numActIn == 0) return 0;
Index numAct = indicePairUnique.dim(0) - 1;
assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
indicePairUnique, outSpatialShape, batchSize);
TV_CHECK_CUDA_ERR();
assignIndicePairsKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
indicePairUnique, outSpatialShape);
TV_CHECK_CUDA_ERR();
if (resetGrid) {
resetGridKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
TV_CHECK_CUDA_ERR();
}
return numAct;
}
};
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) {
auto numActIn = indicesIn.dim(0);
if (numActIn == 0) return 0;
// auto timer = spconv::CudaContextTimer<>();
prepareSubMGridKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
TV_CHECK_CUDA_ERR();
getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
kernelSize, stride, padding, dilation,
outSpatialShape);
TV_CHECK_CUDA_ERR();
// std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
if (resetGrid) {
resetGridSubMKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
numActIn);
TV_CHECK_CUDA_ERR();
}
return numActIn;
}
};
} // namespace functor
#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::GPU, Index, int, \
NDIM>; \
template struct functor::CreateConvIndicePairFunctorP1<tv::GPU, Index, int, \
NDIM>; \
template struct functor::CreateConvIndicePairFunctorP2<tv::GPU, Index, int, \
NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::GPU, Index, int, \
NDIM>;
#define DECLARE_GPU_INDEX(Index) \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
DECLARE_GPU_INDEX(int);
#undef DECLARE_GPU_INDEX
#undef DECLARE_GPU_SPECS_INDEX_NDIM
} // namespace spconv
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/maxpool.h>
#include <torch/script.h>
namespace spconv {
namespace functor {
template <typename T, typename Index>
struct SparseMaxPoolForwardFunctor<tv::CPU, T, Index> {
void operator()(const tv::CPU &d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> inFeatures,
tv::TensorView<const Index> indices, int size) {
int stride = outFeatures.dim(1);
auto outFeaturesData = outFeatures.data();
auto inFeaturesData = inFeatures.data();
auto indicesIn = indices.subview(0).data();
auto indicesOut = indices.subview(1).data();
Index idxi, idxo;
for (int row = 0; row < size; row++) {
idxi = indicesIn[row] * stride;
idxo = indicesOut[row] * stride;
for (int plane = 0; plane < stride; ++plane)
if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
}
}
};
template <typename T, typename Index>
struct SparseMaxPoolBackwardFunctor<tv::CPU, T, Index> {
void operator()(const tv::CPU &d, tv::TensorView<const T> outFeatures,
tv::TensorView<const T> inFeatures,
tv::TensorView<const T> dout, tv::TensorView<T> din,
tv::TensorView<const Index> indices, int size) {
int stride = outFeatures.dim(1);
auto outFeaturesData = outFeatures.data();
auto inFeaturesData = inFeatures.data();
auto doutData = dout.data();
auto dinData = din.data();
auto indicesIn = indices.subview(0).data();
auto indicesOut = indices.subview(1).data();
Index idxi, idxo;
for (int row = 0; row < size; row++) {
idxi = indicesIn[row] * stride;
idxo = indicesOut[row] * stride;
for (int plane = 0; plane < stride; ++plane)
if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
dinData[idxi + plane] += doutData[idxo + plane];
}
}
};
} // namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \
DECLARE_CPU_SPECS_T_INDEX(T, int); \
DECLARE_CPU_SPECS_T_INDEX(T, long);
DECLARE_CPU_SPECS(float);
DECLARE_CPU_SPECS(double);
DECLARE_CPU_SPECS(at::Half);
#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX
} // namespace spconv
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <spconv/maxpool.h>
#include <spconv/mp_helper.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/helper_launch.h>
#include <tensorview/tensorview.h>
#include <chrono>
#include <limits>
#include <type_traits>
namespace spconv {
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolFwdBlockKernel(T *outFeatures, const T *inFeatures,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
T in, out;
int ILPStrideY[NumILP];
Index idxo, idxi;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x; ix < numHot;
ix += blockDim.x * gridDim.x) {
{
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
in = inFeatures[idxi];
out = outFeatures[idxo];
if (in > out) {
outFeatures[idxo] = in;
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolFwdGenericBlockKernel(T *outFeatures,
const T *inFeatures,
const Index *indicesIn,
const Index *indicesOut,
int numHot, int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
T in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in > out) {
outFeatures[RO[ilp] + iy] = in;
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
__global__ void maxPoolFwdVecBlockKernel(T *outFeatures, const T *inFeatures,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideY[NumILP];
constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
T bufi[vecloadFactor];
T bufo[vecloadFactor];
Index idxi, idxo;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
ix += blockDim.x * gridDim.x * vecloadFactor) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
reinterpret_cast<VecType *>(bufo)[0] =
reinterpret_cast<VecType *>(outFeatures)[idxo];
reinterpret_cast<VecType *>(bufi)[0] =
reinterpret_cast<const VecType *>(inFeatures)[idxi];
#pragma unroll
for (int i = 0; i < vecloadFactor; i++) {
if (bufi[i] > bufo[i]) {
bufo[i] = bufi[i];
}
}
reinterpret_cast<VecType *>(outFeatures)[idxo] =
reinterpret_cast<VecType *>(bufo)[0];
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolFwdGenericKernel(T *outFeatures, const T *inFeatures,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
T in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < numHot) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < numHot) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in > out) {
outFeatures[RO[ilp] + iy] = in;
}
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolBwdBlockKernel(const T *outFeatures, const T *inFeatures,
const T *dout, T *din,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
T in, out;
Index idxo, idxi;
int ILPStrideY[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
dout += blockIdx.y * NumTLP;
din += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x; ix < numHot;
ix += blockDim.x * gridDim.x) {
{
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
in = inFeatures[idxi];
out = outFeatures[idxo];
if (in == out) {
din[idxi] += dout[idxo];
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolBwdGenericBlockKernel(const T *outFeatures,
const T *inFeatures, const T *dout,
T *din, const Index *indicesIn,
const Index *indicesOut,
int numHot, int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
T in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in == out) {
din[RI[ilp] + iy] += dout[RO[ilp] + iy];
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
__global__ void maxPoolBwdVecBlockKernel(const T *outFeatures,
const T *inFeatures, const T *dout,
T *din, const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideY[NumILP];
constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
T bufi[vecloadFactor];
T bufo[vecloadFactor];
T bufdi[vecloadFactor];
T bufdo[vecloadFactor];
Index idxi, idxo;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
ix += blockDim.x * gridDim.x * vecloadFactor) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
reinterpret_cast<VecType *>(bufo)[0] =
reinterpret_cast<const VecType *>(outFeatures)[idxo];
reinterpret_cast<VecType *>(bufi)[0] =
reinterpret_cast<const VecType *>(inFeatures)[idxi];
reinterpret_cast<VecType *>(bufdo)[0] =
reinterpret_cast<const VecType *>(dout)[idxo];
reinterpret_cast<VecType *>(bufdi)[0] =
reinterpret_cast<VecType *>(din)[idxi];
#pragma unroll
for (int i = 0; i < vecloadFactor; i++) {
if (bufi[i] == bufo[i]) {
bufdi[i] += bufdo[i];
}
}
reinterpret_cast<VecType *>(din)[idxi] =
reinterpret_cast<VecType *>(bufdi)[0];
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolBwdGenericKernel(const T *outFeatures,
const T *inFeatures, const T *dout,
T *din, const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
T in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < numHot) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < numHot) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in == out) {
din[RI[ilp] + iy] += dout[RO[ilp] + iy];
}
}
}
}
}
}
namespace functor {
template <typename T, typename Index>
struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>;
void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> inFeatures,
tv::TensorView<const Index> indices, int size) {
if (size <= 0) return;
int numPlanes = inFeatures.dim(1);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
&notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (numHotBlock >= NumTLP) {
maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(outFeatures.data(), inFeatures.data(),
indices.subview(0).data(),
indices.subview(1).data(), numHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
if (size > numHotBlock) {
maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock,
size - numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (numHotBlock >= NumTLP) {
maxPoolFwdGenericBlockKernel<T, Index, NumTLP, NumILP>
<<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), inFeatures.data(),
indices.subview(0).data(), indices.subview(1).data(),
numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
if (size > numHotBlock) {
maxPoolFwdGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), inFeatures.data(),
indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock, size - numHotBlock,
numPlanes);
TV_CHECK_CUDA_ERR();
}
}
}
};
template <typename T, typename Index>
struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>;
void operator()(const tv::GPU &d, tv::TensorView<const T> outFeatures,
tv::TensorView<const T> inFeatures,
tv::TensorView<const T> dout, tv::TensorView<T> din,
tv::TensorView<const Index> indices, int size) {
if (size <= 0) return;
int numPlanes = inFeatures.dim(1);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &dout, &din,
&indices, &notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (numHotBlock >= NumTLP) {
maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(outFeatures.data(), inFeatures.data(),
dout.data(), din.data(),
indices.subview(0).data(),
indices.subview(1).data(), numHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
if (size > numHotBlock) {
maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
dout.data(), din.data(),
indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock,
size - numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (numHotBlock >= NumTLP) {
maxPoolBwdGenericBlockKernel<T, Index, NumTLP, NumILP>
<<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
indices.subview(0).data(), indices.subview(1).data(),
numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
if (size > numHotBlock) {
maxPoolBwdGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock, size - numHotBlock,
numPlanes);
TV_CHECK_CUDA_ERR();
}
}
}
};
} // namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseMaxPoolForwardFunctor<tv::GPU, T, Index>; \
template struct functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, Index>;
#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
DECLARE_GPU_SPECS(float);
DECLARE_GPU_SPECS(double);
DECLARE_GPU_SPECS(at::Half);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
} // namespace spconv
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/reordering.h>
#include <torch/script.h>
namespace spconv {
namespace functor {
template <typename T, typename Index>
struct SparseGatherFunctor<tv::CPU, T, Index> {
void operator()(const tv::CPU& d, tv::TensorView<T> buffer,
tv::TensorView<const T> features,
tv::TensorView<const Index> indices, int size) {
int numPlanes = features.dim(1);
for (int i = 0; i < size; ++i) {
std::memcpy(buffer.data() + i * numPlanes,
features.data() + indices[i] * numPlanes,
sizeof(T) * numPlanes);
}
}
};
template <typename T, typename Index>
struct SparseScatterAddFunctor<tv::CPU, T, Index> {
void operator()(const tv::CPU& d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> buffer,
tv::TensorView<const Index> indices, int size, bool stable) {
int numPlanes = outFeatures.dim(1);
const T* buf = buffer.data();
T* out = outFeatures.data();
for (int i = 0; i < size; ++i) {
buf = buffer.data() + i * numPlanes;
out = outFeatures.data() + indices[i] * numPlanes;
for (int j = 0; j < numPlanes; ++j) {
out[j] += buf[j];
}
}
}
};
} // namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \
DECLARE_CPU_SPECS_T_INDEX(T, int); \
DECLARE_CPU_SPECS_T_INDEX(T, long);
DECLARE_CPU_SPECS(float);
DECLARE_CPU_SPECS(double);
DECLARE_CPU_SPECS(at::Half);
#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX
} // namespace spconv
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <spconv/mp_helper.h>
#include <spconv/reordering.cu.h>
#include <spconv/reordering.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/helper_launch.h>
#include <tensorview/tensorview.h>
#include <utility/timer.h>
#include <chrono>
#include <limits>
#include <type_traits>
namespace spconv {
namespace functor {
template <typename T, typename Index>
struct SparseGatherFunctor<tv::GPU, T, Index> {
using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>;
void operator()(const tv::GPU &d, tv::TensorView<T> buffer,
tv::TensorView<const T> features,
tv::TensorView<const Index> indices, int size) {
if (size <= 0) return;
int numPlanes = features.dim(1);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
&notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
<<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(buffer.data(), features.data(),
indices.data(), nHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
if (size - nHotBlock > 0) {
gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
<<<dim3(1, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
features.data(), indices.data() + nHotBlock,
size - nHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
gatherGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::launch::DivUp(size, NumTLP),
tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
buffer.data(), features.data(), indices.data(), size, numPlanes);
TV_CHECK_CUDA_ERR();
}
}
};
template <typename T, typename Index>
struct SparseScatterAddFunctor<tv::GPU, T, Index> {
using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>;
void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> buffer,
tv::TensorView<const Index> indices, int size, bool stable) {
if (size <= 0) return;
int numPlanes = outFeatures.dim(1);
bool notFound = true;
constexpr int vecloadFactor =
sizeof(vecload_type_t) / sizeof(T); // important for half.
mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
&notFound](auto NumTLP) {
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
constexpr int NumILP = NumTLP / 4;
int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(outFeatures.data(), buffer.data(),
indices.data(), nHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
if (size - nHotBlock > 0) {
scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, d.getStream()>>>(
outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
indices.data() + nHotBlock, size - nHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
scatterAddGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::launch::DivUp(size, NumTLP),
tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
outFeatures.data(), buffer.data(), indices.data(), size,
numPlanes);
TV_CHECK_CUDA_ERR();
}
}
};
} // namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::GPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;
#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
DECLARE_GPU_SPECS(float);
DECLARE_GPU_SPECS(double);
DECLARE_GPU_SPECS(at::Half);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
} // namespace spconv
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import torch
def scatter_nd(indices, updates, shape):
"""pytorch edition of tensorflow scatter_nd.
this function don't contain except handle code. so use this carefully when
indice repeats, don't support repeat add which is supported in tensorflow.
"""
ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
ndim = indices.shape[-1]
output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
flatted_indices = indices.view(-1, ndim)
slices = [flatted_indices[:, i] for i in range(ndim)]
slices += [Ellipsis]
ret[slices] = updates.view(*output_shape)
return ret
class SparseConvTensor(object):
def __init__(self,
features,
indices,
spatial_shape,
batch_size,
grid=None):
"""
Args:
grid: pre-allocated grid tensor.
should be used when the volume of spatial shape
is very large.
"""
self.features = features
self.indices = indices
if self.indices.dtype != torch.int32:
self.indices.int()
self.spatial_shape = spatial_shape
self.batch_size = batch_size
self.indice_dict = {}
self.grid = grid
@property
def spatial_size(self):
return np.prod(self.spatial_shape)
def find_indice_pair(self, key):
if key is None:
return None
if key in self.indice_dict:
return self.indice_dict[key]
return None
def dense(self, channels_first=True):
output_shape = [self.batch_size] + list(
self.spatial_shape) + [self.features.shape[1]]
res = scatter_nd(self.indices.long(), self.features, output_shape)
if not channels_first:
return res
ndim = len(self.spatial_shape)
trans_params = list(range(0, ndim + 1))
trans_params.insert(1, ndim + 1)
return res.permute(*trans_params).contiguous()
@property
def sparity(self):
return (self.indices.shape[0] / np.prod(self.spatial_shape) /
self.batch_size)
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
class TestCase(unittest.TestCase):
def _GetNdArray(self, a):
if not isinstance(a, np.ndarray):
a = np.array(a)
return a
def assertAllEqual(self, a, b):
"""Asserts that two numpy arrays have the same values.
Args:
a: the expected numpy ndarray or anything can be converted to one.
b: the actual numpy ndarray or anything can be converted to one.
"""
a = self._GetNdArray(a)
b = self._GetNdArray(b)
self.assertEqual(
a.shape, b.shape,
'Shape mismatch: expected %s, got %s.' % (a.shape, b.shape))
same = (a == b)
if a.dtype == np.float32 or a.dtype == np.float64:
same = np.logical_or(same,
np.logical_and(np.isnan(a), np.isnan(b)))
if not np.all(same):
# Prints more details than np.testing.assert_array_equal.
diff = np.logical_not(same)
if a.ndim:
x = a[np.where(diff)]
y = b[np.where(diff)]
print('not equal where = ', np.where(diff))
else:
# np.where is broken for scalars
x, y = a, b
print('not equal lhs = ', x)
print('not equal rhs = ', y)
np.testing.assert_array_equal(a, b)
def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
"""Asserts that two numpy arrays, or dicts of same, have near values.
This does not support nested dicts.
Args:
a: The expected numpy ndarray (or anything can be converted to one), or
dict of same. Must be a dict iff `b` is a dict.
b: The actual numpy ndarray (or anything can be converted to one), or
dict of same. Must be a dict iff `a` is a dict.
rtol: relative tolerance.
atol: absolute tolerance.
Raises:
ValueError: if only one of `a` and `b` is a dict.
"""
is_a_dict = isinstance(a, dict)
if is_a_dict != isinstance(b, dict):
raise ValueError("Can't compare dict to non-dict, %s vs %s." %
(a, b))
if is_a_dict:
self.assertCountEqual(
a.keys(),
b.keys(),
msg='mismatched keys, expected %s, got %s' %
(a.keys(), b.keys()))
for k in a:
self._assertArrayLikeAllClose(
a[k],
b[k],
rtol=rtol,
atol=atol,
msg='%s: expected %s, got %s.' % (k, a, b))
else:
self._assertArrayLikeAllClose(a, b, rtol=rtol, atol=atol)
def _assertArrayLikeAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
a = self._GetNdArray(a)
b = self._GetNdArray(b)
self.assertEqual(
a.shape, b.shape,
'Shape mismatch: expected %s, got %s.' % (a.shape, b.shape))
if not np.allclose(a, b, rtol=rtol, atol=atol):
# Prints more details than np.testing.assert_allclose.
#
# NOTE: numpy.allclose (and numpy.testing.assert_allclose)
# checks whether two arrays are element-wise equal within a
# tolerance. The relative difference (rtol * abs(b)) and the
# absolute difference atol are added together to compare against
# the absolute difference between a and b. Here, we want to
# print out which elements violate such conditions.
cond = np.logical_or(
np.abs(a - b) > atol + rtol * np.abs(b),
np.isnan(a) != np.isnan(b))
if a.ndim:
x = a[np.where(cond)]
y = b[np.where(cond)]
print('not close where = ', np.where(cond))
else:
# np.where is broken for scalars
x, y = a, b
print('not close lhs = ', x)
print('not close rhs = ', y)
print('not close dif = ', np.abs(x - y))
print('not close tol = ', atol + rtol * np.abs(y))
print('dtype = %s, shape = %s' % (a.dtype, a.shape))
np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, err_msg=msg)
def params_grid(*params):
size = len(params)
length = 1
for p in params:
length *= len(p)
sizes = [len(p) for p in params]
counter = [0] * size
total = []
for i in range(length):
total.append([0] * size)
for i in range(length):
for j in range(size):
total[i][j] = params[j][counter[j]]
counter[size - 1] += 1
for c in range(size - 1, -1, -1):
if (counter[c] == sizes[c] and c > 0):
counter[c - 1] += 1
counter[c] = 0
return total
def generate_sparse_data(shape,
num_points,
num_channels,
integer=False,
data_range=(-1, 1),
with_dense=True,
dtype=np.float32):
dense_shape = shape
ndim = len(dense_shape)
# num_points = np.random.randint(10, 100, size=[batch_size, ndim])
num_points = np.array(num_points)
# num_points = np.array([3, 2])
batch_size = len(num_points)
batch_indices = []
coors_total = np.stack(
np.meshgrid(*[np.arange(0, s) for s in shape]), axis=-1)
coors_total = coors_total.reshape(-1, ndim)
for i in range(batch_size):
np.random.shuffle(coors_total)
inds_total = coors_total[:num_points[i]]
inds_total = np.pad(
inds_total, ((0, 0), (0, 1)), mode='constant', constant_values=i)
batch_indices.append(inds_total)
if integer:
sparse_data = np.random.randint(
data_range[0],
data_range[1],
size=[num_points.sum(), num_channels]).astype(dtype)
else:
sparse_data = np.random.uniform(
data_range[0],
data_range[1],
size=[num_points.sum(), num_channels]).astype(dtype)
res = {
'features': sparse_data.astype(dtype),
}
if with_dense:
dense_data = np.zeros([batch_size, num_channels, *dense_shape],
dtype=sparse_data.dtype)
start = 0
for i, inds in enumerate(batch_indices):
for j, ind in enumerate(inds):
dense_slice = (i, slice(None), *ind[:-1])
dense_data[dense_slice] = sparse_data[start + j]
start += len(inds)
res['features_dense'] = dense_data.astype(dtype)
batch_indices = np.concatenate(batch_indices, axis=0)
res['indices'] = batch_indices.astype(np.int32)
return res
# Copyright (c) OpenMMLab. All rights reserved.
from .scatter_points import DynamicScatter, dynamic_scatter
from .voxelize import Voxelization, voxelization
__all__ = ['Voxelization', 'voxelization', 'dynamic_scatter', 'DynamicScatter']
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from torch import nn
from torch.autograd import Function
from .voxel_layer import (dynamic_point_to_voxel_backward,
dynamic_point_to_voxel_forward)
class _dynamic_scatter(Function):
@staticmethod
def forward(ctx, feats, coors, reduce_type='max'):
"""convert kitti points(N, >=3) to voxels.
Args:
feats: [N, C] float tensor. points features to be reduced
into voxels.
coors: [N, ndim] int tensor. corresponding voxel coordinates
(specifically multi-dim voxel index) of each points.
reduce_type: str. reduce op. support 'max', 'sum' and 'mean'
Returns:
tuple
voxel_feats: [M, C] float tensor. reduced features. input features
that shares the same voxel coordinates are reduced to one row
coordinates: [M, ndim] int tensor, voxel coordinates.
"""
results = dynamic_point_to_voxel_forward(feats, coors, reduce_type)
(voxel_feats, voxel_coors, point2voxel_map,
voxel_points_count) = results
ctx.reduce_type = reduce_type
ctx.save_for_backward(feats, voxel_feats, point2voxel_map,
voxel_points_count)
ctx.mark_non_differentiable(voxel_coors)
return voxel_feats, voxel_coors
@staticmethod
def backward(ctx, grad_voxel_feats, grad_voxel_coors=None):
(feats, voxel_feats, point2voxel_map,
voxel_points_count) = ctx.saved_tensors
grad_feats = torch.zeros_like(feats)
# TODO: whether to use index put or use cuda_backward
# To use index put, need point to voxel index
dynamic_point_to_voxel_backward(grad_feats,
grad_voxel_feats.contiguous(), feats,
voxel_feats, point2voxel_map,
voxel_points_count, ctx.reduce_type)
return grad_feats, None, None
dynamic_scatter = _dynamic_scatter.apply
class DynamicScatter(nn.Module):
def __init__(self, voxel_size, point_cloud_range, average_points: bool):
super(DynamicScatter, self).__init__()
"""Scatters points into voxels, used in the voxel encoder with
dynamic voxelization
**Note**: The CPU and GPU implementation get the same output, but
have numerical difference after summation and division (e.g., 5e-7).
Args:
average_points (bool): whether to use avg pooling to scatter
points into voxel voxel_size (list): list [x, y, z] size
of three dimension
point_cloud_range (list):
[x_min, y_min, z_min, x_max, y_max, z_max]
"""
self.voxel_size = voxel_size
self.point_cloud_range = point_cloud_range
self.average_points = average_points
def forward_single(self, points, coors):
reduce = 'mean' if self.average_points else 'max'
return dynamic_scatter(points.contiguous(), coors.contiguous(), reduce)
def forward(self, points, coors):
"""
Args:
input: NC points
"""
if coors.size(-1) == 3:
return self.forward_single(points, coors)
else:
batch_size = coors[-1, 0] + 1
voxels, voxel_coors = [], []
for i in range(batch_size):
inds = torch.where(coors[:, 0] == i)
voxel, voxel_coor = self.forward_single(
points[inds], coors[inds][:, 1:])
coor_pad = nn.functional.pad(
voxel_coor, (1, 0), mode='constant', value=i)
voxel_coors.append(coor_pad)
voxels.append(voxel)
features = torch.cat(voxels, dim=0)
feature_coors = torch.cat(voxel_coors, dim=0)
return features, feature_coors
def __repr__(self):
tmpstr = self.__class__.__name__ + '('
tmpstr += 'voxel_size=' + str(self.voxel_size)
tmpstr += ', point_cloud_range=' + str(self.point_cloud_range)
tmpstr += ', average_points=' + str(self.average_points)
tmpstr += ')'
return tmpstr
#include <ATen/TensorUtils.h>
#include <torch/extension.h>
// #include "voxelization.h"
namespace {
template <typename T_int>
void determin_max_points_kernel(
torch::TensorAccessor<T_int, 2> coor,
torch::TensorAccessor<T_int, 1> point_to_voxelidx,
torch::TensorAccessor<T_int, 1> num_points_per_voxel,
torch::TensorAccessor<T_int, 3> coor_to_voxelidx, int& voxel_num,
int& max_points, const int num_points) {
int voxelidx, num;
for (int i = 0; i < num_points; ++i) {
if (coor[i][0] == -1) continue;
voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
// record voxel
if (voxelidx == -1) {
voxelidx = voxel_num;
voxel_num += 1;
coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
}
// put points into voxel
num = num_points_per_voxel[voxelidx];
point_to_voxelidx[i] = num;
num_points_per_voxel[voxelidx] += 1;
// update max points per voxel
max_points = std::max(max_points, num + 1);
}
return;
}
template <typename T, typename T_int>
void scatter_point_to_voxel_kernel(
const torch::TensorAccessor<T, 2> points,
torch::TensorAccessor<T_int, 2> coor,
torch::TensorAccessor<T_int, 1> point_to_voxelidx,
torch::TensorAccessor<T_int, 3> coor_to_voxelidx,
torch::TensorAccessor<T, 3> voxels,
torch::TensorAccessor<T_int, 2> voxel_coors, const int num_features,
const int num_points, const int NDim) {
for (int i = 0; i < num_points; ++i) {
int num = point_to_voxelidx[i];
int voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
for (int k = 0; k < num_features; ++k) {
voxels[voxelidx][num][k] = points[i][k];
}
for (int k = 0; k < NDim; ++k) {
voxel_coors[voxelidx][k] = coor[i][k];
}
}
}
} // namespace
namespace voxelization {
std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
const at::Tensor& points, const at::Tensor& voxel_mapping,
const std::vector<float> voxel_size, const std::vector<float> coors_range) {
// current version tooks about 0.02s_0.03s for one frame on cpu
// check device
AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
const int NDim = voxel_mapping.size(1);
const int num_points = points.size(0);
const int num_features = points.size(1);
std::vector<int> grid_size(NDim);
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
at::Tensor num_points_per_voxel = at::zeros(
{
num_points,
},
voxel_mapping.options());
at::Tensor coor_to_voxelidx = -at::ones(
{grid_size[2], grid_size[1], grid_size[0]}, voxel_mapping.options());
at::Tensor point_to_voxelidx = -at::ones(
{
num_points,
},
voxel_mapping.options());
int voxel_num = 0;
int max_points = 0;
AT_DISPATCH_ALL_TYPES(voxel_mapping.scalar_type(), "determin_max_point", [&] {
determin_max_points_kernel<scalar_t>(
voxel_mapping.accessor<scalar_t, 2>(),
point_to_voxelidx.accessor<scalar_t, 1>(),
num_points_per_voxel.accessor<scalar_t, 1>(),
coor_to_voxelidx.accessor<scalar_t, 3>(), voxel_num, max_points,
num_points);
});
at::Tensor voxels =
at::zeros({voxel_num, max_points, num_features}, points.options());
at::Tensor voxel_coors =
at::zeros({voxel_num, NDim}, points.options().dtype(at::kInt));
AT_DISPATCH_ALL_TYPES(points.scalar_type(), "scatter_point_to_voxel", [&] {
scatter_point_to_voxel_kernel<scalar_t, int>(
points.accessor<scalar_t, 2>(), voxel_mapping.accessor<int, 2>(),
point_to_voxelidx.accessor<int, 1>(),
coor_to_voxelidx.accessor<int, 3>(), voxels.accessor<scalar_t, 3>(),
voxel_coors.accessor<int, 2>(), num_features, num_points, NDim);
});
at::Tensor num_points_per_voxel_out =
num_points_per_voxel.slice(/*dim=*/0, /*start=*/0, /*end=*/voxel_num);
return {voxels, voxel_coors, num_points_per_voxel_out};
}
} // namespace voxelization
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <torch/types.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
namespace {
int const threadsPerBlock = 512;
int const maxGridDim = 50000;
} // namespace
__device__ __forceinline__ static void reduceMax(float *address, float val) {
int *address_as_i = reinterpret_cast<int *>(address);
int old = *address_as_i, assumed;
do {
assumed = old;
old = atomicCAS(address_as_i, assumed,
__float_as_int(fmaxf(val, __int_as_float(assumed))));
} while (assumed != old || __int_as_float(old) < val);
}
__device__ __forceinline__ static void reduceMax(double *address, double val) {
unsigned long long *address_as_ull =
reinterpret_cast<unsigned long long *>(address);
unsigned long long old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(
address_as_ull, assumed,
__double_as_longlong(fmax(val, __longlong_as_double(assumed))));
} while (assumed != old || __longlong_as_double(old) < val);
}
// get rid of meaningless warnings when compiling host code
#ifdef __CUDA_ARCH__
__device__ __forceinline__ static void reduceAdd(float *address, float val) {
#if (__CUDA_ARCH__ < 200)
#warning \
"compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32"
int *address_as_i = reinterpret_cast<int *>(address);
int old = *address_as_i, assumed;
do {
assumed = old;
old = atomicCAS(address_as_i, assumed,
__float_as_int(val + __int_as_float(assumed)));
} while (assumed != old);
#else
atomicAdd(address, val);
#endif
}
__device__ __forceinline__ static void reduceAdd(double *address, double val) {
#if (__CUDA_ARCH__ < 600)
#warning \
"compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64"
unsigned long long *address_as_ull =
reinterpret_cast<unsigned long long *>(address);
unsigned long long old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
#else
atomicAdd(address, val);
#endif
}
#endif
template <typename T>
__global__ void
feats_reduce_kernel(const T *feats, const int32_t *coors_map,
T *reduced_feats, // shall be 0 at initialization
const int num_input, const int num_feats,
const reduce_t reduce_type) {
for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
x += gridDim.x * blockDim.x) {
int32_t reduce_to = coors_map[x];
if (reduce_to == -1) continue;
const T *feats_offset = feats + x * num_feats;
T *reduced_feats_offset = reduced_feats + reduce_to * num_feats;
if (reduce_type == reduce_t::MAX) {
for (int i = 0; i < num_feats; i++) {
reduceMax(&reduced_feats_offset[i], feats_offset[i]);
}
} else {
for (int i = 0; i < num_feats; i++) {
reduceAdd(&reduced_feats_offset[i], feats_offset[i]);
}
}
}
}
template <typename T>
__global__ void add_reduce_traceback_grad_kernel(
T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map,
const int32_t *reduce_count, const int num_input, const int num_feats,
const reduce_t reduce_type) {
for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
x += gridDim.x * blockDim.x) {
int32_t reduce_to = coors_map[x];
if (reduce_to == -1) {
continue;
}
const int input_offset = x * num_feats;
T *grad_feats_offset = grad_feats + input_offset;
const int reduced_offset = reduce_to * num_feats;
const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
if (reduce_type == reduce_t::SUM) {
for (int i = 0; i < num_feats; i++) {
grad_feats_offset[i] = grad_reduced_feats_offset[i];
}
} else if (reduce_type == reduce_t::MEAN) {
for (int i = 0; i < num_feats; i++) {
grad_feats_offset[i] = grad_reduced_feats_offset[i] /
static_cast<T>(reduce_count[reduce_to]);
}
}
}
}
template <typename T>
__global__ void max_reduce_traceback_scatter_idx_kernel(
const T *feats, const T *reduced_feats, int32_t *reduce_from,
const int32_t *coors_map, const int num_input, const int num_feats) {
for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
x += gridDim.x * blockDim.x) {
int32_t reduce_to = coors_map[x];
const int input_offset = x * num_feats;
const T *feats_offset = feats + input_offset;
if (reduce_to == -1) {
continue;
}
const int reduced_offset = reduce_to * num_feats;
const T *reduced_feats_offset = reduced_feats + reduced_offset;
int32_t *reduce_from_offset = reduce_from + reduced_offset;
for (int i = 0; i < num_feats; i++) {
if (feats_offset[i] == reduced_feats_offset[i]) {
atomicMin(&reduce_from_offset[i], static_cast<int32_t>(x));
}
}
}
}
template <typename T>
__global__ void max_reduce_scatter_grad_kernel(T *grad_feats,
const T *grad_reduced_feats,
const int32_t *reduce_from,
const int num_reduced,
const int num_feats) {
for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_reduced;
x += gridDim.x * blockDim.x) {
const int reduced_offset = x * num_feats;
const int32_t *scatter_to_offset = reduce_from + reduced_offset;
const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
for (int i = 0; i < num_feats; i++) {
grad_feats[scatter_to_offset[i] * num_feats + i] =
grad_reduced_feats_offset[i];
}
}
}
namespace voxelization {
std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
const at::Tensor &feats, const at::Tensor &coors,
const reduce_t reduce_type) {
CHECK_INPUT(feats);
CHECK_INPUT(coors);
const int num_input = feats.size(0);
const int num_feats = feats.size(1);
if (num_input == 0)
return {feats.clone().detach(),
coors.clone().detach(),
coors.new_empty({0}, torch::kInt32),
coors.new_empty({0}, torch::kInt32)};
at::Tensor out_coors;
at::Tensor coors_map;
at::Tensor reduce_count;
auto coors_clean = coors.masked_fill(coors.lt(0).any(-1, true), -1);
std::tie(out_coors, coors_map, reduce_count) =
at::unique_dim(coors_clean, 0, true, true, true);
if (out_coors.index({0, 0}).lt(0).item<bool>()) {
// the first element of out_coors (-1,-1,-1) and should be removed
out_coors = out_coors.slice(0, 1);
reduce_count = reduce_count.slice(0, 1);
coors_map = coors_map - 1;
}
coors_map = coors_map.to(torch::kInt32);
reduce_count = reduce_count.to(torch::kInt32);
auto reduced_feats =
at::empty({out_coors.size(0), num_feats}, feats.options());
AT_DISPATCH_FLOATING_TYPES(
feats.scalar_type(), "feats_reduce_kernel", ([&] {
if (reduce_type == reduce_t::MAX)
reduced_feats.fill_(-std::numeric_limits<scalar_t>::infinity());
else
reduced_feats.fill_(static_cast<scalar_t>(0));
dim3 blocks(std::min(at::cuda::ATenCeilDiv(num_input, threadsPerBlock),
maxGridDim));
dim3 threads(threadsPerBlock);
feats_reduce_kernel<<<blocks, threads>>>(
feats.data_ptr<scalar_t>(), coors_map.data_ptr<int32_t>(),
reduced_feats.data_ptr<scalar_t>(), num_input, num_feats, reduce_type);
if (reduce_type == reduce_t::MEAN)
reduced_feats /= reduce_count.unsqueeze(-1).to(reduced_feats.dtype());
}));
AT_CUDA_CHECK(cudaGetLastError());
return {reduced_feats, out_coors, coors_map, reduce_count};
}
void dynamic_point_to_voxel_backward_gpu(at::Tensor &grad_feats,
const at::Tensor &grad_reduced_feats,
const at::Tensor &feats,
const at::Tensor &reduced_feats,
const at::Tensor &coors_map,
const at::Tensor &reduce_count,
const reduce_t reduce_type) {
CHECK_INPUT(grad_feats);
CHECK_INPUT(grad_reduced_feats);
CHECK_INPUT(feats);
CHECK_INPUT(reduced_feats);
CHECK_INPUT(coors_map);
CHECK_INPUT(reduce_count);
const int num_input = feats.size(0);
const int num_reduced = reduced_feats.size(0);
const int num_feats = feats.size(1);
grad_feats.fill_(0);
// copy voxel grad to points
if (num_input == 0 || num_reduced == 0) return;
if (reduce_type == reduce_t::MEAN || reduce_type == reduce_t::SUM) {
AT_DISPATCH_FLOATING_TYPES(
grad_reduced_feats.scalar_type(), "add_reduce_traceback_grad_kernel",
([&] {
dim3 blocks(std::min(
at::cuda::ATenCeilDiv(num_input, threadsPerBlock), maxGridDim));
dim3 threads(threadsPerBlock);
add_reduce_traceback_grad_kernel<<<blocks, threads>>>(
grad_feats.data_ptr<scalar_t>(),
grad_reduced_feats.data_ptr<scalar_t>(),
coors_map.data_ptr<int32_t>(), reduce_count.data_ptr<int32_t>(),
num_input, num_feats, reduce_type);
}));
AT_CUDA_CHECK(cudaGetLastError());
} else {
auto reduce_from = at::full({num_reduced, num_feats}, num_input,
coors_map.options().dtype(torch::kInt32));
AT_DISPATCH_FLOATING_TYPES(
grad_reduced_feats.scalar_type(),
"max_reduce_traceback_scatter_idx_kernel", ([&] {
dim3 blocks(std::min(
at::cuda::ATenCeilDiv(num_input, threadsPerBlock), maxGridDim));
dim3 threads(threadsPerBlock);
max_reduce_traceback_scatter_idx_kernel<<<blocks, threads>>>(
feats.data_ptr<scalar_t>(), reduced_feats.data_ptr<scalar_t>(),
reduce_from.data_ptr<int32_t>(), coors_map.data_ptr<int32_t>(),
num_input, num_feats);
}));
AT_CUDA_CHECK(cudaGetLastError());
AT_DISPATCH_FLOATING_TYPES(
grad_reduced_feats.scalar_type(),
"max_reduce_traceback_scatter_idx_kernel", ([&] {
dim3 blocks(std::min(
at::cuda::ATenCeilDiv(num_reduced, threadsPerBlock), maxGridDim));
dim3 threads(threadsPerBlock);
max_reduce_scatter_grad_kernel<<<blocks, threads>>>(
grad_feats.data_ptr<scalar_t>(),
grad_reduced_feats.data_ptr<scalar_t>(),
reduce_from.data_ptr<int32_t>(), num_reduced, num_feats);
}));
AT_CUDA_CHECK(cudaGetLastError());
}
return;
}
} // namespace voxelization
#include <torch/extension.h>
#include "voxelization.h"
namespace voxelization {
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("hard_voxelize", &hard_voxelize, "hard voxelize");
m.def("dynamic_voxelize", &dynamic_voxelize, "dynamic voxelization");
m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward, "dynamic point to voxel forward");
m.def("dynamic_point_to_voxel_backward", &dynamic_point_to_voxel_backward, "dynamic point to voxel backward");
}
} // namespace voxelization
#pragma once
#include <torch/extension.h>
typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
namespace voxelization {
int hard_voxelize_cpu(const at::Tensor &points, at::Tensor &voxels,
at::Tensor &coors, at::Tensor &num_points_per_voxel,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int max_points, const int max_voxels,
const int NDim = 3);
void dynamic_voxelize_cpu(const at::Tensor &points, at::Tensor &coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim = 3);
std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
const at::Tensor &points, const at::Tensor &voxel_mapping,
const std::vector<float> voxel_size, const std::vector<float> coors_range);
#ifdef WITH_CUDA
int hard_voxelize_gpu(const at::Tensor &points, at::Tensor &voxels,
at::Tensor &coors, at::Tensor &num_points_per_voxel,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int max_points, const int max_voxels,
const int NDim = 3);
int nondisterministic_hard_voxelize_gpu(const at::Tensor &points, at::Tensor &voxels,
at::Tensor &coors, at::Tensor &num_points_per_voxel,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int max_points, const int max_voxels,
const int NDim = 3);
void dynamic_voxelize_gpu(const at::Tensor &points, at::Tensor &coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim = 3);
std::vector<torch::Tensor> dynamic_point_to_voxel_forward_gpu(const torch::Tensor &feats,
const torch::Tensor &coors,
const reduce_t reduce_type);
void dynamic_point_to_voxel_backward_gpu(torch::Tensor &grad_feats,
const torch::Tensor &grad_reduced_feats,
const torch::Tensor &feats,
const torch::Tensor &reduced_feats,
const torch::Tensor &coors_idx,
const torch::Tensor &reduce_count,
const reduce_t reduce_type);
#endif
// Interface for Python
inline int hard_voxelize(const at::Tensor &points, at::Tensor &voxels,
at::Tensor &coors, at::Tensor &num_points_per_voxel,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int max_points, const int max_voxels,
const int NDim = 3, const bool deterministic = true) {
if (points.device().is_cuda()) {
#ifdef WITH_CUDA
if (deterministic) {
return hard_voxelize_gpu(points, voxels, coors, num_points_per_voxel,
voxel_size, coors_range, max_points, max_voxels,
NDim);
}
return nondisterministic_hard_voxelize_gpu(points, voxels, coors, num_points_per_voxel,
voxel_size, coors_range, max_points, max_voxels,
NDim);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
return hard_voxelize_cpu(points, voxels, coors, num_points_per_voxel,
voxel_size, coors_range, max_points, max_voxels,
NDim);
}
inline void dynamic_voxelize(const at::Tensor &points, at::Tensor &coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim = 3) {
if (points.device().is_cuda()) {
#ifdef WITH_CUDA
return dynamic_voxelize_gpu(points, coors, voxel_size, coors_range, NDim);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
return dynamic_voxelize_cpu(points, coors, voxel_size, coors_range, NDim);
}
inline reduce_t convert_reduce_type(const std::string &reduce_type) {
if (reduce_type == "max")
return reduce_t::MAX;
else if (reduce_type == "sum")
return reduce_t::SUM;
else if (reduce_type == "mean")
return reduce_t::MEAN;
else TORCH_CHECK(false, "do not support reduce type " + reduce_type)
return reduce_t::SUM;
}
inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward(const torch::Tensor &feats,
const torch::Tensor &coors,
const std::string &reduce_type) {
if (feats.device().is_cuda()) {
#ifdef WITH_CUDA
return dynamic_point_to_voxel_forward_gpu(feats, coors, convert_reduce_type(reduce_type));
#else
TORCH_CHECK(false, "Not compiled with GPU support");
#endif
}
TORCH_CHECK(false, "do not support cpu yet");
return std::vector<torch::Tensor>();
}
inline void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
const torch::Tensor &grad_reduced_feats,
const torch::Tensor &feats,
const torch::Tensor &reduced_feats,
const torch::Tensor &coors_idx,
const torch::Tensor &reduce_count,
const std::string &reduce_type) {
if (grad_feats.device().is_cuda()) {
#ifdef WITH_CUDA
dynamic_point_to_voxel_backward_gpu(
grad_feats, grad_reduced_feats, feats, reduced_feats, coors_idx, reduce_count,
convert_reduce_type(reduce_type));
return;
#else
TORCH_CHECK(false, "Not compiled with GPU support");
#endif
}
TORCH_CHECK(false, "do not support cpu yet");
}
} // namespace voxelization
#include <ATen/TensorUtils.h>
#include <torch/extension.h>
// #include "voxelization.h"
namespace {
template <typename T, typename T_int>
void dynamic_voxelize_kernel(const torch::TensorAccessor<T, 2> points,
torch::TensorAccessor<T_int, 2> coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const std::vector<int> grid_size,
const int num_points, const int num_features,
const int NDim) {
const int ndim_minus_1 = NDim - 1;
bool failed = false;
// int coor[NDim];
int* coor = new int[NDim]();
int c;
for (int i = 0; i < num_points; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points[i][j] - coors_range[j]) / voxel_size[j]);
// necessary to rm points out of range
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
for (int k = 0; k < NDim; ++k) {
if (failed)
coors[i][k] = -1;
else
coors[i][k] = coor[k];
}
}
delete[] coor;
return;
}
template <typename T, typename T_int>
void hard_voxelize_kernel(const torch::TensorAccessor<T, 2> points,
torch::TensorAccessor<T, 3> voxels,
torch::TensorAccessor<T_int, 2> coors,
torch::TensorAccessor<T_int, 1> num_points_per_voxel,
torch::TensorAccessor<T_int, 3> coor_to_voxelidx,
int& voxel_num, const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const std::vector<int> grid_size,
const int max_points, const int max_voxels,
const int num_points, const int num_features,
const int NDim) {
// declare a temp coors
at::Tensor temp_coors = at::zeros(
{num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
// First use dynamic voxelization to get coors,
// then check max points/voxels constraints
dynamic_voxelize_kernel<T, int>(points, temp_coors.accessor<int, 2>(),
voxel_size, coors_range, grid_size,
num_points, num_features, NDim);
int voxelidx, num;
auto coor = temp_coors.accessor<int, 2>();
for (int i = 0; i < num_points; ++i) {
// T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
if (coor[i][0] == -1) continue;
voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
// record voxel
if (voxelidx == -1) {
voxelidx = voxel_num;
if (max_voxels != -1 && voxel_num >= max_voxels) continue;
voxel_num += 1;
coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors[voxelidx][k] = coor[i][k];
}
}
// put points into voxel
num = num_points_per_voxel[voxelidx];
if (max_points == -1 || num < max_points) {
for (int k = 0; k < num_features; ++k) {
voxels[voxelidx][num][k] = points[i][k];
}
num_points_per_voxel[voxelidx] += 1;
}
}
return;
}
} // namespace
namespace voxelization {
int hard_voxelize_cpu(const at::Tensor& points, at::Tensor& voxels,
at::Tensor& coors, at::Tensor& num_points_per_voxel,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int max_points, const int max_voxels,
const int NDim = 3) {
// current version tooks about 0.02s_0.03s for one frame on cpu
// check device
AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
std::vector<int> grid_size(NDim);
const int num_points = points.size(0);
const int num_features = points.size(1);
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
// printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
// grid_size[1], grid_size[0]);
at::Tensor coor_to_voxelidx =
-at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
int voxel_num = 0;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
points.scalar_type(), "hard_voxelize_forward", [&] {
hard_voxelize_kernel<scalar_t, int>(
points.accessor<scalar_t, 2>(), voxels.accessor<scalar_t, 3>(),
coors.accessor<int, 2>(), num_points_per_voxel.accessor<int, 1>(),
coor_to_voxelidx.accessor<int, 3>(), voxel_num, voxel_size,
coors_range, grid_size, max_points, max_voxels, num_points,
num_features, NDim);
});
return voxel_num;
}
void dynamic_voxelize_cpu(const at::Tensor& points, at::Tensor& coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim = 3) {
// check device
AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
std::vector<int> grid_size(NDim);
const int num_points = points.size(0);
const int num_features = points.size(1);
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
points.scalar_type(), "hard_voxelize_forward", [&] {
dynamic_voxelize_kernel<scalar_t, int>(
points.accessor<scalar_t, 2>(), coors.accessor<int, 2>(),
voxel_size, coors_range, grid_size, num_points, num_features, NDim);
});
return;
}
} // namespace voxelization
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <torch/types.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
namespace {
int const threadsPerBlock = sizeof(unsigned long long) * 8;
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
template <typename T, typename T_int>
__global__ void dynamic_voxelize_kernel(
const T* points, T_int* coors, const float voxel_x, const float voxel_y,
const float voxel_z, const float coors_x_min, const float coors_y_min,
const float coors_z_min, const float coors_x_max, const float coors_y_max,
const float coors_z_max, const int grid_x, const int grid_y,
const int grid_z, const int num_points, const int num_features,
const int NDim) {
// const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
CUDA_1D_KERNEL_LOOP(index, num_points) {
// To save some computation
auto points_offset = points + index * num_features;
auto coors_offset = coors + index * NDim;
int c_x = floor((points_offset[0] - coors_x_min) / voxel_x);
if (c_x < 0 || c_x >= grid_x) {
coors_offset[0] = -1;
return;
}
int c_y = floor((points_offset[1] - coors_y_min) / voxel_y);
if (c_y < 0 || c_y >= grid_y) {
coors_offset[0] = -1;
coors_offset[1] = -1;
return;
}
int c_z = floor((points_offset[2] - coors_z_min) / voxel_z);
if (c_z < 0 || c_z >= grid_z) {
coors_offset[0] = -1;
coors_offset[1] = -1;
coors_offset[2] = -1;
} else {
coors_offset[0] = c_z;
coors_offset[1] = c_y;
coors_offset[2] = c_x;
}
}
}
template <typename T, typename T_int>
__global__ void assign_point_to_voxel(const int nthreads, const T* points,
T_int* point_to_voxelidx,
T_int* coor_to_voxelidx, T* voxels,
const int max_points,
const int num_features,
const int num_points, const int NDim) {
CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
// const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
int index = thread_idx / num_features;
int num = point_to_voxelidx[index];
int voxelidx = coor_to_voxelidx[index];
if (num > -1 && voxelidx > -1) {
auto voxels_offset =
voxels + voxelidx * max_points * num_features + num * num_features;
int k = thread_idx % num_features;
voxels_offset[k] = points[thread_idx];
}
}
}
template <typename T, typename T_int>
__global__ void assign_voxel_coors(const int nthreads, T_int* coor,
T_int* point_to_voxelidx,
T_int* coor_to_voxelidx, T_int* voxel_coors,
const int num_points, const int NDim) {
CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
// const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
// if (index >= num_points) return;
int index = thread_idx / NDim;
int num = point_to_voxelidx[index];
int voxelidx = coor_to_voxelidx[index];
if (num == 0 && voxelidx > -1) {
auto coors_offset = voxel_coors + voxelidx * NDim;
int k = thread_idx % NDim;
coors_offset[k] = coor[thread_idx];
}
}
}
template <typename T_int>
__global__ void point_to_voxelidx_kernel(const T_int* coor,
T_int* point_to_voxelidx,
T_int* point_to_pointidx,
const int max_points,
const int max_voxels,
const int num_points, const int NDim) {
CUDA_1D_KERNEL_LOOP(index, num_points) {
auto coor_offset = coor + index * NDim;
// skip invalid points
if ((index >= num_points) || (coor_offset[0] == -1)) return;
int num = 0;
int coor_x = coor_offset[0];
int coor_y = coor_offset[1];
int coor_z = coor_offset[2];
// only calculate the coors before this coor[index]
for (int i = 0; i < index; ++i) {
auto prev_coor = coor + i * NDim;
if (prev_coor[0] == -1) continue;
// Find all previous points that have the same coors
// if find the same coor, record it
if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
(prev_coor[2] == coor_z)) {
num++;
if (num == 1) {
// point to the same coor that first show up
point_to_pointidx[index] = i;
} else if (num >= max_points) {
// out of boundary
return;
}
}
}
if (num == 0) {
point_to_pointidx[index] = index;
}
if (num < max_points) {
point_to_voxelidx[index] = num;
}
}
}
template <typename T_int>
__global__ void determin_voxel_num(
// const T_int* coor,
T_int* num_points_per_voxel, T_int* point_to_voxelidx,
T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
const int max_points, const int max_voxels, const int num_points) {
// only calculate the coors before this coor[index]
for (int i = 0; i < num_points; ++i) {
// if (coor[i][0] == -1)
// continue;
int point_pos_in_voxel = point_to_voxelidx[i];
// record voxel
if (point_pos_in_voxel == -1) {
// out of max_points or invalid point
continue;
} else if (point_pos_in_voxel == 0) {
// record new voxel
int voxelidx = voxel_num[0];
if (voxel_num[0] >= max_voxels) continue;
voxel_num[0] += 1;
coor_to_voxelidx[i] = voxelidx;
num_points_per_voxel[voxelidx] = 1;
} else {
int point_idx = point_to_pointidx[i];
int voxelidx = coor_to_voxelidx[point_idx];
if (voxelidx != -1) {
coor_to_voxelidx[i] = voxelidx;
num_points_per_voxel[voxelidx] += 1;
}
}
}
}
__global__ void nondisterministic_get_assign_pos(
const int nthreads, const int32_t *coors_map, int32_t *pts_id,
int32_t *coors_count, int32_t *reduce_count, int32_t *coors_order) {
CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
int coors_idx = coors_map[thread_idx];
if (coors_idx > -1) {
int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1);
pts_id[thread_idx] = coors_pts_pos;
if (coors_pts_pos == 0) {
coors_order[coors_idx] = atomicAdd(coors_count, 1);
}
}
}
}
template<typename T>
__global__ void nondisterministic_assign_point_voxel(
const int nthreads, const T *points, const int32_t *coors_map,
const int32_t *pts_id, const int32_t *coors_in,
const int32_t *reduce_count, const int32_t *coors_order,
T *voxels, int32_t *coors, int32_t *pts_count, const int max_voxels,
const int max_points, const int num_features, const int NDim) {
CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
int coors_idx = coors_map[thread_idx];
int coors_pts_pos = pts_id[thread_idx];
if (coors_idx > -1) {
int coors_pos = coors_order[coors_idx];
if (coors_pos < max_voxels && coors_pts_pos < max_points) {
auto voxels_offset =
voxels + (coors_pos * max_points + coors_pts_pos) * num_features;
auto points_offset = points + thread_idx * num_features;
for (int k = 0; k < num_features; k++) {
voxels_offset[k] = points_offset[k];
}
if (coors_pts_pos == 0) {
pts_count[coors_pos] = min(reduce_count[coors_idx], max_points);
auto coors_offset = coors + coors_pos * NDim;
auto coors_in_offset = coors_in + coors_idx * NDim;
for (int k = 0; k < NDim; k++) {
coors_offset[k] = coors_in_offset[k];
}
}
}
}
}
}
namespace voxelization {
int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
at::Tensor& coors, at::Tensor& num_points_per_voxel,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int max_points, const int max_voxels,
const int NDim = 3) {
// current version tooks about 0.04s for one frame on cpu
// check device
CHECK_INPUT(points);
at::cuda::CUDAGuard device_guard(points.device());
const int num_points = points.size(0);
const int num_features = points.size(1);
const float voxel_x = voxel_size[0];
const float voxel_y = voxel_size[1];
const float voxel_z = voxel_size[2];
const float coors_x_min = coors_range[0];
const float coors_y_min = coors_range[1];
const float coors_z_min = coors_range[2];
const float coors_x_max = coors_range[3];
const float coors_y_max = coors_range[4];
const float coors_z_max = coors_range[5];
const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
// map points to voxel coors
at::Tensor temp_coors =
at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
dim3 block(512);
// 1. link point to corresponding voxel coors
AT_DISPATCH_ALL_TYPES(
points.scalar_type(), "hard_voxelize_kernel", ([&] {
dynamic_voxelize_kernel<scalar_t, int>
<<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
points.contiguous().data_ptr<scalar_t>(),
temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y,
voxel_z, coors_x_min, coors_y_min, coors_z_min, coors_x_max,
coors_y_max, coors_z_max, grid_x, grid_y, grid_z, num_points,
num_features, NDim);
}));
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
// 2. map point to the idx of the corresponding voxel, find duplicate coor
// create some temporary variables
auto point_to_pointidx = -at::ones(
{
num_points,
},
points.options().dtype(at::kInt));
auto point_to_voxelidx = -at::ones(
{
num_points,
},
points.options().dtype(at::kInt));
dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
dim3 map_block(512);
AT_DISPATCH_ALL_TYPES(
temp_coors.scalar_type(), "determin_duplicate", ([&] {
point_to_voxelidx_kernel<int>
<<<map_grid, map_block, 0, at::cuda::getCurrentCUDAStream()>>>(
temp_coors.contiguous().data_ptr<int>(),
point_to_voxelidx.contiguous().data_ptr<int>(),
point_to_pointidx.contiguous().data_ptr<int>(), max_points,
max_voxels, num_points, NDim);
}));
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
// 3. determine voxel num and voxel's coor index
// make the logic in the CUDA device could accelerate about 10 times
auto coor_to_voxelidx = -at::ones(
{
num_points,
},
points.options().dtype(at::kInt));
auto voxel_num = at::zeros(
{
1,
},
points.options().dtype(at::kInt)); // must be zero from the beginning
AT_DISPATCH_ALL_TYPES(
temp_coors.scalar_type(), "determin_duplicate", ([&] {
determin_voxel_num<int><<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>(
num_points_per_voxel.contiguous().data_ptr<int>(),
point_to_voxelidx.contiguous().data_ptr<int>(),
point_to_pointidx.contiguous().data_ptr<int>(),
coor_to_voxelidx.contiguous().data_ptr<int>(),
voxel_num.contiguous().data_ptr<int>(), max_points, max_voxels,
num_points);
}));
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
// 4. copy point features to voxels
// Step 4 & 5 could be parallel
auto pts_output_size = num_points * num_features;
dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
dim3 cp_block(512);
AT_DISPATCH_ALL_TYPES(
points.scalar_type(), "assign_point_to_voxel", ([&] {
assign_point_to_voxel<float, int>
<<<cp_grid, cp_block, 0, at::cuda::getCurrentCUDAStream()>>>(
pts_output_size, points.contiguous().data_ptr<float>(),
point_to_voxelidx.contiguous().data_ptr<int>(),
coor_to_voxelidx.contiguous().data_ptr<int>(),
voxels.contiguous().data_ptr<float>(), max_points, num_features,
num_points, NDim);
}));
// cudaDeviceSynchronize();
// AT_CUDA_CHECK(cudaGetLastError());
// 5. copy coors of each voxels
auto coors_output_size = num_points * NDim;
dim3 coors_cp_grid(
std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
dim3 coors_cp_block(512);
AT_DISPATCH_ALL_TYPES(
points.scalar_type(), "assign_point_to_voxel", ([&] {
assign_voxel_coors<float, int><<<coors_cp_grid, coors_cp_block, 0,
at::cuda::getCurrentCUDAStream()>>>(
coors_output_size, temp_coors.contiguous().data_ptr<int>(),
point_to_voxelidx.contiguous().data_ptr<int>(),
coor_to_voxelidx.contiguous().data_ptr<int>(),
coors.contiguous().data_ptr<int>(), num_points, NDim);
}));
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
auto voxel_num_cpu = voxel_num.to(at::kCPU);
int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
return voxel_num_int;
}
int nondisterministic_hard_voxelize_gpu(
const at::Tensor &points, at::Tensor &voxels,
at::Tensor &coors, at::Tensor &num_points_per_voxel,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int max_points, const int max_voxels,
const int NDim = 3) {
CHECK_INPUT(points);
at::cuda::CUDAGuard device_guard(points.device());
const int num_points = points.size(0);
const int num_features = points.size(1);
if (num_points == 0)
return 0;
const float voxel_x = voxel_size[0];
const float voxel_y = voxel_size[1];
const float voxel_z = voxel_size[2];
const float coors_x_min = coors_range[0];
const float coors_y_min = coors_range[1];
const float coors_z_min = coors_range[2];
const float coors_x_max = coors_range[3];
const float coors_y_max = coors_range[4];
const float coors_z_max = coors_range[5];
const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
// map points to voxel coors
at::Tensor temp_coors =
at::zeros({num_points, NDim}, points.options().dtype(torch::kInt32));
dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
dim3 block(512);
// 1. link point to corresponding voxel coors
AT_DISPATCH_ALL_TYPES(
points.scalar_type(), "hard_voxelize_kernel", ([&] {
dynamic_voxelize_kernel<scalar_t, int>
<<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
points.contiguous().data_ptr<scalar_t>(),
temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y,
voxel_z, coors_x_min, coors_y_min, coors_z_min, coors_x_max,
coors_y_max, coors_z_max, grid_x, grid_y, grid_z, num_points,
num_features, NDim);
}));
at::Tensor coors_map;
at::Tensor coors_count;
at::Tensor coors_order;
at::Tensor reduce_count;
at::Tensor pts_id;
auto coors_clean = temp_coors.masked_fill(temp_coors.lt(0).any(-1, true), -1);
std::tie(temp_coors, coors_map, reduce_count) =
at::unique_dim(coors_clean, 0, true, true, false);
if (temp_coors.index({0, 0}).lt(0).item<bool>()) {
// the first element of temp_coors is (-1,-1,-1) and should be removed
temp_coors = temp_coors.slice(0, 1);
coors_map = coors_map - 1;
}
int num_coors = temp_coors.size(0);
temp_coors = temp_coors.to(torch::kInt32);
coors_map = coors_map.to(torch::kInt32);
coors_count = coors_map.new_zeros(1);
coors_order = coors_map.new_empty(num_coors);
reduce_count = coors_map.new_zeros(num_coors);
pts_id = coors_map.new_zeros(num_points);
dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
dim3 cp_block(512);
AT_DISPATCH_ALL_TYPES(points.scalar_type(), "get_assign_pos", ([&] {
nondisterministic_get_assign_pos<<<cp_grid, cp_block, 0,
at::cuda::getCurrentCUDAStream()>>>(
num_points,
coors_map.contiguous().data_ptr<int32_t>(),
pts_id.contiguous().data_ptr<int32_t>(),
coors_count.contiguous().data_ptr<int32_t>(),
reduce_count.contiguous().data_ptr<int32_t>(),
coors_order.contiguous().data_ptr<int32_t>());
}));
AT_DISPATCH_ALL_TYPES(
points.scalar_type(), "assign_point_to_voxel", ([&] {
nondisterministic_assign_point_voxel<scalar_t>
<<<cp_grid, cp_block, 0, at::cuda::getCurrentCUDAStream()>>>(
num_points, points.contiguous().data_ptr<scalar_t>(),
coors_map.contiguous().data_ptr<int32_t>(),
pts_id.contiguous().data_ptr<int32_t>(),
temp_coors.contiguous().data_ptr<int32_t>(),
reduce_count.contiguous().data_ptr<int32_t>(),
coors_order.contiguous().data_ptr<int32_t>(),
voxels.contiguous().data_ptr<scalar_t>(),
coors.contiguous().data_ptr<int32_t>(),
num_points_per_voxel.contiguous().data_ptr<int32_t>(),
max_voxels, max_points,
num_features, NDim);
}));
AT_CUDA_CHECK(cudaGetLastError());
return max_voxels < num_coors ? max_voxels : num_coors;
}
void dynamic_voxelize_gpu(const at::Tensor& points, at::Tensor& coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim = 3) {
// current version tooks about 0.04s for one frame on cpu
// check device
CHECK_INPUT(points);
at::cuda::CUDAGuard device_guard(points.device());
const int num_points = points.size(0);
const int num_features = points.size(1);
const float voxel_x = voxel_size[0];
const float voxel_y = voxel_size[1];
const float voxel_z = voxel_size[2];
const float coors_x_min = coors_range[0];
const float coors_y_min = coors_range[1];
const float coors_z_min = coors_range[2];
const float coors_x_max = coors_range[3];
const float coors_y_max = coors_range[4];
const float coors_z_max = coors_range[5];
const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
const int col_blocks = at::cuda::ATenCeilDiv(num_points, threadsPerBlock);
dim3 blocks(col_blocks);
dim3 threads(threadsPerBlock);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES(points.scalar_type(), "dynamic_voxelize_kernel", [&] {
dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
points.contiguous().data_ptr<scalar_t>(),
coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
});
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
return;
}
} // namespace voxelization
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import torch
from torch import nn
from torch.autograd import Function
from torch.nn.modules.utils import _pair
from .voxel_layer import dynamic_voxelize, hard_voxelize
class _Voxelization(Function):
@staticmethod
def forward(ctx,
points,
voxel_size,
coors_range,
max_points=35,
max_voxels=20000,
deterministic=True):
"""convert kitti points(N, >=3) to voxels.
Args:
points: [N, ndim] float tensor. points[:, :3] contain xyz points
and points[:, 3:] contain other information like reflectivity
voxel_size: [3] list/tuple or array, float. xyz, indicate voxel
size
coors_range: [6] list/tuple or array, float. indicate voxel
range. format: xyzxyz, minmax
max_points: int. indicate maximum points contained in a voxel. if
max_points=-1, it means using dynamic_voxelize
max_voxels: int. indicate maximum voxels this function create.
for second, 20000 is a good choice. Users should shuffle points
before call this function because max_voxels may drop points.
deterministic: bool. whether to invoke the non-deterministic
version of hard-voxelization implementations. non-deterministic
version is considerablly fast but is not deterministic. only
affects hard voxelization. default True. for more information
of this argument and the implementation insights, please refer
to the following links:
https://github.com/open-mmlab/mmdetection3d/issues/894
https://github.com/open-mmlab/mmdetection3d/pull/904
it is an experimental feature and we will appreciate it if
you could share with us the failing cases.
Returns:
voxels: [M, max_points, ndim] float tensor. only contain points
and returned when max_points != -1.
coordinates: [M, 3] int32 tensor, always returned.
num_points_per_voxel: [M] int32 tensor. Only returned when
max_points != -1.
"""
if max_points == -1 or max_voxels == -1:
coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
dynamic_voxelize(points, coors, voxel_size, coors_range, 3)
return coors
else:
voxels = points.new_zeros(
size=(max_voxels, max_points, points.size(1)))
coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int)
num_points_per_voxel = points.new_zeros(
size=(max_voxels, ), dtype=torch.int)
voxel_num = hard_voxelize(points, voxels, coors,
num_points_per_voxel, voxel_size,
coors_range, max_points, max_voxels, 3,
deterministic)
# select the valid voxels
voxels_out = voxels[:voxel_num]
coors_out = coors[:voxel_num]
num_points_per_voxel_out = num_points_per_voxel[:voxel_num]
return voxels_out, coors_out, num_points_per_voxel_out
voxelization = _Voxelization.apply
class Voxelization(nn.Module):
def __init__(self,
voxel_size,
point_cloud_range,
max_num_points,
max_voxels=20000,
deterministic=True):
super(Voxelization, self).__init__()
"""
Args:
voxel_size (list): list [x, y, z] size of three dimension
point_cloud_range (list):
[x_min, y_min, z_min, x_max, y_max, z_max]
max_num_points (int): max number of points per voxel
max_voxels (tuple or int): max number of voxels in
(training, testing) time
deterministic: bool. whether to invoke the non-deterministic
version of hard-voxelization implementations. non-deterministic
version is considerablly fast but is not deterministic. only
affects hard voxelization. default True. for more information
of this argument and the implementation insights, please refer
to the following links:
https://github.com/open-mmlab/mmdetection3d/issues/894
https://github.com/open-mmlab/mmdetection3d/pull/904
it is an experimental feature and we will appreciate it if
you could share with us the failing cases.
"""
self.voxel_size = voxel_size
self.point_cloud_range = point_cloud_range
self.max_num_points = max_num_points
if isinstance(max_voxels, tuple):
self.max_voxels = max_voxels
else:
self.max_voxels = _pair(max_voxels)
self.deterministic = deterministic
point_cloud_range = torch.tensor(
point_cloud_range, dtype=torch.float32)
# [0, -40, -3, 70.4, 40, 1]
voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
grid_size = (point_cloud_range[3:] -
point_cloud_range[:3]) / voxel_size
grid_size = torch.round(grid_size).long()
input_feat_shape = grid_size[:2]
self.grid_size = grid_size
# the origin shape is as [x-len, y-len, z-len]
# [w, h, d] -> [d, h, w]
self.pcd_shape = [*input_feat_shape, 1][::-1]
def forward(self, input):
"""
Args:
input: NC points
"""
if self.training:
max_voxels = self.max_voxels[0]
else:
max_voxels = self.max_voxels[1]
return voxelization(input, self.voxel_size, self.point_cloud_range,
self.max_num_points, max_voxels,
self.deterministic)
def __repr__(self):
tmpstr = self.__class__.__name__ + '('
tmpstr += 'voxel_size=' + str(self.voxel_size)
tmpstr += ', point_cloud_range=' + str(self.point_cloud_range)
tmpstr += ', max_num_points=' + str(self.max_num_points)
tmpstr += ', max_voxels=' + str(self.max_voxels)
tmpstr += ', deterministic=' + str(self.deterministic)
tmpstr += ')'
return tmpstr
...@@ -224,97 +224,5 @@ if __name__ == '__main__': ...@@ -224,97 +224,5 @@ if __name__ == '__main__':
'build': parse_requirements('requirements/build.txt'), 'build': parse_requirements('requirements/build.txt'),
'optional': parse_requirements('requirements/optional.txt'), 'optional': parse_requirements('requirements/optional.txt'),
}, },
ext_modules=[
make_cuda_ext(
name='sparse_conv_ext',
module='mmdet3d.ops.spconv',
extra_include_path=[
# PyTorch 1.5 uses ninjia, which requires absolute path
# of included files, relative path will cause failure.
os.path.abspath(
os.path.join(*'mmdet3d.ops.spconv'.split('.'),
'include/'))
],
sources=[
'src/all.cc',
'src/reordering.cc',
'src/reordering_cuda.cu',
'src/indice.cc',
'src/indice_cuda.cu',
'src/maxpool.cc',
'src/maxpool_cuda.cu',
],
extra_args=['-w', '-std=c++14']),
make_cuda_ext(
name='iou3d_cuda',
module='mmdet3d.ops.iou3d',
sources=[
'src/iou3d.cpp',
'src/iou3d_kernel.cu',
]),
make_cuda_ext(
name='voxel_layer',
module='mmdet3d.ops.voxel',
sources=[
'src/voxelization.cpp',
'src/scatter_points_cpu.cpp',
'src/scatter_points_cuda.cu',
'src/voxelization_cpu.cpp',
'src/voxelization_cuda.cu',
]),
make_cuda_ext(
name='roiaware_pool3d_ext',
module='mmdet3d.ops.roiaware_pool3d',
sources=[
'src/roiaware_pool3d.cpp',
'src/points_in_boxes_cpu.cpp',
],
sources_cuda=[
'src/roiaware_pool3d_kernel.cu',
'src/points_in_boxes_cuda.cu',
]),
make_cuda_ext(
name='roipoint_pool3d_ext',
module='mmdet3d.ops.roipoint_pool3d',
sources=['src/roipoint_pool3d.cpp'],
sources_cuda=['src/roipoint_pool3d_kernel.cu']),
make_cuda_ext(
name='ball_query_ext',
module='mmdet3d.ops.ball_query',
sources=['src/ball_query.cpp'],
sources_cuda=['src/ball_query_cuda.cu']),
make_cuda_ext(
name='knn_ext',
module='mmdet3d.ops.knn',
sources=['src/knn.cpp'],
sources_cuda=['src/knn_cuda.cu']),
make_cuda_ext(
name='assign_score_withk_ext',
module='mmdet3d.ops.paconv',
sources=['src/assign_score_withk.cpp'],
sources_cuda=['src/assign_score_withk_cuda.cu']),
make_cuda_ext(
name='group_points_ext',
module='mmdet3d.ops.group_points',
sources=['src/group_points.cpp'],
sources_cuda=['src/group_points_cuda.cu']),
make_cuda_ext(
name='interpolate_ext',
module='mmdet3d.ops.interpolate',
sources=['src/interpolate.cpp'],
sources_cuda=[
'src/three_interpolate_cuda.cu', 'src/three_nn_cuda.cu'
]),
make_cuda_ext(
name='furthest_point_sample_ext',
module='mmdet3d.ops.furthest_point_sample',
sources=['src/furthest_point_sample.cpp'],
sources_cuda=['src/furthest_point_sample_cuda.cu']),
make_cuda_ext(
name='gather_points_ext',
module='mmdet3d.ops.gather_points',
sources=['src/gather_points.cpp'],
sources_cuda=['src/gather_points_cuda.cu'])
],
cmdclass={'build_ext': BuildExtension}, cmdclass={'build_ext': BuildExtension},
zip_safe=False) zip_safe=False)
...@@ -2,190 +2,7 @@ ...@@ -2,190 +2,7 @@
import pytest import pytest
import torch import torch
from mmdet3d.ops import PAConv, PAConvCUDA, assign_score_withk from mmdet3d.ops import PAConv, PAConvCUDA
def test_paconv_assign_scores():
if not torch.cuda.is_available():
pytest.skip()
scores = torch.tensor([[[[0.06947571, 0.6065746], [0.28462553, 0.8378516],
[0.7595994, 0.97220325], [0.519155, 0.766185]],
[[0.15348864, 0.6051019], [0.21510637, 0.31916398],
[0.00236845, 0.5842595], [0.6783676, 0.5216348]]],
[[[0.23089725, 0.5568468], [0.7405102, 0.06438422],
[0.6887394, 0.22089851], [0.0502342, 0.79228795]],
[[0.44883424, 0.15427643],
[0.13817799, 0.34856772], [0.7989621, 0.33788306],
[0.15699774, 0.7693662]]]]).float().cuda()
scores.requires_grad_()
points = torch.tensor([[[[0.06001121, 0.92963666, 0.5753327, 0.7251477],
[0.53563064, 0.23129565, 0.92366195, 0.44261628]],
[[0.5770022, 0.56625944, 0.23560429, 0.11178821],
[0.7735967, 0.95678777, 0.25468266, 0.02895975]],
[[0.0589869, 0.09017515, 0.5977862, 0.02797985],
[0.603862, 0.35991007, 0.85761684, 0.3096559]],
[[0.22359002, 0.13983732, 0.5544243, 0.68863827],
[0.85646236, 0.75651926, 0.8638947, 0.83600986]],
[[0.45424145, 0.27458847, 0.6456112, 0.47162914],
[0.15773582, 0.47645122, 0.79964715, 0.3323908]],
[[0.8351399, 0.84696376, 0.9431732, 0.29418713],
[0.77168906, 0.6996871, 0.19354361, 0.03392768]],
[[0.30976456, 0.7074133, 0.581795, 0.976677],
[0.69656056, 0.07199162, 0.4708506, 0.29117996]],
[[0.5829035, 0.30201727, 0.76556486, 0.0935446],
[0.88030535, 0.16129416, 0.9242525, 0.49545723]]],
[[[0.50899494, 0.06482804, 0.44939405, 0.37704808],
[0.47028124, 0.11969638, 0.62823206, 0.28560323]],
[[0.40690207, 0.689753, 0.51636654, 0.23040164],
[0.06935787, 0.00488842, 0.22462702, 0.09182382]],
[[0.26611632, 0.00184339, 0.7730655, 0.5228131],
[0.87776035, 0.77895886, 0.2787183, 0.16620636]],
[[0.502574, 0.04039001, 0.5368497, 0.98379374],
[0.40973026, 0.3238272, 0.9733018, 0.13988364]],
[[0.04586202, 0.20983845, 0.20662665, 0.22270602],
[0.60387236, 0.5155574, 0.51237285, 0.6528438]],
[[0.45735973, 0.86821306, 0.61054605, 0.8370336],
[0.45193362, 0.3734138, 0.7825672, 0.5699416]],
[[0.44591594, 0.12447512, 0.09282011, 0.7055254],
[0.25223452, 0.46696228, 0.7051136, 0.892151]],
[[0.49615085, 0.47321403, 0.93138885, 0.7652197],
[0.38766378, 0.30332977, 0.23131835,
0.02863514]]]]).float().cuda()
points.requires_grad_()
centers = torch.tensor([[[[0.83878064, 0.96658987, 0.8033424, 0.9598312],
[0.45035273, 0.8768925, 0.977736, 0.54547966]],
[[0.01041394, 0.597893, 0.36212963, 0.4410367],
[0.94879234, 0.8372817, 0.21237361, 0.67945415]],
[[0.5096087, 0.26401454, 0.60034937, 0.5417416],
[0.87591463, 0.546456, 0.4096033, 0.16373193]],
[[0.79547447, 0.1482386, 0.12840575, 0.45384115],
[0.5640288, 0.944541, 0.5745328, 0.73229736]],
[[0.93011934, 0.7406011, 0.62621707, 0.8677915],
[0.91563636, 0.3595413, 0.6678378, 0.6085383]],
[[0.22431666, 0.65617776, 0.7483924, 0.6263364],
[0.30968404, 0.78204364, 0.14899081,
0.09628749]],
[[0.73675203, 0.72104895, 0.4648038, 0.6101647],
[0.7817645, 0.16572917, 0.3311919, 0.43407398]],
[[0.8193154, 0.09559608, 0.05978829, 0.90262103],
[0.4256065, 0.8165596, 0.8206446, 0.6604721]]],
[[[0.7159653, 0.18600845, 0.21433902, 0.3159626],
[0.3921569, 0.33221376, 0.5061177, 0.7961841]],
[[0.95338356, 0.04785997, 0.67185795, 0.6538394],
[0.4729132, 0.33404195, 0.17750603, 0.8445621]],
[[0.6755793, 0.16193843, 0.75943846, 0.92123103],
[0.2781859, 0.03114432, 0.710638, 0.52729136]],
[[0.8376105, 0.10858494, 0.13208169, 0.365772],
[0.5930795, 0.27390373, 0.14036089, 0.170403]],
[[0.3479789, 0.89855295, 0.04844379, 0.9871029],
[0.29781651, 0.0244137, 0.9179047, 0.8081611]],
[[0.12460887, 0.44991326, 0.19382608, 0.35037738],
[0.2773472, 0.4362057, 0.36757517, 0.5993509]],
[[0.29630446, 0.90046406, 0.5417113, 0.13510644],
[0.09623539, 0.04226565, 0.32001644,
0.44358212]],
[[0.5274848, 0.82096446, 0.9415489, 0.7123748],
[0.7537517, 0.8086482, 0.85345286,
0.7472754]]]]).float().cuda()
centers.requires_grad_()
knn_idx = torch.tensor([[[6, 7, 4, 6], [2, 4, 2, 4]],
[[7, 1, 3, 2], [6, 0, 2, 6]]]).long().cuda()
aggregate = 'sum'
expected_output = torch.tensor(
[[[[-0.08134781, 0.03877336, -0.8212776, -0.2869547],
[-0.23378491, -0.24112664, -0.1600166, -0.4121864]],
[[-0.05780616, -0.12298299, -0.0370461, -0.07889931],
[-0.13956165, -0.02006848, -0.10940295, -0.0293439]],
[[0.09284145, 0.58250105, 0.5927749, 0.16774094],
[0.27070042, 0.13422406, 0.2617501, 0.23416464]],
[[-0.06121218, -0.09561322, -0.20408826, 0.08079343],
[0.00944228, 0.03874819, 0.08404065, 0.04041629]]],
[[[-0.2110898, -0.13335688, -0.09315082, 0.08512095],
[0.09121774, 0.15976946, 0.23994486, 0.14350912]],
[[-0.36167958, -0.14891288, -0.64470863, -0.0646704],
[-0.28276974, -0.08847666, -0.46904767, 0.20491874]],
[[-0.34877953, -0.35533834, -0.25225785, -0.4638189],
[-0.1420663, 0.09467781, 0.17088932, 0.22580585]],
[[-0.3879708, -0.3991068, 0.05276498, -0.46989647],
[0.32522714, -0.02163534, 0.21604237, 0.4346682]]]]).float()
# test forward
output = assign_score_withk(scores, points, centers, knn_idx, aggregate)
assert torch.allclose(output.detach().cpu(), expected_output, atol=1e-6)
# test backward
loss = output.sum()
loss.backward()
expected_scores_grad = torch.tensor([[[[0.04288036, -0.18217683],
[-0.78873926, 0.7485497],
[-0.6866992, 0.05346543],
[0.04288036, -0.18217683]],
[[-1.1407862, 0.13533896],
[-0.06964391, -0.22948086],
[-1.1407862, 0.13533896],
[-0.06964391, -0.22948086]]],
[[[-0.3363995, -2.212181],
[-1.1589496, -2.7724311],
[-0.9387654, -1.3163853],
[-1.4385346, -1.0614843]],
[[-0.5048497, 1.4143617],
[-0.47332114, 0.6017133],
[-0.30974793, 1.1995442],
[-0.5048497, 1.4143617]]]]).float()
expected_points_grad = torch.tensor(
[[[[0., 0., 0., 0.], [0., 0., 0., 0.]],
[[0., 0., 0., 0.], [0., 0., 0., 0.]],
[[0.15585709, 0.15585709, 0.15585709, 0.15585709],
[1.1893613, 1.1893613, 1.1893613, 1.1893613]],
[[0., 0., 0., 0.], [0., 0., 0., 0.]],
[[1.6530733, 1.6530733, 1.6530733, 1.6530733],
[1.8130021, 1.8130021, 1.8130021, 1.8130021]],
[[0., 0., 0., 0.], [0., 0., 0., 0.]],
[[0.58863074, 0.58863074, 0.58863074, 0.58863074],
[1.3727596, 1.3727596, 1.3727596, 1.3727596]],
[[0.28462553, 0.28462553, 0.28462553, 0.28462553],
[0.8378516, 0.8378516, 0.8378516, 0.8378516]]],
[[[0.13817799, 0.13817799, 0.13817799, 0.13817799],
[0.34856772, 0.34856772, 0.34856772, 0.34856772]],
[[0.7405102, 0.7405102, 0.7405102, 0.7405102],
[0.06438422, 0.06438422, 0.06438422, 0.06438422]],
[[0.8491963, 0.8491963, 0.8491963, 0.8491963],
[1.1301711, 1.1301711, 1.1301711, 1.1301711]],
[[0.6887394, 0.6887394, 0.6887394, 0.6887394],
[0.22089851, 0.22089851, 0.22089851, 0.22089851]],
[[0., 0., 0., 0.], [0., 0., 0., 0.]],
[[0., 0., 0., 0.], [0., 0., 0., 0.]],
[[0.605832, 0.605832, 0.605832, 0.605832],
[0.92364264, 0.92364264, 0.92364264, 0.92364264]],
[[0.23089725, 0.23089725, 0.23089725, 0.23089725],
[0.5568468, 0.5568468, 0.5568468, 0.5568468]]]]).float()
expected_centers_grad = torch.tensor(
[[[[0., 0., 0., 0.], [0., 0., 0., 0.]],
[[0., 0., 0., 0.], [0., 0., 0., 0.]],
[[-1.0493311, -1.0493311, -1.0493311, -1.0493311],
[-2.0301602, -2.0301602, -2.0301602, -2.0301602]],
[[0., 0., 0., 0.], [0., 0., 0., 0.]],
[[0., 0., 0., 0.], [0., 0., 0., 0.]],
[[0., 0., 0., 0.], [0., 0., 0., 0.]],
[[-1.6328557, -1.6328557, -1.6328557, -1.6328557],
[-3.1828144, -3.1828144, -3.1828144, -3.1828144]],
[[0., 0., 0., 0.], [0., 0., 0., 0.]]],
[[[0., 0., 0., 0.], [0., 0., 0., 0.]],
[[0., 0., 0., 0.], [0., 0., 0., 0.]],
[[0., 0., 0., 0.], [0., 0., 0., 0.]],
[[0., 0., 0., 0.], [0., 0., 0., 0.]],
[[0., 0., 0., 0.], [0., 0., 0., 0.]],
[[0., 0., 0., 0.], [0., 0., 0., 0.]],
[[-1.5429721, -1.5429721, -1.5429721, -1.5429721],
[-1.6100934, -1.6100934, -1.6100934, -1.6100934]],
[[-1.7103812, -1.7103812, -1.7103812, -1.7103812],
[-1.6344175, -1.6344175, -1.6344175, -1.6344175]]]]).float()
assert torch.allclose(
scores.grad.detach().cpu(), expected_scores_grad, atol=1e-6)
assert torch.allclose(
points.grad.detach().cpu(), expected_points_grad, atol=1e-6)
assert torch.allclose(
centers.grad.detach().cpu(), expected_centers_grad, atol=1e-6)
def test_paconv(): def test_paconv():
......
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch
from mmdet3d.ops import (ball_query, furthest_point_sample,
furthest_point_sample_with_dist, gather_points,
grouping_operation, knn, three_interpolate, three_nn)
def test_fps():
if not torch.cuda.is_available():
pytest.skip()
xyz = torch.tensor([[[-0.2748, 1.0020, -1.1674], [0.1015, 1.3952, -1.2681],
[-0.8070, 2.4137,
-0.5845], [-1.0001, 2.1982, -0.5859],
[0.3841, 1.8983, -0.7431]],
[[-1.0696, 3.0758,
-0.1899], [-0.2559, 3.5521, -0.1402],
[0.8164, 4.0081, -0.1839], [-1.1000, 3.0213, -0.8205],
[-0.0518, 3.7251, -0.3950]]]).cuda()
idx = furthest_point_sample(xyz, 3)
expected_idx = torch.tensor([[0, 2, 4], [0, 2, 1]]).cuda()
assert torch.all(idx == expected_idx)
def test_ball_query():
if not torch.cuda.is_available():
pytest.skip()
new_xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625],
[-2.2769, 2.7817, -0.2334],
[-0.4003, 2.4666, -0.5116],
[-0.0740, 1.3147, -1.3625],
[-0.0740, 1.3147, -1.3625]],
[[-2.0289, 2.4952, -0.1708],
[-2.0668, 6.0278, -0.4875],
[0.4066, 1.4211, -0.2947],
[-2.0289, 2.4952, -0.1708],
[-2.0289, 2.4952, -0.1708]]]).cuda()
xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],
[-0.4003, 2.4666,
-0.5116], [-0.5251, 2.4379, -0.8466],
[-0.9691, 1.1418,
-1.3733], [-0.2232, 0.9561, -1.3626],
[-2.2769, 2.7817, -0.2334],
[-0.2822, 1.3192, -1.3645], [0.1533, 1.5024, -1.0432],
[0.4917, 1.1529, -1.3496]],
[[-2.0289, 2.4952,
-0.1708], [-0.7188, 0.9956, -0.5096],
[-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],
[0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],
[-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],
[0.3220, 1.4447, 0.3548], [-0.9744, 2.3856,
-1.2000]]]).cuda()
idx = ball_query(0, 0.2, 5, xyz, new_xyz)
expected_idx = torch.tensor([[[0, 0, 0, 0, 0], [6, 6, 6, 6, 6],
[2, 2, 2, 2, 2], [0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]],
[[0, 0, 0, 0, 0], [2, 2, 2, 2, 2],
[7, 7, 7, 7, 7], [0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]]]).cuda()
assert torch.all(idx == expected_idx)
# test dilated ball query
idx = ball_query(0.2, 0.4, 5, xyz, new_xyz)
expected_idx = torch.tensor([[[0, 5, 7, 0, 0], [6, 6, 6, 6, 6],
[2, 3, 2, 2, 2], [0, 5, 7, 0, 0],
[0, 5, 7, 0, 0]],
[[0, 0, 0, 0, 0], [2, 2, 2, 2, 2],
[7, 7, 7, 7, 7], [0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]]]).cuda()
assert torch.all(idx == expected_idx)
def test_knn():
if not torch.cuda.is_available():
pytest.skip()
new_xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625],
[-2.2769, 2.7817, -0.2334],
[-0.4003, 2.4666, -0.5116],
[-0.0740, 1.3147, -1.3625],
[-0.0740, 1.3147, -1.3625]],
[[-2.0289, 2.4952, -0.1708],
[-2.0668, 6.0278, -0.4875],
[0.4066, 1.4211, -0.2947],
[-2.0289, 2.4952, -0.1708],
[-2.0289, 2.4952, -0.1708]]]).cuda()
xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],
[-0.4003, 2.4666,
-0.5116], [-0.5251, 2.4379, -0.8466],
[-0.9691, 1.1418,
-1.3733], [-0.2232, 0.9561, -1.3626],
[-2.2769, 2.7817, -0.2334],
[-0.2822, 1.3192, -1.3645], [0.1533, 1.5024, -1.0432],
[0.4917, 1.1529, -1.3496]],
[[-2.0289, 2.4952,
-0.1708], [-0.7188, 0.9956, -0.5096],
[-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],
[0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],
[-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],
[0.3220, 1.4447, 0.3548], [-0.9744, 2.3856,
-1.2000]]]).cuda()
idx = knn(5, xyz, new_xyz)
new_xyz_ = new_xyz.unsqueeze(2).repeat(1, 1, xyz.shape[1], 1)
xyz_ = xyz.unsqueeze(1).repeat(1, new_xyz.shape[1], 1, 1)
dist = ((new_xyz_ - xyz_) * (new_xyz_ - xyz_)).sum(-1)
expected_idx = dist.topk(k=5, dim=2, largest=False)[1].transpose(2, 1)
assert torch.all(idx == expected_idx)
idx = knn(5,
xyz.transpose(1, 2).contiguous(),
new_xyz.transpose(1, 2).contiguous(), True)
assert torch.all(idx == expected_idx)
idx = knn(5, xyz, xyz)
xyz_ = xyz.unsqueeze(2).repeat(1, 1, xyz.shape[1], 1)
xyz__ = xyz.unsqueeze(1).repeat(1, xyz.shape[1], 1, 1)
dist = ((xyz_ - xyz__) * (xyz_ - xyz__)).sum(-1)
expected_idx = dist.topk(k=5, dim=2, largest=False)[1].transpose(2, 1)
assert torch.all(idx == expected_idx)
def test_grouping_points():
if not torch.cuda.is_available():
pytest.skip()
idx = torch.tensor([[[0, 0, 0], [3, 3, 3], [8, 8, 8], [0, 0, 0], [0, 0, 0],
[0, 0, 0]],
[[0, 0, 0], [6, 6, 6], [9, 9, 9], [0, 0, 0], [0, 0, 0],
[0, 0, 0]]]).int().cuda()
festures = torch.tensor([[[
0.5798, -0.7981, -0.9280, -1.3311, 1.3687, 0.9277, -0.4164, -1.8274,
0.9268, 0.8414
],
[
5.4247, 1.5113, 2.3944, 1.4740, 5.0300,
5.1030, 1.9360, 2.1939, 2.1581, 3.4666
],
[
-1.6266, -1.0281, -1.0393, -1.6931, -1.3982,
-0.5732, -1.0830, -1.7561, -1.6786, -1.6967
]],
[[
-0.0380, -0.1880, -1.5724, 0.6905, -0.3190,
0.7798, -0.3693, -0.9457, -0.2942, -1.8527
],
[
1.1773, 1.5009, 2.6399, 5.9242, 1.0962,
2.7346, 6.0865, 1.5555, 4.3303, 2.8229
],
[
-0.6646, -0.6870, -0.1125, -0.2224, -0.3445,
-1.4049, 0.4990, -0.7037, -0.9924, 0.0386
]]]).cuda()
output = grouping_operation(festures, idx)
expected_output = torch.tensor([[[[0.5798, 0.5798, 0.5798],
[-1.3311, -1.3311, -1.3311],
[0.9268, 0.9268, 0.9268],
[0.5798, 0.5798, 0.5798],
[0.5798, 0.5798, 0.5798],
[0.5798, 0.5798, 0.5798]],
[[5.4247, 5.4247, 5.4247],
[1.4740, 1.4740, 1.4740],
[2.1581, 2.1581, 2.1581],
[5.4247, 5.4247, 5.4247],
[5.4247, 5.4247, 5.4247],
[5.4247, 5.4247, 5.4247]],
[[-1.6266, -1.6266, -1.6266],
[-1.6931, -1.6931, -1.6931],
[-1.6786, -1.6786, -1.6786],
[-1.6266, -1.6266, -1.6266],
[-1.6266, -1.6266, -1.6266],
[-1.6266, -1.6266, -1.6266]]],
[[[-0.0380, -0.0380, -0.0380],
[-0.3693, -0.3693, -0.3693],
[-1.8527, -1.8527, -1.8527],
[-0.0380, -0.0380, -0.0380],
[-0.0380, -0.0380, -0.0380],
[-0.0380, -0.0380, -0.0380]],
[[1.1773, 1.1773, 1.1773],
[6.0865, 6.0865, 6.0865],
[2.8229, 2.8229, 2.8229],
[1.1773, 1.1773, 1.1773],
[1.1773, 1.1773, 1.1773],
[1.1773, 1.1773, 1.1773]],
[[-0.6646, -0.6646, -0.6646],
[0.4990, 0.4990, 0.4990],
[0.0386, 0.0386, 0.0386],
[-0.6646, -0.6646, -0.6646],
[-0.6646, -0.6646, -0.6646],
[-0.6646, -0.6646, -0.6646]]]]).cuda()
assert torch.allclose(output, expected_output)
def test_gather_points():
if not torch.cuda.is_available():
pytest.skip()
features = torch.tensor([[[
-1.6095, -0.1029, -0.8876, -1.2447, -2.4031, 0.3708, -1.1586, -1.4967,
-0.4800, 0.2252
],
[
1.9138, 3.4979, 1.6854, 1.5631, 3.6776,
3.1154, 2.1705, 2.5221, 2.0411, 3.1446
],
[
-1.4173, 0.3073, -1.4339, -1.4340, -1.2770,
-0.2867, -1.4162, -1.4044, -1.4245, -1.4074
]],
[[
0.2160, 0.0842, 0.3661, -0.2749, -0.4909,
-0.6066, -0.8773, -0.0745, -0.9496, 0.1434
],
[
1.3644, 1.8087, 1.6855, 1.9563, 1.2746,
1.9662, 0.9566, 1.8778, 1.1437, 1.3639
],
[
-0.7172, 0.1692, 0.2241, 0.0721, -0.7540,
0.0462, -0.6227, 0.3223, -0.6944, -0.5294
]]]).cuda()
idx = torch.tensor([[0, 1, 4, 0, 0, 0], [0, 5, 6, 0, 0, 0]]).int().cuda()
output = gather_points(features, idx)
expected_output = torch.tensor(
[[[-1.6095, -0.1029, -2.4031, -1.6095, -1.6095, -1.6095],
[1.9138, 3.4979, 3.6776, 1.9138, 1.9138, 1.9138],
[-1.4173, 0.3073, -1.2770, -1.4173, -1.4173, -1.4173]],
[[0.2160, -0.6066, -0.8773, 0.2160, 0.2160, 0.2160],
[1.3644, 1.9662, 0.9566, 1.3644, 1.3644, 1.3644],
[-0.7172, 0.0462, -0.6227, -0.7172, -0.7172, -0.7172]]]).cuda()
assert torch.allclose(output, expected_output)
output_half = gather_points(features.half(), idx)
assert torch.allclose(output_half, expected_output.half())
def test_three_interpolate():
if not torch.cuda.is_available():
pytest.skip()
features = torch.tensor([[[2.4350, 4.7516, 4.4995, 2.4350, 2.4350, 2.4350],
[3.1236, 2.6278, 3.0447, 3.1236, 3.1236, 3.1236],
[2.6732, 2.8677, 2.6436, 2.6732, 2.6732, 2.6732],
[0.0124, 7.0150, 7.0199, 0.0124, 0.0124, 0.0124],
[0.3207, 0.0000, 0.3411, 0.3207, 0.3207,
0.3207]],
[[0.0000, 0.9544, 2.4532, 0.0000, 0.0000, 0.0000],
[0.5346, 1.9176, 1.4715, 0.5346, 0.5346, 0.5346],
[0.0000, 0.2744, 2.0842, 0.0000, 0.0000, 0.0000],
[0.3414, 1.5063, 1.6209, 0.3414, 0.3414, 0.3414],
[0.5814, 0.0103, 0.0000, 0.5814, 0.5814,
0.5814]]]).cuda()
idx = torch.tensor([[[0, 1, 2], [2, 3, 4], [2, 3, 4], [0, 1, 2], [0, 1, 2],
[0, 1, 3]],
[[0, 2, 3], [1, 3, 4], [2, 1, 4], [0, 2, 4], [0, 2, 4],
[0, 1, 2]]]).int().cuda()
weight = torch.tensor([[[3.3333e-01, 3.3333e-01, 3.3333e-01],
[1.0000e+00, 5.8155e-08, 2.2373e-08],
[1.0000e+00, 1.7737e-08, 1.7356e-08],
[3.3333e-01, 3.3333e-01, 3.3333e-01],
[3.3333e-01, 3.3333e-01, 3.3333e-01],
[3.3333e-01, 3.3333e-01, 3.3333e-01]],
[[3.3333e-01, 3.3333e-01, 3.3333e-01],
[1.0000e+00, 1.3651e-08, 7.7312e-09],
[1.0000e+00, 1.7148e-08, 1.4070e-08],
[3.3333e-01, 3.3333e-01, 3.3333e-01],
[3.3333e-01, 3.3333e-01, 3.3333e-01],
[3.3333e-01, 3.3333e-01, 3.3333e-01]]]).cuda()
output = three_interpolate(features, idx, weight)
expected_output = torch.tensor([[[
3.8953e+00, 4.4995e+00, 4.4995e+00, 3.8953e+00, 3.8953e+00, 3.2072e+00
], [
2.9320e+00, 3.0447e+00, 3.0447e+00, 2.9320e+00, 2.9320e+00, 2.9583e+00
], [
2.7281e+00, 2.6436e+00, 2.6436e+00, 2.7281e+00, 2.7281e+00, 2.7380e+00
], [
4.6824e+00, 7.0199e+00, 7.0199e+00, 4.6824e+00, 4.6824e+00, 2.3466e+00
], [
2.2060e-01, 3.4110e-01, 3.4110e-01, 2.2060e-01, 2.2060e-01, 2.1380e-01
]],
[[
8.1773e-01, 9.5440e-01, 2.4532e+00,
8.1773e-01, 8.1773e-01, 1.1359e+00
],
[
8.4689e-01, 1.9176e+00, 1.4715e+00,
8.4689e-01, 8.4689e-01, 1.3079e+00
],
[
6.9473e-01, 2.7440e-01, 2.0842e+00,
6.9473e-01, 6.9473e-01, 7.8619e-01
],
[
7.6789e-01, 1.5063e+00, 1.6209e+00,
7.6789e-01, 7.6789e-01, 1.1562e+00
],
[
3.8760e-01, 1.0300e-02, 8.3569e-09,
3.8760e-01, 3.8760e-01, 1.9723e-01
]]]).cuda()
assert torch.allclose(output, expected_output, 1e-4)
def test_three_nn():
if not torch.cuda.is_available():
pytest.skip()
known = torch.tensor([[[-1.8373, 3.5605,
-0.7867], [0.7615, 2.9420, 0.2314],
[-0.6503, 3.6637, -1.0622],
[-1.8373, 3.5605, -0.7867],
[-1.8373, 3.5605, -0.7867]],
[[-1.3399, 1.9991, -0.3698],
[-0.0799, 0.9698,
-0.8457], [0.0858, 2.4721, -0.1928],
[-1.3399, 1.9991, -0.3698],
[-1.3399, 1.9991, -0.3698]]]).cuda()
unknown = torch.tensor([[[-1.8373, 3.5605, -0.7867],
[0.7615, 2.9420, 0.2314],
[-0.6503, 3.6637, -1.0622],
[-1.5237, 2.3976, -0.8097],
[-0.0722, 3.4017, -0.2880],
[0.5198, 3.0661, -0.4605],
[-2.0185, 3.5019, -0.3236],
[0.5098, 3.1020, 0.5799],
[-1.6137, 3.8443, -0.5269],
[0.7341, 2.9626, -0.3189]],
[[-1.3399, 1.9991, -0.3698],
[-0.0799, 0.9698, -0.8457],
[0.0858, 2.4721, -0.1928],
[-0.9022, 1.6560, -1.3090],
[0.1156, 1.6901, -0.4366],
[-0.6477, 2.3576, -0.1563],
[-0.8482, 1.1466, -1.2704],
[-0.8753, 2.0845, -0.3460],
[-0.5621, 1.4233, -1.2858],
[-0.5883, 1.3114, -1.2899]]]).cuda()
dist, idx = three_nn(unknown, known)
expected_dist = torch.tensor([[[0.0000, 0.0000, 0.0000],
[0.0000, 2.0463, 2.8588],
[0.0000, 1.2229, 1.2229],
[1.2047, 1.2047, 1.2047],
[1.0011, 1.0845, 1.8411],
[0.7433, 1.4451, 2.4304],
[0.5007, 0.5007, 0.5007],
[0.4587, 2.0875, 2.7544],
[0.4450, 0.4450, 0.4450],
[0.5514, 1.7206, 2.6811]],
[[0.0000, 0.0000, 0.0000],
[0.0000, 1.6464, 1.6952],
[0.0000, 1.5125, 1.5125],
[1.0915, 1.0915, 1.0915],
[0.8197, 0.8511, 1.4894],
[0.7433, 0.8082, 0.8082],
[0.8955, 1.3340, 1.3340],
[0.4730, 0.4730, 0.4730],
[0.7949, 1.3325, 1.3325],
[0.7566, 1.3727, 1.3727]]]).cuda()
expected_idx = torch.tensor([[[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4],
[2, 1, 0], [1, 2, 0], [0, 3, 4], [1, 2, 0],
[0, 3, 4], [1, 2, 0]],
[[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4],
[2, 1, 0], [2, 0, 3], [1, 0, 3], [0, 3, 4],
[1, 0, 3], [1, 0, 3]]]).cuda()
assert torch.allclose(dist, expected_dist, 1e-4)
assert torch.all(idx == expected_idx)
def test_fps_with_dist():
if not torch.cuda.is_available():
pytest.skip()
xyz = torch.tensor([[[-0.2748, 1.0020, -1.1674], [0.1015, 1.3952, -1.2681],
[-0.8070, 2.4137,
-0.5845], [-1.0001, 2.1982, -0.5859],
[0.3841, 1.8983, -0.7431]],
[[-1.0696, 3.0758,
-0.1899], [-0.2559, 3.5521, -0.1402],
[0.8164, 4.0081, -0.1839], [-1.1000, 3.0213, -0.8205],
[-0.0518, 3.7251, -0.3950]]]).cuda()
expected_idx = torch.tensor([[0, 2, 4], [0, 2, 1]]).cuda()
xyz_square_dist = ((xyz.unsqueeze(dim=1) -
xyz.unsqueeze(dim=2))**2).sum(-1)
idx = furthest_point_sample_with_dist(xyz_square_dist, 3)
assert torch.all(idx == expected_idx)
import numpy as np
fps_idx = np.load('tests/data/ops/fps_idx.npy')
features_for_fps_distance = np.load(
'tests/data/ops/features_for_fps_distance.npy')
expected_idx = torch.from_numpy(fps_idx).cuda()
features_for_fps_distance = torch.from_numpy(
features_for_fps_distance).cuda()
idx = furthest_point_sample_with_dist(features_for_fps_distance, 16)
assert torch.all(idx == expected_idx)
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import pytest
import torch
from mmdet3d.ops.roiaware_pool3d import (RoIAwarePool3d, points_in_boxes_all,
points_in_boxes_cpu,
points_in_boxes_part)
def test_RoIAwarePool3d():
# RoIAwarePool3d only support gpu version currently.
if not torch.cuda.is_available():
pytest.skip('test requires GPU and torch+cuda')
roiaware_pool3d_max = RoIAwarePool3d(
out_size=4, max_pts_per_voxel=128, mode='max')
roiaware_pool3d_avg = RoIAwarePool3d(
out_size=4, max_pts_per_voxel=128, mode='avg')
rois = torch.tensor(
[[1.0, 2.0, 3.0, 5.0, 4.0, 6.0, -0.3 - np.pi / 2],
[-10.0, 23.0, 16.0, 20.0, 10.0, 20.0, -0.5 - np.pi / 2]],
dtype=torch.float32).cuda(
) # boxes (m, 7) with bottom center in lidar coordinate
pts = torch.tensor(
[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
[0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
[4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9],
[-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]],
dtype=torch.float32).cuda() # points (n, 3) in lidar coordinate
pts_feature = pts.clone()
pooled_features_max = roiaware_pool3d_max(
rois=rois, pts=pts, pts_feature=pts_feature)
assert pooled_features_max.shape == torch.Size([2, 4, 4, 4, 3])
assert torch.allclose(pooled_features_max.sum(),
torch.tensor(51.100).cuda(), 1e-3)
pooled_features_avg = roiaware_pool3d_avg(
rois=rois, pts=pts, pts_feature=pts_feature)
assert pooled_features_avg.shape == torch.Size([2, 4, 4, 4, 3])
assert torch.allclose(pooled_features_avg.sum(),
torch.tensor(49.750).cuda(), 1e-3)
def test_points_in_boxes_part():
if not torch.cuda.is_available():
pytest.skip('test requires GPU and torch+cuda')
boxes = torch.tensor(
[[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3]],
[[-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
dtype=torch.float32).cuda(
) # boxes (b, t, 7) with bottom center in lidar coordinate
pts = torch.tensor(
[[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
[0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
[4.7, 3.5, -12.2]],
[[3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9], [-21.3, -52, -5],
[0, 0, 0], [6, 7, 8], [-2, -3, -4], [6, 4, 9]]],
dtype=torch.float32).cuda() # points (b, m, 3) in lidar coordinate
point_indices = points_in_boxes_part(points=pts, boxes=boxes)
expected_point_indices = torch.tensor(
[[0, 0, 0, 0, 0, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1]],
dtype=torch.int32).cuda()
assert point_indices.shape == torch.Size([2, 8])
assert (point_indices == expected_point_indices).all()
boxes = torch.tensor([[[0.0, 0.0, 0.0, 1.0, 20.0, 1.0, 0.523598]]],
dtype=torch.float32).cuda() # 30 degrees
pts = torch.tensor(
[[[4, 6.928, 0], [6.928, 4, 0], [4, -6.928, 0], [6.928, -4, 0],
[-4, 6.928, 0], [-6.928, 4, 0], [-4, -6.928, 0], [-6.928, -4, 0]]],
dtype=torch.float32).cuda()
point_indices = points_in_boxes_part(points=pts, boxes=boxes)
expected_point_indices = torch.tensor([[-1, -1, 0, -1, 0, -1, -1, -1]],
dtype=torch.int32).cuda()
assert (point_indices == expected_point_indices).all()
if torch.cuda.device_count() > 1:
pts = pts.to('cuda:1')
boxes = boxes.to('cuda:1')
expected_point_indices = expected_point_indices.to('cuda:1')
point_indices = points_in_boxes_part(points=pts, boxes=boxes)
assert point_indices.shape == torch.Size([2, 8])
assert (point_indices == expected_point_indices).all()
def test_points_in_boxes_cpu():
boxes = torch.tensor(
[[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],
[-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
dtype=torch.float32
) # boxes (m, 7) with bottom center in lidar coordinate
pts = torch.tensor(
[[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
[0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
[4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [
-16, -18, 9
], [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]]],
dtype=torch.float32) # points (n, 3) in lidar coordinate
point_indices = points_in_boxes_cpu(points=pts, boxes=boxes)
expected_point_indices = torch.tensor(
[[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0], [0, 0],
[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]],
dtype=torch.int32)
assert point_indices.shape == torch.Size([1, 15, 2])
assert (point_indices == expected_point_indices).all()
boxes = torch.tensor([[[0.0, 0.0, 0.0, 1.0, 20.0, 1.0, 0.523598]]],
dtype=torch.float32) # 30 degrees
pts = torch.tensor(
[[[4, 6.928, 0], [6.928, 4, 0], [4, -6.928, 0], [6.928, -4, 0],
[-4, 6.928, 0], [-6.928, 4, 0], [-4, -6.928, 0], [-6.928, -4, 0]]],
dtype=torch.float32)
point_indices = points_in_boxes_cpu(points=pts, boxes=boxes)
expected_point_indices = torch.tensor(
[[[0], [0], [1], [0], [1], [0], [0], [0]]], dtype=torch.int32)
assert (point_indices == expected_point_indices).all()
def test_points_in_boxes_all():
if not torch.cuda.is_available():
pytest.skip('test requires GPU and torch+cuda')
boxes = torch.tensor(
[[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],
[-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
dtype=torch.float32).cuda(
) # boxes (m, 7) with bottom center in lidar coordinate
pts = torch.tensor(
[[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
[0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
[4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [
-16, -18, 9
], [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]]],
dtype=torch.float32).cuda() # points (n, 3) in lidar coordinate
point_indices = points_in_boxes_all(points=pts, boxes=boxes)
expected_point_indices = torch.tensor(
[[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0], [0, 0],
[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]],
dtype=torch.int32).cuda()
assert point_indices.shape == torch.Size([1, 15, 2])
assert (point_indices == expected_point_indices).all()
if torch.cuda.device_count() > 1:
pts = pts.to('cuda:1')
boxes = boxes.to('cuda:1')
expected_point_indices = expected_point_indices.to('cuda:1')
point_indices = points_in_boxes_all(points=pts, boxes=boxes)
assert point_indices.shape == torch.Size([1, 15, 2])
assert (point_indices == expected_point_indices).all()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment