Commit 3f1316d5 authored by traveller59's avatar traveller59
Browse files

initial release

parent a347176a
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/maxpool.h>
#include <spconv/mp_helper.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/helper_launch.h>
#include <tensorview/tensorview.h>
#include <type_traits>
namespace spconv {
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolFwdBlockKernel(T *outFeatures, const T *inFeatures,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
T in, out;
int ILPStrideY[NumILP];
Index idxo, idxi;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x; ix < numHot;
ix += blockDim.x * gridDim.x) {
{
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
in = inFeatures[idxi];
out = outFeatures[idxo];
if (in > out) {
outFeatures[idxo] = in;
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void
maxPoolFwdGenericBlockKernel(T *outFeatures, const T *inFeatures,
const Index *indicesIn, const Index *indicesOut,
int numHot, int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
T in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in > out) {
outFeatures[RO[ilp] + iy] = in;
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
__global__ void maxPoolFwdVecBlockKernel(T *outFeatures, const T *inFeatures,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideY[NumILP];
constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
T bufi[vecloadFactor];
T bufo[vecloadFactor];
Index idxi, idxo;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
ix += blockDim.x * gridDim.x * vecloadFactor) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
reinterpret_cast<VecType *>(bufo)[0] =
reinterpret_cast<VecType *>(outFeatures)[idxo];
reinterpret_cast<VecType *>(bufi)[0] =
reinterpret_cast<const VecType *>(inFeatures)[idxi];
#pragma unroll
for (int i = 0; i < vecloadFactor; i++) {
if (bufi[i] > bufo[i]) {
bufo[i] = bufi[i];
}
}
reinterpret_cast<VecType *>(outFeatures)[idxo] =
reinterpret_cast<VecType *>(bufo)[0];
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolFwdGenericKernel(T *outFeatures, const T *inFeatures,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
T in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < numHot) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < numHot) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in > out) {
outFeatures[RO[ilp] + iy] = in;
}
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void
maxPoolBwdBlockKernel(const T *outFeatures, const T *inFeatures, const T *dout,
T *din, const Index *indicesIn, const Index *indicesOut,
int numHot, int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
T in, out;
Index idxo, idxi;
int ILPStrideY[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
dout += blockIdx.y * NumTLP;
din += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x; ix < numHot;
ix += blockDim.x * gridDim.x) {
{
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
in = inFeatures[idxi];
out = outFeatures[idxo];
if (in == out) {
din[idxi] += dout[idxo];
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolBwdGenericBlockKernel(const T *outFeatures,
const T *inFeatures, const T *dout,
T *din, const Index *indicesIn,
const Index *indicesOut,
int numHot, int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
T in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in == out) {
din[RI[ilp] + iy] += dout[RO[ilp] + iy];
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
__global__ void
maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
const T *dout, T *din, const Index *indicesIn,
const Index *indicesOut, int numHot, int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideY[NumILP];
constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
T bufi[vecloadFactor];
T bufo[vecloadFactor];
T bufdi[vecloadFactor];
T bufdo[vecloadFactor];
Index idxi, idxo;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
ix += blockDim.x * gridDim.x * vecloadFactor) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
reinterpret_cast<VecType *>(bufo)[0] =
reinterpret_cast<const VecType *>(outFeatures)[idxo];
reinterpret_cast<VecType *>(bufi)[0] =
reinterpret_cast<const VecType *>(inFeatures)[idxi];
reinterpret_cast<VecType *>(bufdo)[0] =
reinterpret_cast<const VecType *>(dout)[idxo];
reinterpret_cast<VecType *>(bufdi)[0] = reinterpret_cast<VecType *>(din)[idxi];
#pragma unroll
for (int i = 0; i < vecloadFactor; i++) {
if (bufi[i] == bufo[i]) {
bufdi[i] += bufdo[i];
}
}
reinterpret_cast<VecType *>(din)[idxi] = reinterpret_cast<VecType *>(bufdi)[0];
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void
maxPoolBwdGenericKernel(const T *outFeatures, const T *inFeatures,
const T *dout, T *din, const Index *indicesIn,
const Index *indicesOut, int numHot, int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
T in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < numHot) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < numHot) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in == out) {
din[RI[ilp] + iy] += dout[RO[ilp] + iy];
}
}
}
}
}
}
namespace functor {
template <typename T, typename Index>
struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>;
void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> inFeatures,
tv::TensorView<const Index> indices, int size) {
if (size <= 0)
return;
int numPlanes = inFeatures.dim(1);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
&notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (numHotBlock >= NumTLP) {
maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
<<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.stream()>>>(outFeatures.data(), inFeatures.data(),
indices.subview(0).data(),
indices.subview(1).data(), numHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
if (size > numHotBlock) {
maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, d.stream()>>>(outFeatures.data(), inFeatures.data(),
indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock,
size - numHotBlock, numPlanes);
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (numHotBlock >= NumTLP) {
maxPoolFwdGenericBlockKernel<T, Index, NumTLP, NumILP>
<<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.stream()>>>(
outFeatures.data(), inFeatures.data(),
indices.subview(0).data(), indices.subview(1).data(),
numHotBlock, numPlanes);
}
if (size > numHotBlock) {
maxPoolFwdGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.stream()>>>(
outFeatures.data(), inFeatures.data(),
indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock, size - numHotBlock,
numPlanes);
}
}
}
};
template <typename T, typename Index>
struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>;
void operator()(const tv::GPU &d, tv::TensorView<const T> outFeatures,
tv::TensorView<const T> inFeatures,
tv::TensorView<const T> dout, tv::TensorView<T> din,
tv::TensorView<const Index> indices, int size) {
if (size <= 0)
return;
int numPlanes = inFeatures.dim(1);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &dout, &din,
&indices, &notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (numHotBlock >= NumTLP) {
maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
<<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.stream()>>>(outFeatures.data(), inFeatures.data(),
dout.data(), din.data(),
indices.subview(0).data(),
indices.subview(1).data(), numHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
if (size > numHotBlock) {
maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, d.stream()>>>(outFeatures.data(), inFeatures.data(),
dout.data(), din.data(),
indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock,
size - numHotBlock, numPlanes);
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (numHotBlock >= NumTLP) {
maxPoolBwdGenericBlockKernel<T, Index, NumTLP, NumILP>
<<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.stream()>>>(
outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
indices.subview(0).data(), indices.subview(1).data(),
numHotBlock, numPlanes);
}
if (size > numHotBlock) {
maxPoolBwdGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.stream()>>>(
outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock, size - numHotBlock,
numPlanes);
}
}
}
};
} // namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseMaxPoolForwardFunctor<tv::GPU, T, Index>; \
template struct functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, Index>;
#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
DECLARE_GPU_SPECS(float);
DECLARE_GPU_SPECS(double);
DECLARE_GPU_SPECS(at::Half);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
} // namespace spconv
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/reordering.h>
#include <torch/script.h>
namespace spconv {
namespace functor {
template <typename T, typename Index>
struct SparseGatherFunctor<tv::CPU, T, Index> {
void operator()(const tv::CPU& d, tv::TensorView<T> buffer, tv::TensorView<const T> features,
tv::TensorView<const Index> indices, int size) {
int numPlanes = features.dim(1);
for (int i = 0; i < size; ++i) {
std::memcpy(buffer.data() + i * numPlanes,
features.data() + indices[i] * numPlanes,
sizeof(T) * numPlanes);
}
}
};
template <typename T, typename Index>
struct SparseScatterAddFunctor<tv::CPU, T, Index> {
void operator()(const tv::CPU& d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> buffer, tv::TensorView<const Index> indices,
int size, bool stable) {
int numPlanes = outFeatures.dim(1);
const T* buf = buffer.data();
T* out = outFeatures.data();
for (int i = 0; i < size; ++i) {
buf = buffer.data() + i * numPlanes;
out = outFeatures.data() + indices[i] * numPlanes;
for (int j = 0; j < numPlanes; ++j){
out[j] += buf[j];
}
}
}
};
} // namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \
DECLARE_CPU_SPECS_T_INDEX(T, int); \
DECLARE_CPU_SPECS_T_INDEX(T, long);
DECLARE_CPU_SPECS(float);
DECLARE_CPU_SPECS(double);
DECLARE_CPU_SPECS(at::Half);
#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX
} // namespace spconv
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/mp_helper.h>
#include <spconv/reordering.h>
#include <spconv/reordering.cu.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/helper_launch.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
namespace spconv {
namespace functor {
template <typename T, typename Index>
struct SparseGatherFunctor<tv::GPU, T, Index> {
using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>;
void operator()(const tv::GPU &d, tv::TensorView<T> buffer,
tv::TensorView<const T> features,
tv::TensorView<const Index> indices, int size) {
if (size <= 0)
return;
int numPlanes = features.dim(1);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
&notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
<<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.stream()>>>(buffer.data(), features.data(), indices.data(),
nHotBlock, numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
if (size - nHotBlock > 0) {
gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
<<<dim3(1, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
d.stream()>>>(buffer.data() + nHotBlock * numPlanes,
features.data(), indices.data() + nHotBlock,
size - nHotBlock, numPlanes / vecloadFactor);
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
gatherGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::launch::DivUp(size, NumTLP),
tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.stream()>>>(
buffer.data(), features.data(), indices.data(), size, numPlanes);
}
}
};
template <typename T, typename Index>
struct SparseScatterAddFunctor<tv::GPU, T, Index> {
using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = mp_list_c<int, 64, 32, 16>;
void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> buffer,
tv::TensorView<const Index> indices, int size, bool stable) {
if (size <= 0)
return;
int numPlanes = outFeatures.dim(1);
bool notFound = true;
constexpr int vecloadFactor =
sizeof(vecload_type_t) / sizeof(T); // important for half.
mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
&notFound](auto NumTLP) {
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
constexpr int NumILP = NumTLP / 4;
int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.stream()>>>(outFeatures.data(), buffer.data(),
indices.data(), nHotBlock,
numPlanes / vecloadFactor);
}
if (size - nHotBlock > 0) {
scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, d.stream()>>>(
outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
indices.data() + nHotBlock, size - nHotBlock, numPlanes);
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
scatterAddGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::launch::DivUp(size, NumTLP),
tv::launch::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.stream()>>>(
outFeatures.data(), buffer.data(), indices.data(), size,
numPlanes);
}
}
};
} // namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::GPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;
#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
DECLARE_GPU_SPECS(float);
DECLARE_GPU_SPECS(double);
DECLARE_GPU_SPECS(at::Half);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
} // namespace spconv
\ No newline at end of file
add_library(spconv_nms SHARED nms.cu)
set_target_properties(spconv_nms PROPERTIES VERSION ${PROJECT_VERSION})
set_target_properties(spconv_nms PROPERTIES SOVERSION 1)
target_include_directories(spconv_nms PRIVATE ${ALL_INCLUDE})
set_property(TARGET spconv_nms PROPERTY CXX_STANDARD 14)
set_property(TARGET spconv_nms PROPERTY CUDA_STANDARD 14)
target_link_libraries(spconv_nms ${CUDA_CUDART})
install (TARGETS spconv_nms DESTINATION lib)
add_library(spconv_utils SHARED all.cc)
set_target_properties(spconv_utils PROPERTIES VERSION ${PROJECT_VERSION})
set_target_properties(spconv_utils PROPERTIES SOVERSION 1)
target_include_directories(spconv_utils PRIVATE ${ALL_INCLUDE}
${PROJECT_SOURCE_DIR}/third_party/pybind11/include)
set_property(TARGET spconv_utils PROPERTY CXX_STANDARD 14)
set_property(TARGET spconv_utils PROPERTY CUDA_STANDARD 14)
set_target_properties(spconv_utils PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
SUFFIX "${PYTHON_MODULE_EXTENSION}")
target_link_libraries(spconv_utils ${CUDA_CUDART} pybind11::module -Wl,--no-as-needed spconv_nms)
install (TARGETS spconv_utils DESTINATION lib)
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/nms.h>
#include <spconv/point2voxel.h>
#include <spconv/box_iou.h>
namespace py = pybind11;
using namespace pybind11::literals;
PYBIND11_MODULE(spconv_utils, m)
{
m.doc() = "util pybind11 functions for spconv";
m.def("non_max_suppression", &spconv::non_max_suppression<double>, py::return_value_policy::reference_internal, "bbox iou",
"boxes"_a = 1, "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
m.def("non_max_suppression", &spconv::non_max_suppression<float>, py::return_value_policy::reference_internal, "bbox iou",
"boxes"_a = 1, "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<double>, py::return_value_policy::reference_internal, "bbox iou",
"boxes"_a = 1, "order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<float>, py::return_value_policy::reference_internal, "bbox iou",
"boxes"_a = 1, "order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
m.def("rotate_non_max_suppression_cpu", &spconv::rotate_non_max_suppression_cpu<float>, py::return_value_policy::reference_internal, "bbox iou",
"box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3, "thresh"_a = 4);
m.def("rotate_non_max_suppression_cpu", &spconv::rotate_non_max_suppression_cpu<double>, py::return_value_policy::reference_internal, "bbox iou",
"box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3, "thresh"_a = 4);
m.def("rbbox_iou", &spconv::rbbox_iou<double>,
py::return_value_policy::reference_internal, "rbbox iou",
"box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
"standup_thresh"_a = 4);
m.def("rbbox_iou", &spconv::rbbox_iou<float>,
py::return_value_policy::reference_internal, "rbbox iou",
"box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
"standup_thresh"_a = 4);
m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<float, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "coors"_a = 3,
"num_points_per_voxel"_a = 4, "coor_to_voxelidx"_a = 5,
"voxel_size"_a = 6, "coors_range"_a = 7, "max_points"_a = 8,
"max_voxels"_a = 9);
m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<double, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "coors"_a = 3,
"num_points_per_voxel"_a = 4, "coor_to_voxelidx"_a = 5,
"voxel_size"_a = 6, "coors_range"_a = 7, "max_points"_a = 8,
"max_voxels"_a = 9);
}
\ No newline at end of file
// ------------------------------------------------------------------
// Deformable Convolutional Networks
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License
// Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn)
// ------------------------------------------------------------------
#include <vector>
#include <iostream>
#include <cuda_runtime.h>
#include <spconv/nms_gpu.h>
#define CUDA_CHECK(condition) \
/* Code block avoids redefinition of cudaError_t error */ \
do \
{ \
cudaError_t error = condition; \
if (error != cudaSuccess) \
{ \
std::cout << cudaGetErrorString(error) << std::endl; \
} \
} while (0)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
int const threadsPerBlock = sizeof(unsigned long long) * 8;
template <typename DType>
__device__ inline DType devIoU(DType const *const a, DType const *const b)
{
DType left = max(a[0], b[0]), right = min(a[2], b[2]);
DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
DType interS = width * height;
DType Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
DType Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return interS / (Sa + Sb - interS);
}
template <typename DType, int BLOCK_THREADS>
__global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
const DType *dev_boxes, unsigned long long *dev_mask)
{
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size =
min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
const int col_size =
min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
__shared__ DType block_boxes[BLOCK_THREADS * 5];
if (threadIdx.x < col_size)
{
#pragma unroll
for (int i = 0; i < 5; ++i)
{
block_boxes[threadIdx.x * 5 + i] =
dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
}
}
__syncthreads();
if (threadIdx.x < row_size)
{
const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
const DType *cur_box = dev_boxes + cur_box_idx * 5;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start)
{
start = threadIdx.x + 1;
}
for (int i = start; i < col_size; i++)
{
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh)
{
t |= 1ULL << i;
}
}
const int col_blocks = DIVUP(n_boxes, BLOCK_THREADS);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
void _set_device(int device_id)
{
int current_device;
CUDA_CHECK(cudaGetDevice(&current_device));
if (current_device == device_id)
{
return;
}
// The call to cudaSetDevice must come before any calls to Get, which
// may perform initialization using the GPU.
CUDA_CHECK(cudaSetDevice(device_id));
}
template <typename DType, int BLOCK_THREADS>
int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
int boxes_dim, DType nms_overlap_thresh, int device_id)
{
_set_device(device_id);
DType *boxes_dev = NULL;
unsigned long long *mask_dev = NULL;
const int col_blocks = DIVUP(boxes_num, BLOCK_THREADS);
CUDA_CHECK(cudaMalloc(&boxes_dev,
boxes_num * boxes_dim * sizeof(DType)));
CUDA_CHECK(cudaMemcpy(boxes_dev,
boxes_host,
boxes_num * boxes_dim * sizeof(DType),
cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMalloc(&mask_dev,
boxes_num * col_blocks * sizeof(unsigned long long)));
dim3 blocks(DIVUP(boxes_num, BLOCK_THREADS),
DIVUP(boxes_num, BLOCK_THREADS));
dim3 threads(BLOCK_THREADS);
nms_kernel<DType, BLOCK_THREADS><<<blocks, threads>>>(boxes_num,
nms_overlap_thresh,
boxes_dev,
mask_dev);
std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
CUDA_CHECK(cudaMemcpy(&mask_host[0],
mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks,
cudaMemcpyDeviceToHost));
std::vector<unsigned long long> remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++)
{
int nblock = i / BLOCK_THREADS;
int inblock = i % BLOCK_THREADS;
if (!(remv[nblock] & (1ULL << inblock)))
{
keep_out[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++)
{
remv[j] |= p[j];
}
}
}
CUDA_CHECK(cudaFree(boxes_dev));
CUDA_CHECK(cudaFree(mask_dev));
return num_to_keep;
}
//template<>
template int _nms_gpu<float, threadsPerBlock>(int *keep_out, const float *boxes_host, int boxes_num,
int boxes_dim, float nms_overlap_thresh, int device_id);
//template<>
template int _nms_gpu<double, threadsPerBlock>(int *keep_out, const double *boxes_host, int boxes_num,
int boxes_dim, double nms_overlap_thresh, int device_id);
\ No newline at end of file
set(CATCH_HEADER ${PROJECT_SOURCE_DIR}/third_party/catch2)
add_library(catch_main OBJECT src/catch_main.cpp)
# target_compile_features(catch_main PUBLIC cxx_std_2a)
set_property(TARGET catch_main PROPERTY CXX_STANDARD 14)
target_include_directories(catch_main PRIVATE ${CATCH_HEADER})
file(GLOB files "src/test_*.cpp")
foreach(file ${files})
get_filename_component(file_basename ${file} NAME_WE)
string(REGEX REPLACE "test_([^$]+)" "test-\\1" testcase ${file_basename})
add_executable(${testcase} ${file} $<TARGET_OBJECTS:catch_main>)
set_property(TARGET ${testcase} PROPERTY CXX_STANDARD 14)
# set_target_properties(${testcase} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
# set_property(TARGET ${testcase} PROPERTY CUDA_STANDARD 14)
target_compile_definitions(${testcase} PRIVATE
CATCH_CONFIG_FAST_COMPILE
)
target_include_directories(${testcase} PRIVATE
${CATCH_HEADER} ${ALL_INCLUDE}
)
target_link_libraries(${testcase} ${ALL_LIBS} pybind11::embed -Wl,--no-as-needed spconv)
add_test(NAME "${testcase}"
COMMAND ${testcase}
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
endforeach()
\ No newline at end of file
// 000-CatchMain.cpp
// In a Catch project with multiple files, dedicate one file to compile the
// source code of Catch itself and reuse the resulting object file for linking.
// Let Catch provide main():
#define CATCH_CONFIG_MAIN
#include "catch.hpp"
// That's it
// Compile implementation of Catch for use with files that do contain tests:
// - g++ -std=c++11 -Wall -I$(CATCH_SINGLE_INCLUDE) -c 000-CatchMain.cpp
// - cl -EHsc -I%CATCH_SINGLE_INCLUDE% -c 000-CatchMain.cpp
#include <algorithm>
#include <iostream>
#include <map>
#include "catch.hpp"
#include <prettyprint.h>
#include <string>
#include <vector>
#include <exception>
#include <fmt/format.h>
#include <numeric>
#include <pybind11/embed.h> // everything needed for embedding
#include <pybind11/functional.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <tuple>
#include <pybind11_utils.h>
#include <spconv/spconv_ops.h>
namespace py = pybind11;
TEST_CASE("GetConvIndPair", "[SpConvNet]")
{
using namespace py::literals;
py::scoped_interpreter guard{}; // start the interpreter and keep it alive
py::exec(R"(
from __future__ import print_function
import numpy as np
import math
# import spconv
# import torch
def get_convolution_output_size(input_size,
kernel_size,
stride,
padding=None,
rate=None):
ndim = len(input_size)
if padding is None:
padding = [0] * ndim
output_size = []
for i in range(ndim):
output_size.append((input_size[i] + 2 * padding[i] - (
(kernel_size[i] - 1) + 1)) // stride[i] + 1)
return output_size
def get_test_sparse_data(shape,
num_points,
num_channels,
integer=False,
dtype=np.float32):
dense_shape = shape
ndim = len(dense_shape)
# num_points = np.random.randint(10, 100, size=[batch_size, ndim])
num_points = np.array(num_points)
# num_points = np.array([3, 2])
batch_size = len(num_points)
batch_indices = []
coors_total = np.stack(
np.meshgrid(*[np.arange(0, s) for s in shape]), axis=-1)
coors_total = coors_total.reshape(-1, ndim)
for i in range(batch_size):
np.random.shuffle(coors_total)
inds_total = coors_total[:num_points[i]]
inds_total = np.pad(
inds_total, ((0, 0), (0, 1)), mode="constant", constant_values=i)
batch_indices.append(inds_total)
if integer:
sparse_data = np.random.randint(
20, 100, size=[num_points.sum(), num_channels]).astype(dtype)
else:
sparse_data = np.random.uniform(
-1, 1, size=[num_points.sum(), num_channels]).astype(dtype)
# sparse_data = np.arange(1, num_points.sum() + 1).astype(np.float32).reshape(5, 1)
dense_data = np.zeros(
[batch_size, num_channels, *dense_shape], dtype=sparse_data.dtype)
start = 0
for i, inds in enumerate(batch_indices):
for j, ind in enumerate(inds):
dense_slice = (i, slice(None), *ind[:-1])
dense_data[dense_slice] = sparse_data[start + j]
start += len(inds)
batch_indices = np.concatenate(batch_indices, axis=0)
return {
"features": sparse_data.astype(dtype),
"indices": batch_indices.astype(np.int32),
"features_dense": dense_data.astype(dtype),
}
shape = [50, 30, 30]
num_points = [5000] * 1
# np.random.seed(np.random.randint(1, 100000))
in_channels = 64
sparse_dict = get_test_sparse_data(shape, num_points, in_channels)
features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"]
# indices_t = torch.from_numpy(indices)
filters = np.random.uniform(0, 1, size=[3, 3, 3, 64, 64]).astype(np.float32)
# print(outids.shape)
)");
SECTION("DebugTest"){
auto inds = array2TensorView<int>(py::array(py::globals()["indices"]));
auto inds_tensor = torch::from_blob(inds.data(), {inds.dim(0), inds.dim(1)}, torch::dtype(torch::kInt32));
auto inds_gpu = inds_tensor.to(torch::Device(torch::kCPU));
auto features = array2TensorView<float>(py::array(py::globals()["features"]));
auto features_tensor = torch::from_blob(features.data(), {features.dim(0), features.dim(1)}, torch::dtype(torch::kFloat));
auto features_gpu = features_tensor.to(torch::Device(torch::kCUDA, 0));
auto filters = array2TensorView<float>(py::array(py::globals()["filters"]));
auto filters_tensor = torch::from_blob(filters.data(), {filters.dim(0), filters.dim(1), filters.dim(2), filters.dim(3), filters.dim(4)}, torch::dtype(torch::kFloat));
auto filters_gpu = filters_tensor.to(torch::Device(torch::kCUDA, 0));
auto outputs = spconv::getIndicePair<3>(inds_gpu, 1, {46, 26, 26}, {50, 30, 30}, {3, 3, 3},
{1, 1, 1}, {0, 0, 0}, {2, 2, 2}, {0, 0, 0}, false, false);
// std::cout << outputs[2] << std::endl;
/*
auto output = spconv::indiceConv<float>(features_gpu, filters_gpu, outputs[1], outputs[2], outputs[0].size(0), false);
std::cout << output << std::endl;*/
}
}
\ No newline at end of file
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
import spconv
import torch
from torch import nn
import numpy as np
import time
from spconv.test_utils import params_grid, generate_sparse_data, TestCase
import unittest
# import sparseconvnet as scn
class SparseConv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
stride, padding, dilation):
super().__init__()
layers = [spconv.SparseConv3d(
in_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False)]
for i in range(1, num_layers):
layers.append(spconv.SparseConv3d(
out_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False))
self.net = spconv.SparseSequential(
*layers,
)
# self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
self.grid = None
self.shape = shape
def forward(self, features, coors, batch_size):
coors = coors.int()
x = spconv.SparseConvTensor(features, coors,self.shape, batch_size, self.grid)
return self.net(x)# .dense()
class SubMConv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
stride, padding, dilation):
super().__init__()
layers = [spconv.SubMConv3d(
in_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False)]
for i in range(1, num_layers):
layers.append(spconv.SubMConv3d(
out_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False))
self.net = spconv.SparseSequential(
*layers,
)
self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
self.shape = shape
def forward(self, features, coors, batch_size):
coors = coors.int()
x = spconv.SparseConvTensor(features, coors,self.shape, batch_size, self.grid)
return self.net(x)# .dense()
class Conv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
stride, padding, dilation):
super().__init__()
layers = [nn.Conv3d(
in_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False)]
for i in range(1, num_layers):
layers.append(nn.Conv3d(
out_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False))
self.net = nn.Sequential(
*layers,
)
self.shape = shape
def forward(self, x):
return self.net(x)# .dense()
class SparseDeConv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
stride, padding, dilation):
super().__init__()
layers = [spconv.SparseConvTranspose3d(
in_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False)]
for i in range(1, num_layers):
layers.append(spconv.SparseConvTranspose3d(
out_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False))
self.net = spconv.SparseSequential(
*layers,
)
self.shape = shape
def forward(self, features, coors, batch_size):
coors = coors.int()
x = spconv.SparseConvTensor(features, coors,self.shape, batch_size)
return self.net(x)# .dense()
class DeConv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
stride, padding, dilation):
super().__init__()
layers = [nn.ConvTranspose3d(
in_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False)]
for i in range(1, num_layers):
layers.append(nn.ConvTranspose3d(
out_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False))
self.net = nn.Sequential(
*layers,
)
self.shape = shape
def forward(self, x):
return self.net(x)# .dense()
class SparseMaxPoolTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, kernel_size,
stride, padding, dilation):
super().__init__()
layers = [spconv.SparseMaxPool3d(
kernel_size,
stride, padding, dilation)]
for i in range(1, num_layers):
layers.append(spconv.SparseMaxPool3d(
kernel_size,
stride, padding, dilation))
self.net = spconv.SparseSequential(
*layers,
)
self.shape = shape
def forward(self, features, coors, batch_size):
coors = coors.int()
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size )
return self.net(x)# .dense()
class MaxPool3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, kernel_size,
stride, padding, dilation):
super().__init__()
layers = [nn.MaxPool3d(
kernel_size,
stride, padding, dilation)]
for i in range(1, num_layers):
layers.append(nn.MaxPool3d(
kernel_size,
stride, padding, dilation))
self.net = nn.Sequential(
*layers,
)
self.shape = shape
def forward(self, x):
return self.net(x)# .dense()
class SubmanifoldConvTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size, stride):
super().__init__()
layers = [spconv.SubMConv3d(
in_channels, out_channels, kernel_size, bias=False, indice_key="subm0")]
for i in range(1, num_layers):
layers.append(spconv.SubMConv3d(
out_channels, out_channels, kernel_size, bias=False))
self.net = nn.Sequential(
*layers,
)
self.shape = shape
def forward(self, features, coors, batch_size):
coors = coors.int()
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size )
return self.net(x)
class SCNCoupleDeConvTest(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
stride):
super().__init__()
self.scn_input = scn.InputLayer(ndim, shape, mode=0)
self.net = nn.Sequential(
scn.Convolution(
ndim,
in_channels,
out_channels,
kernel_size,
stride,
bias=False),
scn.Deconvolution(
ndim,
out_channels,
in_channels,
kernel_size,
stride,
bias=False),
scn.SparseToDense(ndim, in_channels),
)
def forward(self, features, coors, batch_size):
coors = coors.long().cpu()
x = self.scn_input((coors, features))
return self.net(x)
class SparseCoupleDeConvTest(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
stride):
super().__init__()
self.net = spconv.SparseSequential(
spconv.SparseConv3d(
in_channels,
out_channels,
kernel_size,
stride,
indice_key="cp0",
bias=False),
spconv.SparseConvCoupleTranspose3d(
out_channels,
in_channels,
kernel_size,
stride,
indice_key="cp0",
bias=False),
)
self.todense = spconv.ToDense()
self.shape = shape
def forward(self, features, coors, batch_size):
coors = coors.int()
x = spconv.SparseConvTensor(features, coors,self.shape, batch_size )
return self.todense(self.net(x))# .dense()
def gather_nd(params, indices):
# this function has a limit that MAX_ADVINDEX_CALC_DIMS=5
ndim = indices.shape[-1]
output_shape = list(indices.shape[:-1]) + list(params.shape[indices.shape[-1]:])
flatted_indices = indices.view(-1, ndim)
slices = [flatted_indices[:, i] for i in range(ndim)]
slices += [Ellipsis]
return params[slices].view(*output_shape)
def scatter_nd(indices, updates, shape):
"""pytorch edition of tensorflow scatter_nd.
this function don't contain except handle code. so use this carefully
when indice repeats, don't support repeat add which is supported
in tensorflow.
"""
ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
ndim = indices.shape[-1]
output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
flatted_indices = indices.view(-1, ndim)
slices = [flatted_indices[:, i] for i in range(ndim)]
slices += [Ellipsis]
ret[slices] = updates.view(*output_shape)
return ret
class TestSpConv(TestCase):
def testSpConv3d(self):
np.random.seed(484)
devices = ["cpu:0"]
shapes = [[19, 18, 17]]
batchsizes = [1, 2]
in_channels = [64]
out_channels = [32, 48, 64]
ksizes = [2, 3]
strides = [1, 2, 3]
paddings = [0, 1, 2]
dilations = [1, 2, 3]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations):
if all([s > 1, d > 1]):
continue # don't support this.
device = torch.device(dev)
num_points = [1000] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC)
features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True
features_dense_t = torch.from_numpy(features_dense).to(device)
features_dense_t.requires_grad = True
net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device)
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device)
filters_t = torch.from_numpy(filters).to(device)
net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1, 2).contiguous()
net.net[0].weight.data[:] = filters_t
out_ref = net_ref(features_dense_t)
out = net(features_t, indices_t, bs).dense().permute(0, 4, 1, 2, 3).contiguous()
dout = np.random.uniform(-0.2, 0.2, out_ref.shape).astype(features.dtype)
dout_t = torch.from_numpy(dout).to(device)
out.backward(dout_t)
out_ref.backward(dout_t)
din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4, 1).contiguous()
din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach()
din_np = din.cpu().numpy()
din_sparse_np = din_sparse.cpu().numpy()
self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
for layer, layer_ref in zip(net.net, net_ref.net):
dw = layer.weight.grad.detach().cpu().numpy()
dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
dw = dw.transpose(4, 3, 0, 1, 2)
self.assertAllClose(dw, dw_ref, atol=1e-4)
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
def testSpDeConv3d(self):
np.random.seed(484)
devices = ["cuda:0", "cpu:0"]
shapes = [[19, 18, 17]]
batchsizes = [1, 2]
in_channels = [64]
out_channels = [32, 48, 64]
ksizes = [2, 3]
strides = [2, 3]
paddings = [0, 1, 2]
dilations = [1, 2, 3]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations):
if all([s > 1, d > 1]):
continue # don't support this.
device = torch.device(dev)
num_points = [1000] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC)
features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True
features_dense_t = torch.from_numpy(features_dense).to(device)
features_dense_t.requires_grad = True
net = SparseDeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device)
net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device)
filters_t = torch.from_numpy(filters).to(device)
net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1, 2).contiguous()
net.net[0].weight.data[:] = filters_t
out_ref = net_ref(features_dense_t)
out = net(features_t, indices_t, bs).dense().permute(0, 4, 1, 2, 3).contiguous()
dout = np.random.uniform(-0.2, 0.2, out_ref.shape).astype(features.dtype)
dout_t = torch.from_numpy(dout).to(device)
out.backward(dout_t)
out_ref.backward(dout_t)
din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4, 1).contiguous()
din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach()
din_np = din.cpu().numpy()
din_sparse_np = din_sparse.cpu().numpy()
self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
for layer, layer_ref in zip(net.net, net_ref.net):
dw = layer.weight.grad.detach().cpu().numpy()
dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
dw = dw.transpose(3, 4, 0, 1, 2)
self.assertAllClose(dw, dw_ref, atol=1e-4)
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
def testSpCpConv3d(self):
np.random.seed(484)
devices = ["cuda:0", "cpu:0"]
shapes = [[20, 20, 20]]
batchsizes = [1, 2]
in_channels = [64]
out_channels = [32, 48, 64]
ksizes = [2]
strides = [2]
paddings = [0, 1, 2]
dilations = [1, 2, 3]
for dev, shape, bs, IC, OC, k, s in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides):
device = torch.device(dev)
num_points = [1000] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC)
features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device)
indices_scn_t = torch.from_numpy(indices[:, [1, 2, 3, 0]]).int().to(device)
features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True
features_ref_t = torch.from_numpy(features).to(device)
features_ref_t.requires_grad = True
net_ref = SCNCoupleDeConvTest(1, 3, shape, IC, OC, k, s).to(device)
net = SparseCoupleDeConvTest(1, 3, shape, IC, OC, k, s).to(device)
net_ref.net[0].weight.data[:] = net.net[0].weight.data[:].view(*net_ref.net[0].weight.shape)
net_ref.net[1].weight.data[:] = net.net[1].weight.data[:].view(*net_ref.net[1].weight.shape)
out_ref = net_ref(features_ref_t, indices_scn_t, bs)
out = net(features_t, indices_t, bs)
dout = np.random.uniform(-0.2, 0.2, out_ref.shape).astype(features.dtype)
dout_t = torch.from_numpy(dout).to(device)
out.backward(dout_t)
out_ref.backward(dout_t)
din = features_t.grad.detach()
din_ref = features_ref_t.grad.detach()
din_np = din.cpu().numpy()
din_ref_np = din_ref.cpu().numpy()
self.assertAllClose(din_ref_np, din_np, atol=1e-4)
for layer, layer_ref in zip(net.net, net_ref.net):
dw = layer.weight.grad.detach().cpu().numpy()
dw_ref = layer_ref.weight.grad.detach().cpu().view(*dw.shape).numpy()
self.assertAllClose(dw, dw_ref, atol=1e-4)
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
def testSpMaxPool3d(self):
np.random.seed(484)
devices = ["cuda:0", "cpu:0"]
shapes = [[19, 18, 17]]
batchsizes = [1, 2]
in_channels = [64]
out_channels = [64]
ksizes = [2, 3]
strides = [1, 2, 3]
paddings = [0, 1]
dilations = [1, 2, 3]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations):
if all([s > 1, d > 1]):
continue # don't support this.
device = torch.device(dev)
num_points = [1000] * bs
# when data contains negative, sparse maxpool is not equal to dense maxpool.
sparse_dict = generate_sparse_data(shape, num_points, IC, data_range=[0.1, 1])
features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True
features_dense_t = torch.from_numpy(features_dense).to(device)
features_dense_t.requires_grad = True
net = SparseMaxPoolTestTorch(1, 3, shape, k, s, p, d).to(device)
net_ref = MaxPool3dTestTorch(1, 3, shape, k, s, p, d).to(device)
out_ref = net_ref(features_dense_t)
out = net(features_t, indices_t, bs)
outids = out.indices
outfeatures = out.features
out_dense = out.dense()
out = out_dense.permute(0, 4, 1, 2, 3).contiguous()
dout_sparse = np.random.uniform(-0.2, 0.2, outfeatures.shape).astype(features.dtype)
dout_sparse_t = torch.from_numpy(dout_sparse).to(device)
dout_t = scatter_nd(outids.long(), dout_sparse_t, list(out_dense.shape))
dout_t = dout_t.permute(0, 4, 1, 2, 3).contiguous()
out.backward(dout_t)
out_ref.backward(dout_t)
din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4, 1).contiguous()
din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach()
din_np = din.cpu().numpy()
din_sparse_np = din_sparse.cpu().numpy()
self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
def main():
# function for develop.
np.random.seed(484)
devices = ["cuda:0"]
shapes = [[50, 30, 30]]
batchsizes = [3]
in_channels = [256]
out_channels = [256]
ksizes = [3]
strides = [1]
paddings = [0]
dilations = [1]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations):
if all([s > 1, d > 1]):
continue
device = torch.device(dev)
num_points = [5000] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC)
features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32)
indices_t = torch.from_numpy(indices)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device).half()
features_t = torch.from_numpy(features).to(device).half()
features_dense_t = torch.from_numpy(features_dense).to(device).half()
net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).half()
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).half()
filters_t = torch.from_numpy(filters).to(device).half()
net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1, 2).contiguous()
net.net[0].weight[:] = filters_t
out_ref = net_ref(features_dense_t)
times = []
for i in range(30):
t = time.time()
out = net(features_t, indices_t, bs)
torch.cuda.synchronize()
times.append(time.time() - t)
print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
print("spconv time", np.mean(times[2:]))
out = net(features_t, indices_t, bs).dense().permute(0, 4, 1, 2, 3).contiguous()
print(np.linalg.norm(out.detach().cpu().numpy() - out_ref.detach().cpu().numpy()))
if __name__ == '__main__':
# main()
unittest.main()
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment