Commit 01ed382c authored by yan.yan's avatar yan.yan
Browse files

working on tensor core test

parent 3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/fused_spconv_ops.h>
#include <spconv/nms_ops.h>
#include <spconv/pillar_scatter_ops.h>
#include <spconv/point2voxel_ops.h>
#include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h>
#include <torch/script.h>
static auto registry =
torch::RegisterOperators()
.op("spconv::points_to_voxel", &spconv::pointsToVoxel)
.op("spconv::get_indice_pairs", &spconv::getIndicePairs)
.op("spconv::indice_conv", &spconv::indiceConv)
.op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
.op("spconv::fused_indice_conv_bn", &spconv::fusedIndiceConvBatchNorm)
.op("spconv::indice_maxpool", &spconv::indiceMaxPool)
.op("spconv::indice_maxpool_backward", &spconv::indiceMaxPoolBackward)
.op("spconv::nms", &spconv::nonMaxSuppression<float>)
.op("spconv::pillar_scatter_float", &spconv::pointPillarScatter<float>)
.op("spconv::pillar_scatter_half",
&spconv::pointPillarScatter<at::Half>);
#include <ATen/ATen.h>
#include <spconv/cublas_gemm.h>
namespace spconv {
template <>
cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const float *alpha, const float *A, int lda,
const float *B, int ldb, const float *beta, float *C,
int ldc) {
return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <>
cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const __half *alpha, const __half *A, int lda,
const __half *B, int ldb, const __half *beta,
__half *C, int ldc) {
return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <>
cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const at::Half *alpha, const at::Half *A, int lda,
const at::Half *B, int ldb, const at::Half *beta,
at::Half *C, int ldc) {
return cublasHgemm(handle, transa, transb, m, n, k,
reinterpret_cast<const __half *>(alpha),
reinterpret_cast<const __half *>(A), lda,
reinterpret_cast<const __half *>(B), ldb,
reinterpret_cast<const __half *>(beta),
reinterpret_cast<__half *>(C), ldc);
}
template <>
cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const double *alpha, const double *A, int lda,
const double *B, int ldb, const double *beta,
double *C, int ldc) {
return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <> inline __half constant_scalar(float data) {
return __float2half(data);
}
} // namespace spconv
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <spconv/fused_conv.cu.h>
#include <spconv/fused_conv.h>
#include <spconv/minkowski.cu.h>
#include <tensorview/torch_utils.h>
namespace spconv {
void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
torch::Tensor filters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot) {
auto dtype = output.scalar_type();
auto input_nPlanes = features.size(1);
auto output_nPlanes = output.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
tv::dispatch_torch<float, at::Half>(dtype, [&](auto I) {
using T = decltype(I);
dConvolution_forward2(stream, features.data_ptr<T>(), output.data_ptr<T>(),
filters.data_ptr<T>(), indicesIn.data_ptr<int32_t>(),
indicesOut.data_ptr<int32_t>(), nHot, input_nPlanes,
input_nPlanes, output_nPlanes, output_nPlanes, 1);
});
}
void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din,
torch::Tensor dout, torch::Tensor filters,
torch::Tensor dfilters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot) {
auto dtype = features.scalar_type();
auto input_nPlanes = features.size(1);
auto output_nPlanes = dout.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
tv::dispatch_torch<float>(dtype, [&](auto I) {
using T = decltype(I);
dConvolution_backward_dW2(
stream, features.data_ptr<T>(), din.data_ptr<T>(), dout.data_ptr<T>(),
filters.data_ptr<T>(), dfilters.data_ptr<T>(),
indicesIn.data_ptr<int32_t>(), indicesOut.data_ptr<int32_t>(), nHot,
input_nPlanes, input_nPlanes, output_nPlanes, output_nPlanes, 1);
});
}
void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features,
torch::Tensor filters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot) {
auto dtype = output.scalar_type();
auto in_nchannel = features.size(1);
auto out_nchannel = output.size(1);
int shared_mem_size = -1;
if ((in_nchannel > 16 && out_nchannel > 16 &&
in_nchannel * out_nchannel >= 512) ||
(in_nchannel > 24 && out_nchannel > 24))
shared_mem_size = 32;
else if (in_nchannel % 24 == 0 && out_nchannel % 24 == 0)
shared_mem_size = 24;
else if ((in_nchannel > 8 && out_nchannel > 8) ||
(in_nchannel % 16 == 0 && out_nchannel % 16 == 0))
shared_mem_size = 16;
else
shared_mem_size = 8;
constexpr int MAX_GRID = 65535;
auto stream = at::cuda::getCurrentCUDAStream();
using shmem_sizes_t = tv::mp_list_c<int, 32, 24, 16, 8>;
int num_grid = (nHot + shared_mem_size - 1) / shared_mem_size;
int num_div = (num_grid + MAX_GRID - 1) / MAX_GRID;
int step = (nHot + num_div - 1) / num_div;
dim3 threads(shared_mem_size, shared_mem_size);
tv::dispatch_torch<float>(dtype, [&](auto I) {
using T = decltype(I);
tv::DispatchInt<shmem_sizes_t>()(shared_mem_size, [&](auto ShSizeValue) {
constexpr int ShmemSize = decltype(ShSizeValue)::value;
for (int s = 0; s < num_div; s++) {
int remainder = nHot - step * s;
int curr_num_active = remainder < step ? remainder : step;
dim3 grid((out_nchannel + threads.x - 1) / threads.x,
(curr_num_active + threads.y - 1) / threads.y);
matmul<T, int32_t, ShmemSize><<<grid, threads, 0, stream>>>(
features.data_ptr<T>(), in_nchannel, curr_num_active,
filters.data_ptr<T>(), out_nchannel, in_nchannel,
output.data_ptr<T>(), indicesIn.data_ptr<int32_t>(),
indicesOut.data_ptr<int32_t>());
}
});
});
}
void fused_conv_backward_cuda_minkowski(torch::Tensor features,
torch::Tensor din, torch::Tensor dout,
torch::Tensor filters,
torch::Tensor dfilters,
torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot) {
auto dtype = features.scalar_type();
auto in_nchannel = features.size(1);
auto out_nchannel = dout.size(1);
int shared_mem_size = -1;
if ((in_nchannel > 16 && out_nchannel > 16 &&
in_nchannel * out_nchannel >= 512) ||
(in_nchannel % 32 == 0 && out_nchannel % 32 == 0))
shared_mem_size = 32;
else if (in_nchannel % 24 == 0 && out_nchannel % 24 == 0)
shared_mem_size = 24;
else if ((in_nchannel > 8 && out_nchannel > 8) ||
(in_nchannel % 16 == 0 && out_nchannel % 16 == 0))
shared_mem_size = 16;
else
shared_mem_size = 8;
dim3 threads(shared_mem_size, shared_mem_size);
constexpr int MAX_GRID = 65535;
auto stream = at::cuda::getCurrentCUDAStream();
using shmem_sizes_t = tv::mp_list_c<int, 32, 24, 16, 8>;
int num_grid = (nHot + shared_mem_size - 1) / shared_mem_size;
int num_div = (num_grid + MAX_GRID - 1) / MAX_GRID;
int step = (nHot + num_div - 1) / num_div;
tv::dispatch_torch<float>(dtype, [&](auto I) {
using T = decltype(I);
tv::DispatchInt<shmem_sizes_t>()(shared_mem_size, [&](auto ShSizeValue) {
constexpr int ShmemSize = decltype(ShSizeValue)::value;
for (int s = 0; s < num_div; s++) {
int remainder = nHot - step * s;
int curr_num_active = remainder < step ? remainder : step;
dim3 grid((in_nchannel + threads.x - 1) / threads.x,
(curr_num_active + threads.y - 1) / threads.y);
matmul2<T, int32_t, ShmemSize><<<grid, threads, 0, stream>>>(
dout.data_ptr<T>(), out_nchannel, curr_num_active, // A
filters.data_ptr<T>(), out_nchannel,
in_nchannel, // B
features.data_ptr<T>(), in_nchannel, curr_num_active, // D
din.data_ptr<T>(), // C
dfilters.data_ptr<T>(), // E
indicesIn.data_ptr<int32_t>(), indicesOut.data_ptr<int32_t>());
}
});
});
}
} // namespace spconv
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/Parallel.h>
#include <spconv/geometry.h>
#include <spconv/indice.h>
#include <spconv/spconv_ops.h>
#include <tensorview/tensor.h>
#include <torch/script.h>
namespace spconv {
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation,
const Index *outSpatialShape) {
// indicesOut: num_active * kernelVolume * (NDim + 1)
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
Index hashval;
tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) {
batchIdx = indicesIn(j, 0);
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * batchIdx;
auto iter = hash.find(index);
if (iter == hash.end()) {
for (unsigned k = 1; k < NDim + 1; ++k) {
indicesOut(numAct, k) = pointPtr[k - 1];
}
indicesOut(numAct, 0) = batchIdx;
hashval = numAct++;
hash[index] = hashval;
} else {
hashval = iter->second;
}
// indicePairs: [K, 2, L]
indicePairs(0, offset, indiceNum[offset]) = j;
indicePairs(1, offset, indiceNum[offset]++) = hashval;
}
}
return numAct;
}
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation,
const Index *outSpatialShape) {
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
Index hashval;
tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) {
batchIdx = indicesIn(j, 0);
numValidPoints = getValidOutPosTranspose<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * batchIdx;
auto iter = hash.find(index);
if (iter == hash.end()) {
for (unsigned k = 1; k < NDim + 1; ++k) {
indicesOut(numAct, k) = pointPtr[k - 1];
}
indicesOut(numAct, 0) = batchIdx;
hashval = numAct++;
hash[index] = hashval;
} else {
hashval = iter->second;
}
// indicePairs: [K, 2, L]
indicePairs(0, offset, indiceNum[offset]) = j;
indicePairs(1, offset, indiceNum[offset]++) = hashval;
}
}
return numAct;
}
#ifndef TV_WINDOWS
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *const kernelSize,
const Index *const stride, const Index *const padding,
const Index *dilation,
const Index *const outSpatialShape) {
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) {
Index index = 0;
index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
outSpatialShape) +
spatialVolume * indicesIn(j, 0);
hash[index] = j;
}
at::parallel_for(0, numActIn, 0, [&](int64_t begin, int64_t end) {
Index index = 0;
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
Index oldOffset = 0;
for (int j = begin; j < end; ++j) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * indicesIn(j, 0);
auto iter = hash.find(index);
if (iter != hash.end()) {
#pragma omp atomic capture
oldOffset = indiceNum[offset]++;
indicePairs(0, offset, oldOffset) = j;
indicePairs(1, offset, oldOffset) = iter->second;
}
}
}
});
return numActIn;
}
#else
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *const kernelSize,
const Index *const stride, const Index *const padding,
const Index *dilation,
const Index *const outSpatialShape) {
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
// Index validPoints[kernelVolume * (NDim + 1)];
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) {
Index index = 0;
index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
outSpatialShape) +
spatialVolume * indicesIn(j, 0);
hash[index] = j;
}
Index index = 0;
for (int j = 0; j < numActIn; ++j) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * indicesIn(j, 0);
auto iter = hash.find(index);
if (iter != hash.end()) {
indicePairs(0, offset, indiceNum[offset]) = j;
indicePairs(1, offset, indiceNum[offset]++) = iter->second;
}
}
}
return numActIn;
}
#endif
int create_conv_indice_pair_cpu(
torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
torch::Tensor indicePairs, torch::Tensor indiceNum,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outSpatialShape, bool transpose, bool resetGrid,
bool useHash) {
auto ndim = outSpatialShape.size();
auto numActIn = indicesIn.size(0);
int batchSize = gridsOut.size(0);
auto kernelVolume = indiceNum.size(0);
if (numActIn == 0)
return 0;
tv::dispatch_torch<int32_t, int64_t>(indicesIn.scalar_type(), [&](auto V) {
using Index = decltype(V);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = decltype(I)::value;
tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end());
if (transpose)
numActIn = getIndicePairsDeConv<Index, IndexGrid, NDim>(
tv::torch2tv<Index>(indicesIn), tv::torch2tv<Index>(indicesOut),
tv::torch2tv<IndexGrid>(gridsOut), tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ks.data(), st.data(), pa.data(),
di.data(), ou.data());
else
numActIn = getIndicePairsConv<Index, IndexGrid, NDim>(
tv::torch2tv<Index>(indicesIn), tv::torch2tv<Index>(indicesOut),
tv::torch2tv<IndexGrid>(gridsOut), tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ks.data(), st.data(), pa.data(),
di.data(), ou.data());
});
});
return numActIn;
}
int create_submconv_indice_pair_cpu(
torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
auto ndim = outSpatialShape.size();
auto numActIn = indicesIn.size(0);
int batchSize = gridsOut.size(0);
auto kernelVolume = indiceNum.size(0);
if (numActIn == 0)
return 0;
tv::dispatch_torch<int32_t, int64_t>(indicesIn.scalar_type(), [&](auto V) {
using Index = decltype(V);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = decltype(I)::value;
tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end());
numActIn = getIndicePairsSubM<Index, IndexGrid, NDim>(
tv::torch2tv<Index>(indicesIn), tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicePairs), tv::torch2tv<Index>(indiceNum),
ks.data(), st.data(), pa.data(), di.data(), ou.data());
});
});
return numActIn;
}
} // namespace spconv
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <boost/mp11.hpp>
#include <chrono>
#include <cuhash/hash_table.h>
#include <limits>
#include <spconv/indice.cu.h>
#include <spconv/indice.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensor.h>
#include <tensorview/tensorview.h>
#include <tensorview/torch_utils.h>
#include <thrust/copy.h>
#include <thrust/execution_policy.h>
#include <type_traits>
#include <utility/timer.h>
namespace spconv {
using max_kernel_vol_t = tv::mp_list_c<int, 9, 16, 27, 32, 128, 256, 4096>;
int create_conv_indice_pair_p1_cuda(
torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
torch::Tensor indicePairUnique, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose) {
auto stream = at::cuda::getCurrentCUDAStream();
auto ndim = kernelSize.size();
auto numActIn = indicesIn.size(0);
auto kernelVolume = indiceNum.size(0);
// auto timer = spconv::CudaContextTimer<>();
if (numActIn == 0)
return 0;
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
using Index = decltype(IndexValue);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = decltype(I)::value;
tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end());
tv::DispatchInt<max_kernel_vol_t>()(
kernelVolume, std::less_equal<int>(), [&](auto I2) {
constexpr int MaxKernelVolume = decltype(I2)::value;
tv::dispatch_int<0, 1>(int(transpose), [&](auto I) {
constexpr bool UseDeconv = decltype(I)::value;
prepareIndicePairsKernel<Index, NDim, UseDeconv, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum),
tv::torch2tv<Index>(indicePairUnique), ks, st,
pa, di, ou);
TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
});
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr,
prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>));
tv::ssprint("prepareIndicePairsKernel<", tv::type_s<Index>, NDim,
MaxKernelVolume, ">", attr.numRegs);
#endif
});
});
});
return 1;
}
int create_conv_indice_pair_p2_cuda(
torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
torch::Tensor indicePairs, torch::Tensor indiceNum,
torch::Tensor indicePairUnique, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
auto stream = at::cuda::getCurrentCUDAStream();
auto ndim = outSpatialShape.size();
auto numActIn = indicesIn.size(0);
int batchSize = gridsOut.size(0);
int numAct = indicePairUnique.size(0) - 1;
auto kernelVolume = indiceNum.size(0);
if (numActIn == 0)
return 0;
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
using Index = decltype(IndexValue);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = decltype(I)::value;
using IndexGrid = int32_t;
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end());
if (useHash) {
auto table = cuhash::HashTable();
// std::cout << "create " << numAct << " size table..." << std::endl;
table.Initialize(numAct, 2.0, 4);
unsigned *d_values = nullptr;
cudaMalloc((void **)&d_values, sizeof(unsigned) * numAct);
TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
arangeKernel<unsigned>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(d_values, numAct);
TV_CHECK_CUDA_ERR_V2("arangeKernel failed");
bool res = table.Build(
numAct,
reinterpret_cast<unsigned *>(indicePairUnique.data_ptr<Index>()),
d_values);
cudaFree(d_values);
TV_CHECK_CUDA_ERR_V2("cudaFree failed");
if (!res) {
return -1; // use -1 to tell outside use CPU implementation
}
assignIndiceOutKernel<Index, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut), numAct,
tv::torch2tv<Index>(indicePairUnique), ou, batchSize);
TV_CHECK_CUDA_ERR_V2("assignIndiceOutKernel failed");
auto tableSize = table.get_table_size();
auto tableData = table.data();
auto constants = table.get_constants_4();
auto stash_constants = table.get_stash_constants();
auto stash_count = table.get_stash_count();
assignIndicePairsHashKernel<Index, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut), numActIn,
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indicePairUnique), tableSize,
tableData, constants, stash_constants, stash_count);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsHashKernel failed");
} else {
assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut),
tv::torch2tv<IndexGrid>(gridsOut), numAct,
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indicePairUnique), ou, batchSize);
TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
assignIndicePairsKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut),
tv::torch2tv<IndexGrid>(gridsOut), numActIn,
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indicePairUnique), ou);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr, assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>));
tv::ssprint("assignGridAndIndiceOutKernel<", tv::type_s<Index>, NDim,
">", attr.numRegs);
cudaFuncAttributes attr2;
checkCudaErrors(cudaFuncGetAttributes(
&attr2, assignIndicePairsKernel<Index, IndexGrid, NDim>));
tv::ssprint("assignIndicePairsKernel<", tv::type_s<Index>, NDim, ">",
attr2.numRegs);
#endif
}
if (resetGrid && (!useHash)) {
resetGridKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(indicePairUnique.data_ptr<Index>(),
tv::torch2tv<IndexGrid>(gridsOut), numAct);
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
}
});
});
return numAct;
}
int create_submconv_indice_pair_cuda(
torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
auto stream = at::cuda::getCurrentCUDAStream();
auto ndim = outSpatialShape.size();
auto numActIn = indicesIn.size(0);
int batchSize = gridsOut.size(0);
auto kernelVolume = indiceNum.size(0);
if (numActIn == 0)
return 0;
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
using Index = decltype(IndexValue);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = decltype(I)::value;
tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end());
Index spatialVolume = 1;
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
auto dispatcher = tv::DispatchIntNoexcept<tv::mp_list_c<int, 1, 3, 5>>();
namespace mp11 = boost::mp11;
using kernel2_candidates_t =
mp11::mp_product<tv::mp_list, tv::mp_list_c<int, 1, 3, 5>,
tv::mp_list_c<int, 1, 3, 5>>;
using kernel3_candidates_t =
mp11::mp_product<tv::mp_list, tv::mp_list_c<int, 1, 3, 5>,
tv::mp_list_c<int, 1, 3, 5>,
tv::mp_list_c<int, 1, 3, 5>>;
using kernel3_candidates_final_t =
mp11::mp_push_back<kernel3_candidates_t>;
auto dispatcher2 = tv::DispatchContainerNoexcept<kernel2_candidates_t>();
auto dispatcher3 =
tv::DispatchContainerNoexcept<kernel3_candidates_final_t>();
if (useHash) {
auto table = cuhash::HashTable();
// std::cout << "create " << numAct << " size table..." << std::endl;
table.Initialize(numActIn, 2.0, 4);
unsigned *d_keyvalues = nullptr;
cudaMalloc((void **)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
unsigned *d_values = d_keyvalues + numActIn;
TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
prepareSubMHashKernel<Index, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn), d_keyvalues, d_values,
ou);
TV_CHECK_CUDA_ERR_V2("prepareSubMHashKernel failed");
bool res =
table.Build(numActIn, reinterpret_cast<unsigned *>(d_keyvalues),
reinterpret_cast<unsigned *>(d_values));
cudaFree(d_keyvalues);
TV_CHECK_CUDA_ERR_V2("cudaFree failed");
if (!res) {
return -1; // use -1 to tell outside use CPU implementation
}
auto tableSize = table.get_table_size();
auto tableData = table.data();
auto constants = table.get_constants_4();
auto stash_constants = table.get_stash_constants();
auto stash_count = table.get_stash_count();
bool dilation_one = true;
for (int i = 0; i < NDim; ++i) {
dilation_one &= di[i] == 1;
}
auto found = false;
if (dilation_one && (NDim == 2 || NDim == 3)) {
auto indiceNumCpu = indiceNum.cpu();
if (NDim == 2) {
tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
outSpatialShape.end());
dispatcher2(kernelSize.begin(), kernelSize.end(), [&](auto K) {
constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
found = true;
getSubMIndicePairsHashUnrollKernel2<Index, K0, K1>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ou_,
spatialVolume, tableSize, tableData,
constants, stash_constants, stash_count);
});
} else if (NDim == 3) {
tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),
outSpatialShape.end());
dispatcher3(kernelSize.begin(), kernelSize.end(), [&](auto K) {
constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
constexpr int K2 = mp11::mp_at_c<decltype(K), 2>::value;
found = true;
getSubMIndicePairsHashUnrollKernel3<Index, K0, K1, K2>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ou_,
spatialVolume, tableSize, tableData,
constants, stash_constants, stash_count);
});
}
}
if (!found) {
tv::DispatchInt<max_kernel_vol_t>()(
kernelVolume, std::less_equal<int>(), [&](auto I2) {
constexpr int MaxKernelVolume = decltype(I2)::value;
getSubMIndicePairsHashKernel<Index, NDim, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn),
tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ks, st, pa, di, ou,
tableSize, tableData, constants, stash_constants,
stash_count);
TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
});
}
} else {
// auto timer = spconv::CudaContextTimer<>();
prepareSubMGridKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut), ou, spatialVolume);
// tv::ssprint("prepareSubMGridKernel", timer.report() / 1000.0);
TV_CHECK_CUDA_ERR_V2("prepareSubMGridKernel failed");
// when dilation all one, we use a simple kernel to calc result
bool dilation_one = true;
for (int i = 0; i < NDim; ++i) {
dilation_one &= di[i] == 1;
}
auto found = false;
if (dilation_one && (NDim == 2 || NDim == 3)) {
auto indiceNumCpu = indiceNum.cpu();
if (NDim == 2) {
tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
outSpatialShape.end());
dispatcher2(kernelSize.begin(), kernelSize.end(), [&](auto K) {
constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
found = true;
getSubMIndicePairsUnrollKernel2<Index, IndexGrid, K0, K1>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ou_,
spatialVolume);
});
} else if (NDim == 3) {
tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),
outSpatialShape.end());
dispatcher3(kernelSize.begin(), kernelSize.end(), [&](auto K) {
constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
constexpr int K2 = mp11::mp_at_c<decltype(K), 2>::value;
found = true;
getSubMIndicePairsUnrollKernel3<Index, IndexGrid, K0, K1, K2>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ou_,
spatialVolume);
});
/*
dispatcher(kernelSize[0], [&](auto K0C) {
dispatcher(kernelSize[1], [&](auto K1C) {
dispatcher(kernelSize[2], [&](auto K2C) {
constexpr int K0 = decltype(K0C)::value;
constexpr int K1 = decltype(K1C)::value;
constexpr int K2 = decltype(K2C)::value;
found = true;
getSubMIndicePairsUnrollKernel3<Index, IndexGrid, K0, K1, K2>
<<<tv::cuda::getBlocks(numActIn),
tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ou_, spatialVolume);
});
});
});*/
}
}
if (!found) {
tv::DispatchInt<
max_kernel_vol_t>()(ndim, std::less_equal<int>(), [&](auto I2) {
constexpr int MaxKernelVolume = decltype(I2)::value;
getSubMIndicePairsKernel<Index, IndexGrid, NDim, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ks, st, pa, di,
ou);
TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsKernel failed");
});
}
// tv::ssprint("getSubMIndicePairsKernel", timer.report() / 1000.0);
}
if (resetGrid && (!useHash)) {
resetGridSubMKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(indicesIn.data_ptr<Index>(),
tv::torch2tv<IndexGrid>(gridsOut), ou, numActIn,
spatialVolume);
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
}
});
});
return numActIn;
}
} // namespace spconv
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/maxpool.h>
#include <torch/script.h>
namespace spconv {
using float_types_t = tv::mp_list<float, double, at::Half>;
using int_types_t = tv::mp_list<int32_t, int64_t>;
void maxpool_fwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size) {
if (size <= 0)
return;
int stride = inFeatures.size(1);
auto dtype = inFeatures.scalar_type();
auto int_dtype = indicesIn.scalar_type();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = decltype(TValue);
tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
using Index = decltype(IndexValue);
auto outFeaturesData = outFeatures.data_ptr<T>();
auto inFeaturesData = inFeatures.data_ptr<T>();
auto indicesInData = indicesIn.data_ptr<Index>();
auto indicesOutData = indicesOut.data_ptr<Index>();
Index idxi, idxo;
for (int row = 0; row < size; row++) {
idxi = indicesInData[row] * stride;
idxo = indicesOutData[row] * stride;
for (int plane = 0; plane < stride; ++plane)
if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
}
});
});
}
void maxpool_bwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor dout, torch::Tensor din,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size) {
if (size <= 0)
return;
int stride = inFeatures.size(1);
auto dtype = inFeatures.scalar_type();
auto int_dtype = indicesIn.scalar_type();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = decltype(TValue);
tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
using Index = decltype(IndexValue);
auto outFeaturesData = outFeatures.data_ptr<T>();
auto inFeaturesData = inFeatures.data_ptr<T>();
auto doutData = dout.data_ptr<T>();
auto dinData = din.data_ptr<T>();
auto indicesInData = indicesIn.data_ptr<Index>();
auto indicesOutData = indicesOut.data_ptr<Index>();
Index idxi, idxo;
for (int row = 0; row < size; row++) {
idxi = indicesInData[row] * stride;
idxo = indicesOutData[row] * stride;
for (int plane = 0; plane < stride; ++plane)
if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
dinData[idxi + plane] += doutData[idxo + plane];
}
});
});
}
} // namespace spconv
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/maxpool.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h>
#include <type_traits>
namespace spconv {
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolFwdBlockKernel(T *outFeatures, const T *inFeatures,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
T in, out;
int ILPStrideY[NumILP];
Index idxo, idxi;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x; ix < numHot;
ix += blockDim.x * gridDim.x) {
{
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
in = inFeatures[idxi];
out = outFeatures[idxo];
if (in > out) {
outFeatures[idxo] = in;
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void
maxPoolFwdGenericBlockKernel(T *outFeatures, const T *inFeatures,
const Index *indicesIn, const Index *indicesOut,
int numHot, int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
T in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in > out) {
outFeatures[RO[ilp] + iy] = in;
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
__global__ void maxPoolFwdVecBlockKernel(T *outFeatures, const T *inFeatures,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideY[NumILP];
constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
T bufi[vecloadFactor];
T bufo[vecloadFactor];
Index idxi, idxo;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
ix += blockDim.x * gridDim.x * vecloadFactor) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
reinterpret_cast<VecType *>(bufo)[0] =
reinterpret_cast<VecType *>(outFeatures)[idxo];
reinterpret_cast<VecType *>(bufi)[0] =
reinterpret_cast<const VecType *>(inFeatures)[idxi];
#pragma unroll
for (int i = 0; i < vecloadFactor; i++) {
if (bufi[i] > bufo[i]) {
bufo[i] = bufi[i];
}
}
reinterpret_cast<VecType *>(outFeatures)[idxo] =
reinterpret_cast<VecType *>(bufo)[0];
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolFwdGenericKernel(T *outFeatures, const T *inFeatures,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
T in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < numHot) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < numHot) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in > out) {
outFeatures[RO[ilp] + iy] = in;
}
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void
maxPoolBwdBlockKernel(const T *outFeatures, const T *inFeatures, const T *dout,
T *din, const Index *indicesIn, const Index *indicesOut,
int numHot, int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
T in, out;
Index idxo, idxi;
int ILPStrideY[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
dout += blockIdx.y * NumTLP;
din += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x; ix < numHot;
ix += blockDim.x * gridDim.x) {
{
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
in = inFeatures[idxi];
out = outFeatures[idxo];
if (in == out) {
din[idxi] += dout[idxo];
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolBwdGenericBlockKernel(const T *outFeatures,
const T *inFeatures, const T *dout,
T *din, const Index *indicesIn,
const Index *indicesOut,
int numHot, int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
T in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in == out) {
din[RI[ilp] + iy] += dout[RO[ilp] + iy];
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
__global__ void
maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
const T *dout, T *din, const Index *indicesIn,
const Index *indicesOut, int numHot, int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideY[NumILP];
constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
T bufi[vecloadFactor];
T bufo[vecloadFactor];
T bufdi[vecloadFactor];
T bufdo[vecloadFactor];
Index idxi, idxo;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
ix += blockDim.x * gridDim.x * vecloadFactor) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
reinterpret_cast<VecType *>(bufo)[0] =
reinterpret_cast<const VecType *>(outFeatures)[idxo];
reinterpret_cast<VecType *>(bufi)[0] =
reinterpret_cast<const VecType *>(inFeatures)[idxi];
reinterpret_cast<VecType *>(bufdo)[0] =
reinterpret_cast<const VecType *>(dout)[idxo];
reinterpret_cast<VecType *>(bufdi)[0] =
reinterpret_cast<VecType *>(din)[idxi];
#pragma unroll
for (int i = 0; i < vecloadFactor; i++) {
if (bufi[i] == bufo[i]) {
bufdi[i] += bufdo[i];
}
}
reinterpret_cast<VecType *>(din)[idxi] =
reinterpret_cast<VecType *>(bufdi)[0];
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void
maxPoolBwdGenericKernel(const T *outFeatures, const T *inFeatures,
const T *dout, T *din, const Index *indicesIn,
const Index *indicesOut, int numHot, int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
T in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < numHot) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < numHot) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in == out) {
din[RI[ilp] + iy] += dout[RO[ilp] + iy];
}
}
}
}
}
}
using float_types_t = tv::mp_list<float, double, at::Half>;
using int_types_t = tv::mp_list<int32_t, int64_t>;
void maxpool_fwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size) {
if (size <= 0)
return;
int numPlanes = inFeatures.size(1);
auto dtype = inFeatures.scalar_type();
auto int_dtype = indicesIn.scalar_type();
auto stream = at::cuda::getCurrentCUDAStream();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = decltype(TValue);
using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
using Index = decltype(IndexValue);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
tv::mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indicesIn,
&indicesOut, &notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (numHotBlock >= NumTLP) {
maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
stream>>>(
outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
indicesIn.data_ptr<Index>(), indicesOut.data_ptr<Index>(),
numHotBlock, numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
if (size > numHotBlock) {
maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, stream>>>(outFeatures.data_ptr<T>(),
inFeatures.data_ptr<T>(),
indicesIn.data_ptr<Index>() + numHotBlock,
indicesOut.data_ptr<Index>() + numHotBlock,
size - numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (numHotBlock >= NumTLP) {
maxPoolFwdGenericBlockKernel<T, Index, NumTLP, NumILP>
<<<dim3(size / NumTLP, tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
indicesIn.data_ptr<Index>(), indicesOut.data_ptr<Index>(),
numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
if (size > numHotBlock) {
maxPoolFwdGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(1, tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
indicesIn.data_ptr<Index>() + numHotBlock,
indicesOut.data_ptr<Index>() + numHotBlock,
size - numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
}
});
});
}
void maxpool_bwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor dout, torch::Tensor din,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size) {
if (size <= 0)
return;
int numPlanes = inFeatures.size(1);
auto dtype = inFeatures.scalar_type();
auto int_dtype = indicesIn.scalar_type();
auto stream = at::cuda::getCurrentCUDAStream();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = decltype(TValue);
using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
using Index = decltype(IndexValue);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
tv::mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &dout,
&din, &indicesIn, &indicesOut,
&notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (numHotBlock >= NumTLP) {
maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
stream>>>(outFeatures.data_ptr<T>(),
inFeatures.data_ptr<T>(), dout.data_ptr<T>(),
din.data_ptr<T>(), indicesIn.data_ptr<Index>(),
indicesOut.data_ptr<Index>(), numHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
if (size > numHotBlock) {
maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, stream>>>(outFeatures.data_ptr<T>(),
inFeatures.data_ptr<T>(), dout.data_ptr<T>(),
din.data_ptr<T>(),
indicesIn.data_ptr<Index>() + numHotBlock,
indicesOut.data_ptr<Index>() + numHotBlock,
size - numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (numHotBlock >= NumTLP) {
maxPoolBwdGenericBlockKernel<T, Index, NumTLP, NumILP>
<<<dim3(size / NumTLP, tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
dout.data_ptr<T>(), din.data_ptr<T>(),
indicesIn.data_ptr<Index>(), indicesOut.data_ptr<Index>(),
numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
if (size > numHotBlock) {
maxPoolBwdGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(1, tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
dout.data_ptr<T>(), din.data_ptr<T>(),
indicesIn.data_ptr<Index>() + numHotBlock,
indicesOut.data_ptr<Index>() + numHotBlock,
size - numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
}
});
});
}
} // namespace spconv
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <boost/geometry.hpp>
#include <spconv/nms_functor.h>
#include <torch/script.h>
#include <vector>
namespace spconv {
namespace functor {
template <typename T, typename Index>
struct NonMaxSupressionFunctor<tv::CPU, T, Index> {
Index operator()(const tv::CPU &d, tv::TensorView<Index> keep,
tv::TensorView<const T> boxes, T threshold, T eps) {
auto ndets = boxes.dim(0);
auto suppressed = std::vector<Index>(ndets);
auto area = std::vector<T>(ndets);
for (int i = 0; i < ndets; ++i) {
area[i] =
(boxes(i, 2) - boxes(i, 0) + eps) * (boxes(i, 3) - boxes(i, 1) + eps);
}
int i, j;
T xx1, xx2, w, h, inter, ovr;
int keepNum = 0;
for (int _i = 0; _i < ndets; ++_i) {
i = _i;
if (suppressed[i] == 1)
continue;
keep[keepNum] = i;
keepNum += 1;
for (int _j = _i + 1; _j < ndets; ++_j) {
j = _j;
if (suppressed[j] == 1)
continue;
xx2 = std::min(boxes(i, 2), boxes(j, 2));
xx1 = std::max(boxes(i, 0), boxes(j, 0));
w = xx2 - xx1 + eps;
if (w > 0) {
xx2 = std::min(boxes(i, 3), boxes(j, 3));
xx1 = std::max(boxes(i, 1), boxes(j, 1));
h = xx2 - xx1 + eps;
if (h > 0) {
inter = w * h;
ovr = inter / (area[i] + area[j] - inter);
if (ovr >= threshold)
suppressed[j] = 1;
}
}
}
}
return keepNum;
}
};
template <typename T, typename Index>
struct rotateNonMaxSupressionFunctor<tv::CPU, T, Index> {
Index operator()(const tv::CPU &d, tv::TensorView<Index> keep,
tv::TensorView<const T> boxCorners,
tv::TensorView<const T> standupIoU, T threshold) {
auto ndets = boxCorners.dim(0);
auto suppressed = std::vector<Index>(ndets);
int i, j;
namespace bg = boost::geometry;
typedef bg::model::point<T, 2, bg::cs::cartesian> point_t;
typedef bg::model::polygon<point_t> polygon_t;
polygon_t poly, qpoly;
std::vector<polygon_t> poly_inter, poly_union;
T inter_area, union_area, overlap;
int keepNum = 0;
for (int _i = 0; _i < ndets; ++_i) {
i = _i;
if (suppressed[i] == 1)
continue;
keep[keepNum] = i;
keepNum += 1;
for (int _j = _i + 1; _j < ndets; ++_j) {
j = _j;
if (suppressed[j] == 1)
continue;
if (standupIoU(i, j) <= 0.0)
continue;
bg::append(poly, point_t(boxCorners(i, 0, 0), boxCorners(i, 0, 1)));
bg::append(poly, point_t(boxCorners(i, 1, 0), boxCorners(i, 1, 1)));
bg::append(poly, point_t(boxCorners(i, 2, 0), boxCorners(i, 2, 1)));
bg::append(poly, point_t(boxCorners(i, 3, 0), boxCorners(i, 3, 1)));
bg::append(poly, point_t(boxCorners(i, 0, 0), boxCorners(i, 0, 1)));
bg::append(qpoly, point_t(boxCorners(j, 0, 0), boxCorners(j, 0, 1)));
bg::append(qpoly, point_t(boxCorners(j, 1, 0), boxCorners(j, 1, 1)));
bg::append(qpoly, point_t(boxCorners(j, 2, 0), boxCorners(j, 2, 1)));
bg::append(qpoly, point_t(boxCorners(j, 3, 0), boxCorners(j, 3, 1)));
bg::append(qpoly, point_t(boxCorners(j, 0, 0), boxCorners(j, 0, 1)));
bg::intersection(poly, qpoly, poly_inter);
if (!poly_inter.empty()) {
inter_area = bg::area(poly_inter.front());
bg::union_(poly, qpoly, poly_union);
if (!poly_union.empty()) { // ignore invalid box
union_area = bg::area(poly_union.front());
overlap = inter_area / union_area;
if (overlap >= threshold)
suppressed[j] = 1;
poly_union.clear();
}
}
poly.clear();
qpoly.clear();
poly_inter.clear();
}
}
return keepNum;
}
};
} // namespace functor
#define DECLARE_CPU_T_INDEX(T, Index) \
template struct functor::NonMaxSupressionFunctor<tv::CPU, T, Index>; \
template struct functor::rotateNonMaxSupressionFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_INDEX(Index) \
DECLARE_CPU_T_INDEX(float, Index); \
DECLARE_CPU_T_INDEX(double, Index);
DECLARE_CPU_INDEX(int);
DECLARE_CPU_INDEX(long);
#undef DECLARE_CPU_INDEX
#undef DECLARE_CPU_T_INDEX
} // namespace spconv
// ------------------------------------------------------------------
// Deformable Convolutional Networks
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License
// Modified from MATLAB Faster R-CNN
// (https://github.com/shaoqingren/faster_rcnn)
// ------------------------------------------------------------------
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/reordering.cu.h>
#include <spconv/reordering.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
int const threadsPerBlock = sizeof(unsigned long long) * 8;
template <typename DType>
__device__ inline DType devIoU(DType const *const a, DType const *const b) {
DType left = max(a[0], b[0]), right = min(a[2], b[2]);
DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
DType interS = width * height;
DType Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
DType Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return interS / (Sa + Sb - interS);
}
template <typename DType, int BLOCK_THREADS>
__global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
const DType *dev_boxes,
unsigned long long *dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size = min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
const int col_size = min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
__shared__ DType block_boxes[BLOCK_THREADS * 5];
if (threadIdx.x < col_size) {
#pragma unroll
for (int i = 0; i < 5; ++i) {
block_boxes[threadIdx.x * 5 + i] =
dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
}
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
const DType *cur_box = dev_boxes + cur_box_idx * 5;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (int i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
const int col_blocks = DIVUP(n_boxes, BLOCK_THREADS);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/pillar_scatter_functor.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
namespace spconv {
template <typename T, typename Index>
__global__ void pointPillarsScatterKernel(tv::TensorView<T> canvas,
tv::TensorView<const T> features,
tv::TensorView<const T> coors) {
auto numFeatures = features.dim(0);
auto numPoints = features.dim(1);
for (int i : tv::KernelLoopX<int>(numPoints)) {
for (int ifeature : tv::KernelLoopY<int>(numFeatures)) {
canvas(int(coors(0, i)), ifeature, int(coors(2, i)), int(coors(3, i))) =
features(ifeature, i);
}
}
}
namespace functor {
template <typename T, typename Index>
struct PointPillarScatter<tv::GPU, T, Index> {
void operator()(const tv::GPU &d, tv::TensorView<T> canvas,
tv::TensorView<const T> features,
tv::TensorView<const T> coors) {
auto grid = dim3(tv::cuda::DivUp(features.dim(1), 32),
tv::cuda::DivUp(features.dim(0), 32));
pointPillarsScatterKernel<T, Index>
<<<grid, dim3(32, 32), 0, d.getStream()>>>(canvas, features, coors);
TV_CHECK_CUDA_ERR();
}
};
} // namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::PointPillarScatter<tv::GPU, T, Index>;
#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
DECLARE_GPU_SPECS(float);
DECLARE_GPU_SPECS(double);
DECLARE_GPU_SPECS(at::Half);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
} // namespace spconv
\ No newline at end of file
#include <ATen/ATen.h>
#include <spconv/point2voxel.cu.h>
//#include <spconv/point2voxel.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensor.h>
#include <tensorview/tensorview.h>
#include <tensorview/torch_utils.h>
namespace spconv {
void scatter_point_to_grid_cuda(torch::Tensor points, torch::Tensor indexes,
torch::Tensor grids,
torch::Tensor numPointsPerGrid,
torch::Tensor pointIndex,
std::vector<int64_t> gridShape,
const int ndim) {
auto stream = at::cuda::getCurrentCUDAStream();
auto num_points = points.size(0);
auto num_features = points.size(1);
tv::dispatch_torch<int32_t>(pointIndex.scalar_type(), [&](auto IndexValue) {
using Index = decltype(IndexValue);
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = decltype(I)::value;
tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
scatterPointToGridKernel<Index, NDim>
<<<tv::cuda::getBlocks(num_points), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<float>(points),
tv::torch2tv<Index>(indexes), tv::torch2tv<float>(grids),
tv::torch2tv<Index>(numPointsPerGrid),
tv::torch2tv<Index>(pointIndex), gs);
TV_CHECK_CUDA_ERR_V2("scatterPointToGridKernel failed");
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(
cudaFuncGetAttributes(&attr, scatterPointToGridKernel<Index, NDim>));
tv::ssprint("scatterPointToGridKernel<", tv::type_s<Index>, NDim, ">",
attr.numRegs);
#endif
});
});
}
void gather_point_from_grid_cuda(torch::Tensor grids,
torch::Tensor numPointsPerGrid,
torch::Tensor pointIndex,
torch::Tensor pointIndexUnique,
torch::Tensor voxels, torch::Tensor coors,
std::vector<int64_t> gridShape,
const int ndim) {
auto stream = at::cuda::getCurrentCUDAStream();
auto num_voxel = voxels.size(0);
auto num_max_points = pointIndex.size(0) - 1;
auto grid_volume = grids.size(0);
tv::dispatch_torch<int32_t>(
pointIndexUnique.scalar_type(), [&](auto IndexValue) {
using Index = decltype(IndexValue);
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = decltype(I)::value;
tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
resetPointIndexKernel<Index>
<<<tv::cuda::getBlocks(num_max_points),
tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
tv::torch2tv<Index>(pointIndex), grid_volume);
TV_CHECK_CUDA_ERR_V2("resetPointIndexKernel failed");
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr0;
checkCudaErrors(cudaFuncGetAttributes(
&attr0, resetPointIndexKernel<Index, NDim>));
tv::ssprint("resetPointIndexKernel<", tv::type_s<Index>, NDim, ">",
attr0.numRegs);
#endif
gatherPointFromGridKernel<Index, NDim>
<<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<float>(grids),
tv::torch2tv<Index>(numPointsPerGrid),
tv::torch2tv<Index>(pointIndexUnique),
tv::torch2tv<float>(voxels),
tv::torch2tv<Index>(coors), gs);
TV_CHECK_CUDA_ERR_V2("gatherPointFromGridKernel failed");
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr1;
checkCudaErrors(cudaFuncGetAttributes(
&attr1, gatherPointFromGridKernel<Index, NDim>));
tv::ssprint("gatherPointFromGridKernel<", tv::type_s<Index>, NDim,
">", attr1.numRegs);
#endif
resetGridKernel<Index><<<tv::cuda::getBlocks(num_voxel),
tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
tv::torch2tv<float>(grids), tv::torch2tv<Index>(numPointsPerGrid),
tv::torch2tv<Index>(pointIndexUnique));
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr2;
checkCudaErrors(
cudaFuncGetAttributes(&attr2, resetGridKernel<Index, NDim>));
tv::ssprint("resetGridKernel<", tv::type_s<Index>, NDim, ">",
attr2.numRegs);
#endif
});
});
}
} // namespace spconv
#include <spconv/point2voxel_ops.h>
//#include <spconv/point2voxel.cu.h>
namespace spconv {
int64_t pointsToVoxel(torch::Tensor points, torch::Tensor indexes,
torch::Tensor pointIndex, torch::Tensor grids,
torch::Tensor numPointsPerGrid, torch::Tensor voxels,
torch::Tensor coors, std::vector<int64_t> gridShape,
const int64_t ndim) {
if (points.device().type() == torch::kCPU) {
TV_THROW_INVALID_ARG("not support cpu currently");
}
#ifdef TV_CUDA
else if (points.device().type() == torch::kCUDA) {
scatter_point_to_grid_cuda(points, indexes, grids, numPointsPerGrid,
pointIndex, gridShape, ndim);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
auto res = torch::_unique(pointIndex);
auto pointIndexUnique = std::get<0>(res);
auto num_voxel = pointIndexUnique.size(0) - 1;
if (points.device().type() == torch::kCPU) {
TV_THROW_INVALID_ARG("not support cpu currently");
}
#ifdef TV_CUDA
else if (points.device().type() == torch::kCUDA) {
gather_point_from_grid_cuda(grids, numPointsPerGrid, pointIndex,
pointIndexUnique, voxels, coors, gridShape,
ndim);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
return num_voxel;
}
} // namespace spconv
#include <spconv/pool_ops.h>
namespace spconv {
torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numAct) {
auto device = features.device().type();
auto kernelVolume = indiceNum.size(0);
auto numInPlanes = features.size(1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
double totalTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
if (device == torch::kCPU) {
maxpool_fwd_cpu(output, features, indicePairs[0][i], indicePairs[1][i],
nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
maxpool_fwd_cuda(output, features, indicePairs[0][i], indicePairs[1][i],
nHot);
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalTime += timer.report() / 1000.0;
}
// std::cout << "maxpool forward time " << totalTime << std::endl;
return output;
}
torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum) {
auto device = features.device().type();
auto numInPlanes = features.size(1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
auto kernelVolume = indiceNum.size(0);
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0) {
continue;
}
if (device == torch::kCPU) {
maxpool_bwd_cpu(outFeatures, features, outGrad, inputGrad,
indicePairs[0][i], indicePairs[1][i], nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
maxpool_bwd_cuda(outFeatures, features, outGrad, inputGrad,
indicePairs[0][i], indicePairs[1][i], nHot);
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
}
return inputGrad;
}
} // namespace spconv
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/Parallel.h>
#include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
namespace spconv {
using float_types_t = tv::mp_list<float, double, at::Half>;
using int_types_t = tv::mp_list<int32_t, int64_t>;
void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size) {
int numPlanes = features.size(1);
auto dtype = features.scalar_type();
auto int_dtype = indices.scalar_type();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = decltype(TValue);
tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
using Index = decltype(IndexValue);
Index *indices_data = indices.data_ptr<Index>();
T *buffer_data = buffer.data_ptr<T>();
const T *features_data = features.data_ptr<T>();
at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) {
for (int i = begin; i < end; ++i) {
std::memcpy(buffer_data + i * numPlanes,
features_data + indices_data[i] * numPlanes,
sizeof(T) * numPlanes);
}
});
});
});
}
void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
torch::Tensor indices, int size) {
int numPlanes = outFeatures.size(1);
auto dtype = outFeatures.scalar_type();
auto int_dtype = indices.scalar_type();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = decltype(TValue);
tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
using Index = decltype(IndexValue);
Index *indices_data = indices.data_ptr<Index>();
const T *buffer_data = buffer.data_ptr<T>();
T *features_data = outFeatures.data_ptr<T>();
at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) {
const T *buf = buffer.data_ptr<T>();
T *out = outFeatures.data_ptr<T>();
for (int i = begin; i < end; ++i) {
buf = buffer_data + i * numPlanes;
out = features_data + indices_data[i] * numPlanes;
for (int j = 0; j < numPlanes; ++j) {
out[j] += buf[j];
}
}
});
});
});
}
} // namespace spconv
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/reordering.cu.h>
#include <spconv/reordering.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensor.h>
#include <tensorview/tensorview.h>
#include <tensorview/torch_utils.h>
#include <type_traits>
#include <utility/timer.h>
namespace spconv {
using float_types_t = tv::mp_list<float, double, at::Half>;
using int_types_t = tv::mp_list<int32_t, int64_t>;
template <typename T>
using half_vec_t =
std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
template <typename T>
using half_vec_sadd_t =
std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16, 8>;
void sparse_gather_cuda(cudaStream_t stream, torch::Tensor buffer,
torch::Tensor features, torch::Tensor indices,
int size) {
if (size <= 0)
return;
int numPlanes = features.size(1);
auto dtype = features.scalar_type();
auto inds_dtype = indices.scalar_type();
// auto timer = spconv::CudaContextTimer<>();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = decltype(TValue);
using vecload_type_t = half_vec_t<T>;
tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
using Index = decltype(IndexValue);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
tv::mp_for_each<kernel_block_t>([&](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(size / NumTLP, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
indices.data_ptr<Index>(), nHotBlock,
numPlanes / vecloadFactor);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr, gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>));
tv::ssprint("gatherVecBlockKernel<", tv::type_s<T>,
tv::type_s<Index>, int(NumTLP), NumILP, ">",
attr.numRegs);
#endif
TV_CHECK_CUDA_ERR();
}
if (size - nHotBlock > 0) {
gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
<<<dim3(1, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes,
features.data_ptr<T>(),
indices.data_ptr<Index>() + nHotBlock,
size - nHotBlock, numPlanes / vecloadFactor);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr, gatherVecKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>));
tv::ssprint("gatherVecKernel<", tv::type_s<T>, tv::type_s<Index>,
int(NumTLP), NumILP, ">", attr.numRegs);
#endif
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
gatherGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::cuda::DivUp(size, NumTLP),
tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
buffer.data_ptr<T>(), features.data_ptr<T>(),
indices.data_ptr<Index>(), size, numPlanes);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr, gatherGenericKernel<T, Index, NumTLP, NumILP>));
tv::ssprint("gatherGenericKernel<", tv::type_s<T>, tv::type_s<Index>,
int(NumTLP), NumILP, ">", attr.numRegs);
#endif
TV_CHECK_CUDA_ERR();
}
});
});
}
void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size) {
auto stream = at::cuda::getCurrentCUDAStream();
return sparse_gather_cuda(stream, buffer, features, indices, size);
}
void sparse_scatter_add_cuda(cudaStream_t stream, torch::Tensor buffer,
torch::Tensor outFeatures, torch::Tensor indices,
int size) {
if (size <= 0)
return;
int numPlanes = outFeatures.size(1);
auto dtype = outFeatures.scalar_type();
auto inds_dtype = indices.scalar_type();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = decltype(TValue);
using vecload_type_t = half_vec_sadd_t<T>;
tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
using Index = decltype(IndexValue);
bool notFound = true;
constexpr int vecloadFactor =
sizeof(vecload_type_t) / sizeof(T); // important for half.
tv::mp_for_each<kernel_block_t>([&](auto NumTLP) {
// constexpr int NumILP = NumTLP / (64 / (NumTLP /
// vecloadFactor));
constexpr int NumILP = NumTLP / 4;
int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(size / NumTLP, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
indices.data_ptr<Index>(), nHotBlock,
numPlanes / vecloadFactor);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr, scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>));
tv::ssprint("scatterAddVecBlockKernel<", tv::type_s<T>,
tv::type_s<Index>, int(NumTLP), NumILP, ">",
attr.numRegs);
#endif
TV_CHECK_CUDA_ERR();
}
if (size - nHotBlock > 0) {
scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, stream>>>(outFeatures.data_ptr<T>(),
buffer.data_ptr<T>() + nHotBlock * numPlanes,
indices.data_ptr<Index>() + nHotBlock,
size - nHotBlock, numPlanes);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr,
scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
tv::ssprint("scatterAddGenericKernel<", tv::type_s<T>,
tv::type_s<Index>, int(NumTLP), NumILP, ">",
attr.numRegs);
#endif
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
scatterAddGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::cuda::DivUp(size, NumTLP),
tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
indices.data_ptr<Index>(), size, numPlanes);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr, scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
tv::ssprint("notfound scatterAddGenericKernel<", tv::type_s<T>,
tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
#endif
TV_CHECK_CUDA_ERR();
}
});
});
}
void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
torch::Tensor indices, int size) {
auto stream = at::cuda::getCurrentCUDAStream();
return sparse_scatter_add_cuda(stream, buffer, outFeatures, indices, size);
}
void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size) {
// indices: [volume, inds_stride]
// buffer: [volume, num_points, num_features]
// size == volume * num_points
if (size <= 0)
return;
int numPlanes = features.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
auto dtype = features.scalar_type();
auto inds_dtype = indices.scalar_type();
int inds_stride = indices.size(1);
int feature_stride = buffer.size(1);
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = decltype(TValue);
using vecload_type_t = half_vec_t<T>;
tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
using Index = decltype(IndexValue);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
tv::mp_for_each<kernel_block_t>(
[=, &buffer, &features, &indices, &notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
batchGatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(size / NumTLP, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
indices.data_ptr<Index>(), nHotBlock,
numPlanes / vecloadFactor, inds_stride,
feature_stride);
TV_CHECK_CUDA_ERR_V2("batchGatherVecBlockKernel");
}
if (size - nHotBlock > 0) {
batchGatherVecKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(1, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes,
features.data_ptr<T>(),
indices.data_ptr<Index>(), size - nHotBlock,
nHotBlock, numPlanes / vecloadFactor,
inds_stride, feature_stride);
TV_CHECK_CUDA_ERR_V2("batchGatherVecKernel");
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
batchGatherGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::cuda::DivUp(size, NumTLP),
tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
buffer.data_ptr<T>(), features.data_ptr<T>(),
indices.data_ptr<Index>(), size, numPlanes, inds_stride,
feature_stride);
TV_CHECK_CUDA_ERR();
}
});
});
}
void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
torch::Tensor outFeatures,
torch::Tensor indices, int size) {
// indices: [volume, inds_stride]
// buffer: [volume, num_points, num_features]
// size == volume * num_points
if (size <= 0)
return;
int numPlanes = outFeatures.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
auto dtype = outFeatures.scalar_type();
auto inds_dtype = indices.scalar_type();
int inds_stride = indices.size(1);
int feature_stride = buffer.size(1);
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = decltype(TValue);
using vecload_type_t = half_vec_sadd_t<T>;
tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
using Index = decltype(IndexValue);
bool notFound = true;
constexpr int vecloadFactor = 1; // important for half.
tv::mp_for_each<kernel_block_t>([=, &outFeatures, &buffer, &indices,
&notFound](auto NumTLP) {
// constexpr int NumILP = NumTLP / (64 / (NumTLP /
// vecloadFactor));
constexpr int NumILP = NumTLP / 4;
int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
batchScatterAddBlockKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(size / NumTLP, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
indices.data_ptr<Index>(), nHotBlock,
numPlanes / vecloadFactor, inds_stride,
feature_stride);
TV_CHECK_CUDA_ERR();
}
if (size - nHotBlock > 0) {
batchScatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, stream>>>(outFeatures.data_ptr<T>(),
buffer.data_ptr<T>() + nHotBlock * numPlanes,
indices.data_ptr<Index>(), size - nHotBlock,
nHotBlock, numPlanes, inds_stride,
feature_stride);
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
batchScatterAddGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::cuda::DivUp(size, NumTLP),
tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
indices.data_ptr<Index>(), size, 0, numPlanes, inds_stride,
feature_stride);
TV_CHECK_CUDA_ERR();
}
});
});
}
} // namespace spconv
#include <spconv/fused_conv.h>
#include <spconv/spconv_ops.h>
#include <spgemm/gemm_th.h>
#include <tensorview/tensor.h>
namespace spconv {
std::vector<torch::Tensor>
getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape,
std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM,
int64_t _transpose, int64_t _useHash) {
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
bool transpose = _transpose != 0;
auto NDim = kernelSize.size();
// CPU always use hash (tsl::robin_map).
bool useHash = _useHash != 0 || indices.device().type() == torch::kCPU;
auto numAct = indices.size(0);
auto coorDim = indices.size(1) - 1; // batchIdx + xyz
TV_ASSERT_RT_ERR(NDim == coorDim, "error");
TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
auto kernelVolume = kernelSize[0];
for (int i = 1; i < kernelSize.size(); ++i) {
kernelVolume *= kernelSize[i];
}
TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
auto outputVolume = outSpatialShape[0];
for (int i = 1; i < outSpatialShape.size(); ++i) {
outputVolume *= outSpatialShape[i];
}
std::string msg = "due to limits of cuda hash, the volume of dense space "
"include batch size ";
msg += "must less than std::numeric_limits<int>::max() = 2e9";
TV_ASSERT_RT_ERR(batchSize * outputVolume < std::numeric_limits<int>::max(),
msg);
torch::Tensor indicePairs =
torch::full({2, kernelVolume, numAct}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor indiceNum = torch::zeros(
{kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
auto gridSize = batchSize * outputVolume;
if (useHash) {
gridSize = batchSize;
}
bool resetGrid = gridOut.numel() != 0;
if (!resetGrid) {
gridOut = torch::full({gridSize}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
}
gridOut = gridOut.view({batchSize, -1});
int64_t numActOut = -1;
for (int i = 0; i < NDim; ++i) {
if (subM) {
padding[i] = kernelSize[i] / 2;
stride[i] = 1;
}
}
// tv::ssprint("prepare", timer.report() / 1000.0);
if (subM) {
if (indices.device().type() == torch::kCPU) {
numActOut = create_submconv_indice_pair_cpu(
indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding,
dilation, outSpatialShape, transpose, false, useHash);
}
#ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) {
numActOut = create_submconv_indice_pair_cuda(
indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding,
dilation, outSpatialShape, transpose, resetGrid, useHash);
if (numActOut == -1) {
auto device = indices.device();
indicePairs = indicePairs.to({torch::kCPU});
indiceNum = indiceNum.to({torch::kCPU});
indices = indices.to({torch::kCPU});
numActOut = create_submconv_indice_pair_cpu(
indices, gridOut, indicePairs, indiceNum, kernelSize, stride,
padding, dilation, outSpatialShape, transpose, false, useHash);
return {indices.to(device), indicePairs.to(device),
indiceNum.to(device)};
}
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
// tv::ssprint("subm", timer.report() / 1000.0);
return {indices, indicePairs, indiceNum};
} else {
auto indicePairUnique = torch::full(
{indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor outInds =
torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) {
numActOut = create_conv_indice_pair_cpu(
indices, outInds, gridOut, indicePairs, indiceNum, kernelSize, stride,
padding, dilation, outSpatialShape, transpose, resetGrid, useHash);
}
#ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) {
numActOut = create_conv_indice_pair_p1_cuda(
indices, indicePairs, indiceNum, indicePairUnique, kernelSize, stride,
padding, dilation, outSpatialShape, transpose);
if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res);
numActOut = create_conv_indice_pair_p2_cuda(
indices, outInds, gridOut, indicePairs, indiceNum, indicePairUnique,
outSpatialShape, transpose, resetGrid, useHash);
if (numActOut == -1) {
auto device = indices.device();
outInds = outInds.to({torch::kCPU});
indicePairs = indicePairs.to({torch::kCPU});
indiceNum = indiceNum.to({torch::kCPU});
indices = indices.to({torch::kCPU});
numActOut = create_conv_indice_pair_cpu(
indices, outInds, gridOut, indicePairs, indiceNum, kernelSize,
stride, padding, dilation, outSpatialShape, transpose, false,
useHash);
return {outInds.to(device).slice(0, 0, numActOut),
indicePairs.to(device), indiceNum.to(device)};
}
}
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
}
}
torch::Tensor indiceConvNative(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
auto kernelVolume = indiceNum.size(0);
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
// init for subM
int indicePairMaxOffset = kernelVolume / 2;
int indicePairMaxSize = numActOut;
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
// get indice pair second max size based on subM symmetric property
indicePairMaxSize = *std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() +
indicePairMaxOffset);
if (indicePairMaxSize == 0) {
return output;
}
} else {
indicePairMaxSize =
*std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
}
torch::Tensor inputBuffer =
torch::empty({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::empty({indicePairMaxSize, numOutPlanes}, options);
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
// tv::ssprint("first subm gemm time", timer.report() / 1000.0,
// std::vector<int>(indicePairNumCpu.data_ptr<int>(),
// indicePairNumCpu.data_ptr<int>() + kernelVolume));
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
// TODO torch::from_blob is a little slow
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
if (device == torch::kCPU) {
sparse_gather_cpu(inputBuffer, features, indicePairs[inverse][i], nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_gather_cuda(inputBuffer, features, indicePairs[inverse][i], nHot);
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
// totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
// totalGEMMTime += timer.report() / 1000.0;
if (device == torch::kCPU) {
sparse_scatter_add_cpu(outputBuffer, output, indicePairs[!inverse][i],
nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_scatter_add_cuda(outputBuffer, output, indicePairs[!inverse][i],
nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
// totalSAddTime += timer.report() / 1000.0;
}
// tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
// tv::ssprint("final subm gemm time", timer.report() / 1000.0);
return output;
}
template <int Algo>
torch::Tensor indiceConvFused(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
auto kernelVolume = indiceNum.size(0);
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
// init for subM
int indicePairMaxOffset = kernelVolume / 2;
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("fused only support gpu");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
FusedConvDispatch<Algo>::fwd(output, features, filters[i],
indicePairs[inverse][i],
indicePairs[!inverse][i], nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
}
return output;
}
template <bool BatchScatter>
torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
auto batchScatter = BatchScatter;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indiceNum.size(0);
TV_ASSERT_INVALID_ARG(kernelVolume > 1, "error");
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
// auto timer = spconv::CudaContextTimer<>();
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairNumVec =
std::vector<int>(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
auto indicePairMaxSizeIter =
std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumVec.begin();
int indicePairMaxSize = *indicePairMaxSizeIter;
std::nth_element(indicePairNumVec.begin(), indicePairNumVec.begin() + 1,
indicePairNumVec.end(), std::greater<int>());
int indicePairTop2Size = indicePairNumVec[1];
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto indice_dtype = indicePairs.scalar_type();
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
// we cant use batch conv in subm directly because
// number of indice in the center of filter is much more than other
// filter location.
// so we first use top2 indice num to do batch conv, then
// do native conv (gemm) in center.
int bufferSize = subM ? indicePairTop2Size : indicePairMaxSize;
int maxKernelVolumePart = kernelVolume;
std::vector<std::pair<int, int>> part_ranges = {{0, kernelVolume}};
filters = filters.view({kernelVolume, numInPlanes, numOutPlanes});
if (subM) {
maxKernelVolumePart = std::max(indicePairMaxOffset,
int(kernelVolume - indicePairMaxOffset - 1));
part_ranges = {{0, indicePairMaxOffset},
{indicePairMaxOffset + 1, kernelVolume}};
torch::mm_out(output, features, filters[indicePairMaxOffset]);
if (indicePairTop2Size == 0) {
return output;
}
}
// tv::ssprint("first subm gemm time", timer.report() / 1000.0);
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
torch::Tensor inputBuffer =
torch::empty({maxKernelVolumePart, bufferSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::empty({maxKernelVolumePart, bufferSize, numOutPlanes}, options);
for (auto &range : part_ranges) {
int start = range.first;
int end = range.second;
int length = end - start;
int64_t size = length * bufferSize;
auto inputBufferPart = tv::torch_slice_first_axis(inputBuffer, 0, length);
auto outputBufferPart = tv::torch_slice_first_axis(outputBuffer, 0, length);
auto indicePairs1Part =
tv::torch_slice_first_axis(indicePairs[inverse], start, end);
auto indicePairs2Part =
tv::torch_slice_first_axis(indicePairs[!inverse], start, end);
auto filtersPart = tv::torch_slice_first_axis(filters, start, end);
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
batch_sparse_gather_cuda(inputBufferPart, features, indicePairs1Part,
size);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
// totalGatherTime += timer.report() / 1000.0;
torch::bmm_out(outputBufferPart, inputBufferPart, filtersPart);
// totalGEMMTime += timer.report() / 1000.0;
if (batchScatter) {
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
batch_sparse_scatter_add_cuda(outputBufferPart, output,
indicePairs2Part, size);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
} else {
for (int i = 0; i < length; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i + start];
if (nHot <= 0) {
continue;
}
if (device == torch::kCPU) {
sparse_scatter_add_cpu(outputBufferPart[i], output,
indicePairs2Part[i], nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_scatter_add_cuda(outputBufferPart[i], output,
indicePairs2Part[i], nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
}
}
// totalSAddTime += timer.report() / 1000.0;
}
// tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
return output;
}
std::vector<torch::Tensor>
indiceConvBwNative(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
auto kernelVolume = indiceNum.size(0);
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::empty(filterShape, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
// init for subM
int indicePairMaxOffset = kernelVolume / 2;
int indicePairMaxSize = indicePairNumCpu.data_ptr<int>()[indicePairMaxOffset];
if (subM) {
auto filterGradSub = filtersGrad[indicePairMaxOffset];
torch::mm_out(filterGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
// get indice pair second max size based on subM symmetric property
indicePairMaxSize = *std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() +
indicePairMaxOffset);
if (indicePairMaxSize == 0) {
return {inputGrad, filtersGrad.view(filterShape)};
}
} else {
indicePairMaxSize =
*std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
}
torch::Tensor inputBuffer =
torch::empty({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::empty({indicePairMaxSize, numOutPlanes}, options);
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
if (device == torch::kCPU) {
sparse_gather_cpu(inputBuffer, features, indicePairs[inverse][i], nHot);
sparse_gather_cpu(outputBuffer, outGrad, indicePairs[!inverse][i], nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_gather_cuda(inputBuffer, features, indicePairs[inverse][i], nHot);
sparse_gather_cuda(outputBuffer, outGrad, indicePairs[!inverse][i], nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
auto filterGradSub = filtersGrad[i];
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
if (device == torch::kCPU) {
sparse_scatter_add_cpu(inputBuffer, inputGrad, indicePairs[inverse][i],
nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_scatter_add_cuda(inputBuffer, inputGrad, indicePairs[inverse][i],
nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
}
return {inputGrad, filtersGrad.view(filterShape)};
}
template <int Algo>
std::vector<torch::Tensor>
indiceConvBwFused(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
auto kernelVolume = indiceNum.size(0);
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::zeros(filterShape, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
// init for subM
int indicePairMaxOffset = kernelVolume / 2;
int indicePairMaxSize = indicePairNumCpu.data_ptr<int>()[indicePairMaxOffset];
if (subM) {
auto filterGradSub = filtersGrad[indicePairMaxOffset];
torch::mm_out(filterGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
}
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
FusedConvDispatch<Algo>::bwd(features, inputGrad, outGrad, filters[i],
filtersGrad[i], indicePairs[inverse][i],
indicePairs[!inverse][i], nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
}
return {inputGrad, filtersGrad.view(filterShape)};
}
template <bool BatchScatter>
std::vector<torch::Tensor>
indiceConvBwBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto batchScatter = BatchScatter;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indiceNum.size(0);
TV_ASSERT_INVALID_ARG(kernelVolume > 1, "error");
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairNumVec =
std::vector<int>(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
auto indicePairMaxSizeIter =
std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumVec.begin();
int indicePairMaxSize = *indicePairMaxSizeIter;
std::nth_element(indicePairNumVec.begin(), indicePairNumVec.begin() + 1,
indicePairNumVec.end(), std::greater<int>());
int indicePairTop2Size = indicePairNumVec[1];
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto indice_dtype = indicePairs.scalar_type();
auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::zeros(filterShape, options);
int bufferSize = subM ? indicePairTop2Size : indicePairMaxSize;
filters = filters.view({-1, numInPlanes, numOutPlanes});
filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
std::vector<std::pair<int, int>> part_ranges = {{0, kernelVolume}};
int maxKernelVolumePart = kernelVolume;
if (subM) {
maxKernelVolumePart = std::max(indicePairMaxOffset,
int(kernelVolume - indicePairMaxOffset - 1));
part_ranges = {{0, indicePairMaxOffset},
{indicePairMaxOffset + 1, kernelVolume}};
auto filtersGradSub = filtersGrad[indicePairMaxOffset];
auto filtersSub = filters[indicePairMaxOffset];
torch::mm_out(filtersGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filtersSub.t());
if (indicePairTop2Size == 0) {
return {inputGrad, filtersGrad.view(filterShape)};
}
}
torch::Tensor inputBuffer =
torch::zeros({maxKernelVolumePart, bufferSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({maxKernelVolumePart, bufferSize, numOutPlanes}, options);
for (auto &range : part_ranges) {
int start = range.first;
int end = range.second;
int length = end - start;
int64_t size = length * bufferSize;
auto inputBufferPart = tv::torch_slice_first_axis(inputBuffer, 0, length);
auto outputBufferPart = tv::torch_slice_first_axis(outputBuffer, 0, length);
auto indicePairs1Part =
tv::torch_slice_first_axis(indicePairs[inverse], start, end);
auto indicePairs2Part =
tv::torch_slice_first_axis(indicePairs[!inverse], start, end);
auto filtersPart = tv::torch_slice_first_axis(filters, start, end);
auto filtersGradPart = tv::torch_slice_first_axis(filtersGrad, start, end);
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
batch_sparse_gather_cuda(inputBufferPart, features, indicePairs1Part,
size);
batch_sparse_gather_cuda(outputBufferPart, outGrad, indicePairs2Part,
size);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
// filters: KV, I, O, inputBuffer: [KV, buffer, I]
// outputBuffer: [KV, buffer, O]
torch::bmm_out(filtersGradPart, inputBufferPart.permute({0, 2, 1}),
outputBufferPart);
torch::bmm_out(inputBuffer, outputBufferPart,
filtersPart.permute({0, 2, 1}));
if (batchScatter) {
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
batch_sparse_scatter_add_cuda(inputBufferPart, inputGrad,
indicePairs1Part, size);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
} else {
for (int i = 0; i < length; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i + start];
if (nHot <= 0) {
continue;
}
if (device == torch::kCPU) {
sparse_scatter_add_cpu(inputBufferPart[i], inputGrad,
indicePairs1Part[i], nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_scatter_add_cuda(inputBufferPart[i], inputGrad,
indicePairs1Part[i], nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
}
}
}
return {inputGrad, filtersGrad.view(filterShape)};
}
template <int Algo> struct ConvDispatch;
template <> struct ConvDispatch<kNative> {
constexpr static auto *fwd = indiceConvNative;
constexpr static auto *bwd = indiceConvBwNative;
};
template <> struct ConvDispatch<kBatch> {
constexpr static auto *fwd = indiceConvBatch<false>;
constexpr static auto *bwd = indiceConvBwBatch<false>;
};
template <> struct ConvDispatch<kBatchGemmGather> {
constexpr static auto *fwd = indiceConvBatch<true>;
constexpr static auto *bwd = indiceConvBwBatch<true>;
};
template <> struct ConvDispatch<kSparseConvNet> {
constexpr static auto *fwd = indiceConvFused<kFSparseConvNet>;
constexpr static auto *bwd = indiceConvBwFused<kFSparseConvNet>;
};
template <> struct ConvDispatch<kMinkowskiEngine> {
constexpr static auto *fwd = indiceConvFused<kFMinkowskiEngine>;
constexpr static auto *bwd = indiceConvBwFused<kFMinkowskiEngine>;
};
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM,
int64_t algo) {
torch::Tensor res;
tv::DispatchInt<all_conv_algos_t>()(algo, [&](auto I) {
constexpr int AlgoValue = decltype(I)::value;
res = ConvDispatch<AlgoValue>::fwd(features, filters, indicePairs,
indiceNum, numActOut, _inverse, _subM);
});
return res;
}
std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM,
int64_t algo) {
std::vector<torch::Tensor> res;
tv::DispatchInt<all_conv_algos_t>()(algo, [&](auto I) {
constexpr int AlgoValue = decltype(I)::value;
res = ConvDispatch<AlgoValue>::bwd(features, filters, outGrad, indicePairs,
indiceNum, _inverse, _subM);
});
return res;
}
} // namespace spconv
set(ALL_FILES ${ALL_FILES} gemm.cu)
add_library(spgemm SHARED ${ALL_FILES})
target_include_directories(spgemm PRIVATE ${ALL_INCLUDE} ${MP11_INCLUDE} ${CUTLASS_INCLUDE} )
set_property(TARGET spgemm PROPERTY CUDA_STANDARD 14)
set_property(TARGET spgemm PROPERTY CXX_STANDARD 14)
set_target_properties(spgemm PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(spgemm PRIVATE ${ALL_LIBS})
install (TARGETS spgemm DESTINATION lib)
#include <spgemm/gemm.h>
#include <spgemm/gemm_th.h>
namespace spconv {
template <typename T>
using determine_half_t =
std::conditional_t<std::is_same<T, at::Half>::value, cutlass::half_t, T>;
void cutlass_mm_out(cudaStream_t stream, torch::Tensor c, torch::Tensor a,
torch::Tensor b) {
TV_ASSERT_RT_ERR(c.dtype() == a.dtype() && c.dtype() == b.dtype(),
"dtype must be same");
TV_ASSERT_RT_ERR(c.is_contiguous() && b.is_contiguous() && a.is_contiguous(),
"error");
auto M = a.size(0);
auto K = a.size(1);
auto N = b.size(1);
TV_ASSERT_RT_ERR(b.size(0) == K && c.size(0) == M && c.size(1) == N, "error");
tv::dispatch_torch<float, at::Half>(c.scalar_type(), [&](auto I) {
using T = decltype(I);
using HalfT = determine_half_t<T>;
auto status = cutlassGemm<HalfT, false, false, false>(
stream, M, N, K, HalfT(1.0), reinterpret_cast<HalfT *>(a.data_ptr<T>()),
a.size(1), reinterpret_cast<HalfT *>(b.data_ptr<T>()), b.size(1),
HalfT(0.0), reinterpret_cast<HalfT *>(c.data_ptr<T>()), c.size(1));
TV_ASSERT_RT_ERR(status == cudaSuccess, "error");
});
}
void cutlass_mm_out(torch::Tensor c, torch::Tensor a, torch::Tensor b) {
auto stream = at::cuda::getCurrentCUDAStream();
return cutlass_mm_out(stream, c, a, b);
}
} // namespace spconv
\ No newline at end of file
#define TV_CUDA
#include <cutlass/gemm/device/gemm.h>
#include <spgemm/gemm.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/tensor.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
int main() {
auto M = 100000;
auto N = 128;
auto K = 128;
auto a =
torch::rand({M, K}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
auto b = torch::rand({K, N}, a.options());
auto c = torch::zeros({a.size(0), b.size(1)}, a.options());
auto c2 = torch::zeros({a.size(0), b.size(1)}, a.options());
torch::mm_out(c, a, b);
auto status = spconv::cutlassGemm<float, false, false, false>(
0, M, N, K, 1.0, a.data_ptr<float>(), a.size(1), b.data_ptr<float>(),
b.size(1), 0.0, c2.data_ptr<float>(), c2.size(1));
auto err = torch::norm(c2 - c);
tv::ssprint(status, "linalg norm", err);
tv::ssprint((c.view({-1}) == 0).sum());
auto timer = spconv::CudaContextTimer<>();
for (int i = 0; i < 10; ++i) {
torch::mm_out(c, a, b);
tv::ssprint("mm", timer.report() / 1000.0);
spconv::cutlassGemm<float, false, false, false>(
0, M, N, K, 1.0, a.data_ptr<float>(), a.size(1), b.data_ptr<float>(),
b.size(1), 0.0, c2.data_ptr<float>(), c2.size(1));
tv::ssprint("cutlass_mm", timer.report() / 1000.0);
}
return 0;
}
\ No newline at end of file
if (SPCONV_BuildCUDA)
add_library(spconv_nms STATIC nms.cu)
set_target_properties(spconv_nms PROPERTIES VERSION ${PROJECT_VERSION})
set_target_properties(spconv_nms PROPERTIES SOVERSION 1)
target_include_directories(spconv_nms PRIVATE ${ALL_INCLUDE})
set_property(TARGET spconv_nms PROPERTY CXX_STANDARD 14)
set_property(TARGET spconv_nms PROPERTY CUDA_STANDARD 14)
set_property(TARGET spconv_nms PROPERTY POSITION_INDEPENDENT_CODE ON)
target_link_libraries(spconv_nms ${CUDA_CUDART})
install (TARGETS spconv_nms DESTINATION lib)
endif()
add_library(spconv_utils SHARED all.cc)
set_target_properties(spconv_utils PROPERTIES VERSION ${PROJECT_VERSION})
set_target_properties(spconv_utils PROPERTIES SOVERSION 1)
target_include_directories(spconv_utils PRIVATE ${ALL_INCLUDE}
${PROJECT_SOURCE_DIR}/third_party/pybind11/include)
set_property(TARGET spconv_utils PROPERTY CXX_STANDARD 14)
set_property(TARGET spconv_utils PROPERTY CUDA_STANDARD 14)
set_target_properties(spconv_utils PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
SUFFIX "${PYTHON_MODULE_EXTENSION}")
if (SPCONV_BuildCUDA)
target_link_libraries(spconv_utils ${CUDA_CUDART} pybind11::module spconv_nms)
else()
target_link_libraries(spconv_utils pybind11::module)
endif()
install (TARGETS spconv_utils DESTINATION lib)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment