Commit f08268fc authored by Yan Yan's avatar Yan Yan
Browse files

working on batch indice conv (almost finished)

parent bf473de0
......@@ -14,10 +14,11 @@
#ifndef REORDERING_CU_H_
#define REORDERING_CU_H_
#include <THC/THCAtomics.cuh>
#include <tensorview/kernel_utils.h>
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
namespace spconv {
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void gatherGenericKernel(T *buffer, const T *features,
const Index *indices, int size,
......@@ -99,12 +100,13 @@ __global__ void gatherVecBlockKernel(T *buffer, const T *features,
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void batchGatherGenericKernel(T *buffer, const T *features,
const Index *indices, int size,
int numPlanes, int batch_stride,
int numPlanes, int indice_batch_stride,
int feature_batch_stride) {
// size: max indice num * kernel volume
// inds: [volume, num_elems]
int ILPStrideX[NumILP];
Index inds[NumILP];
Index batchIdx[NumILP];
Index inds_elem;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
......@@ -113,16 +115,18 @@ __global__ void batchGatherGenericKernel(T *buffer, const T *features,
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < size) {
batchIdx[ilp] = ix / feature_batch_stride;
inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
inds_elem = ix + ILPStrideX[ilp];
inds[ilp] =
indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
inds_elem % feature_batch_stride];
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size)
if (ix + ILPStrideX[ilp] < size && inds[ilp] != -1)
buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
features[inds[ilp] + iy];
features[inds[ilp] * numPlanes + iy];
}
}
}
......@@ -131,9 +135,12 @@ __global__ void batchGatherGenericKernel(T *buffer, const T *features,
template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
__global__ void batchGatherVecKernel(T *buffer, const T *features,
const Index *indices, int size,
int numPlanes) {
int feature_offset,
int numPlanes, int indice_batch_stride,
int feature_batch_stride) {
int ILPStrideX[NumILP];
Index inds[NumILP];
Index inds_elem;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
......@@ -141,16 +148,21 @@ __global__ void batchGatherVecKernel(T *buffer, const T *features,
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < size)
inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
if (ix + ILPStrideX[ilp] < size) {
inds_elem = ix + ILPStrideX[ilp] + feature_offset;
inds[ilp] =
indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
inds_elem % feature_batch_stride];
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size)
if (ix + ILPStrideX[ilp] < size && inds[ilp] != -1)
reinterpret_cast<VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
reinterpret_cast<const VecType *>(
features)[inds[ilp] * numPlanes + iy];
}
}
}
......@@ -158,24 +170,34 @@ __global__ void batchGatherVecKernel(T *buffer, const T *features,
template <typename T, typename Index, int NumTLP, int NumILP,
typename VecType = int4>
__global__ void batchGatherVecBlockKernel(T *buffer, const T *features,
const Index *indices, int size,
int numPlanes) {
__global__ void
batchGatherVecBlockKernel(T *buffer, const T *features, const Index *indices,
int size, int numPlanes, int indice_batch_stride,
int feature_batch_stride) {
int ILPStrideY[NumILP];
Index inds;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
features += blockIdx.x * NumTLP;
buffer += blockIdx.x * NumTLP;
Index inds_elem;
for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
inds_elem = iy + ILPStrideY[ilp];
inds = indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
inds_elem % feature_batch_stride];
if (inds != -1) {
reinterpret_cast<VecType *>(
buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =
reinterpret_cast<const VecType *>(
features)[indices[iy + ILPStrideY[ilp]] * numPlanes +
threadIdx.x];
features)[inds * numPlanes + threadIdx.x];
}
}
}
}
......@@ -240,6 +262,68 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void batchScatterAddGenericKernel(T *outFeatures, const T *buffer,
const Index *indices, int size,
int feature_offset, int numPlanes,
int indice_batch_stride,
int feature_batch_stride) {
int ILPStrideX[NumILP];
Index inds[NumILP];
Index inds_elem;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < size) {
inds_elem = ix + ILPStrideX[ilp] + feature_offset;
inds[ilp] =
indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
inds_elem % feature_batch_stride];
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size && inds[ilp] != -1) {
gpuAtomicAdd(outFeatures + inds[ilp] * numPlanes + iy,
buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy]);
// outFeatures[inds[ilp] * numPlanes + iy] +=
// buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void
batchScatterAddBlockKernel(T *outFeatures, const T *buffer,
const Index *indices, int size, int numPlanes,
int indice_batch_stride, int feature_batch_stride) {
int ILPStrideY[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
outFeatures += blockIdx.x * NumTLP;
buffer += blockIdx.x * NumTLP;
Index inds, inds_elem;
for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
inds_elem = iy + ILPStrideY[ilp];
inds = indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
inds_elem % feature_batch_stride];
if (inds != -1) {
gpuAtomicAdd(outFeatures + inds * numPlanes + threadIdx.x,
buffer[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x]);
}
}
}
}
} // namespace spconv
#endif
\ No newline at end of file
......@@ -19,6 +19,12 @@
namespace spconv {
void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size);
void batch_sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
torch::Tensor indices, int size);
void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size);
void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
......
......@@ -338,6 +338,9 @@ std::vector<torch::Tensor> getIndicePairPreGrid(
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
}
}
torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM);
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
......
......@@ -107,7 +107,7 @@ def indice_conv(features,
num_activate_out,
inverse=False,
subm=False):
return torch.ops.spconv.indice_conv(features, filters, indice_pairs,
return torch.ops.spconv.indice_conv_batch(features, filters, indice_pairs,
indice_pair_num, num_activate_out,
int(inverse), int(subm))
......
......@@ -30,6 +30,7 @@ static auto registry =
.op("spconv::get_indice_pairs_grid_3d",
&spconv::getIndicePairPreGrid<3>)
.op("spconv::indice_conv", &spconv::indiceConv)
.op("spconv::indice_conv_batch", &spconv::indiceConvBatch)
.op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
.op("spconv::fused_indice_conv_bn", &spconv::fusedIndiceConvBatchNorm)
.op("spconv::indice_maxpool", &spconv::indiceMaxPool)
......
......@@ -161,4 +161,154 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
});
}
void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size) {
// indices: [volume, inds_stride]
// buffer: [volume, num_points, num_features]
// size == volume * num_points
if (size <= 0)
return;
int numPlanes = features.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
auto dtype = features.scalar_type();
auto inds_dtype = indices.scalar_type();
int inds_stride = indices.size(1);
int feature_stride = buffer.size(1);
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = decltype(TValue);
using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
using Index = decltype(IndexValue);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
tv::mp_for_each<kernel_block_t>(
[=, &buffer, &features, &indices, &notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
batchGatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
indices.data_ptr<Index>(), nHotBlock,
numPlanes / vecloadFactor, inds_stride,
feature_stride);
TV_CHECK_CUDA_ERR();
}
if (size - nHotBlock > 0) {
auto indices_offset = (nHotBlock / feature_stride) * inds_stride + nHotBlock % feature_stride;
batchGatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
<<<dim3(1, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes,
features.data_ptr<T>(),
indices.data_ptr<Index>() + indices_offset,
size - nHotBlock, nHotBlock, numPlanes / vecloadFactor,
inds_stride, feature_stride);
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
batchGatherGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::cuda::DivUp(size, NumTLP),
tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
buffer.data_ptr<T>(), features.data_ptr<T>(),
indices.data_ptr<Index>(), size, numPlanes, inds_stride,
feature_stride);
TV_CHECK_CUDA_ERR();
}
});
});
}
void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
torch::Tensor outFeatures,
torch::Tensor indices, int size) {
// indices: [volume, inds_stride]
// buffer: [volume, num_points, num_features]
// size == volume * num_points
if (size <= 0)
return;
int numPlanes = outFeatures.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
auto dtype = outFeatures.scalar_type();
auto inds_dtype = indices.scalar_type();
int inds_stride = indices.size(1);
int feature_stride = buffer.size(1);
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = decltype(TValue);
using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
using Index = decltype(IndexValue);
bool notFound = true;
constexpr int vecloadFactor = 1; // important for half.
tv::mp_for_each<kernel_block_t>([=, &outFeatures, &buffer, &indices,
&notFound](auto NumTLP) {
// constexpr int NumILP = NumTLP / (64 / (NumTLP /
// vecloadFactor));
constexpr int NumILP = NumTLP / 4;
int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) {
batchScatterAddBlockKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
stream>>>(outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
indices.data_ptr<Index>(), nHotBlock,
numPlanes / vecloadFactor, inds_stride,
feature_stride);
TV_CHECK_CUDA_ERR();
}
if (size - nHotBlock > 0) {
// int indices_offset = (nHotBlock / feature_stride) * inds_stride + nHotBlock % feature_stride;
batchScatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, stream>>>(outFeatures.data_ptr<T>(),
buffer.data_ptr<T>() + nHotBlock * numPlanes,
indices.data_ptr<Index>(),
size - nHotBlock, nHotBlock, numPlanes, inds_stride,
feature_stride);
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
batchScatterAddGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::cuda::DivUp(size, NumTLP),
tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
indices.data_ptr<Index>(), size, 0, numPlanes, inds_stride,
feature_stride);
TV_CHECK_CUDA_ERR();
}
});
});
}
} // namespace spconv
\ No newline at end of file
......@@ -211,6 +211,67 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
return output;
}
torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indiceNum.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
torch::Tensor inputBuffer =
torch::zeros({kernelVolume, indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({kernelVolume, numInPlanes, numOutPlanes});
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
auto size = kernelVolume * indicePairMaxSize;
if (device == torch::kCPU) {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
batch_sparse_gather_cuda(inputBuffer, features, indicePairs[inverse], size);
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
torch::bmm_out(outputBuffer, inputBuffer, filters);
if (device == torch::kCPU) {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
batch_sparse_scatter_add_cuda(outputBuffer, output, indicePairs[!inverse],
size);
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
return output;
}
std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
......
......@@ -720,6 +720,6 @@ def main_subm():
if __name__ == '__main__':
main_subm()
main()
# unittest.main()
# TestSpConv().testSpConv3d()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment