Unverified Commit b820b0be authored by Wenwei Zhang's avatar Wenwei Zhang Committed by GitHub
Browse files

[Fix]: fix compilation error in pytorch 1.7 (#393)

* fix compilation error in pytorch 1.7

* add pt1.7 build

* Update build.yml
parent dabf0a26
...@@ -38,7 +38,7 @@ jobs: ...@@ -38,7 +38,7 @@ jobs:
strategy: strategy:
matrix: matrix:
python-version: [3.6, 3.7] python-version: [3.6, 3.7]
torch: [1.5.0+cu101, 1.6.0+cu101] torch: [1.5.0+cu101, 1.6.0+cu101, 1.7.0+cu101]
include: include:
- torch: 1.5.0+cu101 - torch: 1.5.0+cu101
torchvision: 0.6.0+cu101 torchvision: 0.6.0+cu101
...@@ -48,6 +48,10 @@ jobs: ...@@ -48,6 +48,10 @@ jobs:
mmcv: 1.6.0+cu101 mmcv: 1.6.0+cu101
torchvision: 0.7.0+cu101 torchvision: 0.7.0+cu101
cuda_arch: "7.0" cuda_arch: "7.0"
- torch: 1.7.0+cu101
mmcv: 1.7.0+cu101
torchvision: 0.8.1+cu101
cuda_arch: "7.0"
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
......
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <torch/types.h> #include <torch/types.h>
#include <ATen/cuda/CUDAApplyUtils.cuh> #include <ATen/cuda/CUDAApplyUtils.cuh>
typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t; typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
...@@ -114,17 +115,16 @@ __global__ void coors_map_init_kernel(const int64_t *coors_id, ...@@ -114,17 +115,16 @@ __global__ void coors_map_init_kernel(const int64_t *coors_id,
} }
template <typename T, typename T_int> template <typename T, typename T_int>
__global__ void __global__ void feats_reduce_kernel(
feats_reduce_kernel(const T *feats, const T_int *coors, int32_t *coors_map, const T *feats, const T_int *coors, int32_t *coors_map,
int32_t *reduce_count, // shall be 0 at initialization int32_t *reduce_count, // shall be 0 at initialization
T *reduced_feats, // shall be 0 at initialization T *reduced_feats, // shall be 0 at initialization
T_int *out_coors, const int num_input, const int num_feats, T_int *out_coors, const int num_input, const int num_feats, const int NDim,
const int NDim, const reduce_t reduce_type) { const reduce_t reduce_type) {
for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input; for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
x += gridDim.x * blockDim.x) { x += gridDim.x * blockDim.x) {
int32_t reduce_to = coors_map[x]; int32_t reduce_to = coors_map[x];
if (reduce_to == -1) if (reduce_to == -1) continue;
continue;
const T_int *coors_offset = coors + x * NDim; const T_int *coors_offset = coors + x * NDim;
T_int *out_coors_offset = out_coors + reduce_to * NDim; T_int *out_coors_offset = out_coors + reduce_to * NDim;
...@@ -207,13 +207,13 @@ __global__ void max_reduce_traceback_scatter_idx_kernel( ...@@ -207,13 +207,13 @@ __global__ void max_reduce_traceback_scatter_idx_kernel(
} }
template <typename T> template <typename T>
__global__ void __global__ void max_reduce_scatter_grad_kernel(T *grad_feats,
max_reduce_scatter_grad_kernel(T *grad_feats, const T *grad_reduced_feats, const T *grad_reduced_feats,
const int32_t *reduce_from, const int32_t *reduce_from,
const int num_reduced, const int num_feats) { const int num_reduced,
const int num_feats) {
for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_reduced; for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_reduced;
x += gridDim.x * blockDim.x) { x += gridDim.x * blockDim.x) {
const int reduced_offset = x * num_feats; const int reduced_offset = x * num_feats;
const int32_t *scatter_to_offset = reduce_from + reduced_offset; const int32_t *scatter_to_offset = reduce_from + reduced_offset;
const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset; const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
...@@ -227,9 +227,8 @@ max_reduce_scatter_grad_kernel(T *grad_feats, const T *grad_reduced_feats, ...@@ -227,9 +227,8 @@ max_reduce_scatter_grad_kernel(T *grad_feats, const T *grad_reduced_feats,
namespace voxelization { namespace voxelization {
std::vector<at::Tensor> std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
dynamic_point_to_voxel_forward_gpu(const at::Tensor &feats, const at::Tensor &feats, const at::Tensor &coors,
const at::Tensor &coors,
const reduce_t reduce_type) { const reduce_t reduce_type) {
CHECK_INPUT(feats); CHECK_INPUT(feats);
CHECK_INPUT(coors); CHECK_INPUT(coors);
...@@ -239,11 +238,10 @@ dynamic_point_to_voxel_forward_gpu(const at::Tensor &feats, ...@@ -239,11 +238,10 @@ dynamic_point_to_voxel_forward_gpu(const at::Tensor &feats,
const int num_feats = feats.size(1); const int num_feats = feats.size(1);
auto coors_id = at::empty({num_input}, coors.options().dtype(torch::kInt64)); auto coors_id = at::empty({num_input}, coors.options().dtype(torch::kInt64));
auto coor_space_dim = coors.max_values(0) + 1; auto coor_space_dim = std::get<0>(coors.max(0)) + 1;
auto coors_map_sorted = auto coors_map_sorted =
at::empty({num_input}, coors.options().dtype(torch::kInt32)); at::empty({num_input}, coors.options().dtype(torch::kInt32));
auto coors_map = auto coors_map = at::empty({num_input}, coors.options().dtype(torch::kInt32));
at::empty({num_input}, coors.options().dtype(torch::kInt32));
auto num_coors = at::zeros({1}, coors.options().dtype(torch::kInt32)); auto num_coors = at::zeros({1}, coors.options().dtype(torch::kInt32));
AT_DISPATCH_INTEGRAL_TYPES( AT_DISPATCH_INTEGRAL_TYPES(
...@@ -276,8 +274,7 @@ dynamic_point_to_voxel_forward_gpu(const at::Tensor &feats, ...@@ -276,8 +274,7 @@ dynamic_point_to_voxel_forward_gpu(const at::Tensor &feats,
const int num_coors_cpu = const int num_coors_cpu =
coors_map_sorted[-1].cpu().data_ptr<int32_t>()[0] + 1; coors_map_sorted[-1].cpu().data_ptr<int32_t>()[0] + 1;
auto out_coors = at::empty({num_coors_cpu, NDim}, coors.options()); auto out_coors = at::empty({num_coors_cpu, NDim}, coors.options());
auto reduced_feats = auto reduced_feats = at::empty({num_coors_cpu, num_feats}, feats.options());
at::empty({num_coors_cpu, num_feats}, feats.options());
auto reduce_count = auto reduce_count =
at::zeros({num_coors_cpu}, coors.options().dtype(torch::kInt32)); at::zeros({num_coors_cpu}, coors.options().dtype(torch::kInt32));
...@@ -313,10 +310,12 @@ dynamic_point_to_voxel_forward_gpu(const at::Tensor &feats, ...@@ -313,10 +310,12 @@ dynamic_point_to_voxel_forward_gpu(const at::Tensor &feats,
return {reduced_feats, out_coors, coors_map, reduce_count}; return {reduced_feats, out_coors, coors_map, reduce_count};
} }
void dynamic_point_to_voxel_backward_gpu( void dynamic_point_to_voxel_backward_gpu(at::Tensor &grad_feats,
at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats, const at::Tensor &grad_reduced_feats,
const at::Tensor &feats, const at::Tensor &reduced_feats, const at::Tensor &feats,
const at::Tensor &coors_map, const at::Tensor &reduce_count, const at::Tensor &reduced_feats,
const at::Tensor &coors_map,
const at::Tensor &reduce_count,
const reduce_t reduce_type) { const reduce_t reduce_type) {
CHECK_INPUT(grad_feats); CHECK_INPUT(grad_feats);
CHECK_INPUT(grad_reduced_feats); CHECK_INPUT(grad_reduced_feats);
...@@ -336,9 +335,8 @@ void dynamic_point_to_voxel_backward_gpu( ...@@ -336,9 +335,8 @@ void dynamic_point_to_voxel_backward_gpu(
AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES(
grad_reduced_feats.scalar_type(), "add_reduce_traceback_grad_kernel", grad_reduced_feats.scalar_type(), "add_reduce_traceback_grad_kernel",
([&] { ([&] {
dim3 blocks dim3 blocks(std::min(
(std::min(at::cuda::ATenCeilDiv(num_input, threadsPerBlock), at::cuda::ATenCeilDiv(num_input, threadsPerBlock), maxGridDim));
maxGridDim));
dim3 threads(threadsPerBlock); dim3 threads(threadsPerBlock);
add_reduce_traceback_grad_kernel<<<blocks, threads>>>( add_reduce_traceback_grad_kernel<<<blocks, threads>>>(
grad_feats.data_ptr<scalar_t>(), grad_feats.data_ptr<scalar_t>(),
...@@ -353,9 +351,8 @@ void dynamic_point_to_voxel_backward_gpu( ...@@ -353,9 +351,8 @@ void dynamic_point_to_voxel_backward_gpu(
AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES(
grad_reduced_feats.scalar_type(), grad_reduced_feats.scalar_type(),
"max_reduce_traceback_scatter_idx_kernel", ([&] { "max_reduce_traceback_scatter_idx_kernel", ([&] {
dim3 blocks dim3 blocks(std::min(
(std::min(at::cuda::ATenCeilDiv(num_input, threadsPerBlock), at::cuda::ATenCeilDiv(num_input, threadsPerBlock), maxGridDim));
maxGridDim));
dim3 threads(threadsPerBlock); dim3 threads(threadsPerBlock);
max_reduce_traceback_scatter_idx_kernel<<<blocks, threads>>>( max_reduce_traceback_scatter_idx_kernel<<<blocks, threads>>>(
feats.data_ptr<scalar_t>(), reduced_feats.data_ptr<scalar_t>(), feats.data_ptr<scalar_t>(), reduced_feats.data_ptr<scalar_t>(),
...@@ -367,9 +364,8 @@ void dynamic_point_to_voxel_backward_gpu( ...@@ -367,9 +364,8 @@ void dynamic_point_to_voxel_backward_gpu(
AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES(
grad_reduced_feats.scalar_type(), grad_reduced_feats.scalar_type(),
"max_reduce_traceback_scatter_idx_kernel", ([&] { "max_reduce_traceback_scatter_idx_kernel", ([&] {
dim3 blocks( dim3 blocks(std::min(
std::min(at::cuda::ATenCeilDiv(num_reduced, threadsPerBlock), at::cuda::ATenCeilDiv(num_reduced, threadsPerBlock), maxGridDim));
maxGridDim));
dim3 threads(threadsPerBlock); dim3 threads(threadsPerBlock);
max_reduce_scatter_grad_kernel<<<blocks, threads>>>( max_reduce_scatter_grad_kernel<<<blocks, threads>>>(
grad_feats.data_ptr<scalar_t>(), grad_feats.data_ptr<scalar_t>(),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment