indice_cuda.cu 7.2 KB
Newer Older
zhangwenwei's avatar
zhangwenwei committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <ATen/ATen.h>
#include <spconv/indice.cu.h>
zhangwenwei's avatar
zhangwenwei committed
17
18
#include <spconv/indice.h>
#include <spconv/mp_helper.h>
zhangwenwei's avatar
zhangwenwei committed
19
20
21
22
#include <tensorview/helper_launch.h>
#include <tensorview/tensorview.h>
#include <utility/timer.h>

zhangwenwei's avatar
zhangwenwei committed
23
24
25
26
#include <chrono>
#include <limits>
#include <type_traits>

zhangwenwei's avatar
zhangwenwei committed
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
namespace spconv {
namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
                   tv::TensorView<Index> indicesOut,
                   tv::TensorView<IndexGrid> gridsOut,
                   tv::TensorView<Index> indicePairs,
                   tv::TensorView<Index> indiceNum,
                   tv::TensorView<Index> indicePairUnique,
                   const tv::SimpleVector<Index, NDim> kernelSize,
                   const tv::SimpleVector<Index, NDim> stride,
                   const tv::SimpleVector<Index, NDim> padding,
                   const tv::SimpleVector<Index, NDim> dilation,
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
                   bool transpose) {
    Index batchSize = gridsOut.dim(0);
    auto numActIn = indicesIn.dim(0);
zhangwenwei's avatar
zhangwenwei committed
45
    if (numActIn == 0) return 0;
zhangwenwei's avatar
zhangwenwei committed
46
47
48
49
50
    // auto timer = spconv::CudaContextTimer<>();
    if (transpose)
      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
zhangwenwei's avatar
zhangwenwei committed
51
52
                              indiceNum, indicePairUnique, kernelSize, stride,
                              padding, dilation, outSpatialShape);
zhangwenwei's avatar
zhangwenwei committed
53
54
55
56
    else
      prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
zhangwenwei's avatar
zhangwenwei committed
57
58
                              indiceNum, indicePairUnique, kernelSize, stride,
                              padding, dilation, outSpatialShape);
zhangwenwei's avatar
zhangwenwei committed
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
    TV_CHECK_CUDA_ERR();
    // std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
    return 1;
  }
};

template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
                   tv::TensorView<Index> indicesOut,
                   tv::TensorView<IndexGrid> gridsOut,
                   tv::TensorView<Index> indicePairs,
                   tv::TensorView<Index> indiceNum,
                   tv::TensorView<Index> indicePairUnique,
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
                   bool transpose, bool resetGrid) {
    Index batchSize = gridsOut.dim(0);
    auto kernelVolume = indicePairs.dim(0);
    auto numActIn = indicesIn.dim(0);
zhangwenwei's avatar
zhangwenwei committed
78
    if (numActIn == 0) return 0;
zhangwenwei's avatar
zhangwenwei committed
79
80
81
82
    Index numAct = indicePairUnique.dim(0) - 1;
    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
        <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
           d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
zhangwenwei's avatar
zhangwenwei committed
83
                            indicePairUnique, outSpatialShape, batchSize);
zhangwenwei's avatar
zhangwenwei committed
84
85
86
87
    TV_CHECK_CUDA_ERR();
    assignIndicePairsKernel<Index, IndexGrid, NDim>
        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
           d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
zhangwenwei's avatar
zhangwenwei committed
88
                            indicePairUnique, outSpatialShape);
zhangwenwei's avatar
zhangwenwei committed
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
    TV_CHECK_CUDA_ERR();
    if (resetGrid) {
      resetGridKernel<Index, IndexGrid, NDim>
          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
      TV_CHECK_CUDA_ERR();
    }
    return numAct;
  }
};

template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
                   tv::TensorView<IndexGrid> gridsOut,
                   tv::TensorView<Index> indicePairs,
                   tv::TensorView<Index> indiceNum,
                   const tv::SimpleVector<Index, NDim> kernelSize,
                   const tv::SimpleVector<Index, NDim> stride,
                   const tv::SimpleVector<Index, NDim> padding,
                   const tv::SimpleVector<Index, NDim> dilation,
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
                   bool transpose, bool resetGrid) {
    auto numActIn = indicesIn.dim(0);
zhangwenwei's avatar
zhangwenwei committed
113
    if (numActIn == 0) return 0;
zhangwenwei's avatar
zhangwenwei committed
114
115
116
117
118
119
120
121
    // auto timer = spconv::CudaContextTimer<>();
    prepareSubMGridKernel<Index, IndexGrid, NDim>
        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
           d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
    TV_CHECK_CUDA_ERR();
    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
zhangwenwei's avatar
zhangwenwei committed
122
123
                            kernelSize, stride, padding, dilation,
                            outSpatialShape);
zhangwenwei's avatar
zhangwenwei committed
124
125
126
127
128
    TV_CHECK_CUDA_ERR();
    // std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
    if (resetGrid) {
      resetGridSubMKernel<Index, IndexGrid, NDim>
          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
zhangwenwei's avatar
zhangwenwei committed
129
130
             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
                              numActIn);
zhangwenwei's avatar
zhangwenwei committed
131
132
133
134
135
      TV_CHECK_CUDA_ERR();
    }
    return numActIn;
  }
};
zhangwenwei's avatar
zhangwenwei committed
136
}  // namespace functor
zhangwenwei's avatar
zhangwenwei committed
137

zhangwenwei's avatar
zhangwenwei committed
138
139
140
141
142
143
144
145
#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                             \
  template struct functor::CreateConvIndicePairFunctor<tv::GPU, Index, int,   \
                                                       NDIM>;                 \
  template struct functor::CreateConvIndicePairFunctorP1<tv::GPU, Index, int, \
                                                         NDIM>;               \
  template struct functor::CreateConvIndicePairFunctorP2<tv::GPU, Index, int, \
                                                         NDIM>;               \
  template struct functor::CreateSubMIndicePairFunctor<tv::GPU, Index, int,   \
zhangwenwei's avatar
zhangwenwei committed
146
147
                                                       NDIM>;

zhangwenwei's avatar
zhangwenwei committed
148
149
150
151
#define DECLARE_GPU_INDEX(Index)          \
  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
zhangwenwei's avatar
zhangwenwei committed
152
153
154
155
156
157
  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);

DECLARE_GPU_INDEX(int);

#undef DECLARE_GPU_INDEX
#undef DECLARE_GPU_SPECS_INDEX_NDIM
zhangwenwei's avatar
zhangwenwei committed
158
}  // namespace spconv