"official/vision/serving/export_base.py" did not exist on "84feedee2b6d0933cd6c3d7e420c369fe17c6cda"
blockwise_4d_tensor_op.hip.hpp 10.8 KB
Newer Older
1
#pragma once
2
#include "ConstantTensorDescriptor.hip.hpp"
3

Chao Liu's avatar
Chao Liu committed
4
template <unsigned BlockSize, class Float, class DstDesc, class F>
5
__device__ void
Chao Liu's avatar
Chao Liu committed
6
blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst, F f)
Chao Liu's avatar
Chao Liu committed
7
{
Chao Liu's avatar
Chao Liu committed
8
9
10
11
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
12

13
14
    constexpr auto dst_desc = DstDesc{};

Chao Liu's avatar
Chao Liu committed
15
    constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
16

17
18
19
#if 0
    if(threadIdx.x == 0)
    {
Chao Liu's avatar
Chao Liu committed
20
21
        print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: ");
        print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: ");
22
23
24
    }
#endif

Chao Liu's avatar
Chao Liu committed
25
26
    constexpr unsigned NLoop = desc.GetElementSize() / BlockSize;

Chao Liu's avatar
faster  
Chao Liu committed
27
    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
Chao Liu's avatar
Chao Liu committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
    {
        unsigned is = threadIdx.x + iloop * BlockSize;

        const unsigned did0 = is / desc.GetStride(I0);

        is -= did0 * desc.GetStride(I0);

        const unsigned did1 = is / desc.GetStride(I1);

        is -= did1 * desc.GetStride(I1);

        const unsigned did2 = is / desc.GetStride(I2);

        is -= did2 * desc.GetStride(I2);

        const unsigned did3 = is / desc.GetStride(I3);

        const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);

Chao Liu's avatar
Chao Liu committed
47
        f(p_dst[dindex]);
Chao Liu's avatar
Chao Liu committed
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
    }

    constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize);

    if(has_tail)
    {
        unsigned is = threadIdx.x + NLoop * BlockSize;

        if(is < desc.GetElementSize())
        {
            const unsigned did0 = is / desc.GetStride(I0);

            is -= did0 * desc.GetStride(I0);

            const unsigned did1 = is / desc.GetStride(I1);

            is -= did1 * desc.GetStride(I1);

            const unsigned did2 = is / desc.GetStride(I2);

            is -= did2 * desc.GetStride(I2);

            const unsigned did3 = is / desc.GetStride(I3);

            const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);

Chao Liu's avatar
Chao Liu committed
74
            f(p_dst[dindex]);
Chao Liu's avatar
Chao Liu committed
75
76
77
        }
    }
}
Chao Liu's avatar
Chao Liu committed
78

Chao Liu's avatar
Chao Liu committed
79
// Function: p_dst[reorder[i0], reorder[i1], reorder[i2], reorder[i3]] = p_src[i0,i1,i2,i3]
80
81
// TODO: in order to optimize mem access for different mem type,
// need to write specialized version
Chao Liu's avatar
Chao Liu committed
82
template <unsigned BlockSize,
Chao Liu's avatar
Chao Liu committed
83
          class Float,
84
85
          class SrcDesc,
          class DstDesc,
Chao Liu's avatar
Chao Liu committed
86
87
          class SrcOpLengths,
          class DstFromSrcReorder,
Chao Liu's avatar
Chao Liu committed
88
          class F>
Chao Liu's avatar
Chao Liu committed
89
90
__device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src(
    SrcDesc,
Chao Liu's avatar
Chao Liu committed
91
    const Float* __restrict__ p_src,
Chao Liu's avatar
Chao Liu committed
92
93
94
95
96
    DstDesc,
    Float* __restrict__ p_dst,
    SrcOpLengths,
    DstFromSrcReorder,
    F f)
Chao Liu's avatar
Chao Liu committed
97
{
Chao Liu's avatar
Chao Liu committed
98
99
100
101
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
102

Chao Liu's avatar
Chao Liu committed
103
104
105
106
    constexpr unsigned IR0 = DstFromSrcReorder{}.Get(I0);
    constexpr unsigned IR1 = DstFromSrcReorder{}.Get(I1);
    constexpr unsigned IR2 = DstFromSrcReorder{}.Get(I2);
    constexpr unsigned IR3 = DstFromSrcReorder{}.Get(I3);
Chao Liu's avatar
Chao Liu committed
107

108
109
    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
Chao Liu's avatar
Chao Liu committed
110
    constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{});
Chao Liu's avatar
Chao Liu committed
111

112
    constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
Chao Liu's avatar
Chao Liu committed
113
114
115
116
117

    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
    {
        unsigned is = threadIdx.x + iloop * BlockSize;

118
        unsigned did[4];
Chao Liu's avatar
Chao Liu committed
119

120
        did[0] = is / ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
121

122
        is -= did[0] * ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
123

124
        did[1] = is / ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
125

126
        is -= did[1] * ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
127

128
        did[2] = is / ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
129

130
        is -= did[2] * ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
131

132
        did[3] = is / ref_desc.GetStride(I3);
Chao Liu's avatar
Chao Liu committed
133

134
        const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
Chao Liu's avatar
Chao Liu committed
135

Chao Liu's avatar
Chao Liu committed
136
        const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
137
138

        f(p_src[aindex], p_dst[bindex]);
Chao Liu's avatar
Chao Liu committed
139
140
    }

141
    constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
Chao Liu's avatar
Chao Liu committed
142
143
144
145
146

    if(has_tail)
    {
        unsigned is = threadIdx.x + NLoop * BlockSize;

147
        if(is < ref_desc.GetElementSize())
Chao Liu's avatar
Chao Liu committed
148
        {
149
150
151
            unsigned did[4];

            did[0] = is / ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
152

153
            is -= did[0] * ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
154

155
            did[1] = is / ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
156

157
            is -= did[1] * ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
158

159
            did[2] = is / ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
160

161
            is -= did[2] * ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
162

163
            did[3] = is / ref_desc.GetStride(I3);
Chao Liu's avatar
Chao Liu committed
164

165
            const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
166

Chao Liu's avatar
Chao Liu committed
167
            const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
168

169
            f(p_src[aindex], p_dst[bindex]);
170
171
172
173
        }
    }
}

Chao Liu's avatar
Chao Liu committed
174
175
template <unsigned BlockSize, class Float, class DstDesc>
__device__ void blockwise_4d_tensor_set_zero(DstDesc, Float* __restrict__ p_dst)
176
{
Chao Liu's avatar
Chao Liu committed
177
    auto f_set_zero = [](Float& v) { v = Float(0); };
Chao Liu's avatar
Chao Liu committed
178

Chao Liu's avatar
Chao Liu committed
179
    blockwise_4d_tensor_pointwise_operation_unary<BlockSize>(DstDesc{}, p_dst, f_set_zero);
Chao Liu's avatar
Chao Liu committed
180
}
181

Chao Liu's avatar
Chao Liu committed
182
template <unsigned BlockSize,
Chao Liu's avatar
Chao Liu committed
183
          class Float,
184
185
          class SrcDesc,
          class DstDesc,
Chao Liu's avatar
Chao Liu committed
186
187
188
189
          class SrcOpLengths,
          class DstFromSrcReorder>
__device__ void
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
Chao Liu's avatar
Chao Liu committed
190
                                                     const Float* __restrict__ p_src,
Chao Liu's avatar
Chao Liu committed
191
192
193
194
                                                     DstDesc,
                                                     Float* __restrict__ p_dst,
                                                     SrcOpLengths,
                                                     DstFromSrcReorder)
195
{
Chao Liu's avatar
Chao Liu committed
196
    auto f_copy = [](const Float& src, Float& dst) { dst = src; };
197

Chao Liu's avatar
Chao Liu committed
198
199
    blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src<BlockSize>(
        SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy);
200
201
}

Chao Liu's avatar
Chao Liu committed
202
template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
203
struct Blockwise4dTensorCopy1
Chao Liu's avatar
Chao Liu committed
204
{
Chao Liu's avatar
Chao Liu committed
205
    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
Chao Liu's avatar
Chao Liu committed
206
207
    {
        constexpr auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{};
208

Chao Liu's avatar
Chao Liu committed
209
210
211
212
213
        blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
            SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, dst_from_src_reorder);
    }
};

214
215
216
217
218
219
template <unsigned BlockSize,
          class Float,
          class SrcDesc,
          class DstDesc,
          class DstOpLengths,
          class GlobalLowerPads>
220
struct BlockwiseChwnTensorCopyPadded
221
{
Chao Liu's avatar
Chao Liu committed
222
    __device__ void Run(const Float* __restrict__ p_src,
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
                        unsigned c_block_data_begin,
                        unsigned ho_block_data_begin,
                        unsigned wo_block_data_begin,
                        unsigned n_block_data_begin,
                        Float* __restrict__ p_dst,
                        unsigned h_block_pad_low,
                        unsigned w_block_pad_low,
                        unsigned h_block_pad_up,
                        unsigned w_block_pad_up) const
    {
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
        constexpr auto I2 = Number<2>{};
        constexpr auto I3 = Number<3>{};

        constexpr auto src_desc = SrcDesc{};
        constexpr auto dst_desc = DstDesc{};
        constexpr auto ref_desc = make_ConstantTensorDescriptor(DstOpLengths{});

        constexpr auto h_global_pad_low = GlobalLowerPads{}.Get(I0);
        constexpr auto w_global_pad_low = GlobalLowerPads{}.Get(I1);

        constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;

Chao Liu's avatar
Chao Liu committed
247
        const Float* p_src_tmp =
248
249
250
251
            p_src + src_desc.Get1dIndex(c_block_data_begin,
                                        (ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
                                        (wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
                                        n_block_data_begin);
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338

#if 0
        if(get_thread_local_1d_id() == 0)
        {
            print_ConstantTensorDescriptor(src_desc, "src_desc: ");
            print_ConstantTensorDescriptor(dst_desc, "dst_desc: ");
            print_ConstantTensorDescriptor(ref_desc, "ref_desc: ");

            printf("%u %u, \t"
                   "h_global_pad_low %u w_global_pad_low %u \t"
                   "h_block_pad_low %u w_block_pad_low %u h_block_pad_up %u  w_block_pad_up %u \t"
                   "\n",
                   get_block_1d_id(),
                   get_thread_local_1d_id(),
                   h_global_pad_low,
                   w_global_pad_low,
                   h_block_pad_low,
                   w_block_pad_low,
                   h_block_pad_up,
                   w_block_pad_up);
        }
#endif

        for(unsigned iloop = 0; iloop < NLoop; ++iloop)
        {
            unsigned is = threadIdx.x + iloop * BlockSize;

            unsigned did[4];

            did[0] = is / ref_desc.GetStride(I0);

            is -= did[0] * ref_desc.GetStride(I0);

            did[1] = is / ref_desc.GetStride(I1);

            is -= did[1] * ref_desc.GetStride(I1);

            did[2] = is / ref_desc.GetStride(I2);

            is -= did[2] * ref_desc.GetStride(I2);

            did[3] = is / ref_desc.GetStride(I3);

            const unsigned bindex = dst_desc.Get1dIndex(did[0], did[1], did[2], did[3]);

            p_dst[bindex] =
                (did[1] < h_block_pad_low || did[1] + h_block_pad_up >= ref_desc.GetLength(I1) ||
                 did[2] < w_block_pad_low || did[2] + w_block_pad_up >= ref_desc.GetLength(I2))
                    ? Float(0)
                    : p_src_tmp[src_desc.Get1dIndex(did[0], did[1], did[2], did[3])];
        }

        constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);

        if(has_tail)
        {
            unsigned is = threadIdx.x + NLoop * BlockSize;

            if(is < ref_desc.GetElementSize())
            {
                unsigned did[4];

                did[0] = is / ref_desc.GetStride(I0);

                is -= did[0] * ref_desc.GetStride(I0);

                did[1] = is / ref_desc.GetStride(I1);

                is -= did[1] * ref_desc.GetStride(I1);

                did[2] = is / ref_desc.GetStride(I2);

                is -= did[2] * ref_desc.GetStride(I2);

                did[3] = is / ref_desc.GetStride(I3);

                const unsigned bindex = dst_desc.Get1dIndex(did[0], did[1], did[2], did[3]);

                p_dst[bindex] =
                    (did[1] < h_block_pad_low ||
                     did[1] + h_block_pad_up >= ref_desc.GetLength(I1) ||
                     did[2] < w_block_pad_low || did[2] + w_block_pad_up >= ref_desc.GetLength(I2))
                        ? Float(0)
                        : p_src_tmp[src_desc.Get1dIndex(did[0], did[1], did[2], did[3])];
            }
        }
    }
Chao Liu's avatar
Chao Liu committed
339
};