blockwise_2d_tensor_op.cuh 14 KB
Newer Older
1
#pragma once
Chao Liu's avatar
Chao Liu committed
2
#include "ConstantTensorDescriptor.cuh"
3

Chao Liu's avatar
Chao Liu committed
4
template <unsigned BlockSize, class Float, class DstDesc, class F>
5
__device__ void
Chao Liu's avatar
Chao Liu committed
6
blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst, F f)
Chao Liu's avatar
Chao Liu committed
7
{
Chao Liu's avatar
Chao Liu committed
8
9
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
Chao Liu's avatar
Chao Liu committed
10

11
12
    constexpr auto dst_desc = DstDesc{};

Chao Liu's avatar
Chao Liu committed
13
    constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
14

15
16
17
#if 0
    if(threadIdx.x == 0)
    {
Chao Liu's avatar
Chao Liu committed
18
19
        print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: ");
        print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: ");
20
21
22
    }
#endif

Chao Liu's avatar
Chao Liu committed
23
24
    constexpr unsigned NLoop = desc.GetElementSize() / BlockSize;

Chao Liu's avatar
faster  
Chao Liu committed
25
    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
Chao Liu's avatar
Chao Liu committed
26
27
28
29
30
31
32
33
34
    {
        unsigned is = threadIdx.x + iloop * BlockSize;

        const unsigned did0 = is / desc.GetStride(I0);

        is -= did0 * desc.GetStride(I0);

        const unsigned did1 = is / desc.GetStride(I1);

Chao Liu's avatar
Chao Liu committed
35
        const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
Chao Liu's avatar
Chao Liu committed
36

Chao Liu's avatar
Chao Liu committed
37
        f(p_dst[dindex]);
Chao Liu's avatar
Chao Liu committed
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
    }

    constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize);

    if(has_tail)
    {
        unsigned is = threadIdx.x + NLoop * BlockSize;

        if(is < desc.GetElementSize())
        {
            const unsigned did0 = is / desc.GetStride(I0);

            is -= did0 * desc.GetStride(I0);

            const unsigned did1 = is / desc.GetStride(I1);

Chao Liu's avatar
Chao Liu committed
54
            const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
Chao Liu's avatar
Chao Liu committed
55

Chao Liu's avatar
Chao Liu committed
56
            f(p_dst[dindex]);
Chao Liu's avatar
Chao Liu committed
57
58
59
        }
    }
}
Chao Liu's avatar
Chao Liu committed
60

Chao Liu's avatar
Chao Liu committed
61
// Function: p_dst[reorder[i0], reorder[i1], reorder[i2], reorder[i3]] = p_src[i0,i1,i2,i3]
62
63
// TODO: in order to optimize mem access for different mem type,
// need to write specialized version
Chao Liu's avatar
Chao Liu committed
64
template <unsigned BlockSize,
Chao Liu's avatar
Chao Liu committed
65
          class Float,
66
67
          class SrcDesc,
          class DstDesc,
Chao Liu's avatar
Chao Liu committed
68
69
          class SrcOpLengths,
          class DstFromSrcReorder,
Chao Liu's avatar
Chao Liu committed
70
          class F>
Chao Liu's avatar
Chao Liu committed
71
__device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src(
Chao Liu's avatar
Chao Liu committed
72
73
74
75
76
77
78
    SrcDesc,
    Float* const __restrict__ p_src,
    DstDesc,
    Float* __restrict__ p_dst,
    SrcOpLengths,
    DstFromSrcReorder,
    F f)
Chao Liu's avatar
Chao Liu committed
79
{
Chao Liu's avatar
Chao Liu committed
80
81
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
Chao Liu's avatar
Chao Liu committed
82

Chao Liu's avatar
Chao Liu committed
83
84
    constexpr unsigned IR0 = DstFromSrcReorder{}.Get(I0);
    constexpr unsigned IR1 = DstFromSrcReorder{}.Get(I1);
Chao Liu's avatar
Chao Liu committed
85

86
87
    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
Chao Liu's avatar
Chao Liu committed
88
    constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{});
Chao Liu's avatar
Chao Liu committed
89

90
    constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
Chao Liu's avatar
Chao Liu committed
91
92
93
94
95

    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
    {
        unsigned is = threadIdx.x + iloop * BlockSize;

Chao Liu's avatar
Chao Liu committed
96
        unsigned did[2];
Chao Liu's avatar
Chao Liu committed
97

98
        did[0] = is / ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
99

100
        is -= did[0] * ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
101

102
        did[1] = is / ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
103

Chao Liu's avatar
Chao Liu committed
104
        const unsigned aindex = src_desc.Get1dIndex(did[0], did[1]);
Chao Liu's avatar
Chao Liu committed
105

Chao Liu's avatar
Chao Liu committed
106
        const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]);
107
108

        f(p_src[aindex], p_dst[bindex]);
Chao Liu's avatar
Chao Liu committed
109
110
    }

111
    constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
Chao Liu's avatar
Chao Liu committed
112
113
114
115
116

    if(has_tail)
    {
        unsigned is = threadIdx.x + NLoop * BlockSize;

117
        if(is < ref_desc.GetElementSize())
Chao Liu's avatar
Chao Liu committed
118
        {
Chao Liu's avatar
Chao Liu committed
119
            unsigned did[2];
120
121

            did[0] = is / ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
122

123
            is -= did[0] * ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
124

125
            did[1] = is / ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
126

Chao Liu's avatar
Chao Liu committed
127
            const unsigned aindex = src_desc.Get1dIndex(did[0], did[1]);
128

Chao Liu's avatar
Chao Liu committed
129
            const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]);
130

131
            f(p_src[aindex], p_dst[bindex]);
132
133
134
135
        }
    }
}

Chao Liu's avatar
Chao Liu committed
136
template <unsigned BlockSize, class Float, class DstDesc>
Chao Liu's avatar
Chao Liu committed
137
__device__ void blockwise_2d_tensor_set_zero(DstDesc, Float* __restrict__ p_dst)
138
{
Chao Liu's avatar
Chao Liu committed
139
    auto f_set_zero = [](Float& v) { v = Float(0); };
Chao Liu's avatar
Chao Liu committed
140

Chao Liu's avatar
Chao Liu committed
141
    blockwise_2d_tensor_pointwise_operation_unary<BlockSize>(DstDesc{}, p_dst, f_set_zero);
Chao Liu's avatar
Chao Liu committed
142
}
143

Chao Liu's avatar
Chao Liu committed
144
template <unsigned BlockSize,
Chao Liu's avatar
Chao Liu committed
145
          class Float,
146
147
          class SrcDesc,
          class DstDesc,
Chao Liu's avatar
Chao Liu committed
148
149
150
          class SrcOpLengths,
          class DstFromSrcReorder>
__device__ void
Chao Liu's avatar
Chao Liu committed
151
blockwise_2d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
Chao Liu's avatar
Chao Liu committed
152
153
154
155
156
                                                     Float* const __restrict__ p_src,
                                                     DstDesc,
                                                     Float* __restrict__ p_dst,
                                                     SrcOpLengths,
                                                     DstFromSrcReorder)
157
{
Chao Liu's avatar
Chao Liu committed
158
    auto f_copy = [](const Float& src, Float& dst) { dst = src; };
159

Chao Liu's avatar
Chao Liu committed
160
    blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src<BlockSize>(
Chao Liu's avatar
Chao Liu committed
161
        SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy);
162
163
}

Chao Liu's avatar
Chao Liu committed
164
template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
165
struct blockwise_2d_tensor_copy_1
Chao Liu's avatar
Chao Liu committed
166
{
167
168
169
    __device__ void run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const
    {
        constexpr auto dst_from_src_reorder = Sequence<0, 1>{};
170

171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
        blockwise_2d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
            SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, dst_from_src_reorder);
    }
};

template <unsigned BlockSize,
          class Float,
          class SrcDesc,
          class DstDesc,
          class SrcOpLengths,
          unsigned ThreadPerDim0,
          unsigned ThreadPerDim1>
struct blockwise_2d_tensor_copy_2
{
    unsigned mThreadId0;
    unsigned mThreadId1;

    __device__ blockwise_2d_tensor_copy_2()
    {
Chao Liu's avatar
Chao Liu committed
190
191
        static_assert(is_same<Float, float>::value, "wrong! type is not float!\n");

192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
        mThreadId0 = get_thread_local_1d_id() / ThreadPerDim1;
        mThreadId1 = get_thread_local_1d_id() - mThreadId0 * ThreadPerDim1;
    }

    __device__ void run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const
    {
        if(get_thread_local_1d_id() >= ThreadPerDim0 * ThreadPerDim1)
            return;

        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};

        constexpr auto src_desc = SrcDesc{};
        constexpr auto dst_desc = DstDesc{};

        constexpr unsigned L0 = SrcOpLengths{}.Get(I0);
        constexpr unsigned L1 = SrcOpLengths{}.Get(I1);

        constexpr unsigned Dim0Loop = L0 / ThreadPerDim0;
        constexpr bool d0_has_tail  = (L0 > ThreadPerDim0 * Dim0Loop);

        constexpr unsigned Dim1V4Loop = L1 / (ThreadPerDim1 * 4);
        constexpr unsigned Dim1V2Loop =
            (L1 - Dim1V4Loop * (ThreadPerDim1 * 4)) / (ThreadPerDim1 * 2);
        constexpr unsigned Dim1V1Loop =
            (L1 - Dim1V4Loop * (ThreadPerDim1 * 4) - Dim1V2Loop * (ThreadPerDim1 * 2)) /
            ThreadPerDim1;
        constexpr bool d1_has_tail =
            (L1 > ThreadPerDim1 * (4 * Dim1V4Loop + 2 * Dim1V2Loop + Dim1V1Loop));

        for(unsigned d0loop = 0; d0loop < Dim0Loop; ++d0loop)
        {
            unsigned did0 = d0loop * ThreadPerDim0 + mThreadId0;

            // v4
            for(unsigned d1v4loop = 0; d1v4loop < Dim1V4Loop; ++d1v4loop)
            {
                unsigned did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1;
Chao Liu's avatar
Chao Liu committed
230
231
232
#if 1
                const unsigned sindex = src_desc.Get1dIndex(did0, did1);
                const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
233

Chao Liu's avatar
Chao Liu committed
234
235
236
237
                *(reinterpret_cast<float4*>(p_dst + dindex)) =
                    *(reinterpret_cast<float4*>(p_src + sindex));

#else
238
239
240
241
242
243
244
                for(unsigned i = 0; i < 4; ++i)
                {
                    const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i);
                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1 + i);

                    p_dst[dindex] = p_src[sindex];
                }
Chao Liu's avatar
Chao Liu committed
245
#endif
246
247
248
249
250
251
252
253
            }

            // v2
            for(unsigned d1v2loop = 0; d1v2loop < Dim1V2Loop; ++d1v2loop)
            {
                unsigned did1 =
                    Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 + 2 * mThreadId1;

Chao Liu's avatar
Chao Liu committed
254
255
256
257
258
259
260
261
#if 1
                const unsigned sindex = src_desc.Get1dIndex(did0, did1);
                const unsigned dindex = dst_desc.Get1dIndex(did0, did1);

                *(reinterpret_cast<float2*>(p_dst + dindex)) =
                    *(reinterpret_cast<float2*>(p_src + sindex));

#else
262
263
264
265
266
267
268
                for(unsigned i = 0; i < 2; ++i)
                {
                    const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i);
                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1 + i);

                    p_dst[dindex] = p_src[sindex];
                }
Chao Liu's avatar
Chao Liu committed
269
#endif
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
            }

            // v1
            for(unsigned d1v1loop = 0; d1v1loop < Dim1V1Loop; ++d1v1loop)
            {
                unsigned did1 = Dim1V4Loop * 4 * ThreadPerDim1 + Dim1V2Loop * 2 * ThreadPerDim1 +
                                d1v1loop * ThreadPerDim1 + mThreadId1;

                const unsigned sindex = src_desc.Get1dIndex(did0, did1);
                const unsigned dindex = dst_desc.Get1dIndex(did0, did1);

                p_dst[dindex] = p_src[sindex];
            }

            // dim-1 tail
            if(d1_has_tail)
            {
                unsigned did1 = Dim1V4Loop * 4 * ThreadPerDim1 + Dim1V2Loop * 2 * ThreadPerDim1 +
                                Dim1V1Loop * ThreadPerDim1 + mThreadId1;

                if(did1 < L1)
                {
                    const unsigned sindex = src_desc.Get1dIndex(did0, did1);
                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1);

                    p_dst[dindex] = p_src[sindex];
                }
            }
        }

        // dim-0 tail
        if(d0_has_tail)
        {
            unsigned did0 = Dim0Loop * ThreadPerDim0 + mThreadId0;

            if(did0 < L0)
            {

                // v4
                for(unsigned d1v4loop = 0; d1v4loop < Dim1V4Loop; ++d1v4loop)
                {
                    unsigned did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1;

Chao Liu's avatar
Chao Liu committed
313
314
315
316
317
318
319
320
#if 1
                    const unsigned sindex = src_desc.Get1dIndex(did0, did1);
                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1);

                    *(reinterpret_cast<float4*>(p_dst + dindex)) =
                        *(reinterpret_cast<float4*>(p_src + sindex));

#else
321
322
323
324
325
326
327
                    for(unsigned i = 0; i < 4; ++i)
                    {
                        const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i);
                        const unsigned dindex = dst_desc.Get1dIndex(did0, did1 + i);

                        p_dst[dindex] = p_src[sindex];
                    }
Chao Liu's avatar
Chao Liu committed
328
#endif
329
330
331
332
333
334
335
336
                }

                // v2
                for(unsigned d1v2loop = 0; d1v2loop < Dim1V2Loop; ++d1v2loop)
                {
                    unsigned did1 = Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 +
                                    2 * mThreadId1;

Chao Liu's avatar
Chao Liu committed
337
338
339
340
341
342
343
344
#if 1
                    const unsigned sindex = src_desc.Get1dIndex(did0, did1);
                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1);

                    *(reinterpret_cast<float2*>(p_dst + dindex)) =
                        *(reinterpret_cast<float2*>(p_src + sindex));

#else
345
346
347
348
349
350
351
                    for(unsigned i = 0; i < 2; ++i)
                    {
                        const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i);
                        const unsigned dindex = dst_desc.Get1dIndex(did0, did1 + i);

                        p_dst[dindex] = p_src[sindex];
                    }
Chao Liu's avatar
Chao Liu committed
352
#endif
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
                }

                // v1
                for(unsigned d1v1loop = 0; d1v1loop < Dim1V1Loop; ++d1v1loop)
                {
                    unsigned did1 = Dim1V4Loop * 4 * ThreadPerDim1 +
                                    Dim1V2Loop * 2 * ThreadPerDim1 + d1v1loop * ThreadPerDim1 +
                                    mThreadId1;

                    const unsigned sindex = src_desc.Get1dIndex(did0, did1);
                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1);

                    p_dst[dindex] = p_src[sindex];
                }

                // tail
                if(d1_has_tail)
                {
                    unsigned did1 = Dim1V4Loop * 4 * ThreadPerDim1 +
                                    Dim1V2Loop * 2 * ThreadPerDim1 + Dim1V1Loop * ThreadPerDim1 +
                                    mThreadId1;

                    if(did1 < L1)
                    {
                        const unsigned sindex = src_desc.Get1dIndex(did0, did1);
                        const unsigned dindex = dst_desc.Get1dIndex(did0, did1);

                        p_dst[dindex] = p_src[sindex];
                    }
                }
            }
        }
    }
};
Chao Liu's avatar
Chao Liu committed
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433

template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
struct blockwise_2d_tensor_copy_dummy_1
{
    unsigned mBegin;

    __device__ blockwise_2d_tensor_copy_dummy_1()
    {
        constexpr unsigned n_total =
            make_ConstantTensorDescriptor(SrcOpLengths{}).GetElementSpace();

        constexpr unsigned n_per_thread = n_total / BlockSize;

        mBegin = n_per_thread * get_thread_local_1d_id();
    }

    __device__ void run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const
    {
        constexpr unsigned n_total =
            make_ConstantTensorDescriptor(SrcOpLengths{}).GetElementSpace();

        constexpr unsigned n_per_thread = n_total / BlockSize;

        for(unsigned i = 0; i < n_per_thread; ++i)
        {
            p_dst[mBegin + i] = p_src[mBegin + i];
        }
    }
};

template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
struct blockwise_2d_tensor_copy_dummy_2
{
    __device__ void run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const
    {
        constexpr unsigned n_total =
            make_ConstantTensorDescriptor(SrcOpLengths{}).GetElementSpace();

        constexpr unsigned n_per_thread = n_total / BlockSize;

        for(unsigned i = 0; i < n_per_thread; ++i)
        {
            unsigned index = get_thread_local_1d_id() + BlockSize * i;
            p_dst[index]   = p_src[index];
        }
    }
};