blockwise_direct_convolution.hip.hpp 6.01 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
#pragma once
2
3
4
#include "ConstantTensorDescriptor.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp"
#include "threadwise_direct_convolution.hip.hpp"
Chao Liu's avatar
Chao Liu committed
5

Chao Liu's avatar
Chao Liu committed
6
template <unsigned BlockSize,
Chao Liu's avatar
Chao Liu committed
7
          class Float,
Chao Liu's avatar
Chao Liu committed
8
9
10
11
12
          class InBlockDesc,
          class WeiBlockDesc,
          class OutBlockDesc,
          unsigned OutTileSizeH,
          unsigned OutTileSizeW,
Chao Liu's avatar
Chao Liu committed
13
14
          unsigned NPerThread,
          unsigned KPerThread,
Chao Liu's avatar
Chao Liu committed
15
          unsigned CPerThread>
Chao Liu's avatar
Chao Liu committed
16
__device__ void blockwise_direct_convolution(InBlockDesc,
Chao Liu's avatar
Chao Liu committed
17
                                             Float* const __restrict__ p_in_block,
Chao Liu's avatar
Chao Liu committed
18
                                             WeiBlockDesc,
Chao Liu's avatar
Chao Liu committed
19
                                             Float* const __restrict__ p_wei_block,
Chao Liu's avatar
Chao Liu committed
20
                                             OutBlockDesc,
Chao Liu's avatar
Chao Liu committed
21
                                             Float* __restrict__ p_out_block)
Chao Liu's avatar
Chao Liu committed
22
{
Chao Liu's avatar
Chao Liu committed
23
24
25
26
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
27
28
29
30
31
32
33
34
35
36
37

    constexpr auto in_block_desc  = InBlockDesc{};
    constexpr auto wei_block_desc = WeiBlockDesc{};
    constexpr auto out_block_desc = OutBlockDesc{};

    constexpr unsigned S = wei_block_desc.GetLength(I2);
    constexpr unsigned R = wei_block_desc.GetLength(I3);

    constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
    constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;

Chao Liu's avatar
Chao Liu committed
38
39
40
41
42
43
44
45
    // divide thread work
    constexpr unsigned NThreadWork = (out_block_desc.GetLength(I0) + NPerThread - 1) / NPerThread;
    constexpr unsigned KThreadWork = (out_block_desc.GetLength(I1) + KPerThread - 1) / KPerThread;
    constexpr unsigned YThreadWork =
        (out_block_desc.GetLength(I2) + OutTileSizeH - 1) / OutTileSizeH;
    constexpr unsigned XThreadWork =
        (out_block_desc.GetLength(I3) + OutTileSizeW - 1) / OutTileSizeW;

Chao Liu's avatar
Chao Liu committed
46
47
48
49
50
51
52
53
54
#if 0
    if(threadIdx.x == 0)
    {
        print_ConstantTensorDescriptor(in_block_desc);
        print_ConstantTensorDescriptor(wei_block_desc);
        print_ConstantTensorDescriptor(out_block_desc);
    }
#endif

Chao Liu's avatar
Chao Liu committed
55
56
    constexpr auto in_thread_desc =
        make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{});
Chao Liu's avatar
Chao Liu committed
57

Chao Liu's avatar
Chao Liu committed
58
59
    constexpr auto wei_thread_desc =
        make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, S, R>{});
Chao Liu's avatar
Chao Liu committed
60

Chao Liu's avatar
Chao Liu committed
61
    constexpr auto out_thread_desc =
Chao Liu's avatar
Chao Liu committed
62
        get_convolution_output_default_4d_tensor_descriptor(in_thread_desc, wei_thread_desc);
Chao Liu's avatar
Chao Liu committed
63

Chao Liu's avatar
Chao Liu committed
64
65
    constexpr auto in_thread_block_desc =
        make_ConstantTensorDescriptor(in_thread_desc.GetLengths(), in_block_desc.GetStrides());
Chao Liu's avatar
Chao Liu committed
66

Chao Liu's avatar
Chao Liu committed
67
68
    constexpr auto wei_thread_block_desc =
        make_ConstantTensorDescriptor(wei_thread_desc.GetLengths(), wei_block_desc.GetStrides());
Chao Liu's avatar
Chao Liu committed
69

Chao Liu's avatar
Chao Liu committed
70
71
    constexpr auto out_thread_block_desc =
        make_ConstantTensorDescriptor(out_thread_desc.GetLengths(), out_block_desc.GetStrides());
Chao Liu's avatar
Chao Liu committed
72
73
74

    const unsigned thread_id = threadIdx.x;

Chao Liu's avatar
Chao Liu committed
75
76
    for(unsigned thread_work_id = thread_id;
        thread_work_id < NThreadWork * KThreadWork * YThreadWork * XThreadWork;
Chao Liu's avatar
Chao Liu committed
77
78
79
        thread_work_id += BlockSize)
    {
        unsigned itmp             = thread_work_id;
Chao Liu's avatar
Chao Liu committed
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
        unsigned n_thread_work_id = itmp / (KThreadWork * YThreadWork * XThreadWork);
        itmp -= n_thread_work_id * (KThreadWork * YThreadWork * XThreadWork);
        unsigned k_thread_work_id = itmp / (YThreadWork * XThreadWork);
        itmp -= k_thread_work_id * (YThreadWork * XThreadWork);
        unsigned y_thread_work_id = itmp / XThreadWork;
        unsigned x_thread_work_id = itmp - y_thread_work_id * XThreadWork;

        unsigned n_thread_data_begin  = n_thread_work_id * NPerThread;
        unsigned k_thread_data_begin  = k_thread_work_id * KPerThread;
        unsigned ho_thread_data_begin = y_thread_work_id * OutTileSizeH;
        unsigned wo_thread_data_begin = x_thread_work_id * OutTileSizeW;

        unsigned hi_thread_data_begin = ho_thread_data_begin; // minus padding
        unsigned wi_thread_data_begin = wo_thread_data_begin; // minus padding

Chao Liu's avatar
Chao Liu committed
95
        Float p_out_thread[out_thread_desc.GetElementSpace()];
Chao Liu's avatar
Chao Liu committed
96

Chao Liu's avatar
Chao Liu committed
97
        threadwise_4d_tensor_copy(out_block_desc,
98
99
100
101
                                  p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
                                                                          k_thread_data_begin,
                                                                          ho_thread_data_begin,
                                                                          wo_thread_data_begin),
Chao Liu's avatar
Chao Liu committed
102
                                  out_thread_desc,
Chao Liu's avatar
Chao Liu committed
103
                                  p_out_thread,
Chao Liu's avatar
Chao Liu committed
104
                                  out_thread_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
105
106
107

        for(unsigned c_thread_data_begin = 0; c_thread_data_begin < in_block_desc.GetLength(I1);
            c_thread_data_begin += CPerThread)
Chao Liu's avatar
Chao Liu committed
108
        {
Chao Liu's avatar
Chao Liu committed
109
110
111
            // threadwise convolution
            threadwise_direct_convolution_2(
                in_thread_block_desc,
112
113
114
115
                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
                                                      c_thread_data_begin,
                                                      hi_thread_data_begin,
                                                      wi_thread_data_begin),
Chao Liu's avatar
Chao Liu committed
116
117
118
                wei_thread_block_desc,
                p_wei_block +
                    wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0),
Chao Liu's avatar
Chao Liu committed
119
120
                out_thread_desc,
                p_out_thread);
Chao Liu's avatar
Chao Liu committed
121
        }
Chao Liu's avatar
Chao Liu committed
122
123
124
125

        // copy output into LDS
        threadwise_4d_tensor_copy(out_thread_desc,
                                  p_out_thread,
Chao Liu's avatar
Chao Liu committed
126
                                  out_block_desc,
127
128
129
130
                                  p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
                                                                          k_thread_data_begin,
                                                                          ho_thread_data_begin,
                                                                          wo_thread_data_begin),
Chao Liu's avatar
Chao Liu committed
131
                                  out_thread_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
132
133
    }
}