gridwise_direct_convolution_2.cuh 9.46 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
#pragma once
Chao Liu's avatar
Chao Liu committed
2
#include "ConstantTensorDescriptor.cuh"
Chao Liu's avatar
Chao Liu committed
3
#include "blockwise_4d_tensor_op.cuh"
Chao Liu's avatar
rename  
Chao Liu committed
4
#include "blockwise_direct_convolution.cuh"
Chao Liu's avatar
Chao Liu committed
5
#include "threadwise_4d_tensor_op.cuh"
Chao Liu's avatar
rename  
Chao Liu committed
6
#include "threadwise_direct_convolution.cuh"
Chao Liu's avatar
Chao Liu committed
7

Chao Liu's avatar
Chao Liu committed
8
template <class Float,
Chao Liu's avatar
Chao Liu committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
          class InGlobalDesc,
          class WeiGlobalDesc,
          class OutGlobalDesc,
          unsigned OutTileSizeH,
          unsigned OutTileSizeW,
          unsigned NPerBlock,
          unsigned KPerBlock,
          unsigned CPerBlock,
          unsigned YPerBlock,
          unsigned XPerBlock,
          unsigned NPerThread,
          unsigned KPerThread,
          unsigned CPerThread,
          unsigned BlockSize,
          unsigned GridSize>
Chao Liu's avatar
Chao Liu committed
24
__global__ void gridwise_direct_convolution_2(InGlobalDesc,
Chao Liu's avatar
Chao Liu committed
25
                                              Float* const __restrict__ p_in_global,
Chao Liu's avatar
Chao Liu committed
26
                                              WeiGlobalDesc,
Chao Liu's avatar
Chao Liu committed
27
                                              Float* const __restrict__ p_wei_global,
Chao Liu's avatar
Chao Liu committed
28
                                              OutGlobalDesc,
Chao Liu's avatar
Chao Liu committed
29
                                              Float* __restrict__ p_out_global)
Chao Liu's avatar
Chao Liu committed
30
{
Chao Liu's avatar
Chao Liu committed
31
32
33
34
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
35
36
37
38
39
40
41
42
43
44
45
46
47
48

    constexpr auto in_global_desc  = InGlobalDesc{};
    constexpr auto wei_global_desc = WeiGlobalDesc{};
    constexpr auto out_global_desc = OutGlobalDesc{};

    constexpr unsigned S = wei_global_desc.GetLength(I2);
    constexpr unsigned R = wei_global_desc.GetLength(I3);

    constexpr unsigned HoPerBlock = OutTileSizeH * YPerBlock;
    constexpr unsigned WoPerBlock = OutTileSizeW * XPerBlock;

    constexpr unsigned HiPerBlock = YPerBlock * OutTileSizeH + S - 1;
    constexpr unsigned WiPerBlock = XPerBlock * OutTileSizeW + R - 1;

Chao Liu's avatar
Chao Liu committed
49
50
    constexpr auto in_block_desc =
        make_ConstantTensorDescriptor(Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{});
Chao Liu's avatar
Chao Liu committed
51
52

    constexpr auto wei_block_desc =
Chao Liu's avatar
Chao Liu committed
53
        make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, S, R>{});
Chao Liu's avatar
Chao Liu committed
54
55
56
57
58

    // shared mem
    constexpr unsigned in_block_size  = in_block_desc.GetElementSpace();
    constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();

Chao Liu's avatar
Chao Liu committed
59
60
    __shared__ Float p_in_block[in_block_size];
    __shared__ Float p_wei_block[wei_block_size];
Chao Liu's avatar
Chao Liu committed
61
62
63
64
65

    // threadwise tensors
    constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
    constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;

Chao Liu's avatar
Chao Liu committed
66
67
    constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor(
        Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{}, in_block_desc.GetStrides());
Chao Liu's avatar
Chao Liu committed
68

Chao Liu's avatar
Chao Liu committed
69
70
    constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor(
        Sequence<KPerThread, CPerThread, S, R>{}, wei_block_desc.GetStrides());
Chao Liu's avatar
Chao Liu committed
71

Chao Liu's avatar
Chao Liu committed
72
73
    constexpr auto out_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
        in_thread_block_desc, wei_thread_block_desc);
Chao Liu's avatar
Chao Liu committed
74
75

    // register
Chao Liu's avatar
Chao Liu committed
76
    Float p_out_thread[out_thread_desc.GetElementSpace()];
Chao Liu's avatar
Chao Liu committed
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93

    // divide block work
    constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
    constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
    constexpr unsigned YBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
    constexpr unsigned XBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;

    const unsigned block_id = blockIdx.x;

    unsigned itmp                  = block_id;
    const unsigned n_block_work_id = itmp / (KBlockWork * YBlockWork * XBlockWork);
    itmp -= n_block_work_id * (KBlockWork * YBlockWork * XBlockWork);
    const unsigned k_block_work_id = itmp / (YBlockWork * XBlockWork);
    itmp -= k_block_work_id * (YBlockWork * XBlockWork);
    const unsigned y_block_work_id = itmp / XBlockWork;
    const unsigned x_block_work_id = itmp - y_block_work_id * XBlockWork;

Chao Liu's avatar
Chao Liu committed
94
95
96
97
    const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
    const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
    const unsigned y_block_data_begin = y_block_work_id * YPerBlock;
    const unsigned x_block_data_begin = x_block_work_id * XPerBlock;
Chao Liu's avatar
Chao Liu committed
98

Chao Liu's avatar
Chao Liu committed
99
100
    const unsigned ho_block_data_begin = y_block_data_begin * OutTileSizeH;
    const unsigned wo_block_data_begin = x_block_data_begin * OutTileSizeW;
Chao Liu's avatar
Chao Liu committed
101

Chao Liu's avatar
Chao Liu committed
102
103
    const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding
    const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding
Chao Liu's avatar
Chao Liu committed
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120

    // divide thread work
    constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
    constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
    constexpr unsigned YThreadWork = YPerBlock;
    constexpr unsigned XThreadWork = XPerBlock;

    const unsigned thread_id = threadIdx.x;

    itmp                            = thread_id;
    const unsigned n_thread_work_id = itmp / (KThreadWork * YThreadWork * XThreadWork);
    itmp -= n_thread_work_id * (KThreadWork * YThreadWork * XThreadWork);
    const unsigned k_thread_work_id = itmp / (YThreadWork * XThreadWork);
    itmp -= k_thread_work_id * (YThreadWork * XThreadWork);
    const unsigned y_thread_work_id = itmp / XThreadWork;
    const unsigned x_thread_work_id = itmp - y_thread_work_id * XThreadWork;

Chao Liu's avatar
Chao Liu committed
121
122
123
124
    const unsigned n_thread_data_begin  = n_thread_work_id * NPerThread;
    const unsigned k_thread_data_begin  = k_thread_work_id * KPerThread;
    const unsigned ho_thread_data_begin = y_thread_work_id * OutTileSizeH;
    const unsigned wo_thread_data_begin = x_thread_work_id * OutTileSizeW;
Chao Liu's avatar
Chao Liu committed
125

Chao Liu's avatar
Chao Liu committed
126
127
    const unsigned hi_thread_data_begin = ho_thread_data_begin;
    const unsigned wi_thread_data_begin = wo_thread_data_begin;
Chao Liu's avatar
Chao Liu committed
128
129
130
131
132
133
134
135
136
137

#if 0
    if(threadIdx.x == 0)
    {
        print_ConstantTensorDescriptor(in_global_desc, "gridwise_convolution:  in_global_desc: ");
        print_ConstantTensorDescriptor(wei_global_desc, "gridwise_convolution: wei_global_desc: ");
        print_ConstantTensorDescriptor(out_global_desc, "gridwise_convolution: out_global_desc: ");
    }

    printf("threadIdx.x %u \t"
Chao Liu's avatar
Chao Liu committed
138
139
           "n_thread_data_begin %u, k_thread_data_begin %u, ho_thread_data_begin %u, "
           "wo_thread_data_begin %u\n",
Chao Liu's avatar
Chao Liu committed
140
           threadIdx.x,
Chao Liu's avatar
Chao Liu committed
141
142
143
144
           n_thread_data_begin,
           k_thread_data_begin,
           ho_thread_data_begin,
           wo_thread_data_begin);
Chao Liu's avatar
Chao Liu committed
145
146
147
#endif

    // set threadwise output tensor to 0
Chao Liu's avatar
Chao Liu committed
148
    threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread);
Chao Liu's avatar
Chao Liu committed
149

Chao Liu's avatar
Chao Liu committed
150
151
    for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1);
        c_block_data_begin += CPerBlock, __syncthreads())
Chao Liu's avatar
Chao Liu committed
152
153
    {
        // copy input tensor to LDS
Chao Liu's avatar
Chao Liu committed
154
        blockwise_4d_tensor_copy<BlockSize>(in_global_desc,
Chao Liu's avatar
Chao Liu committed
155
                                            p_in_global +
Chao Liu's avatar
Chao Liu committed
156
157
158
159
                                                in_global_desc.Get1dIndex(n_block_data_begin,
                                                                          c_block_data_begin,
                                                                          hi_block_data_begin,
                                                                          wi_block_data_begin),
Chao Liu's avatar
Chao Liu committed
160
                                            in_block_desc,
Chao Liu's avatar
Chao Liu committed
161
                                            p_in_block,
Chao Liu's avatar
Chao Liu committed
162
                                            in_block_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
163
164

        // copy weight tensor to LDS
Chao Liu's avatar
Chao Liu committed
165
        blockwise_4d_tensor_copy<BlockSize>(
Chao Liu's avatar
Chao Liu committed
166
            wei_global_desc,
Chao Liu's avatar
Chao Liu committed
167
            p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
Chao Liu's avatar
Chao Liu committed
168
            wei_block_desc,
Chao Liu's avatar
Chao Liu committed
169
            p_wei_block,
Chao Liu's avatar
Chao Liu committed
170
            wei_block_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
171
172
173

        __syncthreads();

174
        for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
Chao Liu's avatar
Chao Liu committed
175
176
        {
            // threadwise convolution
Chao Liu's avatar
Chao Liu committed
177
#if 0
Chao Liu's avatar
Chao Liu committed
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
            threadwise_direct_convolution_2(
                in_thread_block_desc,
                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
                                                      c_thread_data,
                                                      hi_thread_data_begin,
                                                      wi_thread_data_begin),
                wei_thread_block_desc,
                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
                out_thread_desc,
                p_out_thread);
#elif 1
            threadwise_direct_convolution_3(
                in_thread_block_desc,
                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
                                                      c_thread_data,
                                                      hi_thread_data_begin,
                                                      wi_thread_data_begin),
                wei_thread_block_desc,
                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
                out_thread_desc,
                p_out_thread);
#endif
Chao Liu's avatar
Chao Liu committed
200
201
202
203
        }
    }

    // copy output tensor from register to global mem
Chao Liu's avatar
Chao Liu committed
204
    threadwise_4d_tensor_copy(
Chao Liu's avatar
Chao Liu committed
205
206
        out_thread_desc,
        p_out_thread,
Chao Liu's avatar
Chao Liu committed
207
        out_global_desc,
Chao Liu's avatar
Chao Liu committed
208
209
210
        p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
                                                  k_block_data_begin + k_thread_data_begin,
                                                  ho_block_data_begin + ho_thread_data_begin,
Chao Liu's avatar
Chao Liu committed
211
                                                  wo_block_data_begin + wo_thread_data_begin),
Chao Liu's avatar
Chao Liu committed
212
        out_thread_desc.GetLengths());
213
}