threadwise_direct_convolution.cuh 9.37 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
2
3
#pragma once
#include "constant_tensor_descriptor.cuh"

Chao Liu's avatar
Chao Liu committed
4
// optimized for scenario if p_in, p_wei, p_out are in register
Chao Liu's avatar
Chao Liu committed
5
template <class Float, class InDesc, class WeiDesc, class OutDesc>
Chao Liu's avatar
Chao Liu committed
6
__device__ void threadwise_direct_convolution_1(InDesc,
Chao Liu's avatar
Chao Liu committed
7
                                                Float* const __restrict__ p_in,
Chao Liu's avatar
Chao Liu committed
8
                                                WeiDesc,
Chao Liu's avatar
Chao Liu committed
9
                                                Float* const __restrict__ p_wei,
Chao Liu's avatar
Chao Liu committed
10
                                                OutDesc,
Chao Liu's avatar
Chao Liu committed
11
                                                Float* __restrict__ p_out)
Chao Liu's avatar
Chao Liu committed
12
{
Chao Liu's avatar
Chao Liu committed
13
14
15
16
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
17
18
19
20
21
22

    constexpr auto in_desc  = InDesc{};
    constexpr auto wei_desc = WeiDesc{};
    constexpr auto out_desc = OutDesc{};

#if 0
Chao Liu's avatar
Chao Liu committed
23
    if(blockIdx.x == 0 && threadIdx.x == 0)
Chao Liu's avatar
Chao Liu committed
24
    {
Chao Liu's avatar
Chao Liu committed
25
26
27
        print_ConstantTensorDescriptor(in_desc, "threadwise_direct_convolution: in_desc: ");
        print_ConstantTensorDescriptor(wei_desc, "threadwise_direct_convolution: wei_desc: ");
        print_ConstantTensorDescriptor(out_desc, "threadwise_direct_convolution: out_desc: ");
Chao Liu's avatar
Chao Liu committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
    }
#endif

    for(unsigned n = 0; n < out_desc.GetLength(I0); ++n)
    {
        for(unsigned k = 0; k < out_desc.GetLength(I1); ++k)
        {
            for(unsigned ho = 0; ho < out_desc.GetLength(I2); ++ho)
            {
                for(unsigned wo = 0; wo < out_desc.GetLength(I3); ++wo)
                {
                    for(unsigned c = 0; c < wei_desc.GetLength(I1); ++c)
                    {
                        for(unsigned s = 0; s < wei_desc.GetLength(I2); ++s)
                        {
                            for(unsigned r = 0; r < wei_desc.GetLength(I3); ++r)
                            {
                                const unsigned hi = ho + s;
                                const unsigned wi = wo + r;

48
                                const unsigned in_index = in_desc.Get1dIndex(n, c, hi, wi);
Chao Liu's avatar
Chao Liu committed
49

50
                                const unsigned wei_index = wei_desc.Get1dIndex(k, c, s, r);
Chao Liu's avatar
Chao Liu committed
51

52
                                const unsigned out_index = out_desc.Get1dIndex(n, k, ho, wo);
Chao Liu's avatar
Chao Liu committed
53
54
55
56

                                p_out[out_index] += p_wei[wei_index] * p_in[in_index];

#if 0
Chao Liu's avatar
Chao Liu committed
57
                                //   if(threadIdx.x == 0)
Chao Liu's avatar
Chao Liu committed
58
                                {
Chao Liu's avatar
Chao Liu committed
59
                                    printf("threadwise_direct_convolution: \t"
Chao Liu's avatar
Chao Liu committed
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
                                           "threadIdx.x %u\t"
                                           "out_index %u, p_out[out_index] %f, \t"
                                           "wei_index %u, p_wei[wei_index] %f, \t"
                                           "in_index %u, p_in[in_index] %f\n",
                                           threadIdx.x,
                                           out_index,
                                           p_out[out_index],
                                           wei_index,
                                           p_wei[wei_index],
                                           in_index,
                                           p_in[in_index]);
                                }
#endif
                            }
                        }
                    }
                }
            }
        }
    }
}
Chao Liu's avatar
Chao Liu committed
81

Chao Liu's avatar
Chao Liu committed
82
83
// Optimized for scenario if p_in and p_wei are in LDS, p_out are in register
// Copy in and wei into register before doing convolution
Chao Liu's avatar
Chao Liu committed
84
template <class Float, class InDesc, class WeiDesc, class OutDesc>
Chao Liu's avatar
Chao Liu committed
85
__device__ void threadwise_direct_convolution_2(InDesc,
Chao Liu's avatar
Chao Liu committed
86
                                                Float* const __restrict__ p_in,
Chao Liu's avatar
Chao Liu committed
87
                                                WeiDesc,
Chao Liu's avatar
Chao Liu committed
88
                                                Float* const __restrict__ p_wei,
Chao Liu's avatar
Chao Liu committed
89
                                                OutDesc,
Chao Liu's avatar
Chao Liu committed
90
                                                Float* __restrict__ p_out)
Chao Liu's avatar
Chao Liu committed
91
92
93
94
95
96
97
98
99
{
    constexpr auto in_desc  = InDesc{};
    constexpr auto wei_desc = WeiDesc{};
    constexpr auto out_desc = OutDesc{};

    constexpr auto in_reg_desc  = make_ConstantTensorDescriptor(in_desc.GetLengths());
    constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths());

    // register
Chao Liu's avatar
Chao Liu committed
100
101
    Float p_in_reg[in_reg_desc.GetElementSpace()];
    Float p_wei_reg[wei_reg_desc.GetElementSpace()];
Chao Liu's avatar
Chao Liu committed
102
103
104
105
106
107
108
109
110
111
112
113

    // copy input tensor into register
    threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc);

    // copy input tensor into register
    threadwise_4d_tensor_copy(wei_desc, p_wei, wei_reg_desc, p_wei_reg, wei_reg_desc);

    // do convolution
    threadwise_direct_convolution_1(
        in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
}

Chao Liu's avatar
Chao Liu committed
114
115
116
// optimized for scenario where p_in and p_wei are in LDS, p_out is in register
// break down a non-1x1 convolution into a sequence of 1x1 convolutions,
// load 1x1 weight into register, and do 1x1 convolution in register.
Chao Liu's avatar
Chao Liu committed
117
template <class Float, class InDesc, class WeiDesc, class OutDesc>
Chao Liu's avatar
Chao Liu committed
118
__device__ void threadwise_direct_convolution_3(InDesc,
Chao Liu's avatar
Chao Liu committed
119
                                                Float* const __restrict__ p_in,
Chao Liu's avatar
Chao Liu committed
120
                                                WeiDesc,
Chao Liu's avatar
Chao Liu committed
121
                                                Float* const __restrict__ p_wei,
Chao Liu's avatar
Chao Liu committed
122
                                                OutDesc,
Chao Liu's avatar
Chao Liu committed
123
                                                Float* __restrict__ p_out)
Chao Liu's avatar
Chao Liu committed
124
125
126
127
128
129
{
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};

Chao Liu's avatar
Chao Liu committed
130
131
132
    constexpr auto in_desc  = InDesc{};
    constexpr auto wei_desc = WeiDesc{};
    constexpr auto out_desc = OutDesc{};
Chao Liu's avatar
Chao Liu committed
133

Chao Liu's avatar
Chao Liu committed
134
135
136
137
    constexpr auto in_reg_desc = make_ConstantTensorDescriptor(Sequence<in_desc.GetLength(I0),
                                                                        in_desc.GetLength(I1),
                                                                        out_desc.GetLength(I2),
                                                                        out_desc.GetLength(I3)>{});
Chao Liu's avatar
Chao Liu committed
138

Chao Liu's avatar
Chao Liu committed
139
140
    constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(
        Sequence<wei_desc.GetLength(I0), wei_desc.GetLength(I1), 1, 1>{});
Chao Liu's avatar
Chao Liu committed
141

Chao Liu's avatar
Chao Liu committed
142
143
    Float p_in_reg[in_reg_desc.GetElementSpace()];
    Float p_wei_reg[wei_reg_desc.GetElementSpace()];
Chao Liu's avatar
Chao Liu committed
144
145
146
147

    constexpr unsigned in_w_new_read = 1;

    constexpr auto in_desc_reg_new_read =
Chao Liu's avatar
Chao Liu committed
148
149
150
        make_ConstantTensorDescriptor(Sequence<in_reg_desc.GetLength(I0),
                                               in_reg_desc.GetLength(I1),
                                               in_reg_desc.GetLength(I2),
Chao Liu's avatar
Chao Liu committed
151
152
                                               in_w_new_read>{});

Chao Liu's avatar
Chao Liu committed
153
#if 0
Chao Liu's avatar
Chao Liu committed
154
    // loop over vertical direction
Chao Liu's avatar
Chao Liu committed
155
    for(unsigned s = 0; s < wei_desc.GetLength(I2); ++s)
Chao Liu's avatar
Chao Liu committed
156
157
    {
        // read first input
Chao Liu's avatar
Chao Liu committed
158
159
160
        threadwise_4d_tensor_copy(in_desc,
                                  p_in + in_desc.Get1dIndex(0, 0, s, 0),
                                  in_reg_desc,
Chao Liu's avatar
Chao Liu committed
161
                                  p_in_reg,
Chao Liu's avatar
Chao Liu committed
162
                                  in_reg_desc);
Chao Liu's avatar
Chao Liu committed
163
164

        // read first 1x1 weight
Chao Liu's avatar
Chao Liu committed
165
166
167
        threadwise_4d_tensor_copy(wei_desc,
                                  p_wei + wei_desc.Get1dIndex(0, 0, s, 0),
                                  wei_reg_desc,
Chao Liu's avatar
Chao Liu committed
168
                                  p_wei_reg,
Chao Liu's avatar
Chao Liu committed
169
                                  wei_reg_desc);
Chao Liu's avatar
Chao Liu committed
170
171
172

        // do first 1x1 conv
        threadwise_direct_convolution_1(
Chao Liu's avatar
Chao Liu committed
173
            in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
Chao Liu's avatar
Chao Liu committed
174
175

        // loop over horizontal direction
Chao Liu's avatar
Chao Liu committed
176
        for(unsigned r = 1; r < wei_desc.GetLength(I3); ++r)
Chao Liu's avatar
Chao Liu committed
177
178
        {
            // read new weight
Chao Liu's avatar
Chao Liu committed
179
180
181
            threadwise_4d_tensor_copy(wei_desc,
                                      p_wei + wei_desc.Get1dIndex(0, 0, s, r),
                                      wei_reg_desc,
Chao Liu's avatar
Chao Liu committed
182
                                      p_wei_reg,
Chao Liu's avatar
Chao Liu committed
183
                                      wei_reg_desc);
Chao Liu's avatar
Chao Liu committed
184
185

            // shift old input to the left
Chao Liu's avatar
Chao Liu committed
186
            threadwise_4d_tensor_shift_down(in_reg_desc, p_in_reg, I3, Number<in_w_new_read>{});
Chao Liu's avatar
Chao Liu committed
187
188
189

            // read new input
            threadwise_4d_tensor_copy(
Chao Liu's avatar
Chao Liu committed
190
191
192
                in_desc,
                p_in + in_desc.Get1dIndex(0, 0, s, r + in_reg_desc.GetLength(I3) - 1),
                in_reg_desc,
Chao Liu's avatar
Chao Liu committed
193
                p_in_reg +
Chao Liu's avatar
Chao Liu committed
194
                    in_reg_desc.Get1dIndex(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read),
Chao Liu's avatar
Chao Liu committed
195
196
197
198
                in_desc_reg_new_read);

            // do 1x1 conv
            threadwise_direct_convolution_1(
Chao Liu's avatar
Chao Liu committed
199
                in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
Chao Liu's avatar
Chao Liu committed
200
        }
Chao Liu's avatar
Chao Liu committed
201
    }
Chao Liu's avatar
Chao Liu committed
202
#elif 1
Chao Liu's avatar
Chao Liu committed
203
204
205
    // loop over vertical direction
    for(unsigned s = 0; s < wei_desc.GetLength(I2); ++s)
    {
Chao Liu's avatar
Chao Liu committed
206
        // loop over horizontal direction
Chao Liu's avatar
Chao Liu committed
207
        for(unsigned r = 0; r < wei_desc.GetLength(I3); ++r)
Chao Liu's avatar
Chao Liu committed
208
209
        {
            // read new weight
Chao Liu's avatar
Chao Liu committed
210
211
212
            threadwise_4d_tensor_copy(wei_desc,
                                      p_wei + wei_desc.Get1dIndex(0, 0, s, r),
                                      wei_reg_desc,
Chao Liu's avatar
Chao Liu committed
213
                                      p_wei_reg,
Chao Liu's avatar
Chao Liu committed
214
                                      wei_reg_desc);
Chao Liu's avatar
Chao Liu committed
215
216

            // read new input
Chao Liu's avatar
Chao Liu committed
217
218
            threadwise_4d_tensor_copy(
                in_desc, p_in + in_desc.Get1dIndex(0, 0, s, r), in_reg_desc, p_in_reg, in_reg_desc);
Chao Liu's avatar
Chao Liu committed
219
220
221

            // do 1x1 conv
            threadwise_direct_convolution_1(
Chao Liu's avatar
Chao Liu committed
222
                in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
Chao Liu's avatar
Chao Liu committed
223
224
        }
    }
Chao Liu's avatar
Chao Liu committed
225
226
#endif
}