"...resnet50_tensorflow.git" did not exist on "fcff6f6593bf9a9b0cd190fe04cc552d2fcb14c6"
threadwise_direct_convolution.hip.hpp 9.3 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
#pragma once
2
#include "ConstantTensorDescriptor.hip.hpp"
Chao Liu's avatar
Chao Liu committed
3
#include "threadwise_tensor_slice_op.hip.hpp"
Chao Liu's avatar
Chao Liu committed
4

Chao Liu's avatar
Chao Liu committed
5
// optimized for scenario if p_in, p_wei, p_out are in register
6
template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
Chao Liu's avatar
Chao Liu committed
7
__device__ void threadwise_direct_convolution_1(InDesc,
8
                                                TInWei* const __restrict__ p_in,
Chao Liu's avatar
Chao Liu committed
9
                                                WeiDesc,
10
                                                TInWei* const __restrict__ p_wei,
Chao Liu's avatar
Chao Liu committed
11
                                                OutDesc,
12
                                                TOut* __restrict__ p_out)
Chao Liu's avatar
Chao Liu committed
13
{
Chao Liu's avatar
Chao Liu committed
14
15
16
17
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
18
19
20
21
22
23

    constexpr auto in_desc  = InDesc{};
    constexpr auto wei_desc = WeiDesc{};
    constexpr auto out_desc = OutDesc{};

#if 0
24
    if(blockIdx.x == 0 && get_thread_local_1d_id() == 0)
Chao Liu's avatar
Chao Liu committed
25
    {
Chao Liu's avatar
Chao Liu committed
26
27
28
        print_ConstantTensorDescriptor(in_desc, "threadwise_direct_convolution: in_desc: ");
        print_ConstantTensorDescriptor(wei_desc, "threadwise_direct_convolution: wei_desc: ");
        print_ConstantTensorDescriptor(out_desc, "threadwise_direct_convolution: out_desc: ");
Chao Liu's avatar
Chao Liu committed
29
30
31
    }
#endif

Chao Liu's avatar
Chao Liu committed
32
    for(index_t n = 0; n < out_desc.GetLength(I0); ++n)
Chao Liu's avatar
Chao Liu committed
33
    {
Chao Liu's avatar
Chao Liu committed
34
        for(index_t k = 0; k < out_desc.GetLength(I1); ++k)
Chao Liu's avatar
Chao Liu committed
35
        {
Chao Liu's avatar
Chao Liu committed
36
            for(index_t ho = 0; ho < out_desc.GetLength(I2); ++ho)
Chao Liu's avatar
Chao Liu committed
37
            {
Chao Liu's avatar
Chao Liu committed
38
                for(index_t wo = 0; wo < out_desc.GetLength(I3); ++wo)
Chao Liu's avatar
Chao Liu committed
39
                {
Chao Liu's avatar
Chao Liu committed
40
                    for(index_t c = 0; c < wei_desc.GetLength(I1); ++c)
Chao Liu's avatar
Chao Liu committed
41
                    {
Chao Liu's avatar
Chao Liu committed
42
                        for(index_t y = 0; y < wei_desc.GetLength(I2); ++y)
Chao Liu's avatar
Chao Liu committed
43
                        {
Chao Liu's avatar
Chao Liu committed
44
                            for(index_t x = 0; x < wei_desc.GetLength(I3); ++x)
Chao Liu's avatar
Chao Liu committed
45
                            {
Chao Liu's avatar
Chao Liu committed
46
47
                                const index_t hi = ho + y;
                                const index_t wi = wo + x;
Chao Liu's avatar
Chao Liu committed
48

49
50
                                const index_t in_index =
                                    in_desc.GetOffsetFromMultiIndex(n, c, hi, wi);
Chao Liu's avatar
Chao Liu committed
51

52
53
                                const index_t wei_index =
                                    wei_desc.GetOffsetFromMultiIndex(k, c, y, x);
Chao Liu's avatar
Chao Liu committed
54

55
56
                                const index_t out_index =
                                    out_desc.GetOffsetFromMultiIndex(n, k, ho, wo);
Chao Liu's avatar
Chao Liu committed
57

58
59
                                fused_multiply_accumulate(
                                    p_out[out_index], p_wei[wei_index], p_in[in_index]);
Chao Liu's avatar
Chao Liu committed
60
61
62
63
64
65
66
67
                            }
                        }
                    }
                }
            }
        }
    }
}
Chao Liu's avatar
Chao Liu committed
68

Chao Liu's avatar
Chao Liu committed
69
70
// Optimized for scenario if p_in and p_wei are in LDS, p_out are in register
// Copy in and wei into register before doing convolution
71
template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
Chao Liu's avatar
Chao Liu committed
72
__device__ void threadwise_direct_convolution_2(InDesc,
73
                                                TInWei* const __restrict__ p_in,
Chao Liu's avatar
Chao Liu committed
74
                                                WeiDesc,
75
                                                TInWei* const __restrict__ p_wei,
Chao Liu's avatar
Chao Liu committed
76
                                                OutDesc,
77
                                                TOut* __restrict__ p_out)
Chao Liu's avatar
Chao Liu committed
78
79
80
81
82
{
    constexpr auto in_desc  = InDesc{};
    constexpr auto wei_desc = WeiDesc{};
    constexpr auto out_desc = OutDesc{};

Chao Liu's avatar
Chao Liu committed
83
84
85
86
    constexpr auto in_reg_desc =
        make_ConstantTensorDescriptor_default_rank_packed(in_desc.GetLengths());
    constexpr auto wei_reg_desc =
        make_ConstantTensorDescriptor_default_rank_packed(wei_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
87
88

    // register
89
90
    TInWei p_in_reg[in_reg_desc.GetElementSpace()];
    TInWei p_wei_reg[wei_reg_desc.GetElementSpace()];
Chao Liu's avatar
Chao Liu committed
91
92

    // copy input tensor into register
Chao Liu's avatar
Chao Liu committed
93
    threadwise_tensor_slice_copy(
Chao Liu's avatar
Chao Liu committed
94
        in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc.GetLengths(), Number<1>{});
Chao Liu's avatar
Chao Liu committed
95
96

    // copy input tensor into register
Chao Liu's avatar
Chao Liu committed
97
    threadwise_tensor_slice_copy(
Chao Liu's avatar
Chao Liu committed
98
        wei_desc, p_wei, wei_reg_desc, p_wei_reg, wei_reg_desc.GetLengths(), Number<1>{});
Chao Liu's avatar
Chao Liu committed
99
100
101
102
103
104

    // do convolution
    threadwise_direct_convolution_1(
        in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
}

Chao Liu's avatar
Chao Liu committed
105
106
107
// optimized for scenario where p_in and p_wei are in LDS, p_out is in register
// break down a non-1x1 convolution into a sequence of 1x1 convolutions,
// load 1x1 weight into register, and do 1x1 convolution in register.
108
template <class Data, class InDesc, class WeiDesc, class OutDesc>
Chao Liu's avatar
Chao Liu committed
109
__device__ void threadwise_direct_convolution_3(InDesc,
110
                                                Data* const __restrict__ p_in,
Chao Liu's avatar
Chao Liu committed
111
                                                WeiDesc,
112
                                                Data* const __restrict__ p_wei,
Chao Liu's avatar
Chao Liu committed
113
                                                OutDesc,
114
                                                Data* __restrict__ p_out)
Chao Liu's avatar
Chao Liu committed
115
116
117
118
119
120
{
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};

Chao Liu's avatar
Chao Liu committed
121
122
123
    constexpr auto in_desc  = InDesc{};
    constexpr auto wei_desc = WeiDesc{};
    constexpr auto out_desc = OutDesc{};
Chao Liu's avatar
Chao Liu committed
124

Chao Liu's avatar
Chao Liu committed
125
126
127
128
    constexpr auto in_reg_desc = make_ConstantTensorDescriptor(Sequence<in_desc.GetLength(I0),
                                                                        in_desc.GetLength(I1),
                                                                        out_desc.GetLength(I2),
                                                                        out_desc.GetLength(I3)>{});
Chao Liu's avatar
Chao Liu committed
129

Chao Liu's avatar
Chao Liu committed
130
131
    constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(
        Sequence<wei_desc.GetLength(I0), wei_desc.GetLength(I1), 1, 1>{});
Chao Liu's avatar
Chao Liu committed
132

133
134
    Data p_in_reg[in_reg_desc.GetElementSpace()];
    Data p_wei_reg[wei_reg_desc.GetElementSpace()];
Chao Liu's avatar
Chao Liu committed
135

Chao Liu's avatar
Chao Liu committed
136
    constexpr index_t in_w_new_read = 1;
Chao Liu's avatar
Chao Liu committed
137
138

    constexpr auto in_desc_reg_new_read =
Chao Liu's avatar
Chao Liu committed
139
140
141
        make_ConstantTensorDescriptor(Sequence<in_reg_desc.GetLength(I0),
                                               in_reg_desc.GetLength(I1),
                                               in_reg_desc.GetLength(I2),
Chao Liu's avatar
Chao Liu committed
142
143
                                               in_w_new_read>{});

Chao Liu's avatar
Chao Liu committed
144
#if 0
Chao Liu's avatar
Chao Liu committed
145
    // this verison reused old input data in register, and read new data from LDS
Chao Liu's avatar
Chao Liu committed
146
    // loop over vertical direction
Chao Liu's avatar
Chao Liu committed
147
    for(index_t y = 0; y < wei_desc.GetLength(I2); ++y)
Chao Liu's avatar
Chao Liu committed
148
149
    {
        // read first input
Chao Liu's avatar
Chao Liu committed
150
        threadwise_4d_tensor_copy(in_desc,
151
                                  p_in + in_desc.GetOffsetFromMultiIndex(0, 0, y, 0),
Chao Liu's avatar
Chao Liu committed
152
                                  in_reg_desc,
Chao Liu's avatar
Chao Liu committed
153
                                  p_in_reg,
Chao Liu's avatar
Chao Liu committed
154
                                  in_reg_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
155
156

        // read first 1x1 weight
Chao Liu's avatar
Chao Liu committed
157
        threadwise_4d_tensor_copy(wei_desc,
158
                                  p_wei + wei_desc.GetOffsetFromMultiIndex(0, 0, y, 0),
Chao Liu's avatar
Chao Liu committed
159
                                  wei_reg_desc,
Chao Liu's avatar
Chao Liu committed
160
                                  p_wei_reg,
Chao Liu's avatar
Chao Liu committed
161
                                  wei_reg_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
162
163
164

        // do first 1x1 conv
        threadwise_direct_convolution_1(
Chao Liu's avatar
Chao Liu committed
165
            in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
Chao Liu's avatar
Chao Liu committed
166
167

        // loop over horizontal direction
Chao Liu's avatar
Chao Liu committed
168
        for(index_t x = 1; x < wei_desc.GetLength(I3); ++x)
Chao Liu's avatar
Chao Liu committed
169
170
        {
            // read new weight
Chao Liu's avatar
Chao Liu committed
171
            threadwise_4d_tensor_copy(wei_desc,
172
                                      p_wei + wei_desc.GetOffsetFromMultiIndex(0, 0, y, x),
Chao Liu's avatar
Chao Liu committed
173
                                      wei_reg_desc,
Chao Liu's avatar
Chao Liu committed
174
                                      p_wei_reg,
Chao Liu's avatar
Chao Liu committed
175
                                      wei_reg_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
176
177

            // shift old input to the left
Chao Liu's avatar
Chao Liu committed
178
            threadwise_4d_tensor_shift_down(in_reg_desc, p_in_reg, I3, Number<in_w_new_read>{});
Chao Liu's avatar
Chao Liu committed
179
180
181

            // read new input
            threadwise_4d_tensor_copy(
Chao Liu's avatar
Chao Liu committed
182
                in_desc,
183
                p_in + in_desc.GetOffsetFromMultiIndex(0, 0, y, x + in_reg_desc.GetLength(I3) - 1),
Chao Liu's avatar
Chao Liu committed
184
                in_reg_desc,
Chao Liu's avatar
Chao Liu committed
185
                p_in_reg +
186
                    in_reg_desc.GetOffsetFromMultiIndex(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read),
Chao Liu's avatar
Chao Liu committed
187
                in_desc_reg_new_read.GetLengths());
Chao Liu's avatar
Chao Liu committed
188
189
190

            // do 1x1 conv
            threadwise_direct_convolution_1(
Chao Liu's avatar
Chao Liu committed
191
                in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
Chao Liu's avatar
Chao Liu committed
192
        }
Chao Liu's avatar
Chao Liu committed
193
    }
Chao Liu's avatar
Chao Liu committed
194
#elif 1
Chao Liu's avatar
Chao Liu committed
195
    // this version read all input from LDS when filter moves
Chao Liu's avatar
Chao Liu committed
196
    // loop over vertical direction
Chao Liu's avatar
Chao Liu committed
197
    for(index_t y = 0; y < wei_desc.GetLength(I2); ++y)
Chao Liu's avatar
Chao Liu committed
198
    {
Chao Liu's avatar
Chao Liu committed
199
        // loop over horizontal direction
Chao Liu's avatar
Chao Liu committed
200
        for(index_t x = 0; x < wei_desc.GetLength(I3); ++x)
Chao Liu's avatar
Chao Liu committed
201
202
        {
            // read new weight
Chao Liu's avatar
Chao Liu committed
203
            threadwise_4d_tensor_copy(wei_desc,
204
                                      p_wei + wei_desc.GetOffsetFromMultiIndex(0, 0, y, x),
Chao Liu's avatar
Chao Liu committed
205
                                      wei_reg_desc,
Chao Liu's avatar
Chao Liu committed
206
                                      p_wei_reg,
Chao Liu's avatar
Chao Liu committed
207
                                      wei_reg_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
208
209

            // read new input
Chao Liu's avatar
Chao Liu committed
210
            threadwise_4d_tensor_copy(in_desc,
211
                                      p_in + in_desc.GetOffsetFromMultiIndex(0, 0, y, x),
Chao Liu's avatar
Chao Liu committed
212
213
214
                                      in_reg_desc,
                                      p_in_reg,
                                      in_reg_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
215
216
217

            // do 1x1 conv
            threadwise_direct_convolution_1(
Chao Liu's avatar
Chao Liu committed
218
                in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
Chao Liu's avatar
Chao Liu committed
219
220
        }
    }
Chao Liu's avatar
Chao Liu committed
221
#endif
Chao Liu's avatar
Chao Liu committed
222
}