blockwise_tensor_op.cuh 7.09 KB
Newer Older
1
2
3
#pragma once
#include "constant_tensor_descriptor.cuh"

Chao Liu's avatar
Chao Liu committed
4
template <class TFloat, class DstDesc, class F, unsigned BlockSize>
5
6
__device__ void
blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
Chao Liu's avatar
Chao Liu committed
7
{
Chao Liu's avatar
Chao Liu committed
8
9
10
11
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
12

13
14
    constexpr auto dst_desc = DstDesc{};

Chao Liu's avatar
Chao Liu committed
15
    constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
16

17
18
19
#if 0
    if(threadIdx.x == 0)
    {
Chao Liu's avatar
Chao Liu committed
20
21
        print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: ");
        print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: ");
22
23
24
    }
#endif

Chao Liu's avatar
Chao Liu committed
25
26
    constexpr unsigned NLoop = desc.GetElementSize() / BlockSize;

Chao Liu's avatar
faster  
Chao Liu committed
27
    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
Chao Liu's avatar
Chao Liu committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
    {
        unsigned is = threadIdx.x + iloop * BlockSize;

        const unsigned did0 = is / desc.GetStride(I0);

        is -= did0 * desc.GetStride(I0);

        const unsigned did1 = is / desc.GetStride(I1);

        is -= did1 * desc.GetStride(I1);

        const unsigned did2 = is / desc.GetStride(I2);

        is -= did2 * desc.GetStride(I2);

        const unsigned did3 = is / desc.GetStride(I3);

        const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);

Chao Liu's avatar
Chao Liu committed
47
        f(p_dst[dindex]);
Chao Liu's avatar
Chao Liu committed
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
    }

    constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize);

    if(has_tail)
    {
        unsigned is = threadIdx.x + NLoop * BlockSize;

        if(is < desc.GetElementSize())
        {
            const unsigned did0 = is / desc.GetStride(I0);

            is -= did0 * desc.GetStride(I0);

            const unsigned did1 = is / desc.GetStride(I1);

            is -= did1 * desc.GetStride(I1);

            const unsigned did2 = is / desc.GetStride(I2);

            is -= did2 * desc.GetStride(I2);

            const unsigned did3 = is / desc.GetStride(I3);

            const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);

Chao Liu's avatar
Chao Liu committed
74
            f(p_dst[dindex]);
Chao Liu's avatar
Chao Liu committed
75
76
77
        }
    }
}
Chao Liu's avatar
Chao Liu committed
78

79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
template <class TFloat,
          class SrcDesc,
          class DstDesc,
          class RefDesc,
          class Reorder,
          class F,
          unsigned BlockSize>
__device__ void
blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
                                                       TFloat* const __restrict__ p_src,
                                                       DstDesc,
                                                       TFloat* __restrict__ p_dst,
                                                       RefDesc,
                                                       Reorder,
                                                       F f)
Chao Liu's avatar
Chao Liu committed
94
{
Chao Liu's avatar
Chao Liu committed
95
96
97
98
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
99

100
101
102
103
    constexpr unsigned IT0 = Reorder{}.Get(I0);
    constexpr unsigned IT1 = Reorder{}.Get(I1);
    constexpr unsigned IT2 = Reorder{}.Get(I2);
    constexpr unsigned IT3 = Reorder{}.Get(I3);
Chao Liu's avatar
Chao Liu committed
104

105
106
107
    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
    constexpr auto ref_desc = RefDesc{};
Chao Liu's avatar
Chao Liu committed
108

109
    constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
Chao Liu's avatar
Chao Liu committed
110
111
112
113
114

    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
    {
        unsigned is = threadIdx.x + iloop * BlockSize;

115
        unsigned did[4];
Chao Liu's avatar
Chao Liu committed
116

117
        did[0] = is / ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
118

119
        is -= did[0] * ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
120

121
        did[1] = is / ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
122

123
        is -= did[1] * ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
124

125
        did[2] = is / ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
126

127
        is -= did[2] * ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
128

129
        did[3] = is / ref_desc.GetStride(I3);
Chao Liu's avatar
Chao Liu committed
130

131
        const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
Chao Liu's avatar
Chao Liu committed
132

133
134
135
        const unsigned bindex = dst_desc.Get1dIndex(did[IT0], did[IT1], did[IT2], did[IT3]);

        f(p_src[aindex], p_dst[bindex]);
Chao Liu's avatar
Chao Liu committed
136
137
    }

138
    constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
Chao Liu's avatar
Chao Liu committed
139
140
141
142
143

    if(has_tail)
    {
        unsigned is = threadIdx.x + NLoop * BlockSize;

144
        if(is < ref_desc.GetElementSize())
Chao Liu's avatar
Chao Liu committed
145
        {
146
147
148
            unsigned did[4];

            did[0] = is / ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
149

150
            is -= did[0] * ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
151

152
            did[1] = is / ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
153

154
            is -= did[1] * ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
155

156
            did[2] = is / ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
157

158
            is -= did[2] * ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
159

160
            did[3] = is / ref_desc.GetStride(I3);
Chao Liu's avatar
Chao Liu committed
161

162
            const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
163

164
            const unsigned bindex = dst_desc.Get1dIndex(did[IT0], did[IT1], did[IT2], did[IT3]);
165

166
            f(p_src[aindex], p_dst[bindex]);
167
168
169
170
        }
    }
}

Chao Liu's avatar
Chao Liu committed
171
172
template <class TFloat, class DstDesc, unsigned BlockSize>
__device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst)
173
{
Chao Liu's avatar
Chao Liu committed
174
    auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
Chao Liu's avatar
Chao Liu committed
175

176
    blockwise_4d_tensor_pointwise_operation_unary<TFloat, DstDesc, decltype(f_set_zero), BlockSize>(
Chao Liu's avatar
Chao Liu committed
177
        DstDesc{}, p_dst, f_set_zero);
Chao Liu's avatar
Chao Liu committed
178
}
179

180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
template <class TFloat,
          class SrcDesc,
          class DstDesc,
          class RefDesc,
          class Reorder,
          unsigned BlockSize>
__device__ void blockwise_4d_tensor_copy_reorder(SrcDesc,
                                                 TFloat* const __restrict__ p_src,
                                                 DstDesc,
                                                 TFloat* __restrict__ p_dst,
                                                 RefDesc,
                                                 Reorder)
{
    auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };

    blockwise_4d_tensor_pointwise_operation_binary_reorder<TFloat,
                                                           SrcDesc,
                                                           DstDesc,
                                                           RefDesc,
                                                           Reorder,
                                                           decltype(f_copy),
                                                           BlockSize>(
        SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy);
}

Chao Liu's avatar
Chao Liu committed
205
206
207
template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, unsigned BlockSize>
__device__ void blockwise_4d_tensor_copy(
    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc)
Chao Liu's avatar
Chao Liu committed
208
{
209
210
211
212
213
214
215
216
217
218
    constexpr auto reorder = Sequence<0, 1, 2, 3>{};

    blockwise_4d_tensor_copy_reorder<TFloat,
                                     SrcDesc,
                                     DstDesc,
                                     RefDesc,
                                     decltype(reorder),
                                     BlockSize>(
        SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, reorder);
}
Chao Liu's avatar
Chao Liu committed
219

220
221
222
223
224
template <class TFloat, class ImDesc, class WDesc, class ColDesc, unsigned BlockSize>
__device__ void blockwise_4d_tensor_im2col(
    ImDesc, const __restrict__ TFloat* p_im, WDesc, ColDesc, __restrict__ TFloat* p_col)
{
    // do nothing
Chao Liu's avatar
Chao Liu committed
225
}