blockwise_tensor_op.cuh 6.64 KB
Newer Older
1
#pragma once
Chao Liu's avatar
Chao Liu committed
2
#include "ConstantTensorDescriptor.cuh"
3

Chao Liu's avatar
Chao Liu committed
4
template <unsigned BlockSize, class Float, class DstDesc, class F>
5
__device__ void
Chao Liu's avatar
Chao Liu committed
6
blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst, F f)
Chao Liu's avatar
Chao Liu committed
7
{
Chao Liu's avatar
Chao Liu committed
8
9
10
11
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
12

13
14
    constexpr auto dst_desc = DstDesc{};

Chao Liu's avatar
Chao Liu committed
15
    constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
16

17
18
19
#if 0
    if(threadIdx.x == 0)
    {
Chao Liu's avatar
Chao Liu committed
20
21
        print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: ");
        print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: ");
22
23
24
    }
#endif

Chao Liu's avatar
Chao Liu committed
25
26
    constexpr unsigned NLoop = desc.GetElementSize() / BlockSize;

Chao Liu's avatar
faster  
Chao Liu committed
27
    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
Chao Liu's avatar
Chao Liu committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
    {
        unsigned is = threadIdx.x + iloop * BlockSize;

        const unsigned did0 = is / desc.GetStride(I0);

        is -= did0 * desc.GetStride(I0);

        const unsigned did1 = is / desc.GetStride(I1);

        is -= did1 * desc.GetStride(I1);

        const unsigned did2 = is / desc.GetStride(I2);

        is -= did2 * desc.GetStride(I2);

        const unsigned did3 = is / desc.GetStride(I3);

        const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);

Chao Liu's avatar
Chao Liu committed
47
        f(p_dst[dindex]);
Chao Liu's avatar
Chao Liu committed
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
    }

    constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize);

    if(has_tail)
    {
        unsigned is = threadIdx.x + NLoop * BlockSize;

        if(is < desc.GetElementSize())
        {
            const unsigned did0 = is / desc.GetStride(I0);

            is -= did0 * desc.GetStride(I0);

            const unsigned did1 = is / desc.GetStride(I1);

            is -= did1 * desc.GetStride(I1);

            const unsigned did2 = is / desc.GetStride(I2);

            is -= did2 * desc.GetStride(I2);

            const unsigned did3 = is / desc.GetStride(I3);

            const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);

Chao Liu's avatar
Chao Liu committed
74
            f(p_dst[dindex]);
Chao Liu's avatar
Chao Liu committed
75
76
77
        }
    }
}
Chao Liu's avatar
Chao Liu committed
78

Chao Liu's avatar
Chao Liu committed
79
// Function: p_dst[reorder[i0], reorder[i1], reorder[i2], reorder[i3]] = p_src[i0,i1,i2,i3]
80
81
// TODO: in order to optimize mem access for different mem type,
// need to write specialized version
Chao Liu's avatar
Chao Liu committed
82
template <unsigned BlockSize,
Chao Liu's avatar
Chao Liu committed
83
          class Float,
84
85
          class SrcDesc,
          class DstDesc,
Chao Liu's avatar
Chao Liu committed
86
87
          class SrcOpLengths,
          class DstFromSrcReorder,
Chao Liu's avatar
Chao Liu committed
88
          class F>
Chao Liu's avatar
Chao Liu committed
89
90
91
92
93
94
95
96
__device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src(
    SrcDesc,
    Float* const __restrict__ p_src,
    DstDesc,
    Float* __restrict__ p_dst,
    SrcOpLengths,
    DstFromSrcReorder,
    F f)
Chao Liu's avatar
Chao Liu committed
97
{
Chao Liu's avatar
Chao Liu committed
98
99
100
101
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
102

Chao Liu's avatar
Chao Liu committed
103
104
105
106
    constexpr unsigned IR0 = DstFromSrcReorder{}.Get(I0);
    constexpr unsigned IR1 = DstFromSrcReorder{}.Get(I1);
    constexpr unsigned IR2 = DstFromSrcReorder{}.Get(I2);
    constexpr unsigned IR3 = DstFromSrcReorder{}.Get(I3);
Chao Liu's avatar
Chao Liu committed
107

108
109
    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
Chao Liu's avatar
Chao Liu committed
110
    constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{});
Chao Liu's avatar
Chao Liu committed
111

112
    constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
Chao Liu's avatar
Chao Liu committed
113
114
115
116
117

    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
    {
        unsigned is = threadIdx.x + iloop * BlockSize;

118
        unsigned did[4];
Chao Liu's avatar
Chao Liu committed
119

120
        did[0] = is / ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
121

122
        is -= did[0] * ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
123

124
        did[1] = is / ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
125

126
        is -= did[1] * ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
127

128
        did[2] = is / ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
129

130
        is -= did[2] * ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
131

132
        did[3] = is / ref_desc.GetStride(I3);
Chao Liu's avatar
Chao Liu committed
133

134
        const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
Chao Liu's avatar
Chao Liu committed
135

Chao Liu's avatar
Chao Liu committed
136
        const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
137

Chao Liu's avatar
Chao Liu committed
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#if 1
        printf("did %u %u %u %u, did_IR %u %u %u %u, index %u %u\n",
               did[0],
               did[1],
               did[2],
               did[3],
               did[IR0],
               did[IR1],
               did[IR2],
               did[IR3],
               aindex,
               bindex);
#endif

152
        f(p_src[aindex], p_dst[bindex]);
Chao Liu's avatar
Chao Liu committed
153
154
    }

155
    constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
Chao Liu's avatar
Chao Liu committed
156
157
158
159
160

    if(has_tail)
    {
        unsigned is = threadIdx.x + NLoop * BlockSize;

161
        if(is < ref_desc.GetElementSize())
Chao Liu's avatar
Chao Liu committed
162
        {
163
164
165
            unsigned did[4];

            did[0] = is / ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
166

167
            is -= did[0] * ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
168

169
            did[1] = is / ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
170

171
            is -= did[1] * ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
172

173
            did[2] = is / ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
174

175
            is -= did[2] * ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
176

177
            did[3] = is / ref_desc.GetStride(I3);
Chao Liu's avatar
Chao Liu committed
178

179
            const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
180

Chao Liu's avatar
Chao Liu committed
181
            const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
182

183
            f(p_src[aindex], p_dst[bindex]);
184
185
186
187
        }
    }
}

Chao Liu's avatar
Chao Liu committed
188
189
template <unsigned BlockSize, class Float, class DstDesc>
__device__ void blockwise_4d_tensor_set_zero(DstDesc, Float* __restrict__ p_dst)
190
{
Chao Liu's avatar
Chao Liu committed
191
    auto f_set_zero = [](Float& v) { v = Float(0); };
Chao Liu's avatar
Chao Liu committed
192

Chao Liu's avatar
Chao Liu committed
193
    blockwise_4d_tensor_pointwise_operation_unary<BlockSize>(DstDesc{}, p_dst, f_set_zero);
Chao Liu's avatar
Chao Liu committed
194
}
195

Chao Liu's avatar
Chao Liu committed
196
template <unsigned BlockSize,
Chao Liu's avatar
Chao Liu committed
197
          class Float,
198
199
          class SrcDesc,
          class DstDesc,
Chao Liu's avatar
Chao Liu committed
200
201
202
203
204
205
206
207
208
          class SrcOpLengths,
          class DstFromSrcReorder>
__device__ void
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
                                                     Float* const __restrict__ p_src,
                                                     DstDesc,
                                                     Float* __restrict__ p_dst,
                                                     SrcOpLengths,
                                                     DstFromSrcReorder)
209
{
Chao Liu's avatar
Chao Liu committed
210
    auto f_copy = [](const Float& src, Float& dst) { dst = src; };
211

Chao Liu's avatar
Chao Liu committed
212
213
    blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src<BlockSize>(
        SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy);
214
215
}

Chao Liu's avatar
Chao Liu committed
216
template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
Chao Liu's avatar
Chao Liu committed
217
__device__ void blockwise_4d_tensor_copy(
Chao Liu's avatar
Chao Liu committed
218
    SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, SrcOpLengths)
Chao Liu's avatar
Chao Liu committed
219
{
Chao Liu's avatar
Chao Liu committed
220
    constexpr auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{};
221

Chao Liu's avatar
Chao Liu committed
222
223
    blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
        SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, dst_from_src_reorder);
224
}