blockwise_tensor_op.cuh 6.1 KB
Newer Older
1
#pragma once
Chao Liu's avatar
Chao Liu committed
2
#include "ConstantTensorDescriptor.cuh"
3

Chao Liu's avatar
Chao Liu committed
4
template <unsigned BlockSize, class Float, class DstDesc, class F>
5
__device__ void
Chao Liu's avatar
Chao Liu committed
6
blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst, F f)
Chao Liu's avatar
Chao Liu committed
7
{
Chao Liu's avatar
Chao Liu committed
8
9
10
11
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
12

13
14
    constexpr auto dst_desc = DstDesc{};

Chao Liu's avatar
Chao Liu committed
15
    constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
16

17
18
19
#if 0
    if(threadIdx.x == 0)
    {
Chao Liu's avatar
Chao Liu committed
20
21
        print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: ");
        print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: ");
22
23
24
    }
#endif

Chao Liu's avatar
Chao Liu committed
25
26
    constexpr unsigned NLoop = desc.GetElementSize() / BlockSize;

Chao Liu's avatar
faster  
Chao Liu committed
27
    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
Chao Liu's avatar
Chao Liu committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
    {
        unsigned is = threadIdx.x + iloop * BlockSize;

        const unsigned did0 = is / desc.GetStride(I0);

        is -= did0 * desc.GetStride(I0);

        const unsigned did1 = is / desc.GetStride(I1);

        is -= did1 * desc.GetStride(I1);

        const unsigned did2 = is / desc.GetStride(I2);

        is -= did2 * desc.GetStride(I2);

        const unsigned did3 = is / desc.GetStride(I3);

        const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);

Chao Liu's avatar
Chao Liu committed
47
        f(p_dst[dindex]);
Chao Liu's avatar
Chao Liu committed
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
    }

    constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize);

    if(has_tail)
    {
        unsigned is = threadIdx.x + NLoop * BlockSize;

        if(is < desc.GetElementSize())
        {
            const unsigned did0 = is / desc.GetStride(I0);

            is -= did0 * desc.GetStride(I0);

            const unsigned did1 = is / desc.GetStride(I1);

            is -= did1 * desc.GetStride(I1);

            const unsigned did2 = is / desc.GetStride(I2);

            is -= did2 * desc.GetStride(I2);

            const unsigned did3 = is / desc.GetStride(I3);

            const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);

Chao Liu's avatar
Chao Liu committed
74
            f(p_dst[dindex]);
Chao Liu's avatar
Chao Liu committed
75
76
77
        }
    }
}
Chao Liu's avatar
Chao Liu committed
78

Chao Liu's avatar
Chao Liu committed
79
// Function: p_dst[reorder[i0], reorder[i1], reorder[i2], reorder[i3]] = p_src[i0,i1,i2,i3]
80
81
// TODO: in order to optimize mem access for different mem type,
// need to write specialized version
Chao Liu's avatar
Chao Liu committed
82
template <unsigned BlockSize,
Chao Liu's avatar
Chao Liu committed
83
          class Float,
84
85
86
87
          class SrcDesc,
          class DstDesc,
          class RefDesc,
          class Reorder,
Chao Liu's avatar
Chao Liu committed
88
          class F>
89
90
__device__ void
blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
Chao Liu's avatar
Chao Liu committed
91
                                                       Float* const __restrict__ p_src,
92
                                                       DstDesc,
Chao Liu's avatar
Chao Liu committed
93
                                                       Float* __restrict__ p_dst,
94
95
96
                                                       RefDesc,
                                                       Reorder,
                                                       F f)
Chao Liu's avatar
Chao Liu committed
97
{
Chao Liu's avatar
Chao Liu committed
98
99
100
101
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
102

103
104
105
106
    constexpr unsigned IT0 = Reorder{}.Get(I0);
    constexpr unsigned IT1 = Reorder{}.Get(I1);
    constexpr unsigned IT2 = Reorder{}.Get(I2);
    constexpr unsigned IT3 = Reorder{}.Get(I3);
Chao Liu's avatar
Chao Liu committed
107

108
109
110
    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
    constexpr auto ref_desc = RefDesc{};
Chao Liu's avatar
Chao Liu committed
111

112
    constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
Chao Liu's avatar
Chao Liu committed
113
114
115
116
117

    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
    {
        unsigned is = threadIdx.x + iloop * BlockSize;

118
        unsigned did[4];
Chao Liu's avatar
Chao Liu committed
119

120
        did[0] = is / ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
121

122
        is -= did[0] * ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
123

124
        did[1] = is / ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
125

126
        is -= did[1] * ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
127

128
        did[2] = is / ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
129

130
        is -= did[2] * ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
131

132
        did[3] = is / ref_desc.GetStride(I3);
Chao Liu's avatar
Chao Liu committed
133

134
        const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
Chao Liu's avatar
Chao Liu committed
135

136
137
138
        const unsigned bindex = dst_desc.Get1dIndex(did[IT0], did[IT1], did[IT2], did[IT3]);

        f(p_src[aindex], p_dst[bindex]);
Chao Liu's avatar
Chao Liu committed
139
140
    }

141
    constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
Chao Liu's avatar
Chao Liu committed
142
143
144
145
146

    if(has_tail)
    {
        unsigned is = threadIdx.x + NLoop * BlockSize;

147
        if(is < ref_desc.GetElementSize())
Chao Liu's avatar
Chao Liu committed
148
        {
149
150
151
            unsigned did[4];

            did[0] = is / ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
152

153
            is -= did[0] * ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
154

155
            did[1] = is / ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
156

157
            is -= did[1] * ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
158

159
            did[2] = is / ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
160

161
            is -= did[2] * ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
162

163
            did[3] = is / ref_desc.GetStride(I3);
Chao Liu's avatar
Chao Liu committed
164

165
            const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
166

167
            const unsigned bindex = dst_desc.Get1dIndex(did[IT0], did[IT1], did[IT2], did[IT3]);
168

169
            f(p_src[aindex], p_dst[bindex]);
170
171
172
173
        }
    }
}

Chao Liu's avatar
Chao Liu committed
174
175
template <unsigned BlockSize, class Float, class DstDesc>
__device__ void blockwise_4d_tensor_set_zero(DstDesc, Float* __restrict__ p_dst)
176
{
Chao Liu's avatar
Chao Liu committed
177
    auto f_set_zero = [](Float& v) { v = Float(0); };
Chao Liu's avatar
Chao Liu committed
178

Chao Liu's avatar
Chao Liu committed
179
    blockwise_4d_tensor_pointwise_operation_unary<BlockSize>(DstDesc{}, p_dst, f_set_zero);
Chao Liu's avatar
Chao Liu committed
180
}
181

Chao Liu's avatar
Chao Liu committed
182
template <unsigned BlockSize,
Chao Liu's avatar
Chao Liu committed
183
          class Float,
184
185
186
          class SrcDesc,
          class DstDesc,
          class RefDesc,
Chao Liu's avatar
Chao Liu committed
187
          class Reorder>
Chao Liu's avatar
Chao Liu committed
188
189
__device__ void blockwise_4d_tensor_copy_reorder(
    SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc, Reorder)
190
{
Chao Liu's avatar
Chao Liu committed
191
    auto f_copy = [](const Float& src, Float& dst) { dst = src; };
192

Chao Liu's avatar
Chao Liu committed
193
    blockwise_4d_tensor_pointwise_operation_binary_reorder<BlockSize>(
194
195
196
        SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy);
}

Chao Liu's avatar
Chao Liu committed
197
template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class RefDesc>
Chao Liu's avatar
Chao Liu committed
198
__device__ void blockwise_4d_tensor_copy(
Chao Liu's avatar
Chao Liu committed
199
    SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc)
Chao Liu's avatar
Chao Liu committed
200
{
201
202
    constexpr auto reorder = Sequence<0, 1, 2, 3>{};

Chao Liu's avatar
Chao Liu committed
203
    blockwise_4d_tensor_copy_reorder<BlockSize>(
204
205
        SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, reorder);
}