blockwise_tensor_op.cuh 6.02 KB
Newer Older
1
2
3
#pragma once
#include "constant_tensor_descriptor.cuh"

Chao Liu's avatar
Chao Liu committed
4
template <unsigned BlockSize, class Float, class DstDesc, class F>
5
__device__ void
Chao Liu's avatar
Chao Liu committed
6
blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst, F f)
Chao Liu's avatar
Chao Liu committed
7
{
Chao Liu's avatar
Chao Liu committed
8
9
10
11
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
12

13
14
    constexpr auto dst_desc = DstDesc{};

Chao Liu's avatar
Chao Liu committed
15
    constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
16

17
18
19
#if 0
    if(threadIdx.x == 0)
    {
Chao Liu's avatar
Chao Liu committed
20
21
        print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: ");
        print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: ");
22
23
24
    }
#endif

Chao Liu's avatar
Chao Liu committed
25
26
    constexpr unsigned NLoop = desc.GetElementSize() / BlockSize;

Chao Liu's avatar
faster  
Chao Liu committed
27
    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
Chao Liu's avatar
Chao Liu committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
    {
        unsigned is = threadIdx.x + iloop * BlockSize;

        const unsigned did0 = is / desc.GetStride(I0);

        is -= did0 * desc.GetStride(I0);

        const unsigned did1 = is / desc.GetStride(I1);

        is -= did1 * desc.GetStride(I1);

        const unsigned did2 = is / desc.GetStride(I2);

        is -= did2 * desc.GetStride(I2);

        const unsigned did3 = is / desc.GetStride(I3);

        const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);

Chao Liu's avatar
Chao Liu committed
47
        f(p_dst[dindex]);
Chao Liu's avatar
Chao Liu committed
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
    }

    constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize);

    if(has_tail)
    {
        unsigned is = threadIdx.x + NLoop * BlockSize;

        if(is < desc.GetElementSize())
        {
            const unsigned did0 = is / desc.GetStride(I0);

            is -= did0 * desc.GetStride(I0);

            const unsigned did1 = is / desc.GetStride(I1);

            is -= did1 * desc.GetStride(I1);

            const unsigned did2 = is / desc.GetStride(I2);

            is -= did2 * desc.GetStride(I2);

            const unsigned did3 = is / desc.GetStride(I3);

            const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);

Chao Liu's avatar
Chao Liu committed
74
            f(p_dst[dindex]);
Chao Liu's avatar
Chao Liu committed
75
76
77
        }
    }
}
Chao Liu's avatar
Chao Liu committed
78

79
80
// TODO: in order to optimize mem access for different mem type,
// need to write specialized version
Chao Liu's avatar
Chao Liu committed
81
template <unsigned BlockSize,
Chao Liu's avatar
Chao Liu committed
82
          class Float,
83
84
85
86
          class SrcDesc,
          class DstDesc,
          class RefDesc,
          class Reorder,
Chao Liu's avatar
Chao Liu committed
87
          class F>
88
89
__device__ void
blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
Chao Liu's avatar
Chao Liu committed
90
                                                       Float* const __restrict__ p_src,
91
                                                       DstDesc,
Chao Liu's avatar
Chao Liu committed
92
                                                       Float* __restrict__ p_dst,
93
94
95
                                                       RefDesc,
                                                       Reorder,
                                                       F f)
Chao Liu's avatar
Chao Liu committed
96
{
Chao Liu's avatar
Chao Liu committed
97
98
99
100
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
101

102
103
104
105
    constexpr unsigned IT0 = Reorder{}.Get(I0);
    constexpr unsigned IT1 = Reorder{}.Get(I1);
    constexpr unsigned IT2 = Reorder{}.Get(I2);
    constexpr unsigned IT3 = Reorder{}.Get(I3);
Chao Liu's avatar
Chao Liu committed
106

107
108
109
    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
    constexpr auto ref_desc = RefDesc{};
Chao Liu's avatar
Chao Liu committed
110

111
    constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
Chao Liu's avatar
Chao Liu committed
112
113
114
115
116

    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
    {
        unsigned is = threadIdx.x + iloop * BlockSize;

117
        unsigned did[4];
Chao Liu's avatar
Chao Liu committed
118

119
        did[0] = is / ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
120

121
        is -= did[0] * ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
122

123
        did[1] = is / ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
124

125
        is -= did[1] * ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
126

127
        did[2] = is / ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
128

129
        is -= did[2] * ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
130

131
        did[3] = is / ref_desc.GetStride(I3);
Chao Liu's avatar
Chao Liu committed
132

133
        const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
Chao Liu's avatar
Chao Liu committed
134

135
136
137
        const unsigned bindex = dst_desc.Get1dIndex(did[IT0], did[IT1], did[IT2], did[IT3]);

        f(p_src[aindex], p_dst[bindex]);
Chao Liu's avatar
Chao Liu committed
138
139
    }

140
    constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
Chao Liu's avatar
Chao Liu committed
141
142
143
144
145

    if(has_tail)
    {
        unsigned is = threadIdx.x + NLoop * BlockSize;

146
        if(is < ref_desc.GetElementSize())
Chao Liu's avatar
Chao Liu committed
147
        {
148
149
150
            unsigned did[4];

            did[0] = is / ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
151

152
            is -= did[0] * ref_desc.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
153

154
            did[1] = is / ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
155

156
            is -= did[1] * ref_desc.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
157

158
            did[2] = is / ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
159

160
            is -= did[2] * ref_desc.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
161

162
            did[3] = is / ref_desc.GetStride(I3);
Chao Liu's avatar
Chao Liu committed
163

164
            const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
165

166
            const unsigned bindex = dst_desc.Get1dIndex(did[IT0], did[IT1], did[IT2], did[IT3]);
167

168
            f(p_src[aindex], p_dst[bindex]);
169
170
171
172
        }
    }
}

Chao Liu's avatar
Chao Liu committed
173
174
template <unsigned BlockSize, class Float, class DstDesc>
__device__ void blockwise_4d_tensor_set_zero(DstDesc, Float* __restrict__ p_dst)
175
{
Chao Liu's avatar
Chao Liu committed
176
    auto f_set_zero = [](Float& v) { v = Float(0); };
Chao Liu's avatar
Chao Liu committed
177

Chao Liu's avatar
Chao Liu committed
178
    blockwise_4d_tensor_pointwise_operation_unary<BlockSize>(DstDesc{}, p_dst, f_set_zero);
Chao Liu's avatar
Chao Liu committed
179
}
180

Chao Liu's avatar
Chao Liu committed
181
template <unsigned BlockSize,
Chao Liu's avatar
Chao Liu committed
182
          class Float,
183
184
185
          class SrcDesc,
          class DstDesc,
          class RefDesc,
Chao Liu's avatar
Chao Liu committed
186
          class Reorder>
Chao Liu's avatar
Chao Liu committed
187
188
__device__ void blockwise_4d_tensor_copy_reorder(
    SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc, Reorder)
189
{
Chao Liu's avatar
Chao Liu committed
190
    auto f_copy = [](const Float& src, Float& dst) { dst = src; };
191

Chao Liu's avatar
Chao Liu committed
192
    blockwise_4d_tensor_pointwise_operation_binary_reorder<BlockSize>(
193
194
195
        SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy);
}

Chao Liu's avatar
Chao Liu committed
196
template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class RefDesc>
Chao Liu's avatar
Chao Liu committed
197
__device__ void blockwise_4d_tensor_copy(
Chao Liu's avatar
Chao Liu committed
198
    SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc)
Chao Liu's avatar
Chao Liu committed
199
{
200
201
    constexpr auto reorder = Sequence<0, 1, 2, 3>{};

Chao Liu's avatar
Chao Liu committed
202
    blockwise_4d_tensor_copy_reorder<BlockSize>(
203
204
        SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, reorder);
}