blockwise_tensor_op.cuh 5.83 KB
Newer Older
1
2
3
#pragma once
#include "constant_tensor_descriptor.cuh"

Chao Liu's avatar
Chao Liu committed
4
5
template <class TFloat, class DstDesc, class F, unsigned BlockSize>
__device__ void blockwise_4d_tensor_pointwise_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
Chao Liu's avatar
Chao Liu committed
6
{
Chao Liu's avatar
Chao Liu committed
7
8
9
10
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
11

12
13
    constexpr auto dst_desc = DstDesc{};

Chao Liu's avatar
Chao Liu committed
14
    constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
15

16
17
18
#if 0
    if(threadIdx.x == 0)
    {
Chao Liu's avatar
Chao Liu committed
19
20
        print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: ");
        print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: ");
21
22
23
    }
#endif

Chao Liu's avatar
Chao Liu committed
24
25
    constexpr unsigned NLoop = desc.GetElementSize() / BlockSize;

Chao Liu's avatar
faster  
Chao Liu committed
26
    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
Chao Liu's avatar
Chao Liu committed
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
    {
        unsigned is = threadIdx.x + iloop * BlockSize;

        const unsigned did0 = is / desc.GetStride(I0);

        is -= did0 * desc.GetStride(I0);

        const unsigned did1 = is / desc.GetStride(I1);

        is -= did1 * desc.GetStride(I1);

        const unsigned did2 = is / desc.GetStride(I2);

        is -= did2 * desc.GetStride(I2);

        const unsigned did3 = is / desc.GetStride(I3);

        const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);

Chao Liu's avatar
Chao Liu committed
46
        f(p_dst[dindex]);
Chao Liu's avatar
Chao Liu committed
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
    }

    constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize);

    if(has_tail)
    {
        unsigned is = threadIdx.x + NLoop * BlockSize;

        if(is < desc.GetElementSize())
        {
            const unsigned did0 = is / desc.GetStride(I0);

            is -= did0 * desc.GetStride(I0);

            const unsigned did1 = is / desc.GetStride(I1);

            is -= did1 * desc.GetStride(I1);

            const unsigned did2 = is / desc.GetStride(I2);

            is -= did2 * desc.GetStride(I2);

            const unsigned did3 = is / desc.GetStride(I3);

            const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);

Chao Liu's avatar
Chao Liu committed
73
            f(p_dst[dindex]);
Chao Liu's avatar
Chao Liu committed
74
75
76
        }
    }
}
Chao Liu's avatar
Chao Liu committed
77

Chao Liu's avatar
Chao Liu committed
78
79
80
template <class TFloat, class SrcDesc, class DstDesc, class F, unsigned BlockSize>
__device__ void blockwise_4d_tensor_pointwise_op_binary(
    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
Chao Liu's avatar
Chao Liu committed
81
{
Chao Liu's avatar
Chao Liu committed
82
83
84
85
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
86

Chao Liu's avatar
Chao Liu committed
87
    constexpr auto src_desc = SrcDesc{};
Chao Liu's avatar
Chao Liu committed
88
89
    constexpr auto dst_desc = DstDesc{};

Chao Liu's avatar
Chao Liu committed
90
91
92
    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);

    constexpr auto desc = make_ConstantTensorDescriptor(src_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
93
94
95
96

#if 0
    if(threadIdx.x == 0)
    {
Chao Liu's avatar
Chao Liu committed
97
98
        print_ConstantTensorDescriptor(src_desc, "blockwise_4d_tensor_op_binary: src_desc: ");
        print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_binary: dst_desc: ");
Chao Liu's avatar
Chao Liu committed
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
    }
#endif

    constexpr unsigned NLoop = desc.GetElementSize() / BlockSize;

    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
    {
        unsigned is = threadIdx.x + iloop * BlockSize;

        const unsigned did0 = is / desc.GetStride(I0);

        is -= did0 * desc.GetStride(I0);

        const unsigned did1 = is / desc.GetStride(I1);

        is -= did1 * desc.GetStride(I1);

        const unsigned did2 = is / desc.GetStride(I2);

        is -= did2 * desc.GetStride(I2);

        const unsigned did3 = is / desc.GetStride(I3);

Chao Liu's avatar
Chao Liu committed
122
123
        const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);

Chao Liu's avatar
Chao Liu committed
124
125
        const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);

Chao Liu's avatar
Chao Liu committed
126
        f(p_src[sindex], p_dst[dindex]);
Chao Liu's avatar
Chao Liu committed
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
    }

    constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize);

    if(has_tail)
    {
        unsigned is = threadIdx.x + NLoop * BlockSize;

        if(is < desc.GetElementSize())
        {
            const unsigned did0 = is / desc.GetStride(I0);

            is -= did0 * desc.GetStride(I0);

            const unsigned did1 = is / desc.GetStride(I1);

            is -= did1 * desc.GetStride(I1);

            const unsigned did2 = is / desc.GetStride(I2);

            is -= did2 * desc.GetStride(I2);

            const unsigned did3 = is / desc.GetStride(I3);

Chao Liu's avatar
Chao Liu committed
151
            const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);
152

Chao Liu's avatar
Chao Liu committed
153
            const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
154

Chao Liu's avatar
Chao Liu committed
155
            f(p_src[sindex], p_dst[dindex]);
156
157
158
159
        }
    }
}

Chao Liu's avatar
Chao Liu committed
160
161
template <class TFloat, class DstDesc, unsigned BlockSize>
__device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst)
162
{
Chao Liu's avatar
Chao Liu committed
163
    auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
Chao Liu's avatar
Chao Liu committed
164

Chao Liu's avatar
Chao Liu committed
165
166
    blockwise_4d_tensor_pointwise_op_unary<TFloat, DstDesc, decltype(f_set_zero), BlockSize>(
        DstDesc{}, p_dst, f_set_zero);
Chao Liu's avatar
Chao Liu committed
167
}
168

Chao Liu's avatar
Chao Liu committed
169
170
171
172
173
template <class TFloat, class SrcDesc, class DstDesc, unsigned BlockSize>
__device__ void blockwise_4d_tensor_copy(SrcDesc,
                                         TFloat* const __restrict__ p_src,
                                         DstDesc,
                                         TFloat* __restrict__ p_dst)
Chao Liu's avatar
Chao Liu committed
174
{
Chao Liu's avatar
Chao Liu committed
175
    auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
Chao Liu's avatar
Chao Liu committed
176

Chao Liu's avatar
Chao Liu committed
177
178
    blockwise_4d_tensor_pointwise_op_binary<TFloat, SrcDesc, DstDesc, decltype(f_copy), BlockSize>(
        SrcDesc{}, p_src, DstDesc{}, p_dst, f_copy);
Chao Liu's avatar
Chao Liu committed
179
}
180

Chao Liu's avatar
Chao Liu committed
181
182
183
184
185
template <class TFloat, class SrcDesc, class DstDesc, unsigned BlockSize>
__device__ void blockwise_4d_tensor_accumulate(SrcDesc,
                                               TFloat* const __restrict__ p_src,
                                               DstDesc,
                                               TFloat* __restrict__ p_dst)
Chao Liu's avatar
Chao Liu committed
186
{
Chao Liu's avatar
Chao Liu committed
187
    auto f_accum = [](const TFloat& src, TFloat& dst) { dst += src; };
188

Chao Liu's avatar
Chao Liu committed
189
190
191
    blockwise_4d_tensor_pointwise_op_binary<TFloat, SrcDesc, DstDesc, decltype(f_accum), BlockSize>(
        SrcDesc{}, p_src, DstDesc{}, p_dst, f_accum);
}