"references/depth/vscode:/vscode.git/clone" did not exist on "c35d3855ccbfa6a36e6ae6337a1f2c721c1f1e78"
blockwise_tensor_op.cuh 5.42 KB
Newer Older
1
2
3
#pragma once
#include "constant_tensor_descriptor.cuh"

Chao Liu's avatar
Chao Liu committed
4
5
template <class TFloat, class DstDesc, class F, unsigned BlockSize>
__device__ void blockwise_4d_tensor_pointwise_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
Chao Liu's avatar
Chao Liu committed
6
{
Chao Liu's avatar
Chao Liu committed
7
8
9
10
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
11

12
13
    constexpr auto dst_desc = DstDesc{};

Chao Liu's avatar
Chao Liu committed
14
    constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths());
Chao Liu's avatar
Chao Liu committed
15

16
17
18
#if 0
    if(threadIdx.x == 0)
    {
Chao Liu's avatar
Chao Liu committed
19
20
        print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: ");
        print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: ");
21
22
23
    }
#endif

Chao Liu's avatar
Chao Liu committed
24
25
    constexpr unsigned NLoop = desc.GetElementSize() / BlockSize;

Chao Liu's avatar
faster  
Chao Liu committed
26
    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
Chao Liu's avatar
Chao Liu committed
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
    {
        unsigned is = threadIdx.x + iloop * BlockSize;

        const unsigned did0 = is / desc.GetStride(I0);

        is -= did0 * desc.GetStride(I0);

        const unsigned did1 = is / desc.GetStride(I1);

        is -= did1 * desc.GetStride(I1);

        const unsigned did2 = is / desc.GetStride(I2);

        is -= did2 * desc.GetStride(I2);

        const unsigned did3 = is / desc.GetStride(I3);

        const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);

Chao Liu's avatar
Chao Liu committed
46
        f(p_dst[dindex]);
Chao Liu's avatar
Chao Liu committed
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
    }

    constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize);

    if(has_tail)
    {
        unsigned is = threadIdx.x + NLoop * BlockSize;

        if(is < desc.GetElementSize())
        {
            const unsigned did0 = is / desc.GetStride(I0);

            is -= did0 * desc.GetStride(I0);

            const unsigned did1 = is / desc.GetStride(I1);

            is -= did1 * desc.GetStride(I1);

            const unsigned did2 = is / desc.GetStride(I2);

            is -= did2 * desc.GetStride(I2);

            const unsigned did3 = is / desc.GetStride(I3);

            const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);

Chao Liu's avatar
Chao Liu committed
73
            f(p_dst[dindex]);
Chao Liu's avatar
Chao Liu committed
74
75
76
        }
    }
}
Chao Liu's avatar
Chao Liu committed
77

Chao Liu's avatar
Chao Liu committed
78
template <class TFloat, class DescA, class DescB, class DescRef, class F, unsigned BlockSize>
Chao Liu's avatar
Chao Liu committed
79
__device__ void blockwise_4d_tensor_pointwise_op_binary(
Chao Liu's avatar
Chao Liu committed
80
    DescA, TFloat* const __restrict__ p_a, DescB, TFloat* __restrict__ p_b, DescRef, F f)
Chao Liu's avatar
Chao Liu committed
81
{
Chao Liu's avatar
Chao Liu committed
82
83
84
85
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
Chao Liu's avatar
Chao Liu committed
86

Chao Liu's avatar
Chao Liu committed
87
88
89
    constexpr auto desc_a   = DescA{};
    constexpr auto desc_b   = DescB{};
    constexpr auto desc_ref = DescRef{};
Chao Liu's avatar
Chao Liu committed
90
91
92
93

#if 0
    if(threadIdx.x == 0)
    {
Chao Liu's avatar
Chao Liu committed
94
95
96
        print_ConstantTensorDescriptor(desc_a, "blockwise_4d_tensor_op_binary: desc_a: ");
        print_ConstantTensorDescriptor(desc_b, "blockwise_4d_tensor_op_binary: desc_b: ");
        print_ConstantTensorDescriptor(desc_ref, "blockwise_4d_tensor_op_binary: desc_ref: ");
Chao Liu's avatar
Chao Liu committed
97
98
99
    }
#endif

Chao Liu's avatar
Chao Liu committed
100
    constexpr unsigned NLoop = desc_ref.GetElementSize() / BlockSize;
Chao Liu's avatar
Chao Liu committed
101
102
103
104
105

    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
    {
        unsigned is = threadIdx.x + iloop * BlockSize;

Chao Liu's avatar
Chao Liu committed
106
        const unsigned did0 = is / desc_ref.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
107

Chao Liu's avatar
Chao Liu committed
108
        is -= did0 * desc_ref.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
109

Chao Liu's avatar
Chao Liu committed
110
        const unsigned did1 = is / desc_ref.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
111

Chao Liu's avatar
Chao Liu committed
112
        is -= did1 * desc_ref.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
113

Chao Liu's avatar
Chao Liu committed
114
        const unsigned did2 = is / desc_ref.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
115

Chao Liu's avatar
Chao Liu committed
116
        is -= did2 * desc_ref.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
117

Chao Liu's avatar
Chao Liu committed
118
        const unsigned did3 = is / desc_ref.GetStride(I3);
Chao Liu's avatar
Chao Liu committed
119

Chao Liu's avatar
Chao Liu committed
120
        const unsigned aindex = desc_a.Get1dIndex(did0, did1, did2, did3);
Chao Liu's avatar
Chao Liu committed
121

Chao Liu's avatar
Chao Liu committed
122
        const unsigned bindex = desc_b.Get1dIndex(did0, did1, did2, did3);
Chao Liu's avatar
Chao Liu committed
123

Chao Liu's avatar
Chao Liu committed
124
        f(p_a[aindex], p_b[bindex]);
Chao Liu's avatar
Chao Liu committed
125
126
    }

Chao Liu's avatar
Chao Liu committed
127
    constexpr bool has_tail = (desc_ref.GetElementSize() > NLoop * BlockSize);
Chao Liu's avatar
Chao Liu committed
128
129
130
131
132

    if(has_tail)
    {
        unsigned is = threadIdx.x + NLoop * BlockSize;

Chao Liu's avatar
Chao Liu committed
133
        if(is < desc_ref.GetElementSize())
Chao Liu's avatar
Chao Liu committed
134
        {
Chao Liu's avatar
Chao Liu committed
135
            const unsigned did0 = is / desc_ref.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
136

Chao Liu's avatar
Chao Liu committed
137
            is -= did0 * desc_ref.GetStride(I0);
Chao Liu's avatar
Chao Liu committed
138

Chao Liu's avatar
Chao Liu committed
139
            const unsigned did1 = is / desc_ref.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
140

Chao Liu's avatar
Chao Liu committed
141
            is -= did1 * desc_ref.GetStride(I1);
Chao Liu's avatar
Chao Liu committed
142

Chao Liu's avatar
Chao Liu committed
143
            const unsigned did2 = is / desc_ref.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
144

Chao Liu's avatar
Chao Liu committed
145
            is -= did2 * desc_ref.GetStride(I2);
Chao Liu's avatar
Chao Liu committed
146

Chao Liu's avatar
Chao Liu committed
147
            const unsigned did3 = is / desc_ref.GetStride(I3);
Chao Liu's avatar
Chao Liu committed
148

Chao Liu's avatar
Chao Liu committed
149
            const unsigned aindex = desc_a.Get1dIndex(did0, did1, did2, did3);
150

Chao Liu's avatar
Chao Liu committed
151
            const unsigned bindex = desc_b.Get1dIndex(did0, did1, did2, did3);
152

Chao Liu's avatar
Chao Liu committed
153
            f(p_a[aindex], p_b[bindex]);
154
155
156
157
        }
    }
}

Chao Liu's avatar
Chao Liu committed
158
159
template <class TFloat, class DstDesc, unsigned BlockSize>
__device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst)
160
{
Chao Liu's avatar
Chao Liu committed
161
    auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
Chao Liu's avatar
Chao Liu committed
162

Chao Liu's avatar
Chao Liu committed
163
164
    blockwise_4d_tensor_pointwise_op_unary<TFloat, DstDesc, decltype(f_set_zero), BlockSize>(
        DstDesc{}, p_dst, f_set_zero);
Chao Liu's avatar
Chao Liu committed
165
}
166

Chao Liu's avatar
Chao Liu committed
167
168
169
template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, unsigned BlockSize>
__device__ void blockwise_4d_tensor_copy(
    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc)
Chao Liu's avatar
Chao Liu committed
170
{
Chao Liu's avatar
Chao Liu committed
171
    auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
Chao Liu's avatar
Chao Liu committed
172

Chao Liu's avatar
Chao Liu committed
173
174
175
176
177
178
179
    blockwise_4d_tensor_pointwise_op_binary<TFloat,
                                            SrcDesc,
                                            DstDesc,
                                            RefDesc,
                                            decltype(f_copy),
                                            BlockSize>(
        SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, f_copy);
Chao Liu's avatar
Chao Liu committed
180
}