threadwise_gemm.hip.hpp 7.08 KB
Newer Older
1
2
#pragma once

Chao Liu's avatar
Chao Liu committed
3
template <class Float, class SrcMatrix, class DstMatrix, index_t NRow, index_t NCol>
Chao Liu's avatar
Chao Liu committed
4
5
6
7
8
__device__ void threadwise_matrix_copy(SrcMatrix,
                                       const Float* __restrict__ p_src,
                                       DstMatrix,
                                       Float* __restrict__ p_dst,
                                       Sequence<NRow, NCol>)
9
{
Chao Liu's avatar
Chao Liu committed
10
11
    constexpr auto src_mtx = SrcMatrix{};
    constexpr auto dst_mtx = DstMatrix{};
12

Chao Liu's avatar
Chao Liu committed
13
    for(index_t i = 0; i < NRow; ++i)
14
    {
Chao Liu's avatar
Chao Liu committed
15
        for(index_t j = 0; j < NCol; ++j)
16
        {
Chao Liu's avatar
Chao Liu committed
17
18
            const index_t src_index = src_mtx.Get1dIndex(i, j);
            const index_t dst_index = dst_mtx.Get1dIndex(i, j);
19
20
21
22

            p_dst[dst_index] = p_src[src_index];
        }
    }
Chao Liu's avatar
Chao Liu committed
23
24
25
26
27
28
29
30
}

template <class Float, class SrcMatrix, class DstMatrix, index_t NRow, index_t NCol>
__device__ void threadwise_matrix_copy_v2(SrcMatrix,
                                          const Float* __restrict__ p_src,
                                          DstMatrix,
                                          Float* __restrict__ p_dst,
                                          Sequence<NRow, NCol>,
Chao Liu's avatar
Chao Liu committed
31
                                          const float* const p_lds_begin)
Chao Liu's avatar
Chao Liu committed
32
33
34
35
{
    constexpr auto src_mtx = SrcMatrix{};
    constexpr auto dst_mtx = DstMatrix{};

Chao Liu's avatar
Chao Liu committed
36
#if 0
Chao Liu's avatar
Chao Liu committed
37
38
39
40
41
42
43
44
45
46
47
48
49
50
    for(index_t i = 0; i < NRow; ++i)
    {
        for(index_t j = 0; j < NCol; ++j)
        {
            const index_t src_index = src_mtx.Get1dIndex(i, j);
            const index_t dst_index = dst_mtx.Get1dIndex(i, j);

#if 0
            p_dst[dst_index] = p_src[src_index];
#else
            asm volatile("\n \
                        ds_read_b32 %0, %1 \n \
                        "
                         : "=v"(p_dst[dst_index])
Chao Liu's avatar
Chao Liu committed
51
                         : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));
Chao Liu's avatar
Chao Liu committed
52
53
54
#endif
        }
    }
Chao Liu's avatar
Chao Liu committed
55
#elif 1
Chao Liu's avatar
Chao Liu committed
56
57
58
59
60
61
62
63
    static_assert(NCol == 4, "only for NCol == 4");

    for(index_t i = 0; i < NRow; ++i)
    {
        const index_t src_index = src_mtx.Get1dIndex(i, 0);
        const index_t dst_index = dst_mtx.Get1dIndex(i, 0);

#if 1
Chao Liu's avatar
Chao Liu committed
64
65
        using vector_t          = typename vector_type<Float, 4>::MemoryType;

Chao Liu's avatar
Chao Liu committed
66
67
        *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
            *(reinterpret_cast<const vector_t*>(p_src + src_index));
Chao Liu's avatar
Chao Liu committed
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#elif 0
        // ds_read_b32
        asm volatile(
            "\n \
                    ds_read_b32 %0, %1 \n \
                    "
            : "=v"(p_dst[dst_index])
            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));

        asm volatile(
            "\n \
                    ds_read_b32 %0, %1 \n \
                    "
            : "=v"(p_dst[dst_index + 1])
            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 1) - p_lds_begin))));

        asm volatile(
            "\n \
                    ds_read_b32 %0, %1 \n \
                    "
            : "=v"(p_dst[dst_index + 2])
            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 2) - p_lds_begin))));

        asm volatile(
            "\n \
                    ds_read_b32 %0, %1 \n \
                    "
            : "=v"(p_dst[dst_index + 3])
            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 3) - p_lds_begin))));
Chao Liu's avatar
Chao Liu committed
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#elif 0
        // ds_read2_b32
        using vector_t = typename vector_type<Float, 2>::MemoryType;

        asm volatile(
            "\n \
                    ds_read2_b32 %0, %1 offset1:1\n \
                    "
            : "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index)))
            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));

        asm volatile(
            "\n \
                    ds_read2_b32 %0, %1 offset1:1\n \
                    "
            : "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index + 2)))
            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 2) - p_lds_begin))));
Chao Liu's avatar
Chao Liu committed
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#elif 0
        // ds_read_b64
        using vector_t = typename vector_type<Float, 2>::MemoryType;

        asm volatile(
            "\n \
                    ds_read_b64 %0, %1 \n \
                    "
            : "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index)))
            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));

        asm volatile(
            "\n \
                    ds_read_b64 %0, %1 \n \
                    "
            : "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index + 2)))
            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 2) - p_lds_begin))));
#elif 0
        // ds_read_b128
        using vector_t = typename vector_type<Float, 4>::MemoryType;

Chao Liu's avatar
Chao Liu committed
135
136
137
        asm volatile(
            "\n \
                    ds_read_b128 %0, %1 \n \
Chao Liu's avatar
Chao Liu committed
138
                    "
Chao Liu's avatar
Chao Liu committed
139
140
            : "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index)))
            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));
Chao Liu's avatar
Chao Liu committed
141
142
143
#endif
    }
#endif
144
145
146
147
148
149
150
151
152
153
154
155
156
}

template <class MatrixA,
          class MatrixB,
          class MatrixC,
          bool TransA,
          bool TransB,
          bool TransC,
          class FloatA,
          class FloatB,
          class FloatC,
          class Accumulator>
__device__ void threadwise_gemm(MatrixA,
Chao Liu's avatar
Chao Liu committed
157
                                integral_constant<bool, TransA>,
Chao Liu's avatar
Chao Liu committed
158
                                const FloatA* __restrict__ p_a_thread,
159
                                MatrixB,
Chao Liu's avatar
Chao Liu committed
160
                                integral_constant<bool, TransB>,
Chao Liu's avatar
Chao Liu committed
161
                                const FloatB* __restrict__ p_b_thread,
162
                                MatrixC,
Chao Liu's avatar
Chao Liu committed
163
                                integral_constant<bool, TransC>,
Chao Liu's avatar
Chao Liu committed
164
                                FloatC* __restrict__ p_c_thread,
165
166
167
168
                                Accumulator f_accum)
{
    if(TransA && (!TransB) && (!TransC))
    {
Chao Liu's avatar
Chao Liu committed
169
170
171
        constexpr auto a_mtx = MatrixA{};
        constexpr auto b_mtx = MatrixB{};
        constexpr auto c_mtx = MatrixC{};
172

Chao Liu's avatar
Chao Liu committed
173
174
175
        constexpr index_t M = c_mtx.NRow();
        constexpr index_t N = c_mtx.NCol();
        constexpr index_t K = a_mtx.NRow(); // A is transposed
176

Chao Liu's avatar
Chao Liu committed
177
        for(index_t k = 0; k < K; ++k)
178
        {
Chao Liu's avatar
Chao Liu committed
179
            for(index_t i = 0; i < M; ++i)
180
            {
Chao Liu's avatar
Chao Liu committed
181
                for(index_t j = 0; j < N; ++j)
182
                {
Chao Liu's avatar
Chao Liu committed
183
184
185
                    const index_t aindex = a_mtx.Get1dIndex(k, i); // A is transposed
                    const index_t bindex = b_mtx.Get1dIndex(k, j);
                    const index_t cindex = c_mtx.Get1dIndex(i, j);
186

Chao Liu's avatar
Chao Liu committed
187
#if 0
188
                    f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
Chao Liu's avatar
Chao Liu committed
189
190
191
192
193
194
195
196
197
#elif 1
                    asm volatile("\n \
                                v_mac_f32 %0, %1, %2 \n \
                                "
                                 : "=v"(p_c_thread[cindex])
                                 : "v"(p_a_thread[aindex]),
                                   "v"(p_b_thread[bindex]),
                                   "0"(p_c_thread[cindex]));
#endif
198
199
200
201
202
203
204
205
206
207
                }
            }
        }
    }
    else
    {
        // not implemented
        assert(false);
    }
}