threadwise_gemm.hip.hpp 6.42 KB
Newer Older
1
2
#pragma once

Chao Liu's avatar
Chao Liu committed
3
template <class Float, class SrcMatrix, class DstMatrix, index_t NRow, index_t NCol>
Chao Liu's avatar
Chao Liu committed
4
5
6
7
8
__device__ void threadwise_matrix_copy(SrcMatrix,
                                       const Float* __restrict__ p_src,
                                       DstMatrix,
                                       Float* __restrict__ p_dst,
                                       Sequence<NRow, NCol>)
9
{
Chao Liu's avatar
Chao Liu committed
10
11
    constexpr auto src_mtx = SrcMatrix{};
    constexpr auto dst_mtx = DstMatrix{};
12

Chao Liu's avatar
Chao Liu committed
13
    for(index_t i = 0; i < NRow; ++i)
14
    {
Chao Liu's avatar
Chao Liu committed
15
        for(index_t j = 0; j < NCol; ++j)
16
        {
Chao Liu's avatar
Chao Liu committed
17
18
            const index_t src_index = src_mtx.Get1dIndex(i, j);
            const index_t dst_index = dst_mtx.Get1dIndex(i, j);
19
20
21
22

            p_dst[dst_index] = p_src[src_index];
        }
    }
Chao Liu's avatar
Chao Liu committed
23
24
25
26
27
28
29
30
}

template <class Float, class SrcMatrix, class DstMatrix, index_t NRow, index_t NCol>
__device__ void threadwise_matrix_copy_v2(SrcMatrix,
                                          const Float* __restrict__ p_src,
                                          DstMatrix,
                                          Float* __restrict__ p_dst,
                                          Sequence<NRow, NCol>,
Chao Liu's avatar
Chao Liu committed
31
                                          const float* const p_lds_begin)
Chao Liu's avatar
Chao Liu committed
32
33
34
35
{
    constexpr auto src_mtx = SrcMatrix{};
    constexpr auto dst_mtx = DstMatrix{};

Chao Liu's avatar
Chao Liu committed
36
#if 0
Chao Liu's avatar
Chao Liu committed
37
38
39
40
41
42
43
44
45
46
47
48
49
50
    for(index_t i = 0; i < NRow; ++i)
    {
        for(index_t j = 0; j < NCol; ++j)
        {
            const index_t src_index = src_mtx.Get1dIndex(i, j);
            const index_t dst_index = dst_mtx.Get1dIndex(i, j);

#if 0
            p_dst[dst_index] = p_src[src_index];
#else
            asm volatile("\n \
                        ds_read_b32 %0, %1 \n \
                        "
                         : "=v"(p_dst[dst_index])
Chao Liu's avatar
Chao Liu committed
51
                         : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));
Chao Liu's avatar
Chao Liu committed
52
53
54
#endif
        }
    }
Chao Liu's avatar
Chao Liu committed
55
#elif 1
Chao Liu's avatar
Chao Liu committed
56
57
58
59
60
61
62
63
    static_assert(NCol == 4, "only for NCol == 4");

    for(index_t i = 0; i < NRow; ++i)
    {
        const index_t src_index = src_mtx.Get1dIndex(i, 0);
        const index_t dst_index = dst_mtx.Get1dIndex(i, 0);

#if 1
Chao Liu's avatar
Chao Liu committed
64
65
        using vector_t          = typename vector_type<Float, 4>::MemoryType;

Chao Liu's avatar
Chao Liu committed
66
67
        *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
            *(reinterpret_cast<const vector_t*>(p_src + src_index));
Chao Liu's avatar
Chao Liu committed
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#elif 0
        // ds_read_b32
        asm volatile(
            "\n \
                    ds_read_b32 %0, %1 \n \
                    "
            : "=v"(p_dst[dst_index])
            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));

        asm volatile(
            "\n \
                    ds_read_b32 %0, %1 \n \
                    "
            : "=v"(p_dst[dst_index + 1])
            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 1) - p_lds_begin))));

        asm volatile(
            "\n \
                    ds_read_b32 %0, %1 \n \
                    "
            : "=v"(p_dst[dst_index + 2])
            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 2) - p_lds_begin))));

        asm volatile(
            "\n \
                    ds_read_b32 %0, %1 \n \
                    "
            : "=v"(p_dst[dst_index + 3])
            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 3) - p_lds_begin))));
#elif 0
        // ds_read_b64
        using vector_t = typename vector_type<Float, 2>::MemoryType;

        asm volatile(
            "\n \
                    ds_read_b64 %0, %1 \n \
                    "
            : "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index)))
            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));

        asm volatile(
            "\n \
                    ds_read_b64 %0, %1 \n \
                    "
            : "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index + 2)))
            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 2) - p_lds_begin))));
#elif 0
        // ds_read_b128
        using vector_t = typename vector_type<Float, 4>::MemoryType;

Chao Liu's avatar
Chao Liu committed
118
119
120
        asm volatile(
            "\n \
                    ds_read_b128 %0, %1 \n \
Chao Liu's avatar
Chao Liu committed
121
                    "
Chao Liu's avatar
Chao Liu committed
122
123
            : "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index)))
            : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));
Chao Liu's avatar
Chao Liu committed
124
125
126
#endif
    }
#endif
127
128
129
130
131
132
133
134
135
136
137
138
139
}

template <class MatrixA,
          class MatrixB,
          class MatrixC,
          bool TransA,
          bool TransB,
          bool TransC,
          class FloatA,
          class FloatB,
          class FloatC,
          class Accumulator>
__device__ void threadwise_gemm(MatrixA,
Chao Liu's avatar
Chao Liu committed
140
                                integral_constant<bool, TransA>,
Chao Liu's avatar
Chao Liu committed
141
                                const FloatA* __restrict__ p_a_thread,
142
                                MatrixB,
Chao Liu's avatar
Chao Liu committed
143
                                integral_constant<bool, TransB>,
Chao Liu's avatar
Chao Liu committed
144
                                const FloatB* __restrict__ p_b_thread,
145
                                MatrixC,
Chao Liu's avatar
Chao Liu committed
146
                                integral_constant<bool, TransC>,
Chao Liu's avatar
Chao Liu committed
147
                                FloatC* __restrict__ p_c_thread,
148
149
150
151
                                Accumulator f_accum)
{
    if(TransA && (!TransB) && (!TransC))
    {
Chao Liu's avatar
Chao Liu committed
152
153
154
        constexpr auto a_mtx = MatrixA{};
        constexpr auto b_mtx = MatrixB{};
        constexpr auto c_mtx = MatrixC{};
155

Chao Liu's avatar
Chao Liu committed
156
157
158
        constexpr index_t M = c_mtx.NRow();
        constexpr index_t N = c_mtx.NCol();
        constexpr index_t K = a_mtx.NRow(); // A is transposed
159

Chao Liu's avatar
Chao Liu committed
160
        for(index_t k = 0; k < K; ++k)
161
        {
Chao Liu's avatar
Chao Liu committed
162
            for(index_t i = 0; i < M; ++i)
163
            {
Chao Liu's avatar
Chao Liu committed
164
                for(index_t j = 0; j < N; ++j)
165
                {
Chao Liu's avatar
Chao Liu committed
166
167
168
                    const index_t aindex = a_mtx.Get1dIndex(k, i); // A is transposed
                    const index_t bindex = b_mtx.Get1dIndex(k, j);
                    const index_t cindex = c_mtx.Get1dIndex(i, j);
169

Chao Liu's avatar
Chao Liu committed
170
#if 0
171
                    f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
Chao Liu's avatar
Chao Liu committed
172
173
174
175
176
177
178
179
180
#elif 1
                    asm volatile("\n \
                                v_mac_f32 %0, %1, %2 \n \
                                "
                                 : "=v"(p_c_thread[cindex])
                                 : "v"(p_a_thread[aindex]),
                                   "v"(p_b_thread[bindex]),
                                   "0"(p_c_thread[cindex]));
#endif
181
182
183
184
185
186
187
188
189
190
                }
            }
        }
    }
    else
    {
        // not implemented
        assert(false);
    }
}