"include/ck/utility/sequence.hpp" did not exist on "917d7a2b1da257f4ae3a2e2525adb6e70f89078f"
threadwise_gemm.hip.hpp 4.66 KB
Newer Older
1
2
#pragma once

Chao Liu's avatar
Chao Liu committed
3
template <class Float, class SrcMatrix, class DstMatrix, index_t NRow, index_t NCol>
Chao Liu's avatar
Chao Liu committed
4
5
6
7
8
__device__ void threadwise_matrix_copy(SrcMatrix,
                                       const Float* __restrict__ p_src,
                                       DstMatrix,
                                       Float* __restrict__ p_dst,
                                       Sequence<NRow, NCol>)
9
{
Chao Liu's avatar
Chao Liu committed
10
11
    constexpr auto src_mtx = SrcMatrix{};
    constexpr auto dst_mtx = DstMatrix{};
12

Chao Liu's avatar
Chao Liu committed
13
    for(index_t i = 0; i < NRow; ++i)
14
    {
Chao Liu's avatar
Chao Liu committed
15
        for(index_t j = 0; j < NCol; ++j)
16
        {
Chao Liu's avatar
Chao Liu committed
17
18
            const index_t src_index = src_mtx.Get1dIndex(i, j);
            const index_t dst_index = dst_mtx.Get1dIndex(i, j);
19
20
21
22

            p_dst[dst_index] = p_src[src_index];
        }
    }
Chao Liu's avatar
Chao Liu committed
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
}

template <class Float, class SrcMatrix, class DstMatrix, index_t NRow, index_t NCol>
__device__ void threadwise_matrix_copy_v2(SrcMatrix,
                                          const Float* __restrict__ p_src,
                                          DstMatrix,
                                          Float* __restrict__ p_dst,
                                          Sequence<NRow, NCol>,
                                          const float* p_lds_begin)
{
    constexpr auto src_mtx = SrcMatrix{};
    constexpr auto dst_mtx = DstMatrix{};

#if 1
    for(index_t i = 0; i < NRow; ++i)
    {
        for(index_t j = 0; j < NCol; ++j)
        {
            const index_t src_index = src_mtx.Get1dIndex(i, j);
            const index_t dst_index = dst_mtx.Get1dIndex(i, j);

#if 0
            p_dst[dst_index] = p_src[src_index];
#else
            asm volatile("\n \
                        ds_read_b32 %0, %1 \n \
                        "
                         : "=v"(p_dst[dst_index])
                         : "v"((uint32_t)((uintptr_t)((p_src + src_index) - p_lds_begin))));
#endif
        }
    }
#elif 0
Chao Liu's avatar
Chao Liu committed
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
    static_assert(NCol == 4, "only for NCol == 4");

    using vector_t = typename vector_type<Float, 4>::MemoryType;

    for(index_t i = 0; i < NRow; ++i)
    {
        const index_t src_index = src_mtx.Get1dIndex(i, 0);
        const index_t dst_index = dst_mtx.Get1dIndex(i, 0);

#if 1
        *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
            *(reinterpret_cast<const vector_t*>(p_src + src_index));
#elif 1
        asm volatile("\n \
                    ds_read_b128 %0, %1, offset:0 \n \
                    "
Chao Liu's avatar
Chao Liu committed
72
73
                     : "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index)))
                     : "v"((uint32_t)((uintptr_t)(p_src + src_index - p_lds_begin))));
Chao Liu's avatar
Chao Liu committed
74
75
76
#endif
    }
#endif
77
78
79
80
81
82
83
84
85
86
87
88
89
}

template <class MatrixA,
          class MatrixB,
          class MatrixC,
          bool TransA,
          bool TransB,
          bool TransC,
          class FloatA,
          class FloatB,
          class FloatC,
          class Accumulator>
__device__ void threadwise_gemm(MatrixA,
Chao Liu's avatar
Chao Liu committed
90
                                integral_constant<bool, TransA>,
Chao Liu's avatar
Chao Liu committed
91
                                const FloatA* __restrict__ p_a_thread,
92
                                MatrixB,
Chao Liu's avatar
Chao Liu committed
93
                                integral_constant<bool, TransB>,
Chao Liu's avatar
Chao Liu committed
94
                                const FloatB* __restrict__ p_b_thread,
95
                                MatrixC,
Chao Liu's avatar
Chao Liu committed
96
                                integral_constant<bool, TransC>,
Chao Liu's avatar
Chao Liu committed
97
                                FloatC* __restrict__ p_c_thread,
98
99
100
101
                                Accumulator f_accum)
{
    if(TransA && (!TransB) && (!TransC))
    {
Chao Liu's avatar
Chao Liu committed
102
103
104
        constexpr auto a_mtx = MatrixA{};
        constexpr auto b_mtx = MatrixB{};
        constexpr auto c_mtx = MatrixC{};
105

Chao Liu's avatar
Chao Liu committed
106
107
108
        constexpr index_t M = c_mtx.NRow();
        constexpr index_t N = c_mtx.NCol();
        constexpr index_t K = a_mtx.NRow(); // A is transposed
109

Chao Liu's avatar
Chao Liu committed
110
        for(index_t k = 0; k < K; ++k)
111
        {
Chao Liu's avatar
Chao Liu committed
112
            for(index_t i = 0; i < M; ++i)
113
            {
Chao Liu's avatar
Chao Liu committed
114
                for(index_t j = 0; j < N; ++j)
115
                {
Chao Liu's avatar
Chao Liu committed
116
117
118
                    const index_t aindex = a_mtx.Get1dIndex(k, i); // A is transposed
                    const index_t bindex = b_mtx.Get1dIndex(k, j);
                    const index_t cindex = c_mtx.Get1dIndex(i, j);
119

Chao Liu's avatar
Chao Liu committed
120
#if 0
121
                    f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
Chao Liu's avatar
Chao Liu committed
122
123
124
125
126
127
128
129
130
#elif 1
                    asm volatile("\n \
                                v_mac_f32 %0, %1, %2 \n \
                                "
                                 : "=v"(p_c_thread[cindex])
                                 : "v"(p_a_thread[aindex]),
                                   "v"(p_b_thread[bindex]),
                                   "0"(p_c_thread[cindex]));
#endif
131
132
133
134
135
136
137
138
139
140
                }
            }
        }
    }
    else
    {
        // not implemented
        assert(false);
    }
}