"docs/en_US/TrainingService/PaiYarnMode.md" did not exist on "bc0e55a00bbbc825f27d851ccc58a749d18b4fd9"
gemm_impl.cpp 9.29 KB
Newer Older
Paul Fultz II's avatar
Paul Fultz II committed
1
#include <rocblas.h>
2
#include <migraphx/gpu/gemm_impl.hpp>
Shucai Xiao's avatar
Shucai Xiao committed
3
4
5
6
7

namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {

8
rocblas_datatype get_type(shape::type_t type)
Shucai Xiao's avatar
Shucai Xiao committed
9
{
10
    switch(type)
11
    {
12
13
14
15
16
17
18
    case shape::double_type: return rocblas_datatype_f64_r;
    case shape::float_type: return rocblas_datatype_f32_r;
    case shape::half_type: return rocblas_datatype_f16_r;
    case shape::int8_type: return rocblas_datatype_i8_r;
    case shape::uint8_type: return rocblas_datatype_u8_r;
    case shape::int32_type: return rocblas_datatype_i32_r;
    case shape::uint32_type: return rocblas_datatype_u32_r;
Paul Fultz II's avatar
Paul Fultz II committed
19
    case shape::tuple_type:
20
    case shape::bool_type:
21
22
23
24
    case shape::uint16_type:
    case shape::int16_type:
    case shape::int64_type:
    case shape::uint64_type: MIGRAPHX_THROW("ROCBLAS_GEMM: data type not supported!");
25
    }
26
27

    MIGRAPHX_THROW("ROCBLAS_GEMM: data type not supported!");
28
29
}

30
31
32
33
34
35
36
37
38
template <class R, class... Ts, class... Us>
R rocblas_invoke(R (*f)(Ts...), Us... xs)
{
    if constexpr(sizeof...(Ts) == sizeof...(Us))
        return f(xs...);
    else
        return f(xs..., nullptr, nullptr);
}

39
template <class T>
Shucai Xiao's avatar
Shucai Xiao committed
40
41
42
43
44
void gemm_impl(context& ctx,
               const shape& output_shape,
               const std::vector<argument>& args,
               T alpha,
               T beta,
Khalique Ahmed's avatar
Khalique Ahmed committed
45
46
               bool int8_x4_format,
               bool compute_fp32)
Shucai Xiao's avatar
Shucai Xiao committed
47
{
48
49
50
51
52
53
54
    bool transa     = args[0].get_shape().transposed();
    bool transb     = args[1].get_shape().transposed();
    auto n_dim      = output_shape.lens().size();
    auto dim_1      = n_dim - 1;
    auto dim_0      = n_dim - 2;
    rocblas_int lda = args[0].get_shape().strides()[transa ? dim_1 : dim_0];
    rocblas_int ldb = args[1].get_shape().strides()[transb ? dim_1 : dim_0];
55
    rocblas_int ldc = args[2].get_shape().strides()[dim_0];
56

57
    bool is_3inputs = (args.size() == 4);
58
59
60
61
62
63
64
    if(!is_3inputs)
    {
        beta = 0;
    }
    rocblas_datatype arg_type = get_type(args[0].get_shape().type());
    auto output_type          = arg_type;
    if(output_type == rocblas_datatype_i8_r)
Shucai Xiao's avatar
Shucai Xiao committed
65
    {
66
        output_type = rocblas_datatype_i32_r;
Shucai Xiao's avatar
Shucai Xiao committed
67
    }
68
    auto compute_type = output_type;
Khalique Ahmed's avatar
Khalique Ahmed committed
69
70
71
72
73
    if(compute_fp32)
    {
        if(arg_type == rocblas_datatype_f16_r)
            compute_type = rocblas_datatype_f32_r;
    }
Shucai Xiao's avatar
Shucai Xiao committed
74

Shucai Xiao's avatar
Shucai Xiao committed
75
76
77
78
79
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
    rocblas_gemm_flags flag =
        int8_x4_format ? rocblas_gemm_flags_pack_int8x4 : rocblas_gemm_flags_none;
#else
    (void)int8_x4_format;
80
    int flag = 0;
Shucai Xiao's avatar
Shucai Xiao committed
81
82
#endif

Shucai Xiao's avatar
Shucai Xiao committed
83
84
85
    auto a_lens = args[0].get_shape().lens();
    auto b_lens = args[1].get_shape().lens();
    output_shape.visit_type([&](auto as) {
Khalique Ahmed's avatar
Khalique Ahmed committed
86
87
        auto alpha_r = as(alpha);
        auto beta_r  = as(beta);
Khalique Ahmed's avatar
Khalique Ahmed committed
88

Shucai Xiao's avatar
Shucai Xiao committed
89
90
91
92
        auto out_lens   = output_shape.lens();
        rocblas_int m   = out_lens[dim_0];
        rocblas_int n   = out_lens[dim_1];
        rocblas_int k   = args[0].get_shape().lens()[dim_1];
Shucai Xiao's avatar
Shucai Xiao committed
93
        auto to_pointer = [&](auto&& arg) { return as.from(arg.data()); };
Shucai Xiao's avatar
Shucai Xiao committed
94
        if(args[0].get_shape().type() == shape::int8_type and (k % 4) != 0 and int8_x4_format)
95
96
97
        {
            MIGRAPHX_THROW("ROCBLAS_GEMM: k size of int8 type input must be mutlple of 4!");
        }
Shucai Xiao's avatar
Shucai Xiao committed
98

Shucai Xiao's avatar
Shucai Xiao committed
99
100
101
102
        auto num_matrices = std::accumulate(
            out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
        if(num_matrices == 1)
        {
Shucai Xiao's avatar
Shucai Xiao committed
103
            // the rocblas_gemm API handles inputs and output matrices as
Shucai Xiao's avatar
Shucai Xiao committed
104
105
106
            // column-major format. When doing a C = A * B, we actually do
            // C^T = (B^T) * (A^T). That is the reason we input args[1] as
            // A and args[0] as B in calling the rocblas_gemm.
Khalique Ahmed's avatar
Khalique Ahmed committed
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159

            if(compute_fp32)
                rocblas_invoke(&rocblas_gemm_ex,
                            ctx.get_stream().get_rocblas(),
                            transb ? rocblas_operation_transpose : rocblas_operation_none,
                            transa ? rocblas_operation_transpose : rocblas_operation_none,
                            n,
                            m,
                            k,
                            &alpha,
                            to_pointer(args.at(1)),
                            arg_type,
                            ldb,
                            to_pointer(args.at(0)),
                            arg_type,
                            lda,
                            &beta,
                            to_pointer(args[2]),
                            output_type,
                            ldc,
                            is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
                            output_type,
                            ldc,
                            compute_type,
                            rocblas_gemm_algo_standard,
                            0,
                            flag);
            else
                rocblas_invoke(&rocblas_gemm_ex,
                            ctx.get_stream().get_rocblas(),
                            transb ? rocblas_operation_transpose : rocblas_operation_none,
                            transa ? rocblas_operation_transpose : rocblas_operation_none,
                            n,
                            m,
                            k,
                            &alpha_r,
                            to_pointer(args.at(1)),
                            arg_type,
                            ldb,
                            to_pointer(args.at(0)),
                            arg_type,
                            lda,
                            &beta_r,
                            to_pointer(args[2]),
                            output_type,
                            ldc,
                            is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
                            output_type,
                            ldc,
                            compute_type,
                            rocblas_gemm_algo_standard,
                            0,
                            flag);
Shucai Xiao's avatar
Shucai Xiao committed
160
161
162
        }
        else
        {
Khalique Ahmed's avatar
Khalique Ahmed committed
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
            if(compute_fp32)
                rocblas_invoke(&rocblas_gemm_strided_batched_ex,
                            ctx.get_stream().get_rocblas(),
                            transb ? rocblas_operation_transpose : rocblas_operation_none,
                            transa ? rocblas_operation_transpose : rocblas_operation_none,
                            n,
                            m,
                            k,
                            &alpha,
                            to_pointer(args.at(1)),
                            arg_type,
                            ldb,
                            k * n,
                            to_pointer(args.at(0)),
                            arg_type,
                            lda,
                            m * k,
                            &beta,
                            to_pointer(args[2]),
                            output_type,
                            ldc,
                            m * n,
                            is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
                            output_type,
                            ldc,
                            m * n,
                            num_matrices,
                            compute_type,
                            rocblas_gemm_algo_standard,
                            0,
                            flag);
            else
                rocblas_invoke(&rocblas_gemm_strided_batched_ex,
                            ctx.get_stream().get_rocblas(),
                            transb ? rocblas_operation_transpose : rocblas_operation_none,
                            transa ? rocblas_operation_transpose : rocblas_operation_none,
                            n,
                            m,
                            k,
                            &alpha_r,
                            to_pointer(args.at(1)),
                            arg_type,
                            ldb,
                            k * n,
                            to_pointer(args.at(0)),
                            arg_type,
                            lda,
                            m * k,
                            &beta_r,
                            to_pointer(args[2]),
                            output_type,
                            ldc,
                            m * n,
                            is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
                            output_type,
                            ldc,
                            m * n,
                            num_matrices,
                            compute_type,
                            rocblas_gemm_algo_standard,
                            0,
                            flag);
Shucai Xiao's avatar
Shucai Xiao committed
225
226
        }
    });
227
}
Shucai Xiao's avatar
Shucai Xiao committed
228

229
230
231
232
void gemm(context& ctx,
          const shape& output_shape,
          const std::vector<argument>& args,
          float alpha,
Shucai Xiao's avatar
Shucai Xiao committed
233
          float beta,
Khalique Ahmed's avatar
Khalique Ahmed committed
234
235
          bool int8_x4_format,
          bool compute_fp32)
236
{
Khalique Ahmed's avatar
Khalique Ahmed committed
237
    gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
238
239
240
241
242
243
}

void gemm(context& ctx,
          const shape& output_shape,
          const std::vector<argument>& args,
          int32_t alpha,
Shucai Xiao's avatar
Shucai Xiao committed
244
          int32_t beta,
Khalique Ahmed's avatar
Khalique Ahmed committed
245
246
          bool int8_x4_format,
          bool compute_fp32)
247
{
Khalique Ahmed's avatar
Khalique Ahmed committed
248
    gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
Shucai Xiao's avatar
Shucai Xiao committed
249
250
251
252
253
}

} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx