gemm_impl.cpp 6.75 KB
Newer Older
Paul Fultz II's avatar
Paul Fultz II committed
1
#include <rocblas.h>
2
#include <migraphx/gpu/gemm_impl.hpp>
Shucai Xiao's avatar
Shucai Xiao committed
3
4
5
6
7

namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {

8
rocblas_datatype get_type(shape::type_t type)
Shucai Xiao's avatar
Shucai Xiao committed
9
{
10
    switch(type)
11
    {
12
13
14
15
16
17
18
    case shape::double_type: return rocblas_datatype_f64_r;
    case shape::float_type: return rocblas_datatype_f32_r;
    case shape::half_type: return rocblas_datatype_f16_r;
    case shape::int8_type: return rocblas_datatype_i8_r;
    case shape::uint8_type: return rocblas_datatype_u8_r;
    case shape::int32_type: return rocblas_datatype_i32_r;
    case shape::uint32_type: return rocblas_datatype_u32_r;
Paul Fultz II's avatar
Paul Fultz II committed
19
    case shape::tuple_type:
20
    case shape::bool_type:
21
22
23
24
    case shape::uint16_type:
    case shape::int16_type:
    case shape::int64_type:
    case shape::uint64_type: MIGRAPHX_THROW("ROCBLAS_GEMM: data type not supported!");
25
    }
26
27

    MIGRAPHX_THROW("ROCBLAS_GEMM: data type not supported!");
28
29
}

30
31
32
33
34
35
36
37
38
template <class R, class... Ts, class... Us>
R rocblas_invoke(R (*f)(Ts...), Us... xs)
{
    if constexpr(sizeof...(Ts) == sizeof...(Us))
        return f(xs...);
    else
        return f(xs..., nullptr, nullptr);
}

39
template <class T>
Shucai Xiao's avatar
Shucai Xiao committed
40
41
42
43
44
void gemm_impl(context& ctx,
               const shape& output_shape,
               const std::vector<argument>& args,
               T alpha,
               T beta,
Khalique Ahmed's avatar
Khalique Ahmed committed
45
46
               bool int8_x4_format,
               bool compute_fp32)
Shucai Xiao's avatar
Shucai Xiao committed
47
{
48
49
50
51
52
53
54
    bool transa     = args[0].get_shape().transposed();
    bool transb     = args[1].get_shape().transposed();
    auto n_dim      = output_shape.lens().size();
    auto dim_1      = n_dim - 1;
    auto dim_0      = n_dim - 2;
    rocblas_int lda = args[0].get_shape().strides()[transa ? dim_1 : dim_0];
    rocblas_int ldb = args[1].get_shape().strides()[transb ? dim_1 : dim_0];
55
    rocblas_int ldc = args[2].get_shape().strides()[dim_0];
56

57
    bool is_3inputs = (args.size() == 4);
58
59
60
61
62
63
64
    if(!is_3inputs)
    {
        beta = 0;
    }
    rocblas_datatype arg_type = get_type(args[0].get_shape().type());
    auto output_type          = arg_type;
    if(output_type == rocblas_datatype_i8_r)
Shucai Xiao's avatar
Shucai Xiao committed
65
    {
66
        output_type = rocblas_datatype_i32_r;
Shucai Xiao's avatar
Shucai Xiao committed
67
    }
68
    auto compute_type = output_type;
Khalique Ahmed's avatar
Khalique Ahmed committed
69
70
71
72
73
    if(compute_fp32)
    {
        if(arg_type == rocblas_datatype_f16_r)
            compute_type = rocblas_datatype_f32_r;
    }
Shucai Xiao's avatar
Shucai Xiao committed
74

Shucai Xiao's avatar
Shucai Xiao committed
75
76
77
78
79
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
    rocblas_gemm_flags flag =
        int8_x4_format ? rocblas_gemm_flags_pack_int8x4 : rocblas_gemm_flags_none;
#else
    (void)int8_x4_format;
80
    int flag = 0;
Shucai Xiao's avatar
Shucai Xiao committed
81
82
#endif

Shucai Xiao's avatar
Shucai Xiao committed
83
84
85
    auto a_lens = args[0].get_shape().lens();
    auto b_lens = args[1].get_shape().lens();
    output_shape.visit_type([&](auto as) {
Khalique Ahmed's avatar
Khalique Ahmed committed
86
87
88
        auto alpha_r = compute_fp32 ? alpha : as(alpha);
        auto beta_r  = compute_fp32 ? beta : as(beta);

Shucai Xiao's avatar
Shucai Xiao committed
89
90
91
92
        auto out_lens   = output_shape.lens();
        rocblas_int m   = out_lens[dim_0];
        rocblas_int n   = out_lens[dim_1];
        rocblas_int k   = args[0].get_shape().lens()[dim_1];
Shucai Xiao's avatar
Shucai Xiao committed
93
        auto to_pointer = [&](auto&& arg) { return as.from(arg.data()); };
Shucai Xiao's avatar
Shucai Xiao committed
94
        if(args[0].get_shape().type() == shape::int8_type and (k % 4) != 0 and int8_x4_format)
95
96
97
        {
            MIGRAPHX_THROW("ROCBLAS_GEMM: k size of int8 type input must be mutlple of 4!");
        }
Shucai Xiao's avatar
Shucai Xiao committed
98

Shucai Xiao's avatar
Shucai Xiao committed
99
100
101
102
        auto num_matrices = std::accumulate(
            out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
        if(num_matrices == 1)
        {
Shucai Xiao's avatar
Shucai Xiao committed
103
            // the rocblas_gemm API handles inputs and output matrices as
Shucai Xiao's avatar
Shucai Xiao committed
104
105
106
            // column-major format. When doing a C = A * B, we actually do
            // C^T = (B^T) * (A^T). That is the reason we input args[1] as
            // A and args[0] as B in calling the rocblas_gemm.
107
            rocblas_invoke(&rocblas_gemm_ex,
Khalique Ahmed's avatar
Khalique Ahmed committed
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
                           ctx.get_stream().get_rocblas(),
                           transb ? rocblas_operation_transpose : rocblas_operation_none,
                           transa ? rocblas_operation_transpose : rocblas_operation_none,
                           n,
                           m,
                           k,
                           &alpha_r,
                           to_pointer(args.at(1)),
                           arg_type,
                           ldb,
                           to_pointer(args.at(0)),
                           arg_type,
                           lda,
                           &beta_r,
                           to_pointer(args[2]),
                           output_type,
                           ldc,
                           is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
                           output_type,
                           ldc,
                           compute_type,
                           rocblas_gemm_algo_standard,
                           0,
                           flag);
Shucai Xiao's avatar
Shucai Xiao committed
132
133
134
        }
        else
        {
135
            rocblas_invoke(&rocblas_gemm_strided_batched_ex,
Khalique Ahmed's avatar
Khalique Ahmed committed
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
                           ctx.get_stream().get_rocblas(),
                           transb ? rocblas_operation_transpose : rocblas_operation_none,
                           transa ? rocblas_operation_transpose : rocblas_operation_none,
                           n,
                           m,
                           k,
                           &alpha_r,
                           to_pointer(args.at(1)),
                           arg_type,
                           ldb,
                           k * n,
                           to_pointer(args.at(0)),
                           arg_type,
                           lda,
                           m * k,
                           &beta_r,
                           to_pointer(args[2]),
                           output_type,
                           ldc,
                           m * n,
                           is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
                           output_type,
                           ldc,
                           m * n,
                           num_matrices,
                           compute_type,
                           rocblas_gemm_algo_standard,
                           0,
                           flag);
Shucai Xiao's avatar
Shucai Xiao committed
165
166
        }
    });
167
}
Shucai Xiao's avatar
Shucai Xiao committed
168

169
170
171
172
void gemm(context& ctx,
          const shape& output_shape,
          const std::vector<argument>& args,
          float alpha,
Shucai Xiao's avatar
Shucai Xiao committed
173
          float beta,
Khalique Ahmed's avatar
Khalique Ahmed committed
174
175
          bool int8_x4_format,
          bool compute_fp32)
176
{
Khalique Ahmed's avatar
Khalique Ahmed committed
177
    gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
178
179
180
181
182
183
}

void gemm(context& ctx,
          const shape& output_shape,
          const std::vector<argument>& args,
          int32_t alpha,
Shucai Xiao's avatar
Shucai Xiao committed
184
          int32_t beta,
Khalique Ahmed's avatar
Khalique Ahmed committed
185
186
          bool int8_x4_format,
          bool compute_fp32)
187
{
Khalique Ahmed's avatar
Khalique Ahmed committed
188
    gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
Shucai Xiao's avatar
Shucai Xiao committed
189
190
191
192
193
}

} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx