gemm_impl.cpp 6.67 KB
Newer Older
Paul Fultz II's avatar
Paul Fultz II committed
1
#include <rocblas.h>
2
#include <migraphx/gpu/gemm_impl.hpp>
Shucai Xiao's avatar
Shucai Xiao committed
3
4
5
6
7

namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {

8
rocblas_datatype get_type(shape::type_t type)
Shucai Xiao's avatar
Shucai Xiao committed
9
{
10
    switch(type)
11
    {
12
13
14
15
16
17
18
    case shape::double_type: return rocblas_datatype_f64_r;
    case shape::float_type: return rocblas_datatype_f32_r;
    case shape::half_type: return rocblas_datatype_f16_r;
    case shape::int8_type: return rocblas_datatype_i8_r;
    case shape::uint8_type: return rocblas_datatype_u8_r;
    case shape::int32_type: return rocblas_datatype_i32_r;
    case shape::uint32_type: return rocblas_datatype_u32_r;
Paul Fultz II's avatar
Paul Fultz II committed
19
    case shape::tuple_type:
20
    case shape::bool_type:
21
22
23
24
    case shape::uint16_type:
    case shape::int16_type:
    case shape::int64_type:
    case shape::uint64_type: MIGRAPHX_THROW("ROCBLAS_GEMM: data type not supported!");
25
    }
26
27

    MIGRAPHX_THROW("ROCBLAS_GEMM: data type not supported!");
28
29
}

30
31
32
33
34
35
36
37
38
template <class R, class... Ts, class... Us>
R rocblas_invoke(R (*f)(Ts...), Us... xs)
{
    if constexpr(sizeof...(Ts) == sizeof...(Us))
        return f(xs...);
    else
        return f(xs..., nullptr, nullptr);
}

39
template <class T>
Shucai Xiao's avatar
Shucai Xiao committed
40
41
42
43
44
void gemm_impl(context& ctx,
               const shape& output_shape,
               const std::vector<argument>& args,
               T alpha,
               T beta,
Khalique Ahmed's avatar
Khalique Ahmed committed
45
46
               bool int8_x4_format,
               bool compute_fp32)
Shucai Xiao's avatar
Shucai Xiao committed
47
{
48
49
50
51
52
53
54
    bool transa     = args[0].get_shape().transposed();
    bool transb     = args[1].get_shape().transposed();
    auto n_dim      = output_shape.lens().size();
    auto dim_1      = n_dim - 1;
    auto dim_0      = n_dim - 2;
    rocblas_int lda = args[0].get_shape().strides()[transa ? dim_1 : dim_0];
    rocblas_int ldb = args[1].get_shape().strides()[transb ? dim_1 : dim_0];
55
    rocblas_int ldc = args[2].get_shape().strides()[dim_0];
56

57
    bool is_3inputs = (args.size() == 4);
58
59
60
61
62
63
64
    if(!is_3inputs)
    {
        beta = 0;
    }
    rocblas_datatype arg_type = get_type(args[0].get_shape().type());
    auto output_type          = arg_type;
    if(output_type == rocblas_datatype_i8_r)
Shucai Xiao's avatar
Shucai Xiao committed
65
    {
66
        output_type = rocblas_datatype_i32_r;
Shucai Xiao's avatar
Shucai Xiao committed
67
    }
68
    auto compute_type = output_type;
Khalique Ahmed's avatar
Khalique Ahmed committed
69
70
71
72
73
    if(compute_fp32)
    {
        if(arg_type == rocblas_datatype_f16_r)
            compute_type = rocblas_datatype_f32_r;
    }
Shucai Xiao's avatar
Shucai Xiao committed
74

Shucai Xiao's avatar
Shucai Xiao committed
75
76
77
78
79
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
    rocblas_gemm_flags flag =
        int8_x4_format ? rocblas_gemm_flags_pack_int8x4 : rocblas_gemm_flags_none;
#else
    (void)int8_x4_format;
80
    int flag = 0;
Shucai Xiao's avatar
Shucai Xiao committed
81
82
#endif

Shucai Xiao's avatar
Shucai Xiao committed
83
84
85
    auto a_lens = args[0].get_shape().lens();
    auto b_lens = args[1].get_shape().lens();
    output_shape.visit_type([&](auto as) {
Khalique Ahmed's avatar
Khalique Ahmed committed
86
87
        auto alpha_r = as(alpha);
        auto beta_r  = as(beta);
Khalique Ahmed's avatar
Khalique Ahmed committed
88

89
        // use void pointer to select different data type if using fp32 mode
90
91
        void* alpha_v = &alpha_r;
        void* beta_v  = &beta_r;
92
93
94
95

        if(compute_fp32)
        {
            alpha_v = &alpha;
Khalique Ahmed's avatar
Khalique Ahmed committed
96
            beta_v  = &beta;
97
98
        }

Shucai Xiao's avatar
Shucai Xiao committed
99
100
101
102
        auto out_lens   = output_shape.lens();
        rocblas_int m   = out_lens[dim_0];
        rocblas_int n   = out_lens[dim_1];
        rocblas_int k   = args[0].get_shape().lens()[dim_1];
Shucai Xiao's avatar
Shucai Xiao committed
103
        auto to_pointer = [&](auto&& arg) { return as.from(arg.data()); };
Shucai Xiao's avatar
Shucai Xiao committed
104
        if(args[0].get_shape().type() == shape::int8_type and (k % 4) != 0 and int8_x4_format)
105
106
107
        {
            MIGRAPHX_THROW("ROCBLAS_GEMM: k size of int8 type input must be mutlple of 4!");
        }
Shucai Xiao's avatar
Shucai Xiao committed
108

Shucai Xiao's avatar
Shucai Xiao committed
109
110
111
112
        auto num_matrices = std::accumulate(
            out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
        if(num_matrices == 1)
        {
113
            rocblas_invoke(&rocblas_gemm_ex,
Khalique Ahmed's avatar
Khalique Ahmed committed
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
                           ctx.get_stream().get_rocblas(),
                           transb ? rocblas_operation_transpose : rocblas_operation_none,
                           transa ? rocblas_operation_transpose : rocblas_operation_none,
                           n,
                           m,
                           k,
                           alpha_v,
                           to_pointer(args.at(1)),
                           arg_type,
                           ldb,
                           to_pointer(args.at(0)),
                           arg_type,
                           lda,
                           beta_v,
                           to_pointer(args[2]),
                           output_type,
                           ldc,
                           is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
                           output_type,
                           ldc,
                           compute_type,
                           rocblas_gemm_algo_standard,
                           0,
                           flag);
Shucai Xiao's avatar
Shucai Xiao committed
138
139
140
        }
        else
        {
141
            rocblas_invoke(&rocblas_gemm_strided_batched_ex,
Khalique Ahmed's avatar
Khalique Ahmed committed
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
                           ctx.get_stream().get_rocblas(),
                           transb ? rocblas_operation_transpose : rocblas_operation_none,
                           transa ? rocblas_operation_transpose : rocblas_operation_none,
                           n,
                           m,
                           k,
                           alpha_v,
                           to_pointer(args.at(1)),
                           arg_type,
                           ldb,
                           k * n,
                           to_pointer(args.at(0)),
                           arg_type,
                           lda,
                           m * k,
                           beta_v,
                           to_pointer(args[2]),
                           output_type,
                           ldc,
                           m * n,
                           is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
                           output_type,
                           ldc,
                           m * n,
                           num_matrices,
                           compute_type,
                           rocblas_gemm_algo_standard,
                           0,
                           flag);
Shucai Xiao's avatar
Shucai Xiao committed
171
172
        }
    });
173
}
Shucai Xiao's avatar
Shucai Xiao committed
174

175
176
177
178
void gemm(context& ctx,
          const shape& output_shape,
          const std::vector<argument>& args,
          float alpha,
Shucai Xiao's avatar
Shucai Xiao committed
179
          float beta,
Khalique Ahmed's avatar
Khalique Ahmed committed
180
181
          bool int8_x4_format,
          bool compute_fp32)
182
{
Khalique Ahmed's avatar
Khalique Ahmed committed
183
    gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
184
185
186
187
188
189
}

void gemm(context& ctx,
          const shape& output_shape,
          const std::vector<argument>& args,
          int32_t alpha,
Shucai Xiao's avatar
Shucai Xiao committed
190
          int32_t beta,
Khalique Ahmed's avatar
Khalique Ahmed committed
191
192
          bool int8_x4_format,
          bool compute_fp32)
193
{
Khalique Ahmed's avatar
Khalique Ahmed committed
194
    gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
Shucai Xiao's avatar
Shucai Xiao committed
195
196
197
198
199
}

} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx