gemm_impl.cpp 6.67 KB
Newer Older
Paul Fultz II's avatar
Paul Fultz II committed
1
#include <rocblas.h>
2
#include <migraphx/gpu/gemm_impl.hpp>
Shucai Xiao's avatar
Shucai Xiao committed
3
4
5
6
7

namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {

8
rocblas_datatype get_type(shape::type_t type)
Shucai Xiao's avatar
Shucai Xiao committed
9
{
10
    switch(type)
11
    {
12
13
14
15
16
17
18
    case shape::double_type: return rocblas_datatype_f64_r;
    case shape::float_type: return rocblas_datatype_f32_r;
    case shape::half_type: return rocblas_datatype_f16_r;
    case shape::int8_type: return rocblas_datatype_i8_r;
    case shape::uint8_type: return rocblas_datatype_u8_r;
    case shape::int32_type: return rocblas_datatype_i32_r;
    case shape::uint32_type: return rocblas_datatype_u32_r;
Paul Fultz II's avatar
Paul Fultz II committed
19
    case shape::tuple_type:
20
    case shape::bool_type:
21
22
23
24
    case shape::uint16_type:
    case shape::int16_type:
    case shape::int64_type:
    case shape::uint64_type: MIGRAPHX_THROW("ROCBLAS_GEMM: data type not supported!");
25
    }
26
27

    MIGRAPHX_THROW("ROCBLAS_GEMM: data type not supported!");
28
29
}

30
31
32
33
34
35
36
37
38
template <class R, class... Ts, class... Us>
R rocblas_invoke(R (*f)(Ts...), Us... xs)
{
    if constexpr(sizeof...(Ts) == sizeof...(Us))
        return f(xs...);
    else
        return f(xs..., nullptr, nullptr);
}

39
template <class T>
Shucai Xiao's avatar
Shucai Xiao committed
40
41
42
43
44
void gemm_impl(context& ctx,
               const shape& output_shape,
               const std::vector<argument>& args,
               T alpha,
               T beta,
Khalique Ahmed's avatar
Khalique Ahmed committed
45
46
               bool int8_x4_format,
               bool compute_fp32)
Shucai Xiao's avatar
Shucai Xiao committed
47
{
48
49
50
51
52
53
54
    bool transa     = args[0].get_shape().transposed();
    bool transb     = args[1].get_shape().transposed();
    auto n_dim      = output_shape.lens().size();
    auto dim_1      = n_dim - 1;
    auto dim_0      = n_dim - 2;
    rocblas_int lda = args[0].get_shape().strides()[transa ? dim_1 : dim_0];
    rocblas_int ldb = args[1].get_shape().strides()[transb ? dim_1 : dim_0];
55
    rocblas_int ldc = args[2].get_shape().strides()[dim_0];
56

57
    bool is_3inputs = (args.size() == 4);
58
59
60
61
62
63
64
    if(!is_3inputs)
    {
        beta = 0;
    }
    rocblas_datatype arg_type = get_type(args[0].get_shape().type());
    auto output_type          = arg_type;
    if(output_type == rocblas_datatype_i8_r)
Shucai Xiao's avatar
Shucai Xiao committed
65
    {
66
        output_type = rocblas_datatype_i32_r;
Shucai Xiao's avatar
Shucai Xiao committed
67
    }
68
    auto compute_type = output_type;
Khalique Ahmed's avatar
Khalique Ahmed committed
69
70
71
72
73
    if(compute_fp32)
    {
        if(arg_type == rocblas_datatype_f16_r)
            compute_type = rocblas_datatype_f32_r;
    }
Shucai Xiao's avatar
Shucai Xiao committed
74

Shucai Xiao's avatar
Shucai Xiao committed
75
76
77
78
79
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
    rocblas_gemm_flags flag =
        int8_x4_format ? rocblas_gemm_flags_pack_int8x4 : rocblas_gemm_flags_none;
#else
    (void)int8_x4_format;
80
    int flag = 0;
Shucai Xiao's avatar
Shucai Xiao committed
81
82
#endif

Shucai Xiao's avatar
Shucai Xiao committed
83
84
85
    auto a_lens = args[0].get_shape().lens();
    auto b_lens = args[1].get_shape().lens();
    output_shape.visit_type([&](auto as) {
Khalique Ahmed's avatar
Khalique Ahmed committed
86

Khalique Ahmed's avatar
Khalique Ahmed committed
87
88
        auto alpha_r = as(alpha);
        auto beta_r  = as(beta);
Khalique Ahmed's avatar
Khalique Ahmed committed
89

90
91
92
93
94
95
96
        // use void pointer to select different data type if using fp32 mode
        void* alpha_v{&alpha_r};
        void* beta_v{&beta_r};

        if(compute_fp32)
        {
            alpha_v = &alpha;
Khalique Ahmed's avatar
Khalique Ahmed committed
97
            beta_v  = &beta;
98
99
        }

Shucai Xiao's avatar
Shucai Xiao committed
100
101
102
103
        auto out_lens   = output_shape.lens();
        rocblas_int m   = out_lens[dim_0];
        rocblas_int n   = out_lens[dim_1];
        rocblas_int k   = args[0].get_shape().lens()[dim_1];
Shucai Xiao's avatar
Shucai Xiao committed
104
        auto to_pointer = [&](auto&& arg) { return as.from(arg.data()); };
Shucai Xiao's avatar
Shucai Xiao committed
105
        if(args[0].get_shape().type() == shape::int8_type and (k % 4) != 0 and int8_x4_format)
106
107
108
        {
            MIGRAPHX_THROW("ROCBLAS_GEMM: k size of int8 type input must be mutlple of 4!");
        }
Shucai Xiao's avatar
Shucai Xiao committed
109

Shucai Xiao's avatar
Shucai Xiao committed
110
111
112
113
        auto num_matrices = std::accumulate(
            out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
        if(num_matrices == 1)
        {
114
            rocblas_invoke(&rocblas_gemm_ex,
Khalique Ahmed's avatar
Khalique Ahmed committed
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
                           ctx.get_stream().get_rocblas(),
                           transb ? rocblas_operation_transpose : rocblas_operation_none,
                           transa ? rocblas_operation_transpose : rocblas_operation_none,
                           n,
                           m,
                           k,
                           alpha_v,
                           to_pointer(args.at(1)),
                           arg_type,
                           ldb,
                           to_pointer(args.at(0)),
                           arg_type,
                           lda,
                           beta_v,
                           to_pointer(args[2]),
                           output_type,
                           ldc,
                           is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
                           output_type,
                           ldc,
                           compute_type,
                           rocblas_gemm_algo_standard,
                           0,
                           flag);
Shucai Xiao's avatar
Shucai Xiao committed
139
140
141
        }
        else
        {
142
            rocblas_invoke(&rocblas_gemm_strided_batched_ex,
Khalique Ahmed's avatar
Khalique Ahmed committed
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
                           ctx.get_stream().get_rocblas(),
                           transb ? rocblas_operation_transpose : rocblas_operation_none,
                           transa ? rocblas_operation_transpose : rocblas_operation_none,
                           n,
                           m,
                           k,
                           alpha_v,
                           to_pointer(args.at(1)),
                           arg_type,
                           ldb,
                           k * n,
                           to_pointer(args.at(0)),
                           arg_type,
                           lda,
                           m * k,
                           beta_v,
                           to_pointer(args[2]),
                           output_type,
                           ldc,
                           m * n,
                           is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
                           output_type,
                           ldc,
                           m * n,
                           num_matrices,
                           compute_type,
                           rocblas_gemm_algo_standard,
                           0,
                           flag);
Shucai Xiao's avatar
Shucai Xiao committed
172
173
        }
    });
174
}
Shucai Xiao's avatar
Shucai Xiao committed
175

176
177
178
179
void gemm(context& ctx,
          const shape& output_shape,
          const std::vector<argument>& args,
          float alpha,
Shucai Xiao's avatar
Shucai Xiao committed
180
          float beta,
Khalique Ahmed's avatar
Khalique Ahmed committed
181
182
          bool int8_x4_format,
          bool compute_fp32)
183
{
Khalique Ahmed's avatar
Khalique Ahmed committed
184
    gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
185
186
187
188
189
190
}

void gemm(context& ctx,
          const shape& output_shape,
          const std::vector<argument>& args,
          int32_t alpha,
Shucai Xiao's avatar
Shucai Xiao committed
191
          int32_t beta,
Khalique Ahmed's avatar
Khalique Ahmed committed
192
193
          bool int8_x4_format,
          bool compute_fp32)
194
{
Khalique Ahmed's avatar
Khalique Ahmed committed
195
    gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
Shucai Xiao's avatar
Shucai Xiao committed
196
197
198
199
200
}

} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx