gemm_impl.cpp 9.19 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
Paul Fultz II's avatar
Paul Fultz II committed
24
#include <rocblas.h>
25
#include <migraphx/gpu/gemm_impl.hpp>
26
#include <migraphx/reduce_dims.hpp>
Shucai Xiao's avatar
Shucai Xiao committed
27
28
29
30
31

namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {

32
rocblas_datatype get_type(shape::type_t type)
Shucai Xiao's avatar
Shucai Xiao committed
33
{
34
    switch(type)
35
    {
36
37
38
39
40
41
42
    case shape::double_type: return rocblas_datatype_f64_r;
    case shape::float_type: return rocblas_datatype_f32_r;
    case shape::half_type: return rocblas_datatype_f16_r;
    case shape::int8_type: return rocblas_datatype_i8_r;
    case shape::uint8_type: return rocblas_datatype_u8_r;
    case shape::int32_type: return rocblas_datatype_i32_r;
    case shape::uint32_type: return rocblas_datatype_u32_r;
Paul Fultz II's avatar
Paul Fultz II committed
43
    case shape::tuple_type:
44
    case shape::bool_type:
45
46
47
48
    case shape::uint16_type:
    case shape::int16_type:
    case shape::int64_type:
    case shape::uint64_type: MIGRAPHX_THROW("ROCBLAS_GEMM: data type not supported!");
49
    }
50
51

    MIGRAPHX_THROW("ROCBLAS_GEMM: data type not supported!");
52
53
}

54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
void blas_shape(const shape& s)
{
    if(s.lens().size() < 2)
        return;
    if(std::none_of(s.strides().end() - 2, s.strides().end(), [&](auto i) { return i == 1; }))
        MIGRAPHX_THROW("GPU_GEMM: needs to have one matrix stride as 1");
    if(s.lens().size() < 3)
        return;
    shape batch_shape{s.type(),
                      {s.lens().begin(), s.lens().end() - 2},
                      {s.strides().begin(), s.strides().end() - 2}};
    auto batch_shapes = reduce_dims({batch_shape});
    if(batch_shapes.front().lens().size() != 1)
        MIGRAPHX_THROW("GPU_GEMM: Batch dimension is not collapsible");
}

70
71
72
73
74
75
76
77
78
template <class R, class... Ts, class... Us>
R rocblas_invoke(R (*f)(Ts...), Us... xs)
{
    if constexpr(sizeof...(Ts) == sizeof...(Us))
        return f(xs...);
    else
        return f(xs..., nullptr, nullptr);
}

79
80
81
82
83
84
85
86
87
88
89
90
static bool is_transposed(const shape& s)
{
    if(not s.transposed())
        return false;
    return s.strides().back() != 1;
}

static rocblas_int get_batch_stride(const argument& a)
{
    return a.get_shape().strides()[a.get_shape().strides().size() - 3];
}

91
template <class T>
Shucai Xiao's avatar
Shucai Xiao committed
92
93
94
95
96
void gemm_impl(context& ctx,
               const shape& output_shape,
               const std::vector<argument>& args,
               T alpha,
               T beta,
97
98
               bool int8_x4_format,
               bool compute_fp32)
Shucai Xiao's avatar
Shucai Xiao committed
99
{
100
101
    bool transa     = is_transposed(args[0].get_shape());
    bool transb     = is_transposed(args[1].get_shape());
102
103
104
105
106
    auto n_dim      = output_shape.lens().size();
    auto dim_1      = n_dim - 1;
    auto dim_0      = n_dim - 2;
    rocblas_int lda = args[0].get_shape().strides()[transa ? dim_1 : dim_0];
    rocblas_int ldb = args[1].get_shape().strides()[transb ? dim_1 : dim_0];
107
    rocblas_int ldc = args[2].get_shape().strides()[dim_0];
108

109
    bool is_3inputs = (args.size() == 4);
110
111
112
113
114
115
116
    if(!is_3inputs)
    {
        beta = 0;
    }
    rocblas_datatype arg_type = get_type(args[0].get_shape().type());
    auto output_type          = arg_type;
    if(output_type == rocblas_datatype_i8_r)
Shucai Xiao's avatar
Shucai Xiao committed
117
    {
118
        output_type = rocblas_datatype_i32_r;
Shucai Xiao's avatar
Shucai Xiao committed
119
    }
120
    auto compute_type = output_type;
121
122
123
124
125
    if(compute_fp32)
    {
        if(arg_type == rocblas_datatype_f16_r)
            compute_type = rocblas_datatype_f32_r;
    }
Shucai Xiao's avatar
Shucai Xiao committed
126

Shucai Xiao's avatar
Shucai Xiao committed
127
128
129
130
131
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
    rocblas_gemm_flags flag =
        int8_x4_format ? rocblas_gemm_flags_pack_int8x4 : rocblas_gemm_flags_none;
#else
    (void)int8_x4_format;
132
    int flag = 0;
Shucai Xiao's avatar
Shucai Xiao committed
133
134
#endif

Shucai Xiao's avatar
Shucai Xiao committed
135
136
137
    auto a_lens = args[0].get_shape().lens();
    auto b_lens = args[1].get_shape().lens();
    output_shape.visit_type([&](auto as) {
138
139
140
141
142
143
144
145
146
147
148
149
150
        auto alpha_r = as(alpha);
        auto beta_r  = as(beta);

        // use void pointer to select different data type if using fp32 mode
        void* alpha_v = &alpha_r;
        void* beta_v  = &beta_r;

        if(compute_fp32)
        {
            alpha_v = &alpha;
            beta_v  = &beta;
        }

Shucai Xiao's avatar
Shucai Xiao committed
151
152
153
154
        auto out_lens   = output_shape.lens();
        rocblas_int m   = out_lens[dim_0];
        rocblas_int n   = out_lens[dim_1];
        rocblas_int k   = args[0].get_shape().lens()[dim_1];
Shucai Xiao's avatar
Shucai Xiao committed
155
        auto to_pointer = [&](auto&& arg) { return as.from(arg.data()); };
Shucai Xiao's avatar
Shucai Xiao committed
156
        if(args[0].get_shape().type() == shape::int8_type and (k % 4) != 0 and int8_x4_format)
157
158
159
        {
            MIGRAPHX_THROW("ROCBLAS_GEMM: k size of int8 type input must be mutlple of 4!");
        }
Shucai Xiao's avatar
Shucai Xiao committed
160

Shucai Xiao's avatar
Shucai Xiao committed
161
162
163
164
        auto num_matrices = std::accumulate(
            out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
        if(num_matrices == 1)
        {
Shucai Xiao's avatar
Shucai Xiao committed
165
            // the rocblas_gemm API handles inputs and output matrices as
Shucai Xiao's avatar
Shucai Xiao committed
166
167
168
            // column-major format. When doing a C = A * B, we actually do
            // C^T = (B^T) * (A^T). That is the reason we input args[1] as
            // A and args[0] as B in calling the rocblas_gemm.
169
170
171
172
173
174
175
            rocblas_invoke(&rocblas_gemm_ex,
                           ctx.get_stream().get_rocblas(),
                           transb ? rocblas_operation_transpose : rocblas_operation_none,
                           transa ? rocblas_operation_transpose : rocblas_operation_none,
                           n,
                           m,
                           k,
176
                           alpha_v,
177
178
179
180
181
182
                           to_pointer(args.at(1)),
                           arg_type,
                           ldb,
                           to_pointer(args.at(0)),
                           arg_type,
                           lda,
183
                           beta_v,
184
185
186
187
188
189
190
191
192
                           to_pointer(args[2]),
                           output_type,
                           ldc,
                           is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
                           output_type,
                           ldc,
                           compute_type,
                           rocblas_gemm_algo_standard,
                           0,
Shucai Xiao's avatar
Shucai Xiao committed
193
                           flag);
Shucai Xiao's avatar
Shucai Xiao committed
194
195
196
        }
        else
        {
197
198
199
            auto a_stride = get_batch_stride(args[0]);
            auto b_stride = get_batch_stride(args[1]);
            auto c_stride = get_batch_stride(args[2]);
200
201
202
203
204
205
206
            rocblas_invoke(&rocblas_gemm_strided_batched_ex,
                           ctx.get_stream().get_rocblas(),
                           transb ? rocblas_operation_transpose : rocblas_operation_none,
                           transa ? rocblas_operation_transpose : rocblas_operation_none,
                           n,
                           m,
                           k,
207
                           alpha_v,
208
209
210
                           to_pointer(args.at(1)),
                           arg_type,
                           ldb,
211
                           b_stride,
212
213
214
                           to_pointer(args.at(0)),
                           arg_type,
                           lda,
215
                           a_stride,
216
                           beta_v,
217
218
219
                           to_pointer(args[2]),
                           output_type,
                           ldc,
220
                           c_stride,
221
222
223
                           is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
                           output_type,
                           ldc,
224
                           c_stride,
225
226
227
228
                           num_matrices,
                           compute_type,
                           rocblas_gemm_algo_standard,
                           0,
Shucai Xiao's avatar
Shucai Xiao committed
229
                           flag);
Shucai Xiao's avatar
Shucai Xiao committed
230
231
        }
    });
232
}
Shucai Xiao's avatar
Shucai Xiao committed
233

234
235
236
237
void gemm(context& ctx,
          const shape& output_shape,
          const std::vector<argument>& args,
          float alpha,
Shucai Xiao's avatar
Shucai Xiao committed
238
          float beta,
239
240
          bool int8_x4_format,
          bool compute_fp32)
241
{
242
    gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
243
244
245
246
247
248
}

void gemm(context& ctx,
          const shape& output_shape,
          const std::vector<argument>& args,
          int32_t alpha,
Shucai Xiao's avatar
Shucai Xiao committed
249
          int32_t beta,
250
251
          bool int8_x4_format,
          bool compute_fp32)
252
{
253
    gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
Shucai Xiao's avatar
Shucai Xiao committed
254
255
256
257
258
}

} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx