ck_gemm.cpp 13 KB
Newer Older
Paul's avatar
Paul committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
/*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
#include <fstream>
#include <filesystem>
#include <migraphx/gpu/compiler.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/gpu/context.hpp>

#include <migraphx/gpu/compile_hip_code_object.hpp>
#include <migraphx/gpu/compile_hip.hpp>
Paul's avatar
Paul committed
32
#include <migraphx/gpu/compile_gen.hpp>
Paul's avatar
Paul committed
33
#include <migraphx/ranges.hpp>
Paul's avatar
Paul committed
34
#include <migraphx/env.hpp>
Paul's avatar
Paul committed
35
36
37
38
#include <migraphx/reduce_dims.hpp>
#include <migraphx/stringutils.hpp>
#include <migraphx/module.hpp>
#include <migraphx/env.hpp>
Paul's avatar
Paul committed
39
#include <migraphx/file_buffer.hpp>
Paul's avatar
Paul committed
40

Alan Turner's avatar
Alan Turner committed
41
#include "ck/include/device_gemm_multiple_d.hpp"
Paul's avatar
Paul committed
42

Paul's avatar
Paul committed
43
44
45
46
47
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {

namespace gpu {

Paul's avatar
Paul committed
48
49
using namespace migraphx::gpu::gen; // NOLINT

Paul's avatar
Paul committed
50
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_LOG_CK_GEMM);
Paul's avatar
Paul committed
51
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_TUNING);
Paul's avatar
Paul committed
52
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_TUNING_VALUE);
Paul's avatar
Paul committed
53
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_DEBUG);
Paul's avatar
Paul committed
54

Paul's avatar
Paul committed
55
56
57
58
// NOLINTNEXTLINE
static const char* const ck_gemm_kernel = R"__migraphx__(
#include <args.hpp>
#include <migraphx/kernels/ck_gemm.hpp>
Paul's avatar
Paul committed
59
#include <migraphx/kernels/pointwise.hpp>
Alan Turner's avatar
Alan Turner committed
60
#include <migraphx/kernels/${include}>
Paul's avatar
Paul committed
61
62
63

namespace migraphx {

Paul's avatar
Paul committed
64
65
${preamble}

Paul's avatar
Paul committed
66
67
extern "C" {

Paul's avatar
Paul committed
68
__global__ void ${kernel}(${params})
Paul's avatar
Paul committed
69
{
Paul's avatar
Paul committed
70
    transform_args(make_tensors(), rotate_last())(${args})([](auto... xs) {
71
        ck_gemm<${solution}, ${blocks_per_batch}>(xs...);
Paul's avatar
Paul committed
72
73
74
75
76
77
78
79
80
    });
}

}

} // namespace migraphx

)__migraphx__";

Paul's avatar
Paul committed
81
82
static bool transposed_matrix(const shape& s) { return s.strides().back() != 1; }

Paul's avatar
Format  
Paul committed
83
template <class F, class Action>
Paul's avatar
Paul committed
84
85
86
87
88
89
90
91
auto action_decorate(F f, Action action)
{
    return [=](auto&&... xs) {
        action();
        f(std::forward<decltype(xs)>(xs)...);
    };
}

Paul's avatar
Paul committed
92
93
94
using tuning_entry = std::pair<std::vector<shape>, size_t>;
static std::vector<tuning_entry> read_tuning(const std::string& s)
{
Paul's avatar
Format  
Paul committed
95
    if(not fs::exists(s))
Paul's avatar
Paul committed
96
97
98
99
        return {};
    return from_value<std::vector<tuning_entry>>(from_json_string(read_string(s)));
}

Paul's avatar
Paul committed
100
101
static float matrix_distance(const shape& x, const shape& y)
{
Paul's avatar
Format  
Paul committed
102
    if(x.type() != y.type())
Paul's avatar
Paul committed
103
        return std::numeric_limits<float>::max();
Paul's avatar
Format  
Paul committed
104
    if(transposed_matrix(x) != transposed_matrix(y))
Paul's avatar
Paul committed
105
        return std::numeric_limits<float>::max();
Paul's avatar
Format  
Paul committed
106
107
108
109
110
111
    auto sum_squared = std::inner_product(x.lens().rbegin(),
                                          x.lens().rbegin() + 2,
                                          y.lens().rbegin(),
                                          0,
                                          std::plus<>{},
                                          [](auto a, auto b) { return (a - b) * (a - b); });
Paul's avatar
Paul committed
112
113
114
    return std::sqrt(sum_squared);
}

Paul's avatar
Paul committed
115
116
117
static std::size_t get_tuning_for(const std::vector<shape>& inputs)
{
    static auto tuning = read_tuning(string_value_of(MIGRAPHX_CK_TUNING{}, ""));
Paul's avatar
Format  
Paul committed
118
    if(tuning.empty())
119
120
121
122
123
124
    {
        std::cout << "*********** Warning: No CK tuning! for config:" << std::endl;
        std::cout << "  " << inputs[0] << std::endl;
        std::cout << "  " << inputs[1] << std::endl;
        std::cout << "  " << inputs[2] << std::endl;
    }
Paul's avatar
Format  
Paul committed
125
    auto it = std::find_if(
Paul's avatar
Format  
Paul committed
126
        tuning.begin(), tuning.end(), [&](const auto& p) { return p.first == inputs; });
Paul's avatar
Format  
Paul committed
127
128
    if(it == tuning.end())
    {
Paul's avatar
Paul committed
129
        std::cout << "*********** Warning: CK tuning missing for config!" << std::endl;
130
131
132
        std::cout << "  " << inputs[0] << std::endl;
        std::cout << "  " << inputs[1] << std::endl;
        std::cout << "  " << inputs[2] << std::endl;
Paul's avatar
Paul committed
133
134
        std::vector<std::pair<float, std::size_t>> w;
        std::transform(tuning.begin(), tuning.end(), std::back_inserter(w), [&](const auto& p) {
Paul's avatar
Format  
Paul committed
135
            if(inputs.size() < 3 or p.first.size() < 3)
Paul's avatar
Paul committed
136
                MIGRAPHX_THROW("Invalid CK config");
Paul's avatar
Format  
Paul committed
137
138
139
140
141
142
143
            auto avg_distance = std::inner_product(
                p.first.begin(),
                p.first.begin() + 3,
                inputs.begin(),
                0.0f,
                std::plus<>{},
                [](const auto& x, const auto& y) { return matrix_distance(x, y) / 3.0f; });
Paul's avatar
Paul committed
144
145
146
147
            return std::make_pair(avg_distance, p.second);
        });
        std::sort(w.begin(), w.end());
        std::size_t default_value = 4;
Paul's avatar
Format  
Paul committed
148
        if(not w.empty())
Paul's avatar
Paul committed
149
            default_value = w.front().second;
Paul's avatar
Paul committed
150
151
152
        auto tuning_val = value_of(MIGRAPHX_CK_TUNING_VALUE{}, default_value);
        std::cout << "*********** Warning: CK try tuning: " << tuning_val << std::endl;
        return tuning_val;
Paul's avatar
Paul committed
153
    }
Paul's avatar
Paul committed
154
155
156
    return it->second;
}

Paul's avatar
Paul committed
157
158
struct ck_gemm_compiler : compiler<ck_gemm_compiler>
{
Paul's avatar
Paul committed
159
160
    static std::string get_layout(const shape& s)
    {
Paul's avatar
Paul committed
161
        return transposed_matrix(s) ? "ck::tensor_layout::gemm::ColumnMajor"
Paul's avatar
Format  
Paul committed
162
                                    : "ck::tensor_layout::gemm::RowMajor";
Paul's avatar
Paul committed
163
164
165
    }

    static std::string get_type(const shape& s)
Paul's avatar
Paul committed
166
    {
Paul's avatar
Format  
Paul committed
167
        if(s.type() == shape::half_type)
Paul's avatar
Paul committed
168
169
170
            return "ck::half_t";
        return shape::cpp_type(s.type());
    }
Paul's avatar
Paul committed
171

Paul's avatar
Format  
Paul committed
172
    template <class Iterator, class F>
Paul's avatar
Paul committed
173
174
175
176
177
178
179
    static std::string ck_tuple(Iterator start, Iterator last, F f)
    {
        std::vector<std::string> s;
        std::transform(start, last, std::back_inserter(s), f);
        return "ck::Tuple<" + join_strings(s, ",") + ">";
    }

Paul's avatar
Paul committed
180
181
    static std::vector<shape> adjust_inputs(std::vector<shape> inputs, bool& swap_inputs)
    {
Paul's avatar
Format  
Paul committed
182
        swap_inputs  = false;
Paul's avatar
Paul committed
183
        auto c_shape = inputs.back();
Paul's avatar
Format  
Paul committed
184
        if(not transposed_matrix(c_shape))
Paul's avatar
Paul committed
185
186
187
188
189
190
191
192
193
194
195
            return inputs;
        std::vector<int64_t> perm(c_shape.lens().size());
        std::iota(perm.begin(), perm.end(), 0);
        std::swap(perm[perm.size() - 1], perm[perm.size() - 2]);
        std::transform(inputs.begin(), inputs.end(), inputs.begin(), [&](shape s) {
            return reorder_shape(s, perm);
        });
        swap_inputs = true;
        return inputs;
    }

196
197
    static std::size_t get_batch_count(const shape& s)
    {
Alan Turner's avatar
Alan Turner committed
198
199
        return std::accumulate(
            s.lens().rbegin() + 2, s.lens().rend(), std::size_t{1}, std::multiplies<std::size_t>());
200
201
202
203
204
    }

    static void fold_batch_dims(shape& s)
    {
        auto lens = s.lens();
Alan Turner's avatar
Alan Turner committed
205
        if(lens.size() <= 2)
206
207
            return;
        auto batch_count = get_batch_count(s);
Alan Turner's avatar
Alan Turner committed
208
209
210
        auto m1          = lens.at(lens.size() - 2);
        auto m2          = lens.at(lens.size() - 1);
        if(transposed_matrix(s))
211
212
213
214
215
216
217
218
            s = shape{s.type(), {m1, m2 * batch_count}};
        else
            s = shape{s.type(), {m1 * batch_count, m2}};
    }

    static void remove_batch_dims(shape& s)
    {
        auto lens = s.lens();
Alan Turner's avatar
Alan Turner committed
219
        if(lens.size() <= 2)
220
221
222
            return;
        auto m1 = lens.at(lens.size() - 2);
        auto m2 = lens.at(lens.size() - 1);
Alan Turner's avatar
Alan Turner committed
223
        s       = shape{s.type(), {m1, m2}};
224
225
    }

226
    std::vector<std::string> names() const { return {"ck_gemm", "gpu::ck_gemm", "ck_gemm_int8", "gpu::ck_gemm_int8"}; }
Paul's avatar
Paul committed
227
228
229

    operation compile_op(context& /* ctx */, const std::vector<shape>& inputs, const value& v) const
    {
Alan Turner's avatar
Alan Turner committed
230
231
232
        auto a_shape      = inputs[0];
        auto b_shape      = inputs[1];
        auto c_shape      = inputs.back();
233
        auto tuning_value = get_tuning_for({a_shape, b_shape, c_shape});
Paul's avatar
Paul committed
234

Alan Turner's avatar
Alan Turner committed
235
236
        auto rank           = a_shape.lens().size();
        auto b_strides      = b_shape.strides();
237
238
        bool can_fold_batch = rank >= 3 and b_strides[rank - 3] == 0;

Alan Turner's avatar
Alan Turner committed
239
240
241
242
243
        auto batch_count = get_batch_count(c_shape);
        auto m           = c_shape.lens()[rank - 2];
        m                = can_fold_batch ? m * batch_count : m;
        auto n           = c_shape.lens().back();
        auto k           = a_shape.lens().back();
244

Alan Turner's avatar
Alan Turner committed
245
246
247
248
249
250
        const bool transA = transposed_matrix(a_shape);
        const bool transB = transposed_matrix(b_shape);
        const bool transE = transposed_matrix(c_shape);
        const auto a_type = get_type(a_shape);
        const auto b_type = get_type(b_shape);
        const auto e_type = get_type(c_shape);
Alan Turner's avatar
Alan Turner committed
251
        std::vector<bool> ds_layout;
Alan Turner's avatar
Alan Turner committed
252
253
254
255
        std::transform(inputs.begin() + 2,
                       inputs.end() - 1,
                       std::back_inserter(ds_layout),
                       [](const auto& i) { return transposed_matrix(i); });
Alan Turner's avatar
Alan Turner committed
256
        std::vector<std::string> ds_type;
Alan Turner's avatar
Alan Turner committed
257
258
259
260
        std::transform(inputs.begin() + 2,
                       inputs.end() - 1,
                       std::back_inserter(ds_type),
                       [](const auto& i) { return get_type(i); });
261

Alan Turner's avatar
Alan Turner committed
262
263
        std::string ck_passthrough = "ck_passthrough";
        std::string cde_op         = ck_passthrough;
Paul's avatar
Paul committed
264
        assert(inputs.size() < 4 or v.contains("post"));
Paul's avatar
Format  
Paul committed
265
        if(v.contains("post"))
Paul's avatar
Paul committed
266
        {
267
            cde_op = v.at("post").to<std::string>();
268
        }
Alan Turner's avatar
Cleanup  
Alan Turner committed
269
        
270
            
Paul's avatar
Paul committed
271

Alan Turner's avatar
Alan Turner committed
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
        auto problem = ck::tensor_operation::device::device_gemm_multiple_d::Problem{
            static_cast<ck::index_t>(m),
            static_cast<ck::index_t>(n),
            static_cast<ck::index_t>(k),
            transA,
            transB,
            transE,
            ds_layout,
            a_type,
            b_type,
            e_type,
            ds_type,
            ck_passthrough,
            ck_passthrough,
            cde_op};
Alan Turner's avatar
Alan Turner committed
287
288
289
290

        const auto include_header   = problem.GetIncludeHeader();
        const auto ck_headers       = problem.GetHeaders();
        const auto solutions        = problem.GetSolutions("gfx90a");
Alan Turner's avatar
Alan Turner committed
291
        const auto solution         = solutions.at(tuning_value);
Alan Turner's avatar
Alan Turner committed
292
293
294
        const auto template_str     = solution.template_str;
        const auto blocks_per_batch = solution.grid_size;
        const auto block_size       = solution.block_size;
Paul's avatar
Paul committed
295

Paul's avatar
Paul committed
296
        hip_compile_options options;
Alan Turner's avatar
Alan Turner committed
297
        options.embedded_headers = ck_headers;
Alan Turner's avatar
Alan Turner committed
298
        auto grid_size = can_fold_batch ? blocks_per_batch : batch_count * blocks_per_batch;
Paul's avatar
Paul committed
299
        options.set_launch_params(v, grid_size * block_size, block_size);
Paul's avatar
Paul committed
300
        options.inputs         = inputs;
Paul's avatar
Paul committed
301
        options.output         = c_shape;
Paul's avatar
Paul committed
302
        options.kernel_name    = v.get("kernel", "ck_gemm_kernel");
Paul's avatar
Paul committed
303
        options.virtual_inputs = inputs;
Alan Turner's avatar
Alan Turner committed
304
        if(can_fold_batch)
305
306
307
308
309
310
311
        {
            auto vinputs = inputs;
            fold_batch_dims(vinputs[0]);
            remove_batch_dims(vinputs[1]);
            std::for_each(vinputs.begin() + 2, vinputs.end(), fold_batch_dims);
            options.virtual_inputs = vinputs;
        }
Paul's avatar
Paul committed
312

Paul's avatar
Paul committed
313
        if(v.get("check", false) or enabled(MIGRAPHX_CK_DEBUG{}))
Paul's avatar
Paul committed
314
315
            options.params += " -DMIGRAPHX_CK_CHECK=1";

Paul's avatar
Format  
Paul committed
316
        auto src = interpolate_string(ck_gemm_kernel,
Alan Turner's avatar
Alan Turner committed
317
                                      {{"solution", template_str},
Alan Turner's avatar
Alan Turner committed
318
                                       {"include", include_header},
Paul's avatar
Format  
Paul committed
319
320
                                       {"params", enum_params(inputs.size(), "void * private_p")},
                                       {"args", enum_params(inputs.size(), "private_p")},
Paul's avatar
Paul committed
321
                                       {"blocks_per_batch", to_string(blocks_per_batch)},
Paul's avatar
Format  
Paul committed
322
323
                                       {"preamble", v.get("preamble", std::string{})},
                                       {"kernel", options.kernel_name}});
Alan Turner's avatar
Cleanup  
Alan Turner committed
324

Paul's avatar
Paul committed
325
326
327
328
329
        return compile_hip_code_object(src, options);
    }

    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
    {
Paul's avatar
Format  
Paul committed
330
331
        auto v      = op.to_value();
        v["kernel"] = "ck_gemm_kernel";
Paul's avatar
Paul committed
332
333
334
        if(not ins->module_inputs().empty())
        {
            auto* pm      = ins->module_inputs().front();
Paul's avatar
Format  
Paul committed
335
336
337
            v["preamble"] = generate_pointwise(*pm, "post_ck_gemm_function") +
                            "\nMIGRAPHX_LIFT_CLASS(post_ck_gemm, post_ck_gemm_function);";
            v["post"]   = "ck_function_adaptor<post_ck_gemm>";
Paul's avatar
Paul committed
338
            v["kernel"] = "ck_gemm_" + generate_name_from_ops(*pm) + "_kernel";
Paul's avatar
Format  
Paul committed
339
        }
Paul's avatar
Paul committed
340

Paul's avatar
Paul committed
341
        auto shapes = to_shapes(ins->inputs());
Paul's avatar
Paul committed
342
        return action_decorate(replace(compile_op(ctx, shapes, v)), [=] {
Paul's avatar
Format  
Paul committed
343
            if(enabled(MIGRAPHX_LOG_CK_GEMM{}))
Paul's avatar
Paul committed
344
            {
345
                std::vector<shape> gemm_shapes{shapes[0], shapes[1], shapes.back().with_type(shapes[0].type())};
Paul's avatar
Paul committed
346
347
                std::cout << "ck_gemm: " << to_json_string(to_value(gemm_shapes)) << std::endl;
            }
Paul's avatar
Paul committed
348
        });
Paul's avatar
Paul committed
349
350
351
352
353
354
    }
};

} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx