lowering.cpp 15.4 KB
Newer Older
1
2
3
/*
 * The MIT License (MIT)
 *
Ted Themistokleous's avatar
Ted Themistokleous committed
4
 * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
Shucai Xiao's avatar
Shucai Xiao committed
24
#include <iterator>
25
26
27
28
29
#include <utility>
#include <functional>
#include <algorithm>
#include <map>

Paul's avatar
Paul committed
30
31
#include <migraphx/manage_ptr.hpp>
#include <migraphx/instruction.hpp>
32
#include <migraphx/make_op.hpp>
33
34
#include <migraphx/instruction_ref.hpp>
#include <migraphx/stringutils.hpp>
35
36
37
#include <migraphx/pass_manager.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/program.hpp>
38
39

#include <migraphx/op/dot.hpp>
Shucai Xiao's avatar
Shucai Xiao committed
40
#include <migraphx/op/if_op.hpp>
41
42
#include <migraphx/op/reshape.hpp>
#include <migraphx/op/quant_dot.hpp>
Ted Themistokleous's avatar
Ted Themistokleous committed
43
#include <migraphx/op/reshape_lazy.hpp>
44

Paul's avatar
Paul committed
45
#include <migraphx/gpu/context.hpp>
46
#include <migraphx/gpu/lowering.hpp>
47
#include <migraphx/gpu/device_name.hpp>
Paul's avatar
Paul committed
48
#include <migraphx/gpu/gemm.hpp>
49
50
#include <migraphx/gpu/miopen.hpp>
#include <migraphx/gpu/rocblas.hpp>
51
#include <migraphx/gpu/compiler.hpp>
Paul's avatar
Paul committed
52

Paul's avatar
Paul committed
53
namespace migraphx {
Paul's avatar
Paul committed
54
inline namespace MIGRAPHX_INLINE_NS {
Paul's avatar
Paul committed
55
namespace gpu {
Paul's avatar
Paul committed
56
57
58

struct miopen_apply
{
59
60
61
    module* mod              = nullptr;
    module_pass_manager* mpm = nullptr;
    const lowering* pass     = nullptr;
Shucai Xiao's avatar
Shucai Xiao committed
62
    std::unordered_map<std::string, std::function<instruction_ref(instruction_ref)>> apply_map{};
Shucai Xiao's avatar
Shucai Xiao committed
63
    instruction_ref last{};
Shucai Xiao's avatar
Shucai Xiao committed
64
65
    bool offload_copy   = false;
    bool int8_x4_format = true;
66
    bool compute_fp32   = false;
Paul's avatar
Paul committed
67

68
    context& get_context() const
69
70
71
72
73
74
    {
        assert(pass != nullptr);
        assert(pass->ctx != nullptr);
        return *pass->ctx;
    }

Paul's avatar
Paul committed
75
76
77
78
79
80
81
    void check_shape(shape x, instruction_ref i)
    {
        assert(x == i->get_shape());
        (void)x;
        (void)i;
    }

82
83
    void init()
    {
Shucai Xiao's avatar
Shucai Xiao committed
84
        assert(mod != nullptr);
85
        assert(pass != nullptr);
86

87
88
89
        auto& ctx      = get_context();
        int8_x4_format = get_int8_x4_format(ctx);
        compute_fp32   = get_compute_fp32_flag();
90
        offload_copy   = (mod == mpm->get_root_module()) ? pass->offload_copy : false;
Paul's avatar
Paul committed
91

92
93
94
95
        add_generic_op("contiguous");
        add_extend_op("argmax");
        add_extend_op("argmin");
        add_extend_op("logsoftmax");
Shucai Xiao's avatar
Shucai Xiao committed
96
        add_extend_op("lrn");
turneram's avatar
turneram committed
97
        add_extend_op("multinomial");
Shucai Xiao's avatar
Shucai Xiao committed
98
        add_extend_op("nonzero");
99
        add_extend_op("pooling");
100
        add_extend_op("prefix_scan_sum");
Cagri Eryilmaz's avatar
Cagri Eryilmaz committed
101
        add_extend_op("reverse");
102
103
104
        add_extend_op("rnn_var_sl_last_output");
        add_extend_op("rnn_var_sl_shift_output");
        add_extend_op("rnn_var_sl_shift_sequence");
105
        add_extend_op("scatter_none");
Shucai Xiao's avatar
Shucai Xiao committed
106
        add_extend_op("topk");
107

108
        add_convolution_op("convolution");
109
        add_convolution_op("convolution_backwards");
110
        add_convolution_op("quant_convolution");
Shucai Xiao's avatar
Shucai Xiao committed
111
112
        add_gemm_op<op::dot>("dot");
        add_gemm_op<op::quant_dot>("quant_dot");
Shucai Xiao's avatar
Shucai Xiao committed
113
        add_if_op();
Shucai Xiao's avatar
Shucai Xiao committed
114
        add_loop_op();
Shucai Xiao's avatar
Shucai Xiao committed
115
        add_neg_op();
116
        add_nms_op();
Charlie Lin's avatar
Charlie Lin committed
117
        add_select_module_op();
Ted Themistokleous's avatar
Ted Themistokleous committed
118
        add_reshape_lazy_op();
119
120
    }

121
    void copy_params() const
122
    {
Shucai Xiao's avatar
Shucai Xiao committed
123
        if(not offload_copy)
124
            return;
125

Shucai Xiao's avatar
Shucai Xiao committed
126
        for(auto ins : iterator_for(*mod))
127
128
129
        {
            if(ins->name() != "@param")
                continue;
130

Shucai Xiao's avatar
Shucai Xiao committed
131
132
133
134
            // parameter no outputs, no need to insert copy to gpu
            if(ins->outputs().empty())
                continue;

135
136
            auto pos = std::next(ins);
            auto a   = insert_allocation(pos, ins->get_shape());
137
            auto c   = mod->insert_instruction(pos, make_op("hip::copy_to_gpu"), ins, a);
Shucai Xiao's avatar
Shucai Xiao committed
138
            mod->replace_instruction(ins, c);
139
        }
140
141

        // return instruction
Shucai Xiao's avatar
Shucai Xiao committed
142
        auto ret = std::prev(mod->end());
143
144
        if(ret->name() == "@return")
        {
145
            const auto& inputs = ret->inputs();
146
147
148

            // each input of ret need to be copied from gpu to host, and replace
            // output with copy output
149
            for(const auto& in : inputs)
150
            {
151
                auto p_output = mod->insert_instruction(ret, make_op("hip::copy_from_gpu"), in);
152
153
154
155
156
157
                instruction::replace_argument(ret, in, p_output);
            }
        }
        // else branch to handle legacy program without the return instruction
        else
        {
158
            mod->add_instruction(make_op("hip::copy_from_gpu"), ret);
159
        }
160
161
    }

Paul's avatar
Paul committed
162
163
    void apply()
    {
164
        init();
Shucai Xiao's avatar
Shucai Xiao committed
165
        for(auto it = mod->begin(); it != mod->end(); it++)
Paul's avatar
Paul committed
166
        {
167
168
            auto s     = it->get_shape();
            auto attrs = it->get_operator().attributes();
169
            if(apply_map.count(it->name()) > 0)
170
            {
171
                check_shape(s, apply_map.at(it->name())(it));
Paul's avatar
Paul committed
172
            }
173
174
175
176
            else if(has_compiler_for(it->name()))
            {
                check_shape(s, insert_precompile_op(it));
            }
177
178
179
180
            else if(attrs.contains("target"))
            {
                check_shape(s, insert_custom_op(it, attrs));
            }
Paul's avatar
Paul committed
181
        }
182
        copy_params();
Paul's avatar
Paul committed
183
184
    }

185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
    instruction_ref insert_custom_op(instruction_ref ins, const value& attrs) const
    {
        const auto& custom_op = ins->get_operator();
        if(attrs.at("target") == "cpu")
        {
            auto s = ins->get_shape();
            std::vector<instruction_ref> cpu_inputs;
            auto inputs = ins->inputs();
            auto output = inputs.back();
            std::transform(
                inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
                    return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
                });
            cpu_inputs.front() =
                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
            auto cpu_out = mod->insert_instruction(ins, custom_op, cpu_inputs);
            auto gpu_out =
                mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output);
            return mod->replace_instruction(ins, gpu_out);
        }
        return ins;
    }

208
    instruction_ref insert_precompile_op(instruction_ref ins) const
209
210
211
212
213
214
215
216
217
218
219
220
    {
        auto output                       = insert_allocation(ins, ins->get_shape());
        std::vector<instruction_ref> refs = ins->inputs();
        refs.push_back(output);

        return mod->replace_instruction(
            ins,
            make_op("gpu::precompile_op", {{"op", to_value(ins->get_operator())}}),
            refs,
            ins->module_inputs());
    }

221
    instruction_ref insert_allocation(instruction_ref ins, const shape& s) const
Paul's avatar
Paul committed
222
    {
223
        return mod->insert_instruction(ins, make_op("allocate", {{"shape", to_value(s)}}));
Paul's avatar
Paul committed
224
225
    }

226
227
    template <typename Op>
    void add_gemm_op(const std::string& name)
228
229
    {
        apply_map.emplace(name, [=](instruction_ref ins) {
230
            std::vector<instruction_ref> refs = ins->inputs();
231
232
233
            assert(refs.size() == 2);
            auto output = insert_allocation(ins, ins->get_shape());
            refs.push_back(output);
Shucai Xiao's avatar
Shucai Xiao committed
234
            return mod->replace_instruction(
235
                ins, rocblas_gemm<Op>{Op{}, 1, 0, int8_x4_format, compute_fp32}, refs);
236
237
238
        });
    }

239
    void add_convolution_op(const std::string& name)
240
    {
241
        apply_map.emplace(name, [=](instruction_ref ins) {
242
243
244
245
            operation conv = make_op(
                "gpu::" + name,
                {{"op", ins->get_operator().to_value()}, {"int8_x4_format", int8_x4_format}});
            auto output = insert_allocation(ins, ins->get_shape());
246

247
248
249
250
251
            return mod->replace_instruction(ins,
                                            make_op("gpu::miopen_op", {{"op", to_value(conv)}}),
                                            ins->inputs().at(0),
                                            ins->inputs().at(1),
                                            output);
Shucai Xiao's avatar
Shucai Xiao committed
252
253
254
        });
    }

255
256
257
    // add_generic_op just constructs the operator with no fields whereas add_extend_op copies over
    // the fields Since it doesn't have fields its default constructed

258
259
260
    void add_generic_op(const std::string& name) { add_generic_op(name, "gpu::" + name); }

    void add_generic_op(const std::string& op_name, const std::string& gpu_name)
Paul's avatar
Paul committed
261
    {
262
        apply_map.emplace(op_name, [=](instruction_ref ins) {
263
264
265
            auto output                       = insert_allocation(ins, ins->get_shape());
            std::vector<instruction_ref> refs = ins->inputs();
            refs.push_back(output);
Paul's avatar
Paul committed
266

Shucai Xiao's avatar
Shucai Xiao committed
267
            return mod->replace_instruction(ins, make_op(gpu_name), refs);
268
        });
Paul's avatar
Paul committed
269
    }
Paul's avatar
Paul committed
270

271
272
273
    void add_extend_op(const std::string& name) { add_extend_op(name, "gpu::" + name); }

    void add_extend_op(const std::string& op_name, const std::string& gpu_name)
Khalique's avatar
Khalique committed
274
    {
275
276
        apply_map.emplace(op_name, [=](instruction_ref ins) {
            auto&& op                         = ins->get_operator();
277
278
279
            auto output                       = insert_allocation(ins, ins->get_shape());
            std::vector<instruction_ref> refs = ins->inputs();
            refs.push_back(output);
Paul's avatar
Paul committed
280

Shucai Xiao's avatar
Shucai Xiao committed
281
            return mod->replace_instruction(ins, make_op(gpu_name, op.to_value()), refs);
282
        });
Khalique's avatar
Khalique committed
283
284
    }

Shucai Xiao's avatar
Shucai Xiao committed
285
286
287
288
289
290
    // use 0 - input to represent neg
    void add_neg_op()
    {
        apply_map.emplace("neg", [=](instruction_ref ins) {
            auto s = ins->get_shape();
            std::vector<float> zeros(s.elements(), 0.0f);
Shucai Xiao's avatar
Shucai Xiao committed
291
            auto l0     = mod->add_literal(literal(s, zeros));
Shucai Xiao's avatar
Shucai Xiao committed
292
            auto output = insert_allocation(ins, s);
Shucai Xiao's avatar
Shucai Xiao committed
293
            return mod->replace_instruction(
294
                ins, make_op("gpu::sub"), l0, ins->inputs().front(), output);
Shucai Xiao's avatar
Shucai Xiao committed
295
296
        });
    }
Shucai Xiao's avatar
Shucai Xiao committed
297

Shucai Xiao's avatar
Shucai Xiao committed
298
    // add input and output argument for the if operator
Shucai Xiao's avatar
Shucai Xiao committed
299
300
301
302
    void add_if_op()
    {
        apply_map.emplace("if", [=](instruction_ref ins) {
            std::vector<instruction_ref> inputs = ins->inputs();
303
304
305
            auto cpu_cond =
                mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.front());
            auto sync_cond = mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_cond);
Shucai Xiao's avatar
Shucai Xiao committed
306
307
            inputs.front() = sync_cond;

308
            return mod->replace_instruction(ins, ins->get_operator(), inputs, ins->module_inputs());
Shucai Xiao's avatar
Shucai Xiao committed
309
310
        });
    }
Shucai Xiao's avatar
Shucai Xiao committed
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326

    // replace the loop operator with gpu_loop operator
    void add_loop_op()
    {
        apply_map.emplace("loop", [=](instruction_ref ins) {
            std::vector<instruction_ref> inputs = ins->inputs();
            // copy max_iter from gpu to cpu
            auto cpu_max_iter =
                mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.at(0));
            auto cpu_cond =
                mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.at(1));
            auto synced_max_iter =
                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_max_iter, cpu_cond);
            inputs.at(0)     = synced_max_iter;
            inputs.at(1)     = cpu_cond;
            auto copy_inputs = inputs;
327
328
329
330
            std::transform(copy_inputs.begin(),
                           copy_inputs.end(),
                           std::back_inserter(inputs),
                           [&](auto in) { return insert_allocation(ins, in->get_shape()); });
Shucai Xiao's avatar
Shucai Xiao committed
331
332
333
334
335

            auto mod_args = ins->module_inputs();
            auto output   = insert_allocation(ins, ins->get_shape());

            const auto* sub_mod = mod_args.front();
336
337
            auto cond_out       = insert_allocation(ins, sub_mod->get_output_shapes().front());

Shucai Xiao's avatar
Shucai Xiao committed
338
339
340
341
342
343
344
345
            // add cond and mod outputs to the argument list
            inputs.push_back(cond_out);
            inputs.push_back(output);

            return mod->replace_instruction(
                ins, make_op("gpu::loop", ins->get_operator().to_value()), inputs, mod_args);
        });
    }
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365

    void add_nms_op()
    {
        apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) {
            auto s      = ins->get_shape();
            auto output = insert_allocation(ins, s);
            std::vector<instruction_ref> cpu_inputs;
            auto inputs = ins->inputs();
            std::transform(
                inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
                    return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
                });
            cpu_inputs.front() =
                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
            auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs);
            auto gpu_out =
                mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output);
            return mod->replace_instruction(ins, gpu_out);
        });
    }
Charlie Lin's avatar
Charlie Lin committed
366
367

    /**
Charlie Lin's avatar
Charlie Lin committed
368
     * Adds dynamic allocation for submodule output parameter.
Charlie Lin's avatar
Charlie Lin committed
369
370
371
372
     */
    void add_select_module_op()
    {
        apply_map.emplace("select_module", [=](instruction_ref ins) {
Charlie Lin's avatar
Charlie Lin committed
373
374
            auto s                              = ins->get_shape();
            auto output                         = insert_allocation(ins, s);
Charlie Lin's avatar
Charlie Lin committed
375
            std::vector<instruction_ref> inputs = ins->inputs();
Charlie Lin's avatar
Charlie Lin committed
376
377
            inputs.push_back(output);
            return mod->replace_instruction(ins, ins->get_operator(), inputs, ins->module_inputs());
Charlie Lin's avatar
Charlie Lin committed
378
379
        });
    }
Ted Themistokleous's avatar
Ted Themistokleous committed
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405

    /**
     *  Adds reshape lazy to reshape ops that can be aliased instead of copied.
     *  `gpu::contiguous` are added before and after the reshape; these contiguous
     *  instructions can be removed by the eliminate_contiguous pass.
     */
    void add_reshape_lazy_op()
    {
        apply_map.emplace("reshape", [=](instruction_ref ins) {
            std::vector<instruction_ref> before_contiguous_args = ins->inputs();
            auto before_alloc = insert_allocation(ins, std::prev(ins)->get_shape());
            before_contiguous_args.push_back(before_alloc);
            auto before_contig =
                mod->insert_instruction(ins, make_op("gpu::contiguous"), {before_contiguous_args});

            auto new_lazy_reshape = mod->insert_instruction(
                ins,
                make_op("reshape_lazy", {{"dims", {ins->get_operator().to_value().at("dims")}}}),
                before_contig);

            std::vector<instruction_ref> after_contiguous_args = {new_lazy_reshape};
            auto after_alloc = insert_allocation(new_lazy_reshape, new_lazy_reshape->get_shape());
            after_contiguous_args.push_back(after_alloc);
            return mod->replace_instruction(ins, make_op("gpu::contiguous"), after_contiguous_args);
        });
    }
Paul's avatar
Paul committed
406
407
};

408
409
410
411
void lowering::apply(module_pass_manager& mpm) const
{
    miopen_apply{&mpm.get_module(), &mpm, this}.apply();
}
Shucai Xiao's avatar
Shucai Xiao committed
412

Paul's avatar
Paul committed
413
} // namespace gpu
Paul's avatar
Paul committed
414
} // namespace MIGRAPHX_INLINE_NS
Paul's avatar
Paul committed
415
} // namespace migraphx