lowering.cpp 15.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
Shucai Xiao's avatar
Shucai Xiao committed
24
#include <iterator>
25
26
27
28
29
#include <utility>
#include <functional>
#include <algorithm>
#include <map>

Paul's avatar
Paul committed
30
31
#include <migraphx/manage_ptr.hpp>
#include <migraphx/instruction.hpp>
32
#include <migraphx/make_op.hpp>
33
34
#include <migraphx/instruction_ref.hpp>
#include <migraphx/stringutils.hpp>
35
36
37
#include <migraphx/pass_manager.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/program.hpp>
38
39

#include <migraphx/op/dot.hpp>
Shucai Xiao's avatar
Shucai Xiao committed
40
#include <migraphx/op/if_op.hpp>
41
42
#include <migraphx/op/reshape.hpp>
#include <migraphx/op/quant_dot.hpp>
Ted Themistokleous's avatar
Ted Themistokleous committed
43
#include <migraphx/op/reshape_lazy.hpp>
44

Paul's avatar
Paul committed
45
#include <migraphx/gpu/context.hpp>
46
#include <migraphx/gpu/lowering.hpp>
47
#include <migraphx/gpu/device_name.hpp>
Paul's avatar
Paul committed
48
#include <migraphx/gpu/gemm.hpp>
49
50
#include <migraphx/gpu/miopen.hpp>
#include <migraphx/gpu/rocblas.hpp>
51
#include <migraphx/gpu/compiler.hpp>
Paul's avatar
Paul committed
52

Paul's avatar
Paul committed
53
namespace migraphx {
Paul's avatar
Paul committed
54
inline namespace MIGRAPHX_INLINE_NS {
Paul's avatar
Paul committed
55
namespace gpu {
Paul's avatar
Paul committed
56
57
58

struct miopen_apply
{
59
60
61
    module* mod              = nullptr;
    module_pass_manager* mpm = nullptr;
    const lowering* pass     = nullptr;
Shucai Xiao's avatar
Shucai Xiao committed
62
    std::unordered_map<std::string, std::function<instruction_ref(instruction_ref)>> apply_map{};
Shucai Xiao's avatar
Shucai Xiao committed
63
    instruction_ref last{};
Shucai Xiao's avatar
Shucai Xiao committed
64
65
    bool offload_copy   = false;
    bool int8_x4_format = true;
66
    bool compute_fp32   = false;
Paul's avatar
Paul committed
67

68
    context& get_context() const
69
70
71
72
73
74
    {
        assert(pass != nullptr);
        assert(pass->ctx != nullptr);
        return *pass->ctx;
    }

Paul's avatar
Paul committed
75
76
77
78
79
80
81
    void check_shape(shape x, instruction_ref i)
    {
        assert(x == i->get_shape());
        (void)x;
        (void)i;
    }

82
83
    void init()
    {
Shucai Xiao's avatar
Shucai Xiao committed
84
        assert(mod != nullptr);
85
        assert(pass != nullptr);
86

87
88
89
        auto& ctx      = get_context();
        int8_x4_format = get_int8_x4_format(ctx);
        compute_fp32   = get_compute_fp32_flag();
90
        offload_copy   = (mod == mpm->get_root_module()) ? pass->offload_copy : false;
Paul's avatar
Paul committed
91

92
        add_generic_op("contiguous");
Ted Themistokleous's avatar
Ted Themistokleous committed
93
        add_generic_op("reshape_lazy");
94
95
96
        add_extend_op("argmax");
        add_extend_op("argmin");
        add_extend_op("logsoftmax");
Shucai Xiao's avatar
Shucai Xiao committed
97
        add_extend_op("lrn");
turneram's avatar
turneram committed
98
        add_extend_op("multinomial");
Shucai Xiao's avatar
Shucai Xiao committed
99
        add_extend_op("nonzero");
100
        add_extend_op("pooling");
101
        add_extend_op("prefix_scan_sum");
Cagri Eryilmaz's avatar
Cagri Eryilmaz committed
102
        add_extend_op("reverse");
103
104
105
        add_extend_op("rnn_var_sl_last_output");
        add_extend_op("rnn_var_sl_shift_output");
        add_extend_op("rnn_var_sl_shift_sequence");
106
        add_extend_op("scatter_none");
Shucai Xiao's avatar
Shucai Xiao committed
107
        add_extend_op("topk");
108

109
110
111
        add_convolution_op("convolution");
        add_convolution_op("deconvolution");
        add_convolution_op("quant_convolution");
Shucai Xiao's avatar
Shucai Xiao committed
112
113
        add_gemm_op<op::dot>("dot");
        add_gemm_op<op::quant_dot>("quant_dot");
Shucai Xiao's avatar
Shucai Xiao committed
114
        add_if_op();
Shucai Xiao's avatar
Shucai Xiao committed
115
        add_loop_op();
Shucai Xiao's avatar
Shucai Xiao committed
116
        add_neg_op();
117
        add_nms_op();
Charlie Lin's avatar
Charlie Lin committed
118
        add_select_module_op();
Ted Themistokleous's avatar
Ted Themistokleous committed
119
        add_reshape_lazy_op();
120
121
    }

122
    void copy_params() const
123
    {
Shucai Xiao's avatar
Shucai Xiao committed
124
        if(not offload_copy)
125
            return;
126

Shucai Xiao's avatar
Shucai Xiao committed
127
        for(auto ins : iterator_for(*mod))
128
129
130
        {
            if(ins->name() != "@param")
                continue;
131

Shucai Xiao's avatar
Shucai Xiao committed
132
133
134
135
            // parameter no outputs, no need to insert copy to gpu
            if(ins->outputs().empty())
                continue;

136
137
            auto pos = std::next(ins);
            auto a   = insert_allocation(pos, ins->get_shape());
138
            auto c   = mod->insert_instruction(pos, make_op("hip::copy_to_gpu"), ins, a);
Shucai Xiao's avatar
Shucai Xiao committed
139
            mod->replace_instruction(ins, c);
140
        }
141
142

        // return instruction
Shucai Xiao's avatar
Shucai Xiao committed
143
        auto ret = std::prev(mod->end());
144
145
        if(ret->name() == "@return")
        {
146
            const auto& inputs = ret->inputs();
147
148
149

            // each input of ret need to be copied from gpu to host, and replace
            // output with copy output
150
            for(const auto& in : inputs)
151
            {
152
                auto p_output = mod->insert_instruction(ret, make_op("hip::copy_from_gpu"), in);
153
154
155
156
157
158
                instruction::replace_argument(ret, in, p_output);
            }
        }
        // else branch to handle legacy program without the return instruction
        else
        {
159
            mod->add_instruction(make_op("hip::copy_from_gpu"), ret);
160
        }
161
162
    }

Paul's avatar
Paul committed
163
164
    void apply()
    {
165
        init();
Shucai Xiao's avatar
Shucai Xiao committed
166
        for(auto it = mod->begin(); it != mod->end(); it++)
Paul's avatar
Paul committed
167
        {
168
169
            auto s     = it->get_shape();
            auto attrs = it->get_operator().attributes();
170
            if(apply_map.count(it->name()) > 0)
171
            {
172
                check_shape(s, apply_map.at(it->name())(it));
Paul's avatar
Paul committed
173
            }
174
175
176
177
            else if(has_compiler_for(it->name()))
            {
                check_shape(s, insert_precompile_op(it));
            }
178
179
180
181
            else if(attrs.contains("target"))
            {
                check_shape(s, insert_custom_op(it, attrs));
            }
Paul's avatar
Paul committed
182
        }
183
        copy_params();
Paul's avatar
Paul committed
184
185
    }

186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
    instruction_ref insert_custom_op(instruction_ref ins, const value& attrs) const
    {
        const auto& custom_op = ins->get_operator();
        if(attrs.at("target") == "cpu")
        {
            auto s = ins->get_shape();
            std::vector<instruction_ref> cpu_inputs;
            auto inputs = ins->inputs();
            auto output = inputs.back();
            std::transform(
                inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
                    return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
                });
            cpu_inputs.front() =
                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
            auto cpu_out = mod->insert_instruction(ins, custom_op, cpu_inputs);
            auto gpu_out =
                mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output);
            return mod->replace_instruction(ins, gpu_out);
        }
        return ins;
    }

209
    instruction_ref insert_precompile_op(instruction_ref ins) const
210
211
212
213
214
215
216
217
218
219
220
221
    {
        auto output                       = insert_allocation(ins, ins->get_shape());
        std::vector<instruction_ref> refs = ins->inputs();
        refs.push_back(output);

        return mod->replace_instruction(
            ins,
            make_op("gpu::precompile_op", {{"op", to_value(ins->get_operator())}}),
            refs,
            ins->module_inputs());
    }

222
    instruction_ref insert_allocation(instruction_ref ins, const shape& s) const
Paul's avatar
Paul committed
223
    {
224
        return mod->insert_instruction(ins, make_op("allocate", {{"shape", to_value(s)}}));
Paul's avatar
Paul committed
225
226
    }

227
228
    template <typename Op>
    void add_gemm_op(const std::string& name)
229
230
    {
        apply_map.emplace(name, [=](instruction_ref ins) {
231
            std::vector<instruction_ref> refs = ins->inputs();
232
233
234
            assert(refs.size() == 2);
            auto output = insert_allocation(ins, ins->get_shape());
            refs.push_back(output);
Shucai Xiao's avatar
Shucai Xiao committed
235
            return mod->replace_instruction(
236
                ins, rocblas_gemm<Op>{Op{}, 1, 0, int8_x4_format, compute_fp32}, refs);
237
238
239
        });
    }

240
    void add_convolution_op(const std::string& name)
241
    {
242
        apply_map.emplace(name, [=](instruction_ref ins) {
243
244
245
246
            operation conv = make_op(
                "gpu::" + name,
                {{"op", ins->get_operator().to_value()}, {"int8_x4_format", int8_x4_format}});
            auto output = insert_allocation(ins, ins->get_shape());
247

248
249
250
251
252
            return mod->replace_instruction(ins,
                                            make_op("gpu::miopen_op", {{"op", to_value(conv)}}),
                                            ins->inputs().at(0),
                                            ins->inputs().at(1),
                                            output);
Shucai Xiao's avatar
Shucai Xiao committed
253
254
255
        });
    }

256
257
258
    // add_generic_op just constructs the operator with no fields whereas add_extend_op copies over
    // the fields Since it doesn't have fields its default constructed

259
260
261
    void add_generic_op(const std::string& name) { add_generic_op(name, "gpu::" + name); }

    void add_generic_op(const std::string& op_name, const std::string& gpu_name)
Paul's avatar
Paul committed
262
    {
263
        apply_map.emplace(op_name, [=](instruction_ref ins) {
264
265
266
            auto output                       = insert_allocation(ins, ins->get_shape());
            std::vector<instruction_ref> refs = ins->inputs();
            refs.push_back(output);
Paul's avatar
Paul committed
267

Shucai Xiao's avatar
Shucai Xiao committed
268
            return mod->replace_instruction(ins, make_op(gpu_name), refs);
269
        });
Paul's avatar
Paul committed
270
    }
Paul's avatar
Paul committed
271

272
273
274
    void add_extend_op(const std::string& name) { add_extend_op(name, "gpu::" + name); }

    void add_extend_op(const std::string& op_name, const std::string& gpu_name)
Khalique's avatar
Khalique committed
275
    {
276
277
        apply_map.emplace(op_name, [=](instruction_ref ins) {
            auto&& op                         = ins->get_operator();
278
279
280
            auto output                       = insert_allocation(ins, ins->get_shape());
            std::vector<instruction_ref> refs = ins->inputs();
            refs.push_back(output);
Paul's avatar
Paul committed
281

Shucai Xiao's avatar
Shucai Xiao committed
282
            return mod->replace_instruction(ins, make_op(gpu_name, op.to_value()), refs);
283
        });
Khalique's avatar
Khalique committed
284
285
    }

Shucai Xiao's avatar
Shucai Xiao committed
286
287
288
289
290
291
    // use 0 - input to represent neg
    void add_neg_op()
    {
        apply_map.emplace("neg", [=](instruction_ref ins) {
            auto s = ins->get_shape();
            std::vector<float> zeros(s.elements(), 0.0f);
Shucai Xiao's avatar
Shucai Xiao committed
292
            auto l0     = mod->add_literal(literal(s, zeros));
Shucai Xiao's avatar
Shucai Xiao committed
293
            auto output = insert_allocation(ins, s);
Shucai Xiao's avatar
Shucai Xiao committed
294
            return mod->replace_instruction(
295
                ins, make_op("gpu::sub"), l0, ins->inputs().front(), output);
Shucai Xiao's avatar
Shucai Xiao committed
296
297
        });
    }
Shucai Xiao's avatar
Shucai Xiao committed
298

Shucai Xiao's avatar
Shucai Xiao committed
299
    // add input and output argument for the if operator
Shucai Xiao's avatar
Shucai Xiao committed
300
301
302
303
    void add_if_op()
    {
        apply_map.emplace("if", [=](instruction_ref ins) {
            std::vector<instruction_ref> inputs = ins->inputs();
304
305
306
            auto cpu_cond =
                mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.front());
            auto sync_cond = mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_cond);
Shucai Xiao's avatar
Shucai Xiao committed
307
308
            inputs.front() = sync_cond;

309
            return mod->replace_instruction(ins, ins->get_operator(), inputs, ins->module_inputs());
Shucai Xiao's avatar
Shucai Xiao committed
310
311
        });
    }
Shucai Xiao's avatar
Shucai Xiao committed
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327

    // replace the loop operator with gpu_loop operator
    void add_loop_op()
    {
        apply_map.emplace("loop", [=](instruction_ref ins) {
            std::vector<instruction_ref> inputs = ins->inputs();
            // copy max_iter from gpu to cpu
            auto cpu_max_iter =
                mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.at(0));
            auto cpu_cond =
                mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.at(1));
            auto synced_max_iter =
                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_max_iter, cpu_cond);
            inputs.at(0)     = synced_max_iter;
            inputs.at(1)     = cpu_cond;
            auto copy_inputs = inputs;
328
329
330
331
            std::transform(copy_inputs.begin(),
                           copy_inputs.end(),
                           std::back_inserter(inputs),
                           [&](auto in) { return insert_allocation(ins, in->get_shape()); });
Shucai Xiao's avatar
Shucai Xiao committed
332
333
334
335
336

            auto mod_args = ins->module_inputs();
            auto output   = insert_allocation(ins, ins->get_shape());

            const auto* sub_mod = mod_args.front();
337
338
            auto cond_out       = insert_allocation(ins, sub_mod->get_output_shapes().front());

Shucai Xiao's avatar
Shucai Xiao committed
339
340
341
342
343
344
345
346
            // add cond and mod outputs to the argument list
            inputs.push_back(cond_out);
            inputs.push_back(output);

            return mod->replace_instruction(
                ins, make_op("gpu::loop", ins->get_operator().to_value()), inputs, mod_args);
        });
    }
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366

    void add_nms_op()
    {
        apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) {
            auto s      = ins->get_shape();
            auto output = insert_allocation(ins, s);
            std::vector<instruction_ref> cpu_inputs;
            auto inputs = ins->inputs();
            std::transform(
                inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
                    return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
                });
            cpu_inputs.front() =
                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
            auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs);
            auto gpu_out =
                mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output);
            return mod->replace_instruction(ins, gpu_out);
        });
    }
Charlie Lin's avatar
Charlie Lin committed
367
368

    /**
Charlie Lin's avatar
Charlie Lin committed
369
     * Adds dynamic allocation for submodule output parameter.
Charlie Lin's avatar
Charlie Lin committed
370
371
372
373
     */
    void add_select_module_op()
    {
        apply_map.emplace("select_module", [=](instruction_ref ins) {
Charlie Lin's avatar
Charlie Lin committed
374
375
            auto s                              = ins->get_shape();
            auto output                         = insert_allocation(ins, s);
Charlie Lin's avatar
Charlie Lin committed
376
            std::vector<instruction_ref> inputs = ins->inputs();
Charlie Lin's avatar
Charlie Lin committed
377
378
            inputs.push_back(output);
            return mod->replace_instruction(ins, ins->get_operator(), inputs, ins->module_inputs());
Charlie Lin's avatar
Charlie Lin committed
379
380
        });
    }
Ted Themistokleous's avatar
Ted Themistokleous committed
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405

    /**
    *  Adds reshape lazy to reshape ops that can be aliased instead of copied
    */
    void add_reshape_lazy_op()
    {
        apply_map.emplace("reshape", [=](instruction_ref ins) {
                ins->debug_print();
                /* Attempt lazy reshape to allow for aliasing. Potentially throws in get_shape if unable to alias */
                return mod->replace_instruction(ins, make_op("reshape_lazy", {{"dims", {ins->get_operator().to_value()}}}), ins->inputs(), ins->module_inputs());
            try 
            {   

            }
            catch (...)
            {
                //std::cout << "catch reshape_lazy_fail" << std::endl;
                /* can't alias so require an allocate for output and a contiguous */
                auto s                              = ins->get_shape();
                std::vector<instruction_ref> inputs = ins->inputs();
                auto output                         = insert_allocation(ins, s);
                return mod->insert_instruction(std::next(ins), make_op("gpu::contiguous"), ins, output);
            }
        });
    }
Paul's avatar
Paul committed
406
407
};

408
409
410
411
void lowering::apply(module_pass_manager& mpm) const
{
    miopen_apply{&mpm.get_module(), &mpm, this}.apply();
}
Shucai Xiao's avatar
Shucai Xiao committed
412

Paul's avatar
Paul committed
413
} // namespace gpu
Paul's avatar
Paul committed
414
} // namespace MIGRAPHX_INLINE_NS
Paul's avatar
Paul committed
415
} // namespace migraphx