quantization.cpp 15.8 KB
Newer Older
Shucai Xiao's avatar
Shucai Xiao committed
1
#include <migraphx/quantization.hpp>
2
3
4
#include <migraphx/program.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/iterator_for.hpp>
5
#include <migraphx/op/convert.hpp>
Shucai Xiao's avatar
Shucai Xiao committed
6
#include <migraphx/op/dot.hpp>
Shucai Xiao's avatar
Shucai Xiao committed
7
#include <migraphx/op/mul.hpp>
8
#include <migraphx/op/add.hpp>
Shucai Xiao's avatar
Shucai Xiao committed
9
#include <migraphx/op/quant_dot.hpp>
10
#include <migraphx/op/capture.hpp>
Shucai Xiao's avatar
Shucai Xiao committed
11
12
13
#include <migraphx/op/convolution.hpp>
#include <migraphx/op/quant_convolution.hpp>
#include <migraphx/op/multibroadcast.hpp>
14
#include <migraphx/stringutils.hpp>
15
#include <migraphx/ranges.hpp>
16
#include <utility>
17
18
#include <iomanip>
#include <fstream>
19
20
21
22

namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {

Shucai Xiao's avatar
Shucai Xiao committed
23
instruction_ref insert_quant_ins(program& prog,
Shucai Xiao's avatar
Shucai Xiao committed
24
25
26
27
28
                                 instruction_ref& ins,
                                 shape::type_t type,
                                 std::unordered_map<instruction_ref, instruction_ref>& map_ins,
                                 float scale = 1.0f,
                                 float shift = 0.0f)
29
{
Shucai Xiao's avatar
Shucai Xiao committed
30
    if(map_ins.count(ins) > 0)
31
    {
Shucai Xiao's avatar
Shucai Xiao committed
32
        return map_ins[ins];
33
34
    }

Shucai Xiao's avatar
Shucai Xiao committed
35
36
37
38
39
    if(ins->name() == "undefined")
    {
        return ins;
    }

40
41
42
43
44
    if(scale < 0.0f)
    {
        MIGRAPHX_THROW("INSERT_QUANT_INS: scale less than 0");
    }

Shucai Xiao's avatar
Shucai Xiao committed
45
    assert(ins->get_shape().type() == shape::float_type ||
Shucai Xiao's avatar
Shucai Xiao committed
46
47
48
           ins->get_shape().type() == shape::double_type ||
           ins->get_shape().type() == shape::int32_type);
    instruction_ref quant_ins{};
Shucai Xiao's avatar
Shucai Xiao committed
49
    quant_ins    = prog.insert_instruction(std::next(ins), op::convert{type, scale, shift}, ins);
Shucai Xiao's avatar
Shucai Xiao committed
50
    map_ins[ins] = quant_ins;
51

Shucai Xiao's avatar
Shucai Xiao committed
52
    return quant_ins;
53
54
}

Shucai Xiao's avatar
Shucai Xiao committed
55
56
57
// This function is to convert any instructions specified in the input
// from double or float to float16 by inserting a convert operator.
// For the conversion, there could be cases of overflowing, but it
Shucai Xiao's avatar
Shucai Xiao committed
58
// is very rare in the area of deeping learning, so we just do a
Shucai Xiao's avatar
Shucai Xiao committed
59
// truncate of the input to get the fp16.
60
void quantize(program& prog, const std::vector<std::string>& ins_names)
61
{
62
    std::unordered_map<instruction_ref, instruction_ref> map_fp16;
Shucai Xiao's avatar
Shucai Xiao committed
63
    for(auto ins : iterator_for(prog))
64
    {
65
        // all indicates every instruction is converted
Shucai Xiao's avatar
Shucai Xiao committed
66
        if((not contains(ins_names, "all")) and (not contains(ins_names, ins->name())))
67
68
69
        {
            continue;
        }
70

71
        shape::type_t orig_type = ins->get_shape().type();
Shucai Xiao's avatar
Shucai Xiao committed
72
        // process all inputs, if input is a fp32 or fp64, convert it
73
        // to a fp16 by adding a convert operator.
74
        auto inputs = ins->inputs();
75
        std::vector<instruction_ref> converted_inputs;
Shucai Xiao's avatar
Shucai Xiao committed
76
        for(auto input : inputs)
77
78
        {
            auto s = input->get_shape();
Shucai Xiao's avatar
Shucai Xiao committed
79
            if(s.type() == shape::float_type || s.type() == shape::double_type)
80
            {
81
                // if the input is a convert operator, uses its input
82
83
                // as its current input
                instruction_ref input_fp16{};
84
                if(input->name() == "convert")
85
86
87
88
89
                {
                    input_fp16 = input->inputs().front();
                }
                else
                {
Shucai Xiao's avatar
Shucai Xiao committed
90
                    input_fp16 = insert_quant_ins(prog, input, shape::half_type, map_fp16);
91
                }
92
                converted_inputs.push_back(input_fp16);
93
            }
94
95
96
97
98
99
            else
            {
                converted_inputs.push_back(input);
            }
        }

100
        // no change for the input, go to the next instruction
Shucai Xiao's avatar
Shucai Xiao committed
101
        if(inputs == converted_inputs)
102
        {
103
            continue;
Shucai Xiao's avatar
Shucai Xiao committed
104
105
106
107
108
109
        }

        auto op        = ins->get_operator();
        auto ins_shape = compute_shape(op, converted_inputs);
        if(ins_shape.type() != orig_type)
        {
Shucai Xiao's avatar
Shucai Xiao committed
110
111
112
113
114
            // check the dead code case to avoid assert
            bool output_empty = ins->outputs().empty();
            auto ins_orig_type =
                prog.insert_instruction(std::next(ins), op::convert{orig_type}, ins);
            if(!output_empty)
115
            {
Shucai Xiao's avatar
Shucai Xiao committed
116
                prog.replace_instruction(ins, ins_orig_type);
Shucai Xiao's avatar
Shucai Xiao committed
117
            }
Shucai Xiao's avatar
Shucai Xiao committed
118
119
120
121
122
123
124
125
        }

        prog.replace_instruction(ins, op, converted_inputs);
    }
}

void quantize(program& prog) { quantize(prog, {"all"}); }

126
127
static std::vector<std::pair<float, float>> int8_quant_params;

128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
// function to compute the scale for each convert operator to convert to int8
void calc_quant_params(std::size_t ins_index, std::vector<migraphx::argument> args)
{
    std::pair<float, float> param_pair{1.0f, 0.0f};

    // scale and shift is need for only int8 type, and we do not
    // consider shift, so set shift to 0
    std::vector<float> vec_val;
    args.front().visit([&](auto output) { vec_val.assign(output.begin(), output.end()); });
    auto max_val     = *std::max_element(vec_val.begin(), vec_val.end());
    auto min_val     = *std::min_element(vec_val.begin(), vec_val.end());
    auto max_abs     = std::max(std::fabs(max_val), std::fabs(min_val));
    param_pair.first = 127.0f / max_abs;

    int8_quant_params[ins_index] = param_pair;
};

Shucai Xiao's avatar
Shucai Xiao committed
145
// int8 quantization is different from fp16 since int8 can only handle value
Shucai Xiao's avatar
Shucai Xiao committed
146
// -128 ~ 127. To convert the float or double to int8, we need a scale and
Shucai Xiao's avatar
Shucai Xiao committed
147
// a shift, then the convert can be done as v_int8 = fp * scale + shift.
Shucai Xiao's avatar
Shucai Xiao committed
148
// To simplify the changes, we consider shift as 0.0f for now.
Shucai Xiao's avatar
Shucai Xiao committed
149
150
void quantize_int8(program& prog,
                   const std::vector<std::string>& ins_names,
151
                   const std::vector<std::pair<float, float>>& quant_params)
Shucai Xiao's avatar
Shucai Xiao committed
152
{
153
154
155
156
157
158
159
    for(size_t i = 0; i < quant_params.size(); i++)
    {
        auto param = quant_params.at(i);
        std::cout << "index = " << i << ", scale = " << param.first << "\t" << param.second
                  << std::endl;
    }
    std::cout << std::endl;
Shucai Xiao's avatar
Shucai Xiao committed
160

Shucai Xiao's avatar
Shucai Xiao committed
161
162
    // For now, we only support the int8 quantization of gemm and convolution
    std::vector<std::string> op_names = {"dot", "convolution"};
Shucai Xiao's avatar
Shucai Xiao committed
163
    if(!std::all_of(ins_names.begin(), ins_names.end(), [&](auto name) {
Shucai Xiao's avatar
Shucai Xiao committed
164
           return (std::find(op_names.begin(), op_names.end(), name) != op_names.end());
Shucai Xiao's avatar
Shucai Xiao committed
165
       }))
Shucai Xiao's avatar
Shucai Xiao committed
166
167
168
169
    {
        MIGRAPHX_THROW("QUANTIZE_INT8: only support DOT and CONVOLUTION operation");
    }

Shucai Xiao's avatar
Shucai Xiao committed
170
    std::size_t quant_param_index = 0;
Shucai Xiao's avatar
Shucai Xiao committed
171
    std::unordered_map<instruction_ref, instruction_ref> map_quant_ins;
172
    std::unordered_map<instruction_ref, std::size_t> map_index;
Shucai Xiao's avatar
Shucai Xiao committed
173
174
175
176
177
178
179
180
181
182
183
184
185
186
    for(auto ins : iterator_for(prog))
    {
        if(not contains(ins_names, ins->name()))
        {
            continue;
        }

        shape::type_t orig_type = ins->get_shape().type();

        // for the dot operator, there could be 2 or 3 input arguments
        // if the 3rd argument is available, convert it to an int32.
        std::vector<instruction_ref> converted_inputs;

        // process all inputs, if input is a fp32 or fp64, convert it
Shucai Xiao's avatar
Shucai Xiao committed
187
        // to a int8 type by adding a convert operator and replace
Shucai Xiao's avatar
Shucai Xiao committed
188
        // the operator with the corresponding int8 version
Shucai Xiao's avatar
Shucai Xiao committed
189
190
        auto inputs = ins->inputs();
        std::vector<std::pair<float, float>> ins_quant_params;
Shucai Xiao's avatar
Shucai Xiao committed
191
192
        for(auto input : inputs)
        {
193
194
195
196
197
198
199
200
            // calculate the index of each instruction to be quantized
            if(map_index.count(input) == 0)
            {
                map_index[input] = quant_param_index++;
            }
            auto param = quant_params[map_index[input]];
            ins_quant_params.push_back(param);

Shucai Xiao's avatar
Shucai Xiao committed
201
202
            // In general, the target_type is int8, but for the dot
            // operation, if it has 3 inputs, then the last one should
Shucai Xiao's avatar
Shucai Xiao committed
203
204
            // be converted to int32_type
            shape::type_t quant_type = shape::int8_type;
Shucai Xiao's avatar
Shucai Xiao committed
205
            if(ins->name() == "dot" and inputs.size() == 3 and input == inputs.back())
Shucai Xiao's avatar
Shucai Xiao committed
206
            {
Shucai Xiao's avatar
Shucai Xiao committed
207
208
                quant_type = shape::int32_type;
            }
Shucai Xiao's avatar
Shucai Xiao committed
209

Shucai Xiao's avatar
Shucai Xiao committed
210
            auto s = input->get_shape();
211
            if((s.type() == shape::float_type || s.type() == shape::double_type ||
Shucai Xiao's avatar
Shucai Xiao committed
212
213
                s.type() == shape::int32_type) &&
               s.type() != quant_type)
Shucai Xiao's avatar
Shucai Xiao committed
214
215
216
217
218
219
220
            {
                // if the input is a convert operator, uses its input
                // as its current input
                instruction_ref quant_input{};
                if(input->name() == "convert")
                {
                    auto tmp_ins = input->inputs().front();
Shucai Xiao's avatar
Shucai Xiao committed
221
                    if(tmp_ins->get_shape().type() == quant_type)
Shucai Xiao's avatar
Shucai Xiao committed
222
223
224
225
226
                    {
                        quant_input = input->inputs().front();
                    }
                    else
                    {
Shucai Xiao's avatar
Shucai Xiao committed
227
228
                        quant_input = insert_quant_ins(
                            prog, input, quant_type, map_quant_ins, param.first, param.second);
Shucai Xiao's avatar
Shucai Xiao committed
229
230
231
                    }
                }
                else
232
                {
Shucai Xiao's avatar
Shucai Xiao committed
233
234
                    quant_input = insert_quant_ins(
                        prog, input, quant_type, map_quant_ins, param.first, param.second);
235
                }
Shucai Xiao's avatar
Shucai Xiao committed
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
                converted_inputs.push_back(quant_input);
            }
            else
            {
                converted_inputs.push_back(input);
            }
        }

        // no change for the input, go to the next instruction
        if(inputs == converted_inputs)
        {
            continue;
        }

        // When converting from other types to int8_type, there are parameters
        // used as scale and shift(.0f), which will generate results diffrent from
        // the original results. To adjust the output to be "correct(approximatly
Shucai Xiao's avatar
Shucai Xiao committed
253
        // equal)", we need additional calculation for the adjustment
Shucai Xiao's avatar
Shucai Xiao committed
254
        if(ins->name() == "dot")
Shucai Xiao's avatar
Shucai Xiao committed
255
        {
Shucai Xiao's avatar
Shucai Xiao committed
256
257
258
259
            auto dot_op = any_cast<op::dot>(ins->get_operator());
            float new_alpha =
                dot_op.alpha / (ins_quant_params[0].first * ins_quant_params[1].first);
            float new_beta = dot_op.beta;
Shucai Xiao's avatar
Shucai Xiao committed
260
            // We need additional checking about the quant_alpha value. If
261
262
263
            // abs(quant_alpha) > 50 (some tmp value set here), we can convert
            // it to an integer as the new_alpha in the quant_dot
            float threshold = 50.0f;
Shucai Xiao's avatar
Shucai Xiao committed
264
            if(fabs(new_alpha) >= threshold && fabs(new_beta) >= threshold)
265
266
            {
                int32_t quant_alpha = static_cast<int32_t>(new_alpha);
Shucai Xiao's avatar
Shucai Xiao committed
267
268
269
                int32_t quant_beta  = static_cast<int32_t>(new_beta);
                shape quant_shape   = compute_shape(op::quant_dot{1, 0}, converted_inputs);
                if(quant_shape.type() == orig_type)
270
                {
Shucai Xiao's avatar
Shucai Xiao committed
271
272
                    prog.replace_instruction(
                        ins, op::quant_dot{quant_alpha, quant_beta}, converted_inputs);
273
274
275
                }
                else
                {
Shucai Xiao's avatar
Shucai Xiao committed
276
277
                    auto quant_dot = prog.insert_instruction(
                        ins, op::quant_dot{quant_alpha, quant_beta}, converted_inputs);
278
279
280
                    prog.replace_instruction(ins, op::convert{orig_type}, quant_dot);
                }
            }
Shucai Xiao's avatar
Shucai Xiao committed
281
            // either alpha or beta cannot be quantized because of too big
282
            // relative rounding error
283
284
285
            else
            {
                auto q_dot = prog.insert_instruction(ins, op::quant_dot{1, 0}, converted_inputs);
Shucai Xiao's avatar
Shucai Xiao committed
286
                if(inputs.size() == 3 and dot_op.beta != 0.0f)
287
                {
Shucai Xiao's avatar
Shucai Xiao committed
288
289
                    auto alpha_ab = prog.insert_instruction(
                        ins, op::convert{orig_type, new_alpha, 0.0f}, q_dot);
290
291
                    auto c_shape = q_dot->get_shape();
                    std::vector<float> vec_beta(c_shape.elements(), dot_op.beta);
Shucai Xiao's avatar
Shucai Xiao committed
292
293
                    auto l_beta =
                        prog.add_literal(literal({shape::float_type, c_shape.lens()}, vec_beta));
294
                    instruction_ref beta_c{};
Shucai Xiao's avatar
Shucai Xiao committed
295
                    if(orig_type != shape::float_type)
296
                    {
Shucai Xiao's avatar
Shucai Xiao committed
297
298
                        auto fp32_c = prog.insert_instruction(
                            ins, op::convert{shape::float_type}, inputs.back());
299
300
                        auto fp32_beta_c = prog.insert_instruction(ins, op::mul{}, l_beta, fp32_c);
                        beta_c = prog.insert_instruction(ins, op::convert{orig_type}, fp32_beta_c);
301
302
303
                    }
                    else
                    {
304
                        beta_c = prog.insert_instruction(ins, op::mul{}, l_beta, inputs.back());
305
                    }
306
                    prog.replace_instruction(ins, op::add{}, alpha_ab, beta_c);
307
308
309
                }
                else
                {
310
                    prog.replace_instruction(ins, op::convert{orig_type, new_alpha, 0.0f}, q_dot);
311
312
                }
            }
Shucai Xiao's avatar
Shucai Xiao committed
313
        }
Shucai Xiao's avatar
Shucai Xiao committed
314
        else if(ins->name() == "convolution")
Shucai Xiao's avatar
Shucai Xiao committed
315
        {
Shucai Xiao's avatar
Shucai Xiao committed
316
            // Current MIOpen convolution does not support alpha and beta,
Shucai Xiao's avatar
Shucai Xiao committed
317
            // so we need a separate multiply to adjust the output
Shucai Xiao's avatar
Shucai Xiao committed
318
319
320
321
322
323
            auto conv_op       = any_cast<op::convolution>(ins->get_operator());
            auto padding       = conv_op.padding;
            auto stride        = conv_op.stride;
            auto dilation      = conv_op.dilation;
            auto padding_mode  = conv_op.padding_mode;
            auto group         = conv_op.group;
324
            auto adjust_factor = 1.0f / (ins_quant_params[0].first * ins_quant_params[1].first);
Shucai Xiao's avatar
Shucai Xiao committed
325

326
327
328
329
            auto quant_conv = prog.insert_instruction(
                ins,
                op::quant_convolution{padding, stride, dilation, padding_mode, group},
                converted_inputs);
330
            prog.replace_instruction(ins, op::convert{orig_type, adjust_factor, 0.0f}, quant_conv);
Shucai Xiao's avatar
Shucai Xiao committed
331
332
333
        }
        else
        {
334
            MIGRAPHX_THROW("QUANTIZE_INT8: does not support operator" + ins->name());
Shucai Xiao's avatar
Shucai Xiao committed
335
        }
336
    }
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351

    if(quant_param_index != quant_params.size())
    {
        MIGRAPHX_THROW("QUANTIZE_INT8: number of scales does not match");
    }
}

void quantize_int8(program& prog, const std::vector<std::string>& ins_names)
{
    quantize_int8(prog, ins_names, int8_quant_params);
}

void quantize_int8(program& prog)
{
    std::vector<std::string> ins_names = {"dot", "convolution"};
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
    quantize_int8(prog, ins_names, int8_quant_params);
}

// For the input of each input argument, we need to insert a
// capture operator to compute the scale and shift
void capture_arguments(program& prog,
                       const std::vector<std::string>& ins_names,
                       std::function<void(std::size_t, std::vector<argument>)> func)
{
    size_t num_quant_params = 0;
    // the int8 quantization only support dot and convolution
    std::vector<std::string> op_names = {"dot", "convolution", "quant_dot", "quant_convolution"};
    if(!std::all_of(ins_names.begin(), ins_names.end(), [&](auto name) {
           return std::find(op_names.begin(), op_names.end(), name) != op_names.end();
       }))
    {
        MIGRAPHX_THROW("CAPTURE_ARGUMENTS: input operator is not supported");
    }

    std::unordered_map<instruction_ref, instruction_ref> ins_map;
    for(auto ins : iterator_for(prog))
    {
        if(not contains(ins_names, ins->name()))
        {
            continue;
        }

        auto inputs = ins->inputs();
        std::vector<instruction_ref> new_args;
        for(auto input : inputs)
        {
            instruction_ref new_ins{};
            if(ins_map.count(input) > 0)
            {
                new_ins = ins_map[input];
            }
            else
            {
                new_ins = prog.insert_instruction(
                    std::next(input), op::capture{num_quant_params++, func}, input);
                ins_map[input] = new_ins;
            }
            new_args.push_back(new_ins);
        }
        instruction::replace(ins, ins->get_operator(), ins->get_shape(), new_args);
    }

    // set one pair of parameter for each argument
    int8_quant_params.resize(num_quant_params, std::make_pair(-1.0f, -1.0f));
}

void capture_arguments(program& prog, const std::vector<std::string>& ins_names)
{
    capture_arguments(prog, ins_names, calc_quant_params);
}

void capture_arguments(program& prog)
{
    std::vector<std::string> ins_names = {"dot", "convolution"};
    capture_arguments(prog, ins_names);
412
413
414
415
}

} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx