merge changes from int8_quantize

0b2fb46a · Shucai Xiao · 2fcc8c09 · 4d1f18a8 · 0b2fb46a · 0b2fb46a
Commit 0b2fb46a authored Jun 05, 2019 by Shucai Xiao
15 changed files
--- a/src/include/migraphx/generate.hpp
+++ b/src/include/migraphx/generate.hpp
@@ -82,6 +82,8 @@ std::vector<T> generate_tensor_data(const migraphx::shape& s, unsigned long seed
 {
    std::vector<T> result(s.elements());
    std::generate(result.begin(), result.end(), xorshf96_generator<T>{seed});
+    // divide a value to avoid integer overflow
+    std::transform(result.begin(), result.end(), result.begin(), [](auto i) { return i / 32; });
    // std::generate(result.begin(), result.end(), [&]{ return seed % 7; });
    // std::generate(result.begin(), result.end(), []{ return 1; });
    return result;

--- a/src/include/migraphx/op/quant_convolution.hpp
+++ b/src/include/migraphx/op/quant_convolution.hpp
@@ -50,7 +50,7 @@ struct quant_convolution
        {
            MIGRAPHX_THROW("QUANT_CONVOLUTION: only accept input and weights of type int8_t");
        }
-        t = shape::float_type;
+        t = shape::int32_type;

        if(padding_mode == default_)
        {

--- a/src/include/migraphx/quantization.hpp
+++ b/src/include/migraphx/quantization.hpp
@@ -22,6 +22,10 @@ void capture_arguments(program& prog,
                       std::function<void(std::size_t, std::vector<argument>)> func);
 void capture_arguments(program& prog, const std::vector<std::string>& ins_names);

+void quantize_int8(program& prog,
+                   const std::vector<std::string>& ins_names,
+                   const std::vector<std::pair<float, float>>& quant_params);
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx


--- a/src/opt/memory_coloring_impl.cpp
+++ b/src/opt/memory_coloring_impl.cpp
@@ -85,6 +85,9 @@ bool memory_coloring_impl::allocate(interval_ptr interval)
            offset += (element_size - (offset % element_size));
        conflict_queue.pop();
    }
+    // when int8 type is used, the offset could be any number
+    // if not 4-byte aligned, miopen int8 convolution can crash
+    offset         = (offset + 3) / 4 * 4;
    segment.offset = offset;
    MIGRAPHX_DEBUG(segment.dump());
    required_bytes = std::max(required_bytes, offset + segment.size);

--- a/src/quantization.cpp
+++ b/src/quantization.cpp
@@ -14,6 +14,8 @@
 #include <migraphx/stringutils.hpp>
 #include <migraphx/ranges.hpp>
 #include <utility>
+#include <iomanip>
+#include <fstream>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -35,6 +37,11 @@ instruction_ref insert_quant_ins(program& prog,
        return ins;
    }

+    if(scale < 0.0f)
+    {
+        MIGRAPHX_THROW("INSERT_QUANT_INS: scale less than 0");
+    }
+
    assert(ins->get_shape().type() == shape::float_type ||
           ins->get_shape().type() == shape::double_type ||
           ins->get_shape().type() == shape::int32_type);
@@ -135,6 +142,315 @@ void calc_quant_params(std::size_t ins_index, std::vector<migraphx::argument> ar
    int8_quant_params[ins_index] = param_pair;
 };

+// int8 quantization is different from fp16 since int8 can only handle value
+// -128 ~ 127. To convert the float or double to int8, we need a scale and
+// a shift, then the convert can be done as v_int8 = fp * scale + shift.
+// To simplify the changes, we consider shift as 0.0f for now.
+void quantize_int8(program& prog,
+                   const std::vector<std::string>& ins_names,
+                   const std::vector<std::pair<float, float>>& quant_params)
+{
+    for(size_t i = 0; i < quant_params.size(); i++)
+    {
+        auto param = quant_params.at(i);
+        std::cout << "index = " << i << ", scale = " << param.first << "\t" << param.second
+                  << std::endl;
+    }
+    std::cout << std::endl;
+
+    // For now, we only support the int8 quantization of gemm and convolution
+    std::vector<std::string> op_names = {"dot", "convolution"};
+    if(!std::all_of(ins_names.begin(), ins_names.end(), [&](auto name) {
+           return (std::find(op_names.begin(), op_names.end(), name) != op_names.end());
+       }))
+    {
+        MIGRAPHX_THROW("QUANTIZE_INT8: only support DOT and CONVOLUTION operation");
+    }
+
+    std::size_t quant_param_index = 0;
+    std::unordered_map<instruction_ref, instruction_ref> map_quant_ins;
+    std::unordered_map<instruction_ref, std::size_t> map_index;
+    for(auto ins : iterator_for(prog))
+    {
+        if(not contains(ins_names, ins->name()))
+        {
+            continue;
+        }
+
+        shape::type_t orig_type = ins->get_shape().type();
+
+        // for the dot operator, there could be 2 or 3 input arguments
+        // if the 3rd argument is available, convert it to an int32.
+        std::vector<instruction_ref> converted_inputs;
+
+        // process all inputs, if input is a fp32 or fp64, convert it
+        // to a int8 type by adding a convert operator and replace
+        // the operator with the corresponding int8 version
+        auto inputs = ins->inputs();
+        std::vector<std::pair<float, float>> ins_quant_params;
+        for(auto input : inputs)
+        {
+            // calculate the index of each instruction to be quantized
+            if(map_index.count(input) == 0)
+            {
+                map_index[input] = quant_param_index++;
+            }
+            auto param = quant_params[map_index[input]];
+            ins_quant_params.push_back(param);
+
+            // In general, the target_type is int8, but for the dot
+            // operation, if it has 3 inputs, then the last one should
+            // be converted to int32_type
+            shape::type_t quant_type = shape::int8_type;
+            if(ins->name() == "dot" and inputs.size() == 3 and input == inputs.back())
+            {
+                quant_type = shape::int32_type;
+            }
+
+            auto s = input->get_shape();
+            if((s.type() == shape::float_type || s.type() == shape::double_type ||
+                s.type() == shape::int32_type) &&
+               s.type() != quant_type)
+            {
+                // if the input is a convert operator, uses its input
+                // as its current input
+                instruction_ref quant_input{};
+                if(input->name() == "convert")
+                {
+                    auto tmp_ins = input->inputs().front();
+                    if(tmp_ins->get_shape().type() == quant_type)
+                    {
+                        quant_input = input->inputs().front();
+                    }
+                    else
+                    {
+                        quant_input = insert_quant_ins(
+                            prog, input, quant_type, map_quant_ins, param.first, param.second);
+                    }
+                }
+                else
+                {
+                    quant_input = insert_quant_ins(
+                        prog, input, quant_type, map_quant_ins, param.first, param.second);
+                }
+                converted_inputs.push_back(quant_input);
+            }
+            else
+            {
+                converted_inputs.push_back(input);
+            }
+        }
+
+        // no change for the input, go to the next instruction
+        if(inputs == converted_inputs)
+        {
+            continue;
+        }
+
+        // When converting from other types to int8_type, there are parameters
+        // used as scale and shift(.0f), which will generate results diffrent from
+        // the original results. To adjust the output to be "correct(approximatly
+        // equal)", we need additional calculation for the adjustment
+        if(ins->name() == "dot")
+        {
+            auto dot_op = any_cast<op::dot>(ins->get_operator());
+            float new_alpha =
+                dot_op.alpha / (ins_quant_params[0].first * ins_quant_params[1].first);
+            float new_beta = dot_op.beta;
+            // We need additional checking about the quant_alpha value. If
+            // abs(quant_alpha) > 50 (some tmp value set here), we can convert
+            // it to an integer as the new_alpha in the quant_dot
+            float threshold = 50.0f;
+            if(fabs(new_alpha) >= threshold && fabs(new_beta) >= threshold)
+            {
+                int32_t quant_alpha = static_cast<int32_t>(new_alpha);
+                int32_t quant_beta  = static_cast<int32_t>(new_beta);
+                shape quant_shape   = compute_shape(op::quant_dot{1, 0}, converted_inputs);
+                if(quant_shape.type() == orig_type)
+                {
+                    prog.replace_instruction(
+                        ins, op::quant_dot{quant_alpha, quant_beta}, converted_inputs);
+                }
+                else
+                {
+                    auto quant_dot = prog.insert_instruction(
+                        ins, op::quant_dot{quant_alpha, quant_beta}, converted_inputs);
+                    prog.replace_instruction(ins, op::convert{orig_type}, quant_dot);
+                }
+            }
+            // only alpha can be quantized, quantization of beta will cause
+            // big error, so we have to manually do the multiplication and
+            // addition
+            else if(fabs(new_alpha) >= threshold)
+            {
+                // truncate to the nearest integer
+                new_alpha           = new_alpha > 0.0 ? new_alpha + 0.5 : new_alpha - 0.5;
+                int32_t quant_alpha = static_cast<int32_t>(new_alpha);
+                int32_t quant_beta  = 0;
+                if(orig_type == shape::int32_type)
+                {
+                    if(inputs.size() == 2 or dot_op.beta == 0.0f)
+                    {
+                        prog.replace_instruction(
+                            ins, op::quant_dot{quant_alpha, quant_beta}, converted_inputs);
+                    }
+                    // if there are 3 inputs, we need to consider the third argument
+                    else
+                    {
+                        auto q_dot = prog.insert_instruction(
+                            ins, op::quant_dot{quant_alpha, quant_beta}, converted_inputs);
+                        std::vector<float> vec_beta(q_dot->get_shape().elements(), dot_op.beta);
+                        auto l_beta = prog.add_literal(literal{orig_type, vec_beta});
+                        auto beta_c =
+                            prog.insert_instruction(ins, op::mul{}, l_beta, inputs.back());
+                        prog.replace_instruction(ins, op::add{}, q_dot, beta_c);
+                    }
+                }
+                else
+                {
+                    if(inputs.size() == 2 or dot_op.beta == 0.0f)
+                    {
+                        auto q_dot = prog.insert_instruction(
+                            ins, op::quant_dot{quant_alpha, quant_beta}, converted_inputs);
+                        prog.replace_instruction(ins, op::convert{orig_type}, q_dot);
+                    }
+                    // if there are 3 inputs, we need to consider the third argument
+                    else
+                    {
+                        auto q_dot = prog.insert_instruction(
+                            ins, op::quant_dot{quant_alpha, quant_beta}, converted_inputs);
+                        auto oq_dot = prog.insert_instruction(ins, op::convert{orig_type}, q_dot);
+                        std::vector<float> vec_beta(q_dot->get_shape().elements(), dot_op.beta);
+                        auto l_beta = prog.add_literal(literal{oq_dot->get_shape(), vec_beta});
+                        auto beta_c =
+                            prog.insert_instruction(ins, op::mul{}, l_beta, inputs.back());
+                        prog.replace_instruction(ins, op::add{}, oq_dot, beta_c);
+                    }
+                }
+            }
+            else
+            {
+                auto q_dot = prog.insert_instruction(ins, op::quant_dot{1, 0}, converted_inputs);
+                std::vector<float> vec_alpha(q_dot->get_shape().elements(), new_alpha);
+                if(orig_type == shape::int32_type)
+                {
+                    auto l_alpha = prog.add_literal(literal(ins->get_shape(), vec_alpha));
+                    if(converted_inputs.size() == 2 or dot_op.beta == 0.0f)
+                    {
+                        prog.replace_instruction(ins, op::mul{}, l_alpha, q_dot);
+                    }
+                    // case of 3 arguments
+                    else
+                    {
+                        std::vector<float> vec_beta(ins->get_shape().elements(), new_beta);
+                        auto l_beta   = prog.add_literal(literal(ins->get_shape(), vec_beta));
+                        auto alpha_ab = prog.insert_instruction(ins, op::mul{}, l_alpha, q_dot);
+                        auto beta_c =
+                            prog.insert_instruction(ins, op::mul{}, l_beta, inputs.back());
+                        prog.replace_instruction(ins, op::add{}, alpha_ab, beta_c);
+                    }
+                }
+                else
+                {
+                    auto oq_dot  = prog.insert_instruction(ins, op::convert{orig_type}, q_dot);
+                    auto l_alpha = prog.add_literal(literal(ins->get_shape(), vec_alpha));
+                    if(converted_inputs.size() == 2 or dot_op.beta == 0.0f)
+                    {
+                        prog.replace_instruction(ins, op::mul{}, l_alpha, oq_dot);
+                    }
+                    // case of 3 arguments
+                    else
+                    {
+                        std::vector<float> vec_beta(ins->get_shape().elements(), new_beta);
+                        auto l_beta   = prog.add_literal(literal(ins->get_shape(), vec_beta));
+                        auto alpha_ab = prog.insert_instruction(ins, op::mul{}, l_alpha, oq_dot);
+                        auto beta_c =
+                            prog.insert_instruction(ins, op::mul{}, l_beta, inputs.back());
+                        prog.replace_instruction(ins, op::add{}, alpha_ab, beta_c);
+                        // auto gemm_res = prog.insert_instruction(ins, op::add{}, alpha_ab,
+                        // beta_c); prog.replace_instruction(ins, op::capture{0, print_gemm_res},
+                        // gemm_res);
+                    }
+                }
+            }
+        }
+        else if(ins->name() == "convolution")
+        {
+            // Current MIOpen convolution does not support alpha and beta,
+            // so we need a separate multiply to adjust the output
+            auto conv_op       = any_cast<op::convolution>(ins->get_operator());
+            auto padding       = conv_op.padding;
+            auto stride        = conv_op.stride;
+            auto dilation      = conv_op.dilation;
+            auto padding_mode  = conv_op.padding_mode;
+            auto group         = conv_op.group;
+            auto adjust_factor = 1.0 / (ins_quant_params[0].first * ins_quant_params[1].first);
+
+            shape quant_shape =
+                compute_shape(op::quant_convolution{padding, stride, dilation, padding_mode, group},
+                              converted_inputs);
+            std::vector<float> vec_factor(quant_shape.elements(), adjust_factor);
+            auto fl = prog.add_literal(literal{{orig_type, quant_shape.lens()}, vec_factor});
+            if(quant_shape.type() == orig_type)
+            {
+                if(adjust_factor == 1.0f)
+                {
+                    prog.replace_instruction(
+                        ins,
+                        op::quant_convolution{padding, stride, dilation, padding_mode, group},
+                        converted_inputs);
+                }
+                else
+                {
+                    auto quant_conv = prog.insert_instruction(
+                        ins,
+                        op::quant_convolution{padding, stride, dilation, padding_mode, group},
+                        converted_inputs);
+                    prog.replace_instruction(ins, op::mul{}, quant_conv, fl);
+                    // auto q_conv = prog.insert_instruction(ins, op::mul{}, quant_conv, fl);
+                    // prog.replace_instruction(ins, op::capture{10000, print_conv_res}, q_conv);
+                }
+            }
+            else
+            {
+                auto quant_conv = prog.insert_instruction(
+                    ins,
+                    op::quant_convolution{padding, stride, dilation, padding_mode, group},
+                    converted_inputs);
+                if(adjust_factor == 1.0f)
+                {
+                    prog.replace_instruction(ins, op::convert{orig_type}, quant_conv);
+                }
+                else
+                {
+                    auto oq_conv = prog.insert_instruction(ins, op::convert{orig_type}, quant_conv);
+                    prog.replace_instruction(ins, op::mul{}, oq_conv, fl);
+                }
+            }
+        }
+        else
+        {
+            MIGRAPHX_THROW("QUANTIZE_INT8: does not support operator" + ins->name());
+        }
+    }
+
+    if(quant_param_index != quant_params.size())
+    {
+        MIGRAPHX_THROW("QUANTIZE_INT8: number of scales does not match");
+    }
+}
+
+void quantize_int8(program& prog, const std::vector<std::string>& ins_names)
+{
+    quantize_int8(prog, ins_names, int8_quant_params);
+}
+
+void quantize_int8(program& prog)
+{
+    std::vector<std::string> ins_names = {"dot", "convolution"};
+    quantize_int8(prog, ins_names, int8_quant_params);
+}
+
 // For the input of each input argument, we need to insert a
 // capture operator to compute the scale and shift
 void capture_arguments(program& prog,
@@ -188,5 +504,7 @@ void capture_arguments(program& prog, const std::vector<std::string>& ins_names)
    capture_arguments(prog, ins_names, calc_quant_params);
 }

+
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -219,40 +219,39 @@ struct cpu_quant_convolution
    argument compute(context&, shape output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        result.visit([&](auto output) {
-            visit_all(args[0], args[1])([&](auto input, auto weights) {
-                auto in   = input.get_shape().lens();
-                auto in_h = in[2];
-                auto in_w = in[3];
-
-                auto wei   = weights.get_shape().lens();
-                auto wei_n = wei[0];
-                auto wei_c = wei[1];
-                auto wei_h = wei[2];
-                auto wei_w = wei[3];
-
-                par_dfor(output_shape.lens()[0],
-                         output_shape.lens()[1],
-                         output_shape.lens()[2],
-                         output_shape.lens()[3])(
-                    [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
-                        const int start_x  = i * op.stride[0] - op.padding[0];
-                        const int start_y  = j * op.stride[1] - op.padding[1];
-                        const int group_id = w / (wei_n / op.group);
-
-                        float acc = 0;
-                        dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
-                            const int in_x  = start_x + x;
-                            const int in_y  = start_y + y;
-                            const int in_ch = group_id * wei_c + k;
-                            if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
-                            {
-                                acc += input(o, in_ch, in_x, in_y) * weights(w, k, x, y);
-                            }
-                        });
-                        output(o, w, i, j) = acc;
+        auto output = result.get<int32_t>();
+        visit_all(args[0], args[1])([&](auto input, auto weights) {
+            auto in   = input.get_shape().lens();
+            auto in_h = in[2];
+            auto in_w = in[3];
+
+            auto wei   = weights.get_shape().lens();
+            auto wei_n = wei[0];
+            auto wei_c = wei[1];
+            auto wei_h = wei[2];
+            auto wei_w = wei[3];
+
+            par_dfor(output_shape.lens()[0],
+                     output_shape.lens()[1],
+                     output_shape.lens()[2],
+                     output_shape.lens()[3])(
+                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
+                    const auto start_x  = i * op.stride[0] - op.padding[0];
+                    const auto start_y  = j * op.stride[1] - op.padding[1];
+                    const auto group_id = w / (wei_n / op.group);
+
+                    int32_t acc = 0;
+                    dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
+                        const auto in_x  = start_x + x;
+                        const auto in_y  = start_y + y;
+                        const auto in_ch = group_id * wei_c + k;
+                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
+                        {
+                            acc += input(o, in_ch, in_x, in_y) * weights(w, k, x, y);
+                        }
                    });
-            });
+                    output(o, w, i, j) = acc;
+                });
        });

        return result;

--- a/src/targets/gpu/device/pack.cpp
+++ b/src/targets/gpu/device/pack.cpp
@@ -13,18 +13,18 @@ namespace device {

 void pack_a(hipStream_t stream, const argument& result, const argument& arg)
 {
-    auto output_shape  = result.get_shape();
-    auto out_lens      = output_shape.lens();
+    auto comp_shape    = arg.get_shape();
+    auto out_lens      = comp_shape.lens();
    auto dim_0         = out_lens.size() - 2;
    auto dim_1         = out_lens.size() - 1;
-    std::size_t lda    = output_shape.strides()[dim_0];
+    std::size_t lda    = comp_shape.strides()[dim_0];
    std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
    visit_all(result, arg)([&](auto output, auto input) {
-        std::size_t nelements = output_shape.elements();
+        std::size_t nelements = comp_shape.elements();
        auto* out_ptr         = device_cast(output.data());
        auto* in_ptr          = device_cast(input.data());
        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
-            hip_tensor_descriptor<out_dim> desc(output_shape);
+            hip_tensor_descriptor<out_dim> desc(comp_shape);
            gs_launch(stream, nelements)([=](auto ii) {
                const size_t nb    = 4;
                auto idx           = desc.multi(ii);
@@ -40,7 +40,7 @@ void pack_a(hipStream_t stream, const argument& result, const argument& arg)

 void pack_b(hipStream_t stream, const argument& result, const argument& arg)
 {
-    auto trans_shape = result.get_shape();
+    auto trans_shape = arg.get_shape();
    auto out_lens    = trans_shape.lens();
    auto dim_0       = trans_shape.lens().size() - 2;
    auto dim_1       = trans_shape.lens().size() - 1;
@@ -48,14 +48,14 @@ void pack_b(hipStream_t stream, const argument& result, const argument& arg)

    auto wrap_lens = out_lens;
    std::swap(wrap_lens[dim_0], wrap_lens[dim_1]);
-    shape output_shape{trans_shape.type(), wrap_lens};
+    shape comp_shape{trans_shape.type(), wrap_lens};
    std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
    visit_all(result, arg)([&](auto output, auto input) {
-        std::size_t nelements = output_shape.elements();
+        std::size_t nelements = comp_shape.elements();
        auto* out_ptr         = device_cast(output.data());
        auto* in_ptr          = device_cast(input.data());
        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
-            hip_tensor_descriptor<out_dim> desc(output_shape);
+            hip_tensor_descriptor<out_dim> desc(comp_shape);
            gs_launch(stream, nelements)([=](auto ii) {
                const size_t nb    = 4;
                auto idx           = desc.multi(ii);

--- a/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
@@ -33,7 +33,10 @@ struct miopen_quant_convolution
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
    shape compile(context& ctx, const shape& output_shape, std::vector<shape> inputs);
    void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }

    private:
    shape pack_int8_shape(shape& s);

--- a/src/targets/gpu/include/migraphx/gpu/quant_gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/quant_gemm.hpp
@@ -13,8 +13,6 @@ struct context;
 struct miopen_quant_gemm
 {
    op::quant_dot op;
-    mutable argument arg_a{};
-    mutable argument arg_b{};

    template <class Self, class F>
    static auto reflect(Self& self, F f)

--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -99,7 +99,6 @@ struct miopen_apply
        add_generic_op<hip_min>("min");

        add_extend_op<miopen_gemm, op::dot>("dot");
-        add_extend_op<miopen_quant_gemm, op::quant_dot>("quant_dot");
        add_extend_op<miopen_contiguous, op::contiguous>("contiguous");
        add_extend_op<hip_concat, op::concat>("concat");
        add_extend_op<miopen_softmax, op::softmax>("softmax");
@@ -112,6 +111,7 @@ struct miopen_apply
        add_lrn_op();
        add_convolution_op();
        add_quant_convolution_op();
+        add_quant_dot_op();
        add_pooling_op();
        add_batch_norm_inference_op();
    }
@@ -167,10 +167,31 @@ struct miopen_apply
            auto ws   = conv.compile(ctx, ins->get_shape(), to_shapes(ins->inputs()));

            auto workspace = insert_allocation(ins, ws, "workspace");
-            auto output    = insert_allocation(ins, ins->get_shape());
+
+            // add a temp float output to store the miopen convolution output
+            shape tmp_output_shape{shape::float_type, ins->get_shape().lens()};
+            auto tmp_output = insert_allocation(ins, tmp_output_shape, "tmp_out");
+            auto output     = insert_allocation(ins, ins->get_shape());

            return prog->replace_instruction(
-                ins, conv, ins->inputs().at(0), ins->inputs().at(1), workspace, output);
+                ins, conv, ins->inputs().at(0), ins->inputs().at(1), workspace, tmp_output, output);
+        });
+    }
+
+    void add_quant_dot_op()
+    {
+        apply_map.emplace("quant_dot", [=](instruction_ref ins) {
+            auto&& op      = any_cast<op::quant_dot>(ins->get_operator());
+            auto inputs    = ins->inputs();
+            auto in_shapes = to_shapes(inputs);
+            auto pack_a    = insert_allocation(ins, in_shapes[0], "pack_a");
+            auto pack_b    = insert_allocation(ins, in_shapes[1], "pack_b");
+            auto output    = insert_allocation(ins, ins->get_shape());
+            inputs.push_back(pack_a);
+            inputs.push_back(pack_b);
+            inputs.push_back(output);
+
+            return prog->replace_instruction(ins, miopen_quant_gemm{op}, inputs);
        });
    }


--- a/src/targets/gpu/quant_convolution.cpp
+++ b/src/targets/gpu/quant_convolution.cpp
 #include <migraphx/gpu/quant_convolution.hpp>
+#include <migraphx/gpu/device/convert.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/generate.hpp>

@@ -8,7 +9,7 @@ namespace gpu {

 shape miopen_quant_convolution::compute_shape(const std::vector<shape>& inputs) const
 {
-    check_shapes{inputs, *this}.has(4).standard();
+    check_shapes{inputs, *this}.has(5).standard();
    return op.compute_shape({inputs.at(0), inputs.at(1)});
 }
 argument miopen_quant_convolution::compute(context& ctx,
@@ -19,7 +20,8 @@ argument miopen_quant_convolution::compute(context& ctx,
    auto x_desc_vec4 = make_tensor(args[0].get_shape(), true);
    auto w_desc      = make_tensor(args[1].get_shape());
    auto w_desc_vec4 = make_tensor(args[1].get_shape(), true);
-    auto y_desc      = make_tensor(output_shape);
+    shape tmp_output_shape{shape::float_type, output_shape.lens()};
+    auto y_desc = make_tensor(tmp_output_shape);

    float alpha = 1;
    float beta  = 0;
@@ -67,7 +69,11 @@ argument miopen_quant_convolution::compute(context& ctx,
    {
        MIGRAPHX_THROW("QUANT_CONVOLUTION: run convolution forward failed");
    }
-    return args[3];
+
+    // Add a conversion from float to int32_t
+    device::convert(ctx.get_stream().get(), args[4], args[3], 1.0f, 0.0f, shape::int32_type);
+
+    return args[4];
 }

 shape miopen_quant_convolution::compile(context& ctx,
@@ -77,7 +83,8 @@ shape miopen_quant_convolution::compile(context& ctx,
    shape workspace_shape{};
    auto x_desc = make_tensor(inputs[0], true);
    auto w_desc = make_tensor(inputs[1], true);
-    auto y_desc = make_tensor(output_shape);
+    shape tmp_output_shape{shape::float_type, output_shape.lens()};
+    auto y_desc = make_tensor(tmp_output_shape);

    std::size_t workspace_size = 0;
    miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
@@ -90,7 +97,7 @@ shape miopen_quant_convolution::compile(context& ctx,

    arg_vec4_x     = to_gpu(generate_argument(pack_int8_shape(inputs[0])));
    arg_vec4_w     = to_gpu(generate_argument(pack_int8_shape(inputs[1])));
-    auto y         = allocate_gpu(output_shape);
+    auto y         = allocate_gpu(tmp_output_shape);
    auto workspace = allocate_gpu(workspace_shape);

    int algo_count = 1;

--- a/src/targets/gpu/quant_gemm.cpp
+++ b/src/targets/gpu/quant_gemm.cpp
@@ -54,10 +54,11 @@ rb_type<T>* to_rocblas_type(T* x)

 shape miopen_quant_gemm::compute_shape(const std::vector<shape>& inputs) const
 {
-    std::vector<shape> input_shapes(inputs);
-    input_shapes.pop_back();
-    check_shapes{input_shapes}.not_broadcasted();
-    return op.compute_shape(input_shapes);
+    std::vector<shape> in_shapes(inputs);
+    in_shapes.erase(in_shapes.begin() + in_shapes.size() - 3, in_shapes.end());
+    check_shapes{in_shapes}.not_broadcasted();
+
+    return op.compute_shape(in_shapes);
 }

 argument miopen_quant_gemm::compute(context& ctx,
@@ -69,31 +70,24 @@ argument miopen_quant_gemm::compute(context& ctx,
    auto n_dim      = output_shape.lens().size();
    auto dim_1      = n_dim - 1;
    auto dim_0      = n_dim - 2;
+    auto arg_num    = args.size();
    rocblas_int lda = args[0].get_shape().strides()[transa ? dim_1 : dim_0];
    rocblas_int ldb = args[1].get_shape().strides()[transb ? dim_1 : dim_0];
-    rocblas_int ldc = args[2].get_shape().strides()[dim_0];
+    rocblas_int ldc = args[arg_num - 1].get_shape().strides()[dim_0];

    if(!transb)
    {
-        if(arg_b.empty())
-        {
-            arg_b = allocate_gpu(args[1].get_shape());
-        }
-        device::pack_a(ctx.get_stream().get(), arg_b, args[1]);
+        device::pack_a(ctx.get_stream().get(), args[arg_num - 2], args[1]);
    }

    // need to pack A in this scenario, use the algorithm to pack B in the
    // comment of the API
    if(transa)
    {
-        if(arg_a.empty())
-        {
-            arg_a = allocate_gpu(args.at(0).get_shape());
-        }
-        device::pack_b(ctx.get_stream().get(), arg_a, args[0]);
+        device::pack_b(ctx.get_stream().get(), args[arg_num - 3], args[0]);
    }

-    bool is_3inputs = (args.size() == 4);
+    bool is_3inputs = (arg_num == 6);
    int32_t beta    = 0;
    if(is_3inputs)
    {
@@ -127,17 +121,18 @@ argument miopen_quant_gemm::compute(context& ctx,
                                    m,
                                    k,
                                    &alpha_r,
-                                    (!transb) ? to_pointer(arg_b) : to_pointer(args.at(1)),
+                                    (!transb) ? to_pointer(args[arg_num - 2])
+                                              : to_pointer(args.at(1)),
                                    rocblas_datatype_i8_r,
                                    ldb,
-                                    transa ? to_pointer(arg_a) : to_pointer(args.at(0)),
+                                    transa ? to_pointer(args[arg_num - 3]) : to_pointer(args.at(0)),
                                    rocblas_datatype_i8_r,
                                    lda,
                                    &beta_r,
                                    to_pointer(args[2]),
                                    rocblas_datatype_i32_r,
                                    ldc,
-                                    is_3inputs ? to_pointer(args.at(3)) : to_pointer(args[2]),
+                                    to_pointer(args[arg_num - 1]),
                                    rocblas_datatype_i32_r,
                                    ldc,
                                    rocblas_datatype_i32_r,
@@ -157,11 +152,11 @@ argument miopen_quant_gemm::compute(context& ctx,
                m,
                k,
                &alpha_r,
-                (!transb) ? to_pointer(arg_b) : to_pointer(args.at(1)),
+                (!transb) ? to_pointer(args[arg_num - 2]) : to_pointer(args.at(1)),
                rocblas_datatype_i8_r,
                ldb,
                k * n,
-                transa ? to_pointer(arg_a) : to_pointer(args.at(0)),
+                transa ? to_pointer(args[arg_num - 3]) : to_pointer(args.at(0)),
                rocblas_datatype_i8_r,
                lda,
                m * k,
@@ -170,7 +165,7 @@ argument miopen_quant_gemm::compute(context& ctx,
                rocblas_datatype_i32_r,
                ldc,
                m * n,
-                is_3inputs ? to_pointer(args.at(3)) : to_pointer(args[2]),
+                to_pointer(args[arg_num - 1]),
                rocblas_datatype_i32_r,
                ldc,
                m * n,
@@ -184,7 +179,7 @@ argument miopen_quant_gemm::compute(context& ctx,
        }
    });

-    return is_3inputs ? args.at(3) : args[2];
+    return args[arg_num - 1];
 }

 } // namespace gpu

--- a/test/cpu_ops_test.cpp
+++ b/test/cpu_ops_test.cpp
@@ -1317,24 +1317,24 @@ TEST_CASE(quant_conv2d_test)
    p.compile(migraphx::cpu::target{});
    auto result = p.eval({});

-    std::vector<float> s = {10197,
-                            10548,
-                            11601,
-                            11952,
-                            25506,
-                            26586,
-                            29826,
-                            30906,
-                            27045,
-                            27396,
-                            28449,
-                            28800,
-                            77346,
-                            78426,
-                            81666,
-                            82746};
-
-    std::vector<float> results_vector;
+    std::vector<int32_t> s = {10197,
+                              10548,
+                              11601,
+                              11952,
+                              25506,
+                              26586,
+                              29826,
+                              30906,
+                              27045,
+                              27396,
+                              28449,
+                              28800,
+                              77346,
+                              78426,
+                              81666,
+                              82746};
+
+    std::vector<int32_t> results_vector;
    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
    EXPECT(migraphx::verify_range(results_vector, s));
 }
@@ -1357,14 +1357,14 @@ TEST_CASE(quant_conv2d_test_default_mode)
    p.compile(migraphx::cpu::target{});
    auto result = p.eval({});

-    std::vector<float> s = {
+    std::vector<int32_t> s = {
        10197, 10548, 6939,  3420,  11601, 11952, 7839,  3852,  7383,  7590,  4953,  2421,  3480,
        3570,  2316,  1125,  25506, 26586, 17874, 9009,  29826, 30906, 20718, 10413, 20505, 21198,
        14187, 7119,  10527, 10860, 7257,  3636,  27045, 27396, 17739, 8604,  28449, 28800, 18639,
        9036,  17319, 17526, 11289, 5445,  7800,  7890,  5052,  2421,  77346, 78426, 52002, 25857,
        81666, 82746, 54846, 27261, 53769, 54462, 36075, 17919, 26511, 26844, 17769, 8820};

-    std::vector<float> results_vector;
+    std::vector<int32_t> results_vector;
    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
    EXPECT(migraphx::verify_range(results_vector, s));
 }
@@ -1387,24 +1387,24 @@ TEST_CASE(quant_conv2d_test_valid_mode)
    p.compile(migraphx::cpu::target{});
    auto result = p.eval({});

-    std::vector<float> s = {10197,
-                            10548,
-                            11601,
-                            11952,
-                            25506,
-                            26586,
-                            29826,
-                            30906,
-                            27045,
-                            27396,
-                            28449,
-                            28800,
-                            77346,
-                            78426,
-                            81666,
-                            82746};
-
-    std::vector<float> results_vector;
+    std::vector<int32_t> s = {10197,
+                              10548,
+                              11601,
+                              11952,
+                              25506,
+                              26586,
+                              29826,
+                              30906,
+                              27045,
+                              27396,
+                              28449,
+                              28800,
+                              77346,
+                              78426,
+                              81666,
+                              82746};
+
+    std::vector<int32_t> results_vector;
    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
    EXPECT(migraphx::verify_range(results_vector, s));
 }
@@ -1422,15 +1422,15 @@ TEST_CASE(quant_conv2d_padding_test)
    auto cl = p.add_literal(migraphx::literal{c_shape, c});
    p.add_instruction(migraphx::op::quant_convolution{{{1, 1}}, {{1, 1}}}, al, cl);
    p.compile(migraphx::cpu::target{});
-    auto result          = p.eval({});
-    std::vector<float> s = {
+    auto result            = p.eval({});
+    std::vector<int32_t> s = {
        4521,  6753,  7014,  4635,  6858,  10197, 10548, 6939,  7830,  11601, 11952, 7839,  5007,
        7383,  7590,  4953,  10515, 15987, 16734, 11277, 16821, 25506, 26586, 17874, 19737, 29826,
        30906, 20718, 13593, 20505, 21198, 14187, 13161, 19281, 19542, 12699, 18522, 27045, 27396,
        17739, 19494, 28449, 28800, 18639, 11919, 17319, 17526, 11289, 34707, 51843, 52590, 34893,
        51813, 77346, 78426, 52002, 54729, 81666, 82746, 54846, 36057, 53769, 54462, 36075};

-    std::vector<float> results_vector;
+    std::vector<int32_t> results_vector;
    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
    EXPECT(migraphx::verify_range(results_vector, s));
 }
@@ -1450,23 +1450,23 @@ TEST_CASE(quant_conv2d_padding_stride_test)
    p.compile(migraphx::cpu::target{});
    auto result = p.eval({});

-    std::vector<float> s = {4521,
-                            7014,
-                            7830,
-                            11952,
-                            10515,
-                            16734,
-                            19737,
-                            30906,
-                            13161,
-                            19542,
-                            19494,
-                            28800,
-                            34707,
-                            52590,
-                            54729,
-                            82746};
-    std::vector<float> results_vector;
+    std::vector<int32_t> s = {4521,
+                              7014,
+                              7830,
+                              11952,
+                              10515,
+                              16734,
+                              19737,
+                              30906,
+                              13161,
+                              19542,
+                              19494,
+                              28800,
+                              34707,
+                              52590,
+                              54729,
+                              82746};
+    std::vector<int32_t> results_vector;
    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
    EXPECT(migraphx::verify_range(results_vector, s));
 }

--- a/test/gpu/miopen.cpp
+++ b/test/gpu/miopen.cpp
@@ -82,10 +82,6 @@ auto get_hash(const T& x)
    return std::hash<T>{}(x);
 }

-// add an overload function for int type
-// to avoid overflow in test examples
-inline auto get_hash(const int& x) { return std::hash<int>{}(x) / 64; }
-
 void compile_check(migraphx::program& p, const migraphx::target& t)
 {
    auto name = t.name();

--- a/test/op_shape_test.cpp
+++ b/test/op_shape_test.cpp
@@ -78,24 +78,24 @@ TEST_CASE(convolution_shape)

 TEST_CASE(quant_convolution_shape)
 {
-    migraphx::shape output{migraphx::shape::float_type, {4, 4, 1, 1}};
+    migraphx::shape output{migraphx::shape::int32_type, {4, 4, 1, 1}};
    migraphx::shape input{migraphx::shape::int8_type, {4, 3, 3, 3}};
    migraphx::shape weights{migraphx::shape::int8_type, {4, 3, 3, 3}};
    expect_shape(output, migraphx::op::quant_convolution{}, input, weights);
    throws_shape(migraphx::op::quant_convolution{}, input);

-    migraphx::shape input2{migraphx::shape::float_type, {3, 3}};
+    migraphx::shape input2{migraphx::shape::int32_type, {3, 3}};
    migraphx::shape weights2{migraphx::shape::float_type, {3, 3}};
    throws_shape(migraphx::op::quant_convolution{}, input2, weights2);
    throws_shape(migraphx::op::quant_convolution{}, input2, weights);

-    migraphx::shape input3{migraphx::shape::float_type, {4, 3, 3, 3}};
+    migraphx::shape input3{migraphx::shape::int32_type, {4, 3, 3, 3}};
    migraphx::shape weight3{migraphx::shape::float_type, {4, 3, 3, 3}};
    throws_shape(migraphx::op::quant_convolution{}, input3, weights);
    throws_shape(migraphx::op::quant_convolution{}, input, weight3);
    throws_shape(migraphx::op::quant_convolution{}, input3, weight3);

-    migraphx::shape output_same_mode{migraphx::shape::float_type, {4, 4, 3, 3}};
+    migraphx::shape output_same_mode{migraphx::shape::int32_type, {4, 4, 3, 3}};
    expect_shape(output_same_mode,
                 migraphx::op::quant_convolution{{{0, 0}}, {{1, 1}}, {{1, 1}}, migraphx::op::same},
                 input,