merge changes from int8_quantize

cb555646 · Shucai Xiao · 12ccb601 · 4a10535c · cb555646 · cb555646
Commit cb555646 authored May 23, 2019 by Shucai Xiao
20 changed files
--- a/requirements.txt
+++ b/requirements.txt
 google/protobuf -DCMAKE_POSITION_INDEPENDENT_CODE=On
 RadeonOpenCompute/rocm-cmake@42f6740 --build
 ROCmSoftwarePlatform/rocBLAS@30a992ae02fda568688bcd190edd5e277d6674d9
-ROCmSoftwarePlatform/MIOpen@1.7.0
+ROCmSoftwarePlatform/MIOpen@1.8.0
 blaze,https://bitbucket.org/blaze-lib/blaze/get/f0755dea0e03.tar.gz -X header -DHEADER_DIR=blaze
 half,https://github.com/pfultz2/half/archive/1.12.0.tar.gz -X header -H sha256:0a08660b68abb176ebc2a0cdf8de46e3182a7f46c66443bb80dbfaaec98cf969
 pybind/pybind11@v2.2.4 -DPYBIND11_TEST=Off --build
--- a/src/include/migraphx/argument.hpp
+++ b/src/include/migraphx/argument.hpp
@@ -36,7 +36,7 @@ struct argument : raw_data<argument>
    }
    /// Provides a raw pointer to the data
-    std::function<char*()> data;
+    std::function<char*()> data = nullptr;
    /// Whether data is available
    bool empty() const { return not data; }

--- a/src/include/migraphx/op/binary.hpp
+++ b/src/include/migraphx/op/binary.hpp
@@ -28,23 +28,31 @@ struct binary : op_name<Derived>
    argument compute(const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        visit_all(result, args[0], args[1])([&](auto output, auto input1, auto input2) {
+        auto s1 = args[0].get_shape();
-            if(input1.get_shape().packed() and input2.get_shape().packed())
+        auto s2 = args[1].get_shape();
-            {
+        if(s1 == s2 and s1.packed())
+        {
+            shape std_shape{s1.type(), s1.lens()};
+            argument std_result{std_shape, result.data()};
+            argument std_arg0{std_shape, args[0].data()};
+            argument std_arg1{std_shape, args[1].data()};
+            visit_all(std_result, std_arg0, std_arg1)([&](auto output, auto input1, auto input2) {
                std::transform(input1.begin(),
                               input1.end(),
                               input2.begin(),
                               output.begin(),
                               static_cast<const Derived&>(*this).apply());
-            }
+            });
-            else
+        }
-            {
+        else
+        {
+            visit_all(result, args[0], args[1])([&](auto output, auto input1, auto input2) {
                shape_for_each(output.get_shape(), [&](const auto& idx) {
                    output(idx.begin(), idx.end()) = static_cast<const Derived&>(*this).apply()(
                        input1(idx.begin(), idx.end()), input2(idx.begin(), idx.end()));
                });
-            }
+            });
-        });
+        }
        return result;
    }

--- a/src/include/migraphx/op/capture.hpp
+++ b/src/include/migraphx/op/capture.hpp
@@ -23,7 +23,7 @@ struct capture
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        return pack(f(self.ins_index, "instruction_index"));
+        return pack(f(self.ins_index, "ins_index"));
    }
    std::string name() const { return "capture"; }

--- a/src/include/migraphx/op/convert.hpp
+++ b/src/include/migraphx/op/convert.hpp
@@ -20,11 +20,14 @@ namespace op {
 struct convert : unary<convert>
 {
    shape::type_t target_type = shape::half_type;
+    float scale               = 1.0f;
+    float shift               = 0.0f;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        return pack(f(self.target_type, "target_type"));
+        return pack(
+            f(self.target_type, "target_type"), f(self.scale, "scale"), f(self.shift, "shift"));
    }
    shape compute_shape(std::vector<shape> inputs) const
@@ -35,10 +38,22 @@ struct convert : unary<convert>
    auto apply() const
    {
-        return [](auto x) { return x; };
+        return [&](auto x) {
+            float res = scale * x + shift;
+            if(target_type == shape::int8_type)
+            {
+                int factor = (res > 0) ? 1 : -1;
+                res        = res + factor * 0.5f;
+                res        = res > 127.0 ? 127.0 : res;
+                res        = res < -128.0 ? -128.0 : res;
+            }
+            return res;
+        };
    }
    convert(shape::type_t t) : target_type{t} {}
+    convert(shape::type_t t, float sle, float sft) : target_type{t}, scale{sle}, shift{sft} {}
    convert() {}
 };

--- a/src/include/migraphx/op/quant_convolution.hpp
+++ b/src/include/migraphx/op/quant_convolution.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_QUANT_CONVOLUTION_HPP
+#define MIGRAPHX_GUARD_OPERATORS_QUANT_CONVOLUTION_HPP
+#include <array>
+#include <migraphx/op/common.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/streamutils.hpp>
+#include <migraphx/literal.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/config.hpp>
+#include <cmath>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct quant_convolution
+{
+    std::array<std::size_t, 2> padding  = {{0, 0}};
+    std::array<std::size_t, 2> stride   = {{1, 1}};
+    std::array<std::size_t, 2> dilation = {{1, 1}};
+    padding_mode_t padding_mode = default_;
+    int group                   = 1;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.padding, "padding"),
+                    f(self.stride, "stride"),
+                    f(self.dilation, "dilation"),
+                    f(self.padding_mode, "padding_mode"),
+                    f(self.group, "group"));
+    }
+    std::string name() const { return "quant_convolution"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(2).same_type().same_ndims().only_dims(4);
+        const shape& input   = inputs.at(0);
+        const shape& weights = inputs.at(1);
+        auto t               = input.type();
+        // all input type must be int8_type and output is float_type
+        if(t != shape::int8_type)
+        {
+            MIGRAPHX_THROW("QUANT_CONVOLUTION: only accept input and weights of type int8_t");
+        }
+        t = shape::float_type;
+        if(padding_mode == default_)
+        {
+            return {t,
+                    {
+                        input.lens()[0],
+                        weights.lens()[0],
+                        std::size_t(std::max<std::ptrdiff_t>(
+                            1,
+                            (input.lens()[2] - (1 + dilation[0] * (weights.lens()[2] - 1)) +
+                             2 * padding[0]) /
+                                    stride[0] +
+                                1)),
+                        std::size_t(std::max<std::ptrdiff_t>(
+                            1,
+                            (input.lens()[3] - (1 + dilation[1] * (weights.lens()[3] - 1)) +
+                             2 * padding[1]) /
+                                    stride[1] +
+                                1)),
+                    }};
+        }
+        else if(padding_mode == same)
+        {
+            return {t,
+                    {input.lens()[0],
+                     weights.lens()[0],
+                     static_cast<std::size_t>(
+                         std::ceil(static_cast<double>(input.lens()[2]) / stride[0])),
+                     static_cast<std::size_t>(
+                         std::ceil(static_cast<double>(input.lens()[3]) / stride[1]))}};
+        }
+        else if(padding_mode == valid)
+        {
+            return {
+                t,
+                {input.lens()[0],
+                 weights.lens()[0],
+                 static_cast<std::size_t>(std::ceil(
+                     static_cast<double>(input.lens()[2] - weights.lens()[2] + 1) / stride[0])),
+                 static_cast<std::size_t>(std::ceil(
+                     static_cast<double>(input.lens()[3] - weights.lens()[3] + 1) / stride[1]))}};
+        }
+        else
+        {
+            MIGRAPHX_THROW("QUANT_CONVOLUTION: invalid padding mode");
+        }
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/quant_dot.hpp
+++ b/src/include/migraphx/op/quant_dot.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_QUANT_DOT_HPP
+#define MIGRAPHX_GUARD_OPERATORS_QUANT_DOT_HPP
+#include <array>
+#include <migraphx/operation.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/streamutils.hpp>
+#include <migraphx/literal.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/config.hpp>
+#include <cmath>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct quant_dot
+{
+    int32_t alpha = 1;
+    int32_t beta  = 1;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(as_number(self.alpha), "alpha"), f(as_number(self.beta), "beta"));
+    }
+    std::string name() const { return "quant_dot"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{{inputs.at(0), inputs.at(1)}, *this}.same_type();
+        const shape& a = inputs.at(0);
+        const shape& b = inputs.at(1);
+        auto t         = a.type();
+        if(t != shape::int8_type)
+        {
+            MIGRAPHX_THROW("QUANT_DOT: only support data type int8_t");
+        }
+        if(!std::all_of(inputs.begin(), inputs.end(), [](auto s) { return s.lens().size() >= 2; }))
+        {
+            MIGRAPHX_THROW("QUANT_DOT: dot only accept 2 or more dims operands");
+        }
+        // only handle the case that the batch size of a and b are the same
+        if(!std::equal(
+               a.lens().rbegin() + 2, a.lens().rend(), b.lens().rbegin() + 2, b.lens().rend()))
+        {
+            MIGRAPHX_THROW("QUANT_DOT: batch size of A and B mismatch: {" +
+                           to_string_range(a.lens()) + "} x {" + to_string_range(b.lens()) + "}");
+        }
+        std::size_t dim_0 = a.lens().size() - 2;
+        std::size_t dim_1 = a.lens().size() - 1;
+        if(a.lens()[dim_1] != b.lens()[dim_0])
+        {
+            MIGRAPHX_THROW("QUANT_DOT: inner dimensions do not match: {" +
+                           to_string_range(a.lens()) + "} x {" + to_string_range(b.lens()) + "}");
+        }
+        // k be multiple of 4
+        if((a.lens()[dim_1] % 4) != 0)
+        {
+            MIGRAPHX_THROW("QUANT_DOT: size of A {" + to_string_range(a.lens()) + "} and B {" +
+                           to_string_range(b.lens()) + "} must be multiple of 4 for int8 type");
+        }
+        auto out_lens   = a.lens();
+        out_lens[dim_1] = b.lens()[dim_1];
+        if(inputs.size() == 3 && out_lens != inputs.at(2).lens())
+        {
+            MIGRAPHX_THROW("QUANT_DOT: dimension mismatch, operand C: {" +
+                           to_string_range(inputs.at(2).lens()) +
+                           "}, cannot add to operand A * B: {" + to_string_range(out_lens) + "}");
+        }
+        if(inputs.size() == 3 && inputs.at(2).type() != shape::int32_type)
+        {
+            MIGRAPHX_THROW("QUANT_DOT: operand C type must be int32");
+        }
+        return {shape::int32_type, out_lens};
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/unary.hpp
+++ b/src/include/migraphx/op/unary.hpp
@@ -27,26 +27,34 @@ struct unary : op_name<Derived>
    argument compute(const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        result.visit([&](auto output) {
+        auto in_shape = args[0].get_shape();
-            args[0].visit([&](auto input) {
+        if(in_shape.packed())
-                if(input.get_shape().packed())
+        {
-                {
+            shape std_in_shape{in_shape.type(), in_shape.lens()};
+            shape std_out_shape{output_shape.type(), output_shape.lens()};
+            argument arg_in{std_in_shape, args[0].data()};
+            argument arg_out{std_out_shape, result.data()};
+            arg_out.visit([&](auto output) {
+                arg_in.visit([&](auto input) {
                    std::transform(input.begin(),
                                   input.end(),
                                   output.begin(),
                                   static_cast<const Derived&>(*this).apply());
-                    return result;
-                }
-                shape_for_each(output.get_shape(), [&](const auto& idx) {
-                    output(idx.begin(), idx.end()) =
-                        static_cast<const Derived&>(*this).apply()(input(idx.begin(), idx.end()));
                });
-                return result;
            });
-        });
+        }
+        else
+        {
+            result.visit([&](auto output) {
+                args[0].visit([&](auto input) {
+                    shape_for_each(output.get_shape(), [&](const auto& idx) {
+                        output(idx.begin(), idx.end()) = static_cast<const Derived&>(*this).apply()(
+                            input(idx.begin(), idx.end()));
+                    });
+                });
+            });
+        }
        return result;
    }

--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -43,6 +43,8 @@
 #include <migraphx/op/outline.hpp>
 #include <migraphx/op/pad.hpp>
 #include <migraphx/op/pooling.hpp>
+#include <migraphx/op/quant_convolution.hpp>
+#include <migraphx/op/quant_dot.hpp>
 #include <migraphx/op/relu.hpp>
 #include <migraphx/op/reshape.hpp>
 #include <migraphx/op/rnn.hpp>

--- a/src/include/migraphx/quantization.hpp
+++ b/src/include/migraphx/quantization.hpp
@@ -21,6 +21,9 @@ void capture_arguments(program& prog,
                       const std::vector<std::string>& ins_names,
                       std::size_t& num_quant_params,
                       std::function<void(std::size_t, std::vector<argument> args)> func);
+void quantize_int8(program& prog,
+                   const std::vector<std::string>& ins_names,
+                   std::vector<std::pair<float, float>>& int8_quant_params);
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -186,6 +186,12 @@ PYBIND11_MODULE(migraphx, m)
        migraphx::quantize(p, ins_names);
    });
    m.def("quantize", [](migraphx::program& p) { migraphx::quantize(p, {"all"}); });
+    m.def("quantize_int8",
+          [](migraphx::program& p,
+             std::vector<std::string>& ins_names,
+             std::vector<std::pair<float, float>>& quant_params) {
+              migraphx::quantize_int8(p, ins_names, quant_params);
+          });
 #ifdef HAVE_GPU
    m.def("allocate_gpu", &migraphx::gpu::allocate_gpu, py::arg("s"), py::arg("host") = false);

--- a/src/quantization.cpp
+++ b/src/quantization.cpp
@@ -2,12 +2,15 @@
 #include <migraphx/program.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/iterator_for.hpp>
+#include <migraphx/op/convert.hpp>
 #include <migraphx/op/dot.hpp>
 #include <migraphx/op/mul.hpp>
 #include <migraphx/op/add.hpp>
+#include <migraphx/op/quant_dot.hpp>
+#include <migraphx/op/capture.hpp>
 #include <migraphx/op/convolution.hpp>
+#include <migraphx/op/quant_convolution.hpp>
 #include <migraphx/op/multibroadcast.hpp>
-#include <migraphx/op/capture.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/ranges.hpp>
 #include <utility>
@@ -113,6 +116,305 @@ void quantize(program& prog, const std::vector<std::string>& ins_names)
 void quantize(program& prog) { quantize(prog, {"all"}); }
+// int8 quantization is different from fp16 since int8 can only handle value
+// -128 ~ 127. To convert the float or double to int8, we need a scale and
+// a shift, then the convert can be done as v_int8 = fp * scale + shift.
+// To simplify the changes, we consider shift as 0.0f for now.
+void quantize_int8(program& prog,
+                   const std::vector<std::string>& ins_names,
+                   std::vector<std::pair<float, float>>& int8_quant_params)
+{
+    // // For debugging
+    // auto print_gemm_res = [&](std::size_t ins_index, std::vector<migraphx::argument> args) {
+    //     // scale and shift is need for only int8 type, and we do not
+    //     // consider shift, so set shift to 0
+    //     std::vector<float> vec_val;
+    //     args.front().visit([&](auto output) { vec_val.assign(output.begin(), output.end()); });
+    //     std::cout << "quant_gemm = " << std::endl;
+    //     for (size_t i = 0; i < 20; i++)
+    //     {
+    //         std::cout << vec_val[i] << "\t";
+    //     }
+    //     std::cout << std::endl;
+    // };
+    // // For debugging
+    // auto print_conv_res = [&](std::size_t ins_index, std::vector<migraphx::argument> args) {
+    //     // scale and shift is need for only int8 type, and we do not
+    //     // consider shift, so set shift to 0
+    //     std::vector<float> vec_val;
+    //     args.front().visit([&](auto output) { vec_val.assign(output.begin(), output.end()); });
+    //     std::cout << "quant_conv = " << std::endl;
+    //     for (size_t i = 0; i < 20; i++)
+    //     {
+    //         std::cout << vec_val[i] << "\t";
+    //     }
+    //     std::cout << std::endl;
+    // };
+    // For now, we only support the int8 quantization of gemm and convolution
+    std::vector<std::string> op_names = {"dot", "convolution"};
+    if(!std::all_of(ins_names.begin(), ins_names.end(), [&](auto name) {
+           return (std::find(op_names.begin(), op_names.end(), name) != op_names.end());
+       }))
+    {
+        MIGRAPHX_THROW("QUANTIZE_INT8: only support DOT and CONVOLUTION operation");
+    }
+    std::size_t quant_param_index = 0;
+    std::unordered_map<instruction_ref, instruction_ref> map_quant_ins;
+    for(auto ins : iterator_for(prog))
+    {
+        if(not contains(ins_names, ins->name()))
+        {
+            continue;
+        }
+        shape::type_t orig_type = ins->get_shape().type();
+        // for the dot operator, there could be 2 or 3 input arguments
+        // if the 3rd argument is available, convert it to an int32.
+        std::vector<instruction_ref> converted_inputs;
+        // process all inputs, if input is a fp32 or fp64, convert it
+        // to a int8 type by adding a convert operator and replace
+        // the operator with the corresponding int8 version
+        auto inputs = ins->inputs();
+        std::vector<std::pair<float, float>> ins_quant_params;
+        for(auto input : inputs)
+        {
+            // In general, the target_type is int8, but for the dot
+            // operation, if it has 3 inputs, then the last one should
+            // be converted to int32_type
+            shape::type_t quant_type = shape::int8_type;
+            auto param               = int8_quant_params[quant_param_index++];
+            ins_quant_params.push_back(param);
+            if(ins->name() == "dot" and inputs.size() == 3 and input == inputs.back())
+            {
+                quant_type = shape::int32_type;
+            }
+            auto s = input->get_shape();
+            if((s.type() == shape::float_type || s.type() == shape::double_type ||
+                s.type() == shape::int32_type) &&
+               s.type() != quant_type)
+            {
+                // if the input is a convert operator, uses its input
+                // as its current input
+                instruction_ref quant_input{};
+                if(input->name() == "convert")
+                {
+                    auto tmp_ins = input->inputs().front();
+                    if(tmp_ins->get_shape().type() == quant_type)
+                    {
+                        quant_input = input->inputs().front();
+                    }
+                    else
+                    {
+                        quant_input = insert_quant_ins(
+                            prog, input, quant_type, map_quant_ins, param.first, param.second);
+                    }
+                }
+                else
+                {
+                    quant_input = insert_quant_ins(
+                        prog, input, quant_type, map_quant_ins, param.first, param.second);
+                }
+                converted_inputs.push_back(quant_input);
+            }
+            else
+            {
+                converted_inputs.push_back(input);
+            }
+        }
+        // no change for the input, go to the next instruction
+        if(inputs == converted_inputs)
+        {
+            continue;
+        }
+        // When converting from other types to int8_type, there are parameters
+        // used as scale and shift(.0f), which will generate results diffrent from
+        // the original results. To adjust the output to be "correct(approximatly
+        // equal)", we need additional calculation for the adjustment
+        if(ins->name() == "dot")
+        {
+            auto dot_op = any_cast<op::dot>(ins->get_operator());
+            float new_alpha =
+                dot_op.alpha / (ins_quant_params[0].first * ins_quant_params[1].first);
+            float new_beta = dot_op.beta;
+            // We need additional checking about the quant_alpha value. If
+            // abs(quant_alpha) > 50 (some tmp value set here), we can convert
+            // it to an integer as the new_alpha in the quant_dot
+            float threshold = 50.0f;
+            if(fabs(new_alpha) >= threshold && fabs(new_beta) >= threshold)
+            {
+                int32_t quant_alpha = static_cast<int32_t>(new_alpha);
+                int32_t quant_beta  = static_cast<int32_t>(new_beta);
+                shape quant_shape   = compute_shape(op::quant_dot{1, 0}, converted_inputs);
+                if(quant_shape.type() == orig_type)
+                {
+                    prog.replace_instruction(
+                        ins, op::quant_dot{quant_alpha, quant_beta}, converted_inputs);
+                }
+                else
+                {
+                    auto quant_dot = prog.insert_instruction(
+                        ins, op::quant_dot{quant_alpha, quant_beta}, converted_inputs);
+                    prog.replace_instruction(ins, op::convert{orig_type}, quant_dot);
+                }
+            }
+            // only alpha can be quantized, quantization of beta will cause
+            // big error, so we have to manually do the multiplication and
+            // addition
+            else if(fabs(new_alpha) >= threshold)
+            {
+                int32_t quant_alpha = static_cast<int32_t>(new_alpha);
+                int32_t quant_beta  = 0;
+                if(orig_type == shape::int32_type)
+                {
+                    if(inputs.size() == 2 or dot_op.beta == 0.0f)
+                    {
+                        prog.replace_instruction(
+                            ins, op::quant_dot{quant_alpha, quant_beta}, converted_inputs);
+                    }
+                    // if there are 3 inputs, we need to consider the third argument
+                    else
+                    {
+                        auto q_dot = prog.insert_instruction(
+                            ins, op::quant_dot{quant_alpha, quant_beta}, converted_inputs);
+                        std::vector<float> vec_beta(q_dot->get_shape().elements(), dot_op.beta);
+                        auto l_beta = prog.add_literal(literal{orig_type, vec_beta});
+                        auto beta_c =
+                            prog.insert_instruction(ins, op::mul{}, l_beta, inputs.back());
+                        prog.replace_instruction(ins, op::add{}, q_dot, beta_c);
+                    }
+                }
+                else
+                {
+                    if(inputs.size() == 2 or dot_op.beta == 0.0f)
+                    {
+                        auto q_dot = prog.insert_instruction(
+                            ins, op::quant_dot{quant_alpha, quant_beta}, converted_inputs);
+                        prog.replace_instruction(ins, op::convert{orig_type}, q_dot);
+                    }
+                    // if there are 3 inputs, we need to consider the third argument
+                    else
+                    {
+                        auto q_dot = prog.insert_instruction(
+                            ins, op::quant_dot{quant_alpha, quant_beta}, converted_inputs);
+                        auto oq_dot = prog.insert_instruction(ins, op::convert{orig_type}, q_dot);
+                        std::vector<float> vec_beta(q_dot->get_shape().elements(), dot_op.beta);
+                        auto l_beta = prog.add_literal(literal{oq_dot->get_shape(), vec_beta});
+                        auto beta_c =
+                            prog.insert_instruction(ins, op::mul{}, l_beta, inputs.back());
+                        prog.replace_instruction(ins, op::add{}, q_dot, beta_c);
+                    }
+                }
+            }
+            else
+            {
+                auto q_dot = prog.insert_instruction(ins, op::quant_dot{1, 0}, converted_inputs);
+                std::vector<float> vec_alpha(q_dot->get_shape().elements(), new_alpha);
+                if(orig_type == shape::int32_type)
+                {
+                    auto l_alpha = prog.add_literal(literal(ins->get_shape(), vec_alpha));
+                    if(converted_inputs.size() == 2 or dot_op.beta == 0.0f)
+                    {
+                        prog.replace_instruction(ins, op::mul{}, l_alpha, q_dot);
+                    }
+                    // case of 3 arguments
+                    else
+                    {
+                        std::vector<float> vec_beta(ins->get_shape().elements(), new_beta);
+                        auto l_beta   = prog.add_literal(literal(ins->get_shape(), vec_beta));
+                        auto alpha_ab = prog.insert_instruction(ins, op::mul{}, l_alpha, q_dot);
+                        auto beta_c =
+                            prog.insert_instruction(ins, op::mul{}, l_beta, inputs.back());
+                        prog.replace_instruction(ins, op::add{}, alpha_ab, beta_c);
+                    }
+                }
+                else
+                {
+                    auto oq_dot  = prog.insert_instruction(ins, op::convert{orig_type}, q_dot);
+                    auto l_alpha = prog.add_literal(literal(ins->get_shape(), vec_alpha));
+                    if(converted_inputs.size() == 2 or dot_op.beta == 0.0f)
+                    {
+                        prog.replace_instruction(ins, op::mul{}, l_alpha, oq_dot);
+                    }
+                    // case of 3 arguments
+                    else
+                    {
+                        std::vector<float> vec_beta(ins->get_shape().elements(), new_beta);
+                        auto l_beta   = prog.add_literal(literal(ins->get_shape(), vec_beta));
+                        auto alpha_ab = prog.insert_instruction(ins, op::mul{}, l_alpha, oq_dot);
+                        auto beta_c =
+                            prog.insert_instruction(ins, op::mul{}, l_beta, inputs.back());
+                        prog.replace_instruction(ins, op::add{}, alpha_ab, beta_c);
+                    }
+                }
+            }
+        }
+        else if(ins->name() == "convolution")
+        {
+            // Current MIOpen convolution does not support alpha and beta,
+            // so we need a separate multiply to adjust the output
+            auto conv_op       = any_cast<op::convolution>(ins->get_operator());
+            auto padding       = conv_op.padding;
+            auto stride        = conv_op.stride;
+            auto dilation      = conv_op.dilation;
+            auto padding_mode  = conv_op.padding_mode;
+            auto group         = conv_op.group;
+            auto adjust_factor = 1.0 / (ins_quant_params[0].first * ins_quant_params[1].first);
+            shape quant_shape =
+                compute_shape(op::quant_convolution{padding, stride, dilation, padding_mode, group},
+                              converted_inputs);
+            std::vector<float> vec_factor(quant_shape.elements(), adjust_factor);
+            auto fl = prog.add_literal(literal{{orig_type, quant_shape.lens()}, vec_factor});
+            if(quant_shape.type() == orig_type)
+            {
+                if(adjust_factor == 1.0f)
+                {
+                    prog.replace_instruction(
+                        ins,
+                        op::quant_convolution{padding, stride, dilation, padding_mode, group},
+                        converted_inputs);
+                }
+                else
+                {
+                    auto quant_conv = prog.insert_instruction(
+                        ins,
+                        op::quant_convolution{padding, stride, dilation, padding_mode, group},
+                        converted_inputs);
+                    prog.replace_instruction(ins, op::mul{}, quant_conv, fl);
+                }
+            }
+            else
+            {
+                auto quant_conv = prog.insert_instruction(
+                    ins,
+                    op::quant_convolution{padding, stride, dilation, padding_mode, group},
+                    converted_inputs);
+                if(adjust_factor == 1.0f)
+                {
+                    prog.replace_instruction(ins, op::convert{orig_type}, quant_conv);
+                }
+                else
+                {
+                    auto oq_conv = prog.insert_instruction(ins, op::convert{orig_type}, quant_conv);
+                    prog.replace_instruction(ins, op::mul{}, oq_conv, fl);
+                }
+            }
+        }
+        else
+        {
+            MIGRAPHX_THROW("INT8_QUANTIZE: does not support operator" + ins->name());
+        }
+    }
+}
 // For the input of each input argument, we need to insert a
 // capture operator to compute the scale and shift
 void capture_arguments(program& prog,

--- a/src/targets/cpu/gemm.cpp
+++ b/src/targets/cpu/gemm.cpp
@@ -44,13 +44,9 @@ struct is_fast_gemm_type<float> : std::true_type
 {
 };
-template <class T>
+template <class T, class F>
-void migemm_impl(tensor_view<T> cmat,
+void migemm_impl(
-                 tensor_view<T> amat,
+    tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta, std::true_type)
-                 tensor_view<T> bmat,
-                 float alpha,
-                 float beta,
-                 std::true_type)
 {
    visit_mat(amat, [&](const auto& a) {
        visit_mat(bmat, [&](const auto& b) {
@@ -66,13 +62,9 @@ void migemm_impl(tensor_view<T> cmat,
    });
 }
-template <class T>
+template <class T, class F>
-void migemm_impl(tensor_view<T> cmat,
+void migemm_impl(
-                 tensor_view<T> amat,
+    tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta, std::false_type)
-                 tensor_view<T> bmat,
-                 float alpha,
-                 float beta,
-                 std::false_type)
 {
    std::size_t n_dims = cmat.get_shape().lens().size();
    std::size_t dim_0  = n_dims - 2;
@@ -95,9 +87,8 @@ void migemm_impl(tensor_view<T> cmat,
    });
 }
-template <class T>
+template <class T, class F>
-void migemm_impl(
+void migemm_impl(tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta)
-    tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, float alpha, float beta)
 {
    auto lens = amat.get_shape().lens();
    bool batch_mul =
@@ -113,13 +104,29 @@ void migemm_impl(
    }
 }
-void migemm(
+template <class F>
-    const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta)
+void migemm_tpl(
+    const argument& c_arg, const argument& a_arg, const argument& b_arg, F alpha, F beta)
 {
    visit_all(c_arg, a_arg, b_arg)(
        [&](auto cmat, auto amat, auto bmat) { migemm_impl(cmat, amat, bmat, alpha, beta); });
 }
+void migemm(
+    const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta)
+{
+    migemm_tpl(c_arg, a_arg, b_arg, alpha, beta);
+}
+void migemm(const argument& c_arg,
+            const argument& a_arg,
+            const argument& b_arg,
+            int32_t alpha,
+            int32_t beta)
+{
+    migemm_tpl(c_arg, a_arg, b_arg, alpha, beta);
+}
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/cpu/include/migraphx/cpu/gemm.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/gemm.hpp
@@ -10,6 +10,11 @@ namespace cpu {
 void migemm(
    const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta);
+void migemm(const argument& c_arg,
+            const argument& a_arg,
+            const argument& b_arg,
+            int32_t alpha,
+            int32_t beta);
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -204,6 +204,61 @@ struct cpu_convolution
    }
 };
+struct cpu_quant_convolution
+{
+    op::quant_convolution op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "cpu::quant_convolution"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        result.visit([&](auto output) {
+            visit_all(args[0], args[1])([&](auto input, auto weights) {
+                auto in   = input.get_shape().lens();
+                auto in_h = in[2];
+                auto in_w = in[3];
+                auto wei   = weights.get_shape().lens();
+                auto wei_n = wei[0];
+                auto wei_c = wei[1];
+                auto wei_h = wei[2];
+                auto wei_w = wei[3];
+                par_dfor(output_shape.lens()[0],
+                         output_shape.lens()[1],
+                         output_shape.lens()[2],
+                         output_shape.lens()[3])(
+                    [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
+                        const int start_x  = i * op.stride[0] - op.padding[0];
+                        const int start_y  = j * op.stride[1] - op.padding[1];
+                        const int group_id = w / (wei_n / op.group);
+                        float acc = 0;
+                        dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
+                            const int in_x  = start_x + x;
+                            const int in_y  = start_y + y;
+                            const int in_ch = group_id * wei_c + k;
+                            if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
+                            {
+                                acc += input(o, in_ch, in_x, in_y) * weights(w, k, x, y);
+                            }
+                        });
+                        output(o, w, i, j) = acc;
+                    });
+            });
+        });
+        return result;
+    }
+};
 struct cpu_im2col
 {
    op::im2col op;
@@ -421,7 +476,7 @@ struct cpu_gemm
    {
        argument result{output_shape};
        // 3 inputs, it is alpha * A * B + beta * C, then
-        // A and B are matrics, and C is broadcastable to A * B
+        // A and B are matrices, and C is of the same shape as A * B
        if(args.size() == 3)
        {
            // no need to consider the value of args[2]
@@ -448,6 +503,73 @@ struct cpu_gemm
    }
 };
+struct cpu_quant_gemm
+{
+    op::quant_dot op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "cpu::quant_dot"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        if(inputs.size() == 3)
+        {
+            auto c_shape = inputs.at(2);
+            check_shapes{{c_shape}}.not_broadcasted();
+        }
+        return op.compute_shape(inputs);
+    }
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        // 3 inputs, it is alpha * A * B + beta * C, then
+        // A and B are matrices, and C is of the same shape to A * B
+        // first, convert the args[0] and args[1] from int8_t to int32_t
+        argument arg_0{{shape::int32_type, {args.at(0).get_shape().lens()}}};
+        argument arg_1{{shape::int32_type, {args.at(1).get_shape().lens()}}};
+        arg_0.visit([&](auto output) {
+            args.at(0).visit(
+                [&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
+        });
+        arg_1.visit([&](auto output) {
+            args.at(1).visit(
+                [&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
+        });
+        if(args.size() == 3)
+        {
+            // no need to consider the value of args[2]
+            if(op.beta == 0)
+            {
+                result.visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
+            }
+            else
+            {
+                visit_all(result, args[2])([&](auto output, auto input) {
+                    std::copy(input.begin(), input.end(), output.begin());
+                });
+            }
+            migemm(result, arg_0, arg_1, op.alpha, op.beta);
+            return result;
+        }
+        // 2 input arguments
+        int32_t beta = 0;
+        migemm(result, arg_0, arg_1, op.alpha, beta);
+        return result;
+    }
+};
 struct leaky_relu_op
 {
    op::leaky_relu op;
@@ -652,15 +774,17 @@ struct cpu_apply
    {
        apply_map["batch_norm_inference"] =
            extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
-        apply_map["convolution"] = extend_op<cpu_convolution, op::convolution>();
+        apply_map["convolution"]       = extend_op<cpu_convolution, op::convolution>();
-        apply_map["dot"]         = extend_op<cpu_gemm, op::dot>();
+        apply_map["dot"]               = extend_op<cpu_gemm, op::dot>();
-        apply_map["elu"]         = extend_op<cpu_unary<elu_op>, op::elu>();
+        apply_map["quant_dot"]         = extend_op<cpu_quant_gemm, op::quant_dot>();
-        apply_map["im2col"]      = extend_op<cpu_im2col, op::im2col>();
+        apply_map["quant_convolution"] = extend_op<cpu_quant_convolution, op::quant_convolution>();
-        apply_map["leaky_relu"]  = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
+        apply_map["elu"]               = extend_op<cpu_unary<elu_op>, op::elu>();
-        apply_map["logsoftmax"]  = extend_op<cpu_logsoftmax, op::logsoftmax>();
+        apply_map["im2col"]            = extend_op<cpu_im2col, op::im2col>();
-        apply_map["lrn"]         = extend_op<cpu_lrn, op::lrn>();
+        apply_map["leaky_relu"]        = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
-        apply_map["pad"]         = extend_op<cpu_pad, op::pad>();
+        apply_map["logsoftmax"]        = extend_op<cpu_logsoftmax, op::logsoftmax>();
-        apply_map["softmax"]     = simple_op<softmax2d>();
+        apply_map["lrn"]               = extend_op<cpu_lrn, op::lrn>();
+        apply_map["pad"]               = extend_op<cpu_pad, op::pad>();
+        apply_map["softmax"]           = simple_op<softmax2d>();
    }
    void apply()

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -33,6 +33,7 @@ add_library(migraphx_device
    device/pad.cpp
    device/gather.cpp
    device/sub.cpp
+    device/pack.cpp
    device/clip.cpp
 )
 set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
@@ -48,8 +49,10 @@ add_library(migraphx_gpu
    target.cpp
    lowering.cpp
    gemm.cpp
+    quant_gemm.cpp
    pooling.cpp
    convolution.cpp
+    quant_convolution.cpp
    softmax.cpp
    logsoftmax.cpp
    contiguous.cpp
@@ -65,6 +68,7 @@ add_library(migraphx_gpu
    elu.cpp
    pad.cpp
    gather.cpp
+    convert.cpp
    lrn.cpp
    schedule_model.cpp
    adjust_allocation.cpp

--- a/src/targets/gpu/convert.cpp
+++ b/src/targets/gpu/convert.cpp
+#include <migraphx/gpu/convert.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/convert.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+shape hip_convert::compute_shape(std::vector<shape> inputs) const
+{
+    inputs.pop_back();
+    check_shapes{inputs}.packed();
+    return op.compute_shape(inputs);
+}
+argument hip_convert::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::convert(ctx.get_stream().get(), args[1], args[0], op.scale, op.shift, op.target_type);
+    return args[1];
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/convert.cpp
+++ b/src/targets/gpu/device/convert.cpp
@@ -6,14 +6,31 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void convert(hipStream_t stream, const argument& result, const argument& arg)
+void convert(hipStream_t stream,
+             const argument& result,
+             const argument& arg,
+             float scale,
+             float shift,
+             shape::type_t target_type)
 {
    result.visit([&](auto output) {
        arg.visit([&](auto input) {
            const auto* input_ptr = device_cast(input.data());
            auto* output_ptr      = device_cast(output.data());
-            gs_launch(stream,
+            if(target_type == shape::int8_type)
-                      result.get_shape().elements())([=](auto i) { output_ptr[i] = input_ptr[i]; });
+            {
+                gs_launch(stream, result.get_shape().elements())([=](auto i) {
+                    float res  = input_ptr[i] * scale + shift;
+                    int factor = (res > 0) ? 1 : -1;
+                    output_ptr[i] =
+                        std::min<int8_t>(std::max<float>(-128, res + factor * 0.5), 127);
+                });
+            }
+            else
+            {
+                gs_launch(stream, result.get_shape().elements())(
+                    [=](auto i) { output_ptr[i] = input_ptr[i] * scale + shift; });
+            }
        });
    });
 }

--- a/src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
@@ -67,13 +67,16 @@ struct hip_tensor_descriptor
    {
        hip_index<NDim> result{};
        size_t tidx = idx;
        for(size_t is = 0; is < NDim; is++)
        {
            result[is] = tidx / strides[is];
            tidx       = tidx % strides[is];
        }
        return result;
    }
    __device__ __host__ size_t linear(hip_index<NDim> s) const
    {
        size_t idx = 0;

--- a/src/targets/gpu/device/pack.cpp
+++ b/src/targets/gpu/device/pack.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/pack.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/hip.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void pack_a(hipStream_t stream, const argument& result, const argument& arg)
+{
+    auto output_shape  = result.get_shape();
+    auto out_lens      = output_shape.lens();
+    auto dim_0         = out_lens.size() - 2;
+    auto dim_1         = out_lens.size() - 1;
+    std::size_t lda    = output_shape.strides()[dim_0];
+    std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
+    visit_all(result, arg)([&](auto output, auto input) {
+        std::size_t nelements = output_shape.elements();
+        auto* out_ptr         = device_cast(output.data());
+        auto* in_ptr          = device_cast(input.data());
+        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
+            hip_tensor_descriptor<out_dim> desc(output_shape);
+            gs_launch(stream, nelements)([=](auto ii) {
+                const size_t nb    = 4;
+                auto idx           = desc.multi(ii);
+                std::size_t i_m    = idx[dim_1];
+                std::size_t i_k    = idx[dim_0];
+                std::size_t offset = ii / m_size * m_size;
+                out_ptr[i_k % nb + (i_m + (i_k / nb) * lda) * nb + offset] =
+                    in_ptr[i_m + i_k * lda + offset];
+            });
+        });
+    });
+}
+void pack_b(hipStream_t stream, const argument& result, const argument& arg)
+{
+    auto trans_shape = result.get_shape();
+    auto out_lens    = trans_shape.lens();
+    auto dim_0       = trans_shape.lens().size() - 2;
+    auto dim_1       = trans_shape.lens().size() - 1;
+    std::size_t ldb  = trans_shape.strides()[dim_1];
+    auto wrap_lens = out_lens;
+    std::swap(wrap_lens[dim_0], wrap_lens[dim_1]);
+    shape output_shape{trans_shape.type(), wrap_lens};
+    std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
+    visit_all(result, arg)([&](auto output, auto input) {
+        std::size_t nelements = output_shape.elements();
+        auto* out_ptr         = device_cast(output.data());
+        auto* in_ptr          = device_cast(input.data());
+        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
+            hip_tensor_descriptor<out_dim> desc(output_shape);
+            gs_launch(stream, nelements)([=](auto ii) {
+                const size_t nb    = 4;
+                auto idx           = desc.multi(ii);
+                std::size_t i_n    = idx[dim_1];
+                std::size_t i_k    = idx[dim_0];
+                std::size_t offset = ii / m_size * m_size;
+                out_ptr[i_k % nb + (i_n + (i_k / nb) * ldb) * nb + offset] =
+                    in_ptr[i_n + i_k * ldb + offset];
+            });
+        });
+    });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx