Merge branch 'develop' into conv_fusion_fix

50174953 · mvermeulen · GitHub · 8409c08d · 3ec62e53 · 50174953
Unverified Commit 50174953 authored Aug 19, 2019 by mvermeulen Committed by GitHub Aug 19, 2019
20 changed files
--- a/src/include/migraphx/argument.hpp
+++ b/src/include/migraphx/argument.hpp
@@ -36,7 +36,7 @@ struct argument : raw_data<argument>
    }

    /// Provides a raw pointer to the data
-    std::function<char*()> data;
+    std::function<char*()> data = nullptr;

    /// Whether data is available
    bool empty() const { return not data; }

--- a/src/include/migraphx/op/binary.hpp
+++ b/src/include/migraphx/op/binary.hpp
@@ -30,23 +30,29 @@ struct binary : op_name<Derived>
        argument result{output_shape};
        auto s1 = args[0].get_shape();
        auto s2 = args[1].get_shape();
-        visit_all(result, args[0], args[1])([&](auto output, auto input1, auto input2) {
-            if(s1 == s2 and input1.get_shape().packed() and input2.get_shape().packed())
-            {
+        if(s1 == s2 and s1.packed())
+        {
+            shape std_shape{s1.type(), s1.lens()};
+            argument std_result{std_shape, result.data()};
+            argument std_arg0{std_shape, args[0].data()};
+            argument std_arg1{std_shape, args[1].data()};
+            visit_all(std_result, std_arg0, std_arg1)([&](auto output, auto input1, auto input2) {
                std::transform(input1.begin(),
                               input1.end(),
                               input2.begin(),
                               output.begin(),
                               static_cast<const Derived&>(*this).apply());
-            }
-            else
-            {
+            });
+        }
+        else
+        {
+            visit_all(result, args[0], args[1])([&](auto output, auto input1, auto input2) {
                shape_for_each(output.get_shape(), [&](const auto& idx) {
                    output(idx.begin(), idx.end()) = static_cast<const Derived&>(*this).apply()(
                        input1(idx.begin(), idx.end()), input2(idx.begin(), idx.end()));
                });
-            }
-        });
+            });
+        }

        return result;
    }

--- a/src/include/migraphx/op/capture.hpp
+++ b/src/include/migraphx/op/capture.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_CAPTURE_HPP
+#define MIGRAPHX_GUARD_OPERATORS_CAPTURE_HPP
+
+#include <array>
+#include <migraphx/operation.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/streamutils.hpp>
+#include <migraphx/literal.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/config.hpp>
+#include <cmath>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct capture
+{
+    std::size_t ins_index;
+    std::function<void(std::size_t ins_index, std::vector<argument>)> f{};
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.ins_index, "ins_index"));
+    }
+
+    std::string name() const { return "capture"; }
+
+    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
+
+    argument compute(const shape&, std::vector<argument> args) const
+    {
+        if(f)
+        {
+            f(ins_index, args);
+        }
+        else
+        {
+            MIGRAPHX_THROW("CAPTURE: callback function is not callable!");
+        }
+
+        return args.front();
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/quant_convolution.hpp
+++ b/src/include/migraphx/op/quant_convolution.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_QUANT_CONVOLUTION_HPP
+#define MIGRAPHX_GUARD_OPERATORS_QUANT_CONVOLUTION_HPP
+
+#include <array>
+#include <migraphx/op/common.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/streamutils.hpp>
+#include <migraphx/literal.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/config.hpp>
+#include <cmath>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct quant_convolution
+{
+    std::array<std::size_t, 2> padding  = {{0, 0}};
+    std::array<std::size_t, 2> stride   = {{1, 1}};
+    std::array<std::size_t, 2> dilation = {{1, 1}};
+
+    padding_mode_t padding_mode = default_;
+    int group                   = 1;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.padding, "padding"),
+                    f(self.stride, "stride"),
+                    f(self.dilation, "dilation"),
+                    f(self.padding_mode, "padding_mode"),
+                    f(self.group, "group"));
+    }
+
+    std::string name() const { return "quant_convolution"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(2).same_type().same_ndims().only_dims(4);
+
+        const shape& input   = inputs.at(0);
+        const shape& weights = inputs.at(1);
+        auto t               = input.type();
+
+        // all input type must be int8_type and output is float_type
+        if(t != shape::int8_type)
+        {
+            MIGRAPHX_THROW("QUANT_CONVOLUTION: only accept input and weights of type int8_t");
+        }
+        t = shape::int32_type;
+
+        return {t,
+                {
+                    input.lens()[0],
+                    weights.lens()[0],
+                    std::size_t(std::max<std::ptrdiff_t>(
+                        1,
+                        (input.lens()[2] - (1 + dilation[0] * (weights.lens()[2] - 1)) +
+                         2 * padding[0]) /
+                                stride[0] +
+                            1)),
+                    std::size_t(std::max<std::ptrdiff_t>(
+                        1,
+                        (input.lens()[3] - (1 + dilation[1] * (weights.lens()[3] - 1)) +
+                         2 * padding[1]) /
+                                stride[1] +
+                            1)),
+                }};
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/quant_dot.hpp
+++ b/src/include/migraphx/op/quant_dot.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_QUANT_DOT_HPP
+#define MIGRAPHX_GUARD_OPERATORS_QUANT_DOT_HPP
+
+#include <array>
+#include <migraphx/operation.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/streamutils.hpp>
+#include <migraphx/literal.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/config.hpp>
+#include <cmath>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct quant_dot
+{
+    int32_t alpha = 1;
+    int32_t beta  = 1;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(as_number(self.alpha), "alpha"), f(as_number(self.beta), "beta"));
+    }
+
+    std::string name() const { return "quant_dot"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{{inputs.at(0), inputs.at(1)}, *this}.same_type();
+        const shape& a = inputs.at(0);
+        const shape& b = inputs.at(1);
+        auto t         = a.type();
+        if(t != shape::int8_type)
+        {
+            MIGRAPHX_THROW("QUANT_DOT: only support data type int8_t");
+        }
+
+        if(!std::all_of(inputs.begin(), inputs.end(), [](auto s) { return s.lens().size() >= 2; }))
+        {
+            MIGRAPHX_THROW("QUANT_DOT: dot only accept 2 or more dims operands");
+        }
+
+        // only handle the case that the batch size of a and b are the same
+        if(!std::equal(
+               a.lens().rbegin() + 2, a.lens().rend(), b.lens().rbegin() + 2, b.lens().rend()))
+        {
+            MIGRAPHX_THROW("QUANT_DOT: batch size of A and B mismatch: {" +
+                           to_string_range(a.lens()) + "} x {" + to_string_range(b.lens()) + "}");
+        }
+
+        std::size_t dim_0 = a.lens().size() - 2;
+        std::size_t dim_1 = a.lens().size() - 1;
+        if(a.lens()[dim_1] != b.lens()[dim_0])
+        {
+            MIGRAPHX_THROW("QUANT_DOT: inner dimensions do not match: {" +
+                           to_string_range(a.lens()) + "} x {" + to_string_range(b.lens()) + "}");
+        }
+
+        // k be multiple of 4
+        if((a.lens()[dim_1] % 4) != 0)
+        {
+            MIGRAPHX_THROW("QUANT_DOT: size of A {" + to_string_range(a.lens()) + "} and B {" +
+                           to_string_range(b.lens()) + "} must be multiple of 4 for int8 type");
+        }
+
+        auto out_lens   = a.lens();
+        out_lens[dim_1] = b.lens()[dim_1];
+        if(inputs.size() == 3 && out_lens != inputs.at(2).lens())
+        {
+            MIGRAPHX_THROW("QUANT_DOT: dimension mismatch, operand C: {" +
+                           to_string_range(inputs.at(2).lens()) +
+                           "}, cannot add to operand A * B: {" + to_string_range(out_lens) + "}");
+        }
+
+        if(inputs.size() == 3 && inputs.at(2).type() != shape::int32_type)
+        {
+            MIGRAPHX_THROW("QUANT_DOT: operand C type must be int32");
+        }
+
+        return {shape::int32_type, out_lens};
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/unary.hpp
+++ b/src/include/migraphx/op/unary.hpp
@@ -27,26 +27,34 @@ struct unary : op_name<Derived>
    argument compute(const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        result.visit([&](auto output) {
-            args[0].visit([&](auto input) {
-                if(input.get_shape().packed())
-                {
+        auto in_shape = args[0].get_shape();
+        if(in_shape.packed())
+        {
+            shape std_in_shape{in_shape.type(), in_shape.lens()};
+            shape std_out_shape{output_shape.type(), output_shape.lens()};
+            argument arg_in{std_in_shape, args[0].data()};
+            argument arg_out{std_out_shape, result.data()};
+            arg_out.visit([&](auto output) {
+                arg_in.visit([&](auto input) {
                    std::transform(input.begin(),
                                   input.end(),
                                   output.begin(),
                                   static_cast<const Derived&>(*this).apply());

-                    return result;
-                }
-
-                shape_for_each(output.get_shape(), [&](const auto& idx) {
-                    output(idx.begin(), idx.end()) =
-                        static_cast<const Derived&>(*this).apply()(input(idx.begin(), idx.end()));
                });
-
-                return result;
            });
-        });
+        }
+        else
+        {
+            result.visit([&](auto output) {
+                args[0].visit([&](auto input) {
+                    shape_for_each(output.get_shape(), [&](const auto& idx) {
+                        output(idx.begin(), idx.end()) = static_cast<const Derived&>(*this).apply()(
+                            input(idx.begin(), idx.end()));
+                    });
+                });
+            });
+        }

        return result;
    }

--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -13,6 +13,7 @@
 #include <migraphx/op/batch_norm.hpp>
 #include <migraphx/op/binary.hpp>
 #include <migraphx/op/broadcast.hpp>
+#include <migraphx/op/capture.hpp>
 #include <migraphx/op/clip.hpp>
 #include <migraphx/op/common.hpp>
 #include <migraphx/op/concat.hpp>
@@ -45,6 +46,8 @@
 #include <migraphx/op/outline.hpp>
 #include <migraphx/op/pad.hpp>
 #include <migraphx/op/pooling.hpp>
+#include <migraphx/op/quant_convolution.hpp>
+#include <migraphx/op/quant_dot.hpp>
 #include <migraphx/op/pow.hpp>
 #include <migraphx/op/reduce_sum.hpp>
 #include <migraphx/op/reduce_mean.hpp>

--- a/src/include/migraphx/program.hpp
+++ b/src/include/migraphx/program.hpp
@@ -126,6 +126,9 @@ struct program
    friend bool operator==(const program& x, const program& y);
    friend bool operator!=(const program& x, const program& y) { return !(x == y); }

+    std::shared_ptr<std::vector<std::pair<float, float>>> int8_quant_params =
+        std::make_shared<std::vector<std::pair<float, float>>>();
+
    private:
    void assign(const program& p);


--- a/src/include/migraphx/quantization.hpp
+++ b/src/include/migraphx/quantization.hpp
@@ -15,6 +15,14 @@ struct program;
 void quantize(program& prog, const std::vector<std::string>& ins_names);
 void quantize(program& prog);

+// insert the capture operator for the inputs of each operator to be quantized
+// to int8
+void capture_arguments(program& prog,
+                       const std::vector<std::string>& ins_names,
+                       const std::function<void(std::size_t, std::vector<argument>)>& func);
+void capture_arguments(program& prog, const std::vector<std::string>& ins_names);
+void capture_arguments(program& prog);
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx


--- a/src/opt/memory_coloring_impl.cpp
+++ b/src/opt/memory_coloring_impl.cpp
@@ -85,6 +85,9 @@ bool memory_coloring_impl::allocate(interval_ptr interval)
            offset += (element_size - (offset % element_size));
        conflict_queue.pop();
    }
+    // when int8 type is used, the offset could be any number
+    // if not 4-byte aligned, miopen int8 convolution can crash
+    offset         = (offset + 3) / 4 * 4;
    segment.offset = offset;
    MIGRAPHX_DEBUG(segment.dump());
    required_bytes = std::max(required_bytes, offset + segment.size);

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -112,7 +112,8 @@ void program::assign(const program& p)
    {
        impl->instructions.clear();
    }
-    impl->ctx = p.impl->ctx;
+    impl->ctx         = p.impl->ctx;
+    int8_quant_params = p.int8_quant_params;

    std::unordered_map<instruction_ref, instruction_ref> ins_map;
    for(auto ins : iterator_for(p))

--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -156,6 +156,7 @@ PYBIND11_MODULE(migraphx, m)
    py::class_<migraphx::target>(m, "target");

    py::class_<migraphx::program>(m, "program")
+        .def("clone", [](migraphx::program& p) { return *(new migraphx::program(p)); })
        .def("get_parameter_shapes", &migraphx::program::get_parameter_shapes)
        .def("get_shape", &migraphx::program::get_shape)
        .def("compile", [](migraphx::program& p, const migraphx::target& t) { p.compile(t); })
@@ -186,6 +187,11 @@ PYBIND11_MODULE(migraphx, m)
        migraphx::quantize(p, ins_names);
    });
    m.def("quantize", [](migraphx::program& p) { migraphx::quantize(p, {"all"}); });
+    m.def("capture_arguments", [](migraphx::program& p, const std::vector<std::string>& ins_names) {
+        migraphx::capture_arguments(p, ins_names);
+    });
+
+    m.def("capture_arguments", [](migraphx::program& p) { migraphx::capture_arguments(p); });

 #ifdef HAVE_GPU
    m.def("allocate_gpu", &migraphx::gpu::allocate_gpu, py::arg("s"), py::arg("host") = false);

--- a/src/quantization.cpp
+++ b/src/quantization.cpp
@@ -3,32 +3,53 @@
 #include <migraphx/instruction.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/op/convert.hpp>
+#include <migraphx/op/dot.hpp>
+#include <migraphx/op/mul.hpp>
+#include <migraphx/op/add.hpp>
+#include <migraphx/op/quant_dot.hpp>
+#include <migraphx/op/capture.hpp>
+#include <migraphx/op/convolution.hpp>
+#include <migraphx/op/quant_convolution.hpp>
+#include <migraphx/op/multibroadcast.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/ranges.hpp>
 #include <utility>
+#include <iomanip>
+#include <fstream>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-instruction_ref insert_fp16(program& prog,
-                            instruction_ref& ins,
-                            shape::type_t type,
-                            std::unordered_map<instruction_ref, instruction_ref>& map_fp16)
+instruction_ref insert_quant_ins(program& prog,
+                                 instruction_ref& ins,
+                                 shape::type_t type,
+                                 std::unordered_map<instruction_ref, instruction_ref>& map_ins)
 {
-    if(map_fp16.count(ins) > 0)
+    if(map_ins.count(ins) > 0)
    {
-        return map_fp16[ins];
+        return map_ins[ins];
+    }
+
+    if(ins->name() == "undefined")
+    {
+        return ins;
    }

    assert(ins->get_shape().type() == shape::float_type ||
-           ins->get_shape().type() == shape::double_type);
-    instruction_ref ins_fp16{};
-    ins_fp16      = prog.insert_instruction(std::next(ins), op::convert{type}, ins);
-    map_fp16[ins] = ins_fp16;
+           ins->get_shape().type() == shape::double_type ||
+           ins->get_shape().type() == shape::int32_type);
+    instruction_ref quant_ins{};
+    quant_ins    = prog.insert_instruction(std::next(ins), op::convert{type}, ins);
+    map_ins[ins] = quant_ins;

-    return ins_fp16;
+    return quant_ins;
 }

+// This function is to convert any instructions specified in the input
+// from double or float to float16 by inserting a convert operator.
+// For the conversion, there could be cases of overflowing, but it
+// is very rare in the area of deeping learning, so we just do a
+// truncate of the input to get the fp16.
 void quantize(program& prog, const std::vector<std::string>& ins_names)
 {
    std::unordered_map<instruction_ref, instruction_ref> map_fp16;
@@ -59,7 +80,7 @@ void quantize(program& prog, const std::vector<std::string>& ins_names)
                }
                else
                {
-                    input_fp16 = insert_fp16(prog, input, shape::half_type, map_fp16);
+                    input_fp16 = insert_quant_ins(prog, input, shape::half_type, map_fp16);
                }
                converted_inputs.push_back(input_fp16);
            }
@@ -79,21 +100,13 @@ void quantize(program& prog, const std::vector<std::string>& ins_names)
        auto ins_shape = compute_shape(op, converted_inputs);
        if(ins_shape.type() != orig_type)
        {
-            // insert another convert instruction to convert it back
-            if(ins == std::prev(prog.end()))
+            // check the dead code case to avoid assert
+            bool output_empty = ins->outputs().empty();
+            auto ins_orig_type =
+                prog.insert_instruction(std::next(ins), op::convert{orig_type}, ins);
+            if(!output_empty)
            {
-                prog.add_instruction(op::convert{orig_type}, ins);
-            }
-            else
-            {
-                // check the dead code case to avoid assert
-                bool output_empty = ins->outputs().empty();
-                auto ins_orig_type =
-                    prog.insert_instruction(std::next(ins), op::convert{orig_type}, ins);
-                if(!output_empty)
-                {
-                    prog.replace_instruction(ins, ins_orig_type);
-                }
+                prog.replace_instruction(ins, ins_orig_type);
            }
        }

@@ -103,5 +116,80 @@ void quantize(program& prog, const std::vector<std::string>& ins_names)

 void quantize(program& prog) { quantize(prog, {"all"}); }

+// For the input of each input argument, we need to insert a
+// capture operator to compute the scale and shift
+void capture_arguments(program& prog,
+                       const std::vector<std::string>& ins_names,
+                       const std::function<void(std::size_t, std::vector<argument>)>& func)
+{
+
+    size_t num_quant_params = 0;
+    // the int8 quantization only support dot and convolution
+    std::vector<std::string> op_names = {"dot", "convolution"};
+    if(!std::all_of(ins_names.begin(), ins_names.end(), [&](auto name) {
+           return std::find(op_names.begin(), op_names.end(), name) != op_names.end();
+       }))
+    {
+        MIGRAPHX_THROW("CAPTURE_ARGUMENTS: input operator is not supported");
+    }
+
+    std::unordered_map<instruction_ref, instruction_ref> ins_map;
+    for(auto ins : iterator_for(prog))
+    {
+        if(not contains(ins_names, ins->name()))
+        {
+            continue;
+        }
+
+        auto inputs = ins->inputs();
+        std::vector<instruction_ref> new_args;
+        for(auto input : inputs)
+        {
+            instruction_ref new_ins{};
+            if(ins_map.count(input) > 0)
+            {
+                new_ins = ins_map[input];
+            }
+            else
+            {
+                new_ins = prog.insert_instruction(
+                    std::next(input), op::capture{num_quant_params++, func}, input);
+                ins_map[input] = new_ins;
+            }
+            new_args.push_back(new_ins);
+        }
+        instruction::replace(ins, ins->get_operator(), ins->get_shape(), new_args);
+    }
+
+    // set one pair of parameter for each argument
+    prog.int8_quant_params->resize(num_quant_params, std::make_pair(-1.0f, -1.0f));
+}
+
+void capture_arguments(program& prog, const std::vector<std::string>& ins_names)
+{
+    auto calc_quant_params = [&](std::size_t ins_index, std::vector<migraphx::argument> args) {
+        std::pair<float, float> param_pair{1.0f, 0.0f};
+
+        // scale and shift is need for only int8 type, and we do not
+        // consider shift, so set shift to 0
+        std::vector<float> vec_val;
+        args.front().visit([&](auto output) { vec_val.assign(output.begin(), output.end()); });
+        auto max_val = *std::max_element(vec_val.begin(), vec_val.end());
+        auto min_val = *std::min_element(vec_val.begin(), vec_val.end());
+        auto max_abs = std::max(std::fabs(max_val), std::fabs(min_val));
+
+        param_pair.first                     = 127.0f / max_abs;
+        (*prog.int8_quant_params)[ins_index] = param_pair;
+    };
+
+    capture_arguments(prog, ins_names, calc_quant_params);
+}
+
+void capture_arguments(program& prog)
+{
+    std::vector<std::string> ins_names = {"dot", "convolution"};
+    capture_arguments(prog, ins_names);
+}
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/cpu/gemm.cpp
+++ b/src/targets/cpu/gemm.cpp
@@ -44,13 +44,9 @@ struct is_fast_gemm_type<float> : std::true_type
 {
 };

-template <class T>
-void migemm_impl(tensor_view<T> cmat,
-                 tensor_view<T> amat,
-                 tensor_view<T> bmat,
-                 float alpha,
-                 float beta,
-                 std::true_type)
+template <class T, class F>
+void migemm_impl(
+    tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta, std::true_type)
 {
    visit_mat(amat, [&](const auto& a) {
        visit_mat(bmat, [&](const auto& b) {
@@ -66,13 +62,9 @@ void migemm_impl(tensor_view<T> cmat,
    });
 }

-template <class T>
-void migemm_impl(tensor_view<T> cmat,
-                 tensor_view<T> amat,
-                 tensor_view<T> bmat,
-                 float alpha,
-                 float beta,
-                 std::false_type)
+template <class T, class F>
+void migemm_impl(
+    tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta, std::false_type)
 {
    std::size_t n_dims = cmat.get_shape().lens().size();
    std::size_t dim_0  = n_dims - 2;
@@ -95,9 +87,8 @@ void migemm_impl(tensor_view<T> cmat,
    });
 }

-template <class T>
-void migemm_impl(
-    tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, float alpha, float beta)
+template <class T, class F>
+void migemm_impl(tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta)
 {
    auto lens = amat.get_shape().lens();
    bool batch_mul =
@@ -113,13 +104,29 @@ void migemm_impl(
    }
 }

-void migemm(
-    const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta)
+template <class F>
+void migemm_tpl(
+    const argument& c_arg, const argument& a_arg, const argument& b_arg, F alpha, F beta)
 {
    visit_all(c_arg, a_arg, b_arg)(
        [&](auto cmat, auto amat, auto bmat) { migemm_impl(cmat, amat, bmat, alpha, beta); });
 }

+void migemm(
+    const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta)
+{
+    migemm_tpl(c_arg, a_arg, b_arg, alpha, beta);
+}
+
+void migemm(const argument& c_arg,
+            const argument& a_arg,
+            const argument& b_arg,
+            int32_t alpha,
+            int32_t beta)
+{
+    migemm_tpl(c_arg, a_arg, b_arg, alpha, beta);
+}
+
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/cpu/include/migraphx/cpu/gemm.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/gemm.hpp
@@ -10,6 +10,11 @@ namespace cpu {

 void migemm(
    const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta);
+void migemm(const argument& c_arg,
+            const argument& a_arg,
+            const argument& b_arg,
+            int32_t alpha,
+            int32_t beta);

 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -4,7 +4,9 @@
 #include <migraphx/dfor.hpp>
 #include <migraphx/op/batch_norm.hpp>
 #include <migraphx/op/convolution.hpp>
+#include <migraphx/op/quant_convolution.hpp>
 #include <migraphx/op/dot.hpp>
+#include <migraphx/op/quant_dot.hpp>
 #include <migraphx/op/elu.hpp>
 #include <migraphx/op/im2col.hpp>
 #include <migraphx/op/leaky_relu.hpp>
@@ -216,6 +218,61 @@ struct cpu_convolution
    }
 };

+struct cpu_quant_convolution
+{
+    op::quant_convolution op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "cpu::quant_convolution"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        auto output = result.get<int32_t>();
+        visit_all(args[0], args[1])([&](auto input, auto weights) {
+            auto in   = input.get_shape().lens();
+            auto in_h = in[2];
+            auto in_w = in[3];
+
+            auto wei   = weights.get_shape().lens();
+            auto wei_n = wei[0];
+            auto wei_c = wei[1];
+            auto wei_h = wei[2];
+            auto wei_w = wei[3];
+
+            par_dfor(output_shape.lens()[0],
+                     output_shape.lens()[1],
+                     output_shape.lens()[2],
+                     output_shape.lens()[3])(
+                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
+                    const auto start_x  = i * op.stride[0] - op.padding[0];
+                    const auto start_y  = j * op.stride[1] - op.padding[1];
+                    const auto group_id = w / (wei_n / op.group);
+
+                    int32_t acc = 0;
+                    dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
+                        const auto in_x  = start_x + x;
+                        const auto in_y  = start_y + y;
+                        const auto in_ch = group_id * wei_c + k;
+                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
+                        {
+                            acc += static_cast<int32_t>(input(o, in_ch, in_x, in_y)) *
+                                   weights(w, k, x, y);
+                        }
+                    });
+                    output(o, w, i, j) = acc;
+                });
+        });
+
+        return result;
+    }
+};
+
 struct cpu_im2col
 {
    op::im2col op;
@@ -433,7 +490,7 @@ struct cpu_gemm
    {
        argument result{output_shape};
        // 3 inputs, it is alpha * A * B + beta * C, then
-        // A and B are matrics, and C is broadcastable to A * B
+        // A and B are matrices, and C is of the same shape as A * B
        if(args.size() == 3)
        {
            // no need to consider the value of args[2]
@@ -460,6 +517,72 @@ struct cpu_gemm
    }
 };

+struct cpu_quant_gemm
+{
+    op::quant_dot op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "cpu::quant_dot"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        if(inputs.size() == 3)
+        {
+            auto c_shape = inputs.at(2);
+            check_shapes{{c_shape}}.not_broadcasted();
+        }
+        return op.compute_shape(inputs);
+    }
+
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        // 3 inputs, it is alpha * A * B + beta * C, then
+        // A and B are matrices, and C is of the same shape to A * B
+
+        // first, convert the args[0] and args[1] from int8_t to int32_t
+        argument arg_0{{shape::int32_type, {args.at(0).get_shape().lens()}}};
+        argument arg_1{{shape::int32_type, {args.at(1).get_shape().lens()}}};
+        arg_0.visit([&](auto output) {
+            args.at(0).visit(
+                [&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
+        });
+
+        arg_1.visit([&](auto output) {
+            args.at(1).visit(
+                [&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
+        });
+
+        if(args.size() == 3)
+        {
+            // no need to consider the value of args[2]
+            if(op.beta == 0)
+            {
+                result.visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
+            }
+            else
+            {
+                visit_all(result, args[2])([&](auto output, auto input) {
+                    std::copy(input.begin(), input.end(), output.begin());
+                });
+            }
+
+            migemm(result, arg_0, arg_1, op.alpha, op.beta);
+
+            return result;
+        }
+
+        // 2 input arguments
+        migemm(result, arg_0, arg_1, op.alpha, int32_t{0});
+
+        return result;
+    }
+};
+
 struct leaky_relu_op
 {
    op::leaky_relu op;
@@ -671,15 +794,17 @@ struct cpu_apply
    {
        apply_map["batch_norm_inference"] =
            extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
-        apply_map["convolution"] = extend_op<cpu_convolution, op::convolution>();
-        apply_map["dot"]         = extend_op<cpu_gemm, op::dot>();
-        apply_map["elu"]         = extend_op<cpu_unary<elu_op>, op::elu>();
-        apply_map["im2col"]      = extend_op<cpu_im2col, op::im2col>();
-        apply_map["leaky_relu"]  = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
-        apply_map["logsoftmax"]  = extend_op<cpu_logsoftmax, op::logsoftmax>();
-        apply_map["lrn"]         = extend_op<cpu_lrn, op::lrn>();
-        apply_map["pad"]         = extend_op<cpu_pad, op::pad>();
-        apply_map["softmax"]     = extend_op<cpu_softmax, op::softmax>();
+        apply_map["convolution"]       = extend_op<cpu_convolution, op::convolution>();
+        apply_map["dot"]               = extend_op<cpu_gemm, op::dot>();
+        apply_map["quant_dot"]         = extend_op<cpu_quant_gemm, op::quant_dot>();
+        apply_map["quant_convolution"] = extend_op<cpu_quant_convolution, op::quant_convolution>();
+        apply_map["elu"]               = extend_op<cpu_unary<elu_op>, op::elu>();
+        apply_map["im2col"]            = extend_op<cpu_im2col, op::im2col>();
+        apply_map["leaky_relu"]        = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
+        apply_map["logsoftmax"]        = extend_op<cpu_logsoftmax, op::logsoftmax>();
+        apply_map["lrn"]               = extend_op<cpu_lrn, op::lrn>();
+        apply_map["pad"]               = extend_op<cpu_pad, op::pad>();
+        apply_map["softmax"]           = extend_op<cpu_softmax, op::softmax>();
    }

    void apply()

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -39,6 +39,7 @@ add_library(migraphx_device
    device/pad.cpp
    device/gather.cpp
    device/sub.cpp
+    device/int8_gemm_pack.cpp
    device/div.cpp
    device/clip.cpp
    device/reduce_sum.cpp
@@ -64,8 +65,10 @@ add_library(migraphx_gpu
    target.cpp
    lowering.cpp
    gemm.cpp
+    quant_gemm.cpp
    pooling.cpp
    convolution.cpp
+    quant_convolution.cpp
    softmax.cpp
    logsoftmax.cpp
    contiguous.cpp
@@ -79,12 +82,16 @@ add_library(migraphx_gpu
    elu.cpp
    pad.cpp
    gather.cpp
+    convert.cpp
    lrn.cpp
    schedule_model.cpp
    adjust_allocation.cpp
+    pack_int8_args.cpp
    clip.cpp
    reduce_sum.cpp
    reduce_mean.cpp
+    int8_gemm_pack.cpp
+    int8_conv_pack.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 rocm_clang_tidy_check(migraphx_gpu)

--- a/src/targets/gpu/convert.cpp
+++ b/src/targets/gpu/convert.cpp
+#include <migraphx/gpu/convert.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/convert.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_convert::compute_shape(std::vector<shape> inputs) const
+{
+    inputs.pop_back();
+    check_shapes{inputs}.packed();
+    return op.compute_shape(inputs);
+}
+
+argument hip_convert::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::convert(ctx.get_stream().get(), args[1], args[0]);
+    return args[1];
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
@@ -31,6 +31,7 @@ struct hip_tensor_descriptor
            result[is] = tidx / strides[is];
            tidx       = tidx % strides[is];
        }
+
        return result;
    }
    __device__ __host__ std::size_t linear(hip_tensor_index<NDim> s) const

--- a/src/targets/gpu/device/int8_gemm_pack.cpp
+++ b/src/targets/gpu/device/int8_gemm_pack.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/int8_gemm_pack.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/hip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void int8_gemm_pack_a(hipStream_t stream, const argument& result, const argument& arg)
+{
+    auto comp_shape    = arg.get_shape();
+    auto out_lens      = comp_shape.lens();
+    auto dim_0         = out_lens.size() - 2;
+    auto dim_1         = out_lens.size() - 1;
+    std::size_t lda    = comp_shape.strides()[dim_0];
+    std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
+    visit_all(result, arg)([&](auto output, auto input) {
+        std::size_t nelements = comp_shape.elements();
+        auto* out_ptr         = device_cast(output.data());
+        auto* in_ptr          = device_cast(input.data());
+        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
+            hip_tensor_descriptor<out_dim> desc(comp_shape);
+            gs_launch(stream, nelements, 256)([=](auto ii) {
+                const size_t nb    = 4;
+                auto idx           = desc.multi(ii);
+                std::size_t i_m    = idx[dim_1];
+                std::size_t i_k    = idx[dim_0];
+                std::size_t offset = ii / m_size * m_size;
+                out_ptr[i_k % nb + (i_m + (i_k / nb) * lda) * nb + offset] =
+                    in_ptr[i_m + i_k * lda + offset];
+            });
+        });
+    });
+}
+
+void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument& arg)
+{
+    auto trans_shape = arg.get_shape();
+    auto out_lens    = trans_shape.lens();
+    auto dim_0       = trans_shape.lens().size() - 2;
+    auto dim_1       = trans_shape.lens().size() - 1;
+    std::size_t ldb  = trans_shape.strides()[dim_1];
+
+    auto wrap_lens = out_lens;
+    std::swap(wrap_lens[dim_0], wrap_lens[dim_1]);
+    shape comp_shape{trans_shape.type(), wrap_lens};
+    std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
+    visit_all(result, arg)([&](auto output, auto input) {
+        std::size_t nelements = comp_shape.elements();
+        auto* out_ptr         = device_cast(output.data());
+        auto* in_ptr          = device_cast(input.data());
+        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
+            hip_tensor_descriptor<out_dim> desc(comp_shape);
+            gs_launch(stream, nelements, 256)([=](auto ii) {
+                const size_t nb    = 4;
+                auto idx           = desc.multi(ii);
+                std::size_t i_n    = idx[dim_1];
+                std::size_t i_k    = idx[dim_0];
+                std::size_t offset = ii / m_size * m_size;
+                out_ptr[i_k % nb + (i_n + (i_k / nb) * ldb) * nb + offset] =
+                    in_ptr[i_n + i_k * ldb + offset];
+            });
+        });
+    });
+}
+
+void sync_stream(hipStream_t stream) { hipStreamSynchronize(stream); }
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx