Merge branch 'develop' into eliminate-more-contiguous

98fd5e1d · Paul · f7a6d87f · a1c7e7a5 · 98fd5e1d · 98fd5e1d
Commit 98fd5e1d authored Aug 27, 2019 by Paul
20 changed files
--- a/src/include/migraphx/verify.hpp
+++ b/src/include/migraphx/verify.hpp
@@ -168,6 +168,7 @@ bool verify_range(R1&& r1, R2&& r2, double tolerance = 80, double* out_error = n
 {
    double threshold = std::numeric_limits<range_value<R1>>::epsilon() * tolerance;
    auto error       = rms_range(r1, r2);
+    // cppcheck-suppress uninitvar
    if(out_error != nullptr)
        *out_error = error;
    return error <= threshold;

--- a/src/onnx/onnx.cpp
+++ b/src/onnx/onnx.cpp
@@ -55,6 +55,7 @@ struct onnx_parser
        add_generic_op("Acos", op::acos{});
        add_generic_op("Atan", op::atan{});
        add_generic_op("Sqrt", op::sqrt{});
+        add_generic_op("Round", op::round{});
        add_generic_op("Sign", op::sign{});

        add_binary_op("Add", op::add{});
@@ -206,6 +207,16 @@ struct onnx_parser
        return out_lens;
    }

+    instruction_ref make_contiguous(instruction_ref ins)
+    {
+        if(ins->get_shape().standard())
+        {
+            return ins;
+        }
+
+        return prog.add_instruction(op::contiguous{}, ins);
+    }
+
    template <class T>
    instruction_ref add_broadcastable_binary_op(instruction_ref arg0, instruction_ref arg1, T x)
    {
@@ -313,7 +324,11 @@ struct onnx_parser
        {
            if(contains(attributes, "auto_pad"))
            {
-                MIGRAPHX_THROW("auto_pad and padding cannot be specified simultaneously");
+                auto s = attributes["auto_pad"].s();
+                if(contains(attributes, "pads") and to_upper(s) != "NOTSET")
+                {
+                    MIGRAPHX_THROW("auto_pad and padding cannot be specified simultaneously");
+                }
            }
            std::vector<std::int64_t> padding;
            copy(attributes["pads"].ints(), std::back_inserter(padding));
@@ -361,7 +376,7 @@ struct onnx_parser
        if(args.size() == 3)
        {
            uint64_t axis = 1;
-            auto l1       = prog.add_instruction(op, args[0], args[1]);
+            auto l1       = prog.add_instruction(op, l0, args[1]);
            auto l2 = prog.add_instruction(op::broadcast{axis, l1->get_shape().lens()}, args[2]);
            return prog.add_instruction(op::add{}, l1, l2);
        }
@@ -437,12 +452,7 @@ struct onnx_parser
            s.visit([&](auto v) { copy(v, std::back_inserter(op.dims)); });
        }

-        if(!args[0]->get_shape().standard())
-        {
-            args[0] = prog.add_instruction(op::contiguous{}, args[0]);
-        }
-
-        return prog.add_instruction(op, args[0]);
+        return prog.add_instruction(op, make_contiguous(args[0]));
    }

    instruction_ref
@@ -490,23 +500,41 @@ struct onnx_parser
        {
            axis = parse_value(attributes.at("axis")).at<int>();
        }
+
        op::gather op{axis};
-        return prog.add_instruction(op, std::move(args));
+        return prog.add_instruction(op, make_contiguous(args[0]), make_contiguous(args[1]));
    }

    instruction_ref
    parse_slice(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
    {
        op::slice op;
+        std::vector<size_t> dims = args[0]->get_shape().lens();
+        size_t num_dims          = dims.size();
        if(contains(attributes, "axes"))
        {
            literal s = parse_value(attributes.at("axes"));
            s.visit([&](auto v) { copy(v, std::back_inserter(op.axes)); });
        }
+        else
+        {
+            op.axes = std::vector<int64_t>(num_dims);
+            std::iota(op.axes.begin(), op.axes.end(), 0);
+        }
+
+        if(contains(attributes, "ends"))
        {
            literal s = parse_value(attributes.at("ends"));
            s.visit([&](auto v) { copy(v, std::back_inserter(op.ends)); });
+            for(size_t i = 0; i < num_dims; i++)
+            {
+                if(static_cast<size_t>(op.ends[i]) > dims[i])
+                {
+                    op.ends[i] = dims[i];
+                }
+            }
        }
+        if(contains(attributes, "starts"))
        {
            literal s = parse_value(attributes.at("starts"));
            s.visit([&](auto v) { copy(v, std::back_inserter(op.starts)); });
@@ -1011,9 +1039,10 @@ struct onnx_parser
        }

        std::vector<operation> vec_actv_funcs(vec_names.size());
-        std::transform(vec_names.begin(), vec_names.end(), vec_actv_funcs.begin(), [&](auto& fn) {
-            return map_actv_funcs[fn];
-        });
+        std::transform(vec_names.begin(),
+                       vec_names.end(),
+                       vec_actv_funcs.begin(),
+                       [&](const auto& fn) { return map_actv_funcs[fn]; });

        // To be added later
        float clip = 0.0;
@@ -1127,9 +1156,10 @@ struct onnx_parser
        }

        std::vector<operation> vec_actv_funcs(vec_names.size());
-        std::transform(vec_names.begin(), vec_names.end(), vec_actv_funcs.begin(), [&](auto& name) {
-            return map_actv_funcs[name];
-        });
+        std::transform(vec_names.begin(),
+                       vec_names.end(),
+                       vec_actv_funcs.begin(),
+                       [&](const auto& name) { return map_actv_funcs[name]; });

        float clip = 0.0;
        if(contains(attributes, "clip"))
@@ -1299,9 +1329,10 @@ struct onnx_parser
        }

        std::vector<operation> vec_actv_funcs(vec_names.size());
-        std::transform(vec_names.begin(), vec_names.end(), vec_actv_funcs.begin(), [&](auto& name) {
-            return map_actv_funcs[name];
-        });
+        std::transform(vec_names.begin(),
+                       vec_names.end(),
+                       vec_actv_funcs.begin(),
+                       [&](const auto& name) { return map_actv_funcs[name]; });

        float clip = 0.0;
        if(contains(attributes, "clip"))

--- a/src/opt/memory_coloring_impl.cpp
+++ b/src/opt/memory_coloring_impl.cpp
@@ -85,6 +85,9 @@ bool memory_coloring_impl::allocate(interval_ptr interval)
            offset += (element_size - (offset % element_size));
        conflict_queue.pop();
    }
+    // when int8 type is used, the offset could be any number
+    // if not 4-byte aligned, miopen int8 convolution can crash
+    offset         = (offset + 3) / 4 * 4;
    segment.offset = offset;
    MIGRAPHX_DEBUG(segment.dump());
    required_bytes = std::max(required_bytes, offset + segment.size);

--- a/src/opt/memory_coloring_impl.hpp
+++ b/src/opt/memory_coloring_impl.hpp
@@ -107,7 +107,7 @@ struct memory_coloring_impl
        return ins->name() == "check_context";
    }

-    static bool is_disjoin(live_range& range1, live_range& range2)
+    static bool is_disjoin(const live_range& range1, const live_range& range2)
    {
        if((range1.size == 0) || (range2.size == 0))
            return false;

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -241,7 +241,7 @@ instruction_ref program::remove_instructions(instruction_ref first, instruction_
    // TODO: Check every element
    assert(has_instruction(first));
    std::for_each(first, last, [&](instruction& ins) { ins.clear_arguments(); });
-    assert(std::all_of(first, last, [&](instruction& ins) { return ins.outputs().empty(); }));
+    assert(std::all_of(first, last, [&](const instruction& ins) { return ins.outputs().empty(); }));
    return impl->instructions.erase(first, last);
 }


--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -156,6 +156,7 @@ PYBIND11_MODULE(migraphx, m)
    py::class_<migraphx::target>(m, "target");

    py::class_<migraphx::program>(m, "program")
+        .def("clone", [](migraphx::program& p) { return *(new migraphx::program(p)); })
        .def("get_parameter_shapes", &migraphx::program::get_parameter_shapes)
        .def("get_shape", &migraphx::program::get_shape)
        .def("compile", [](migraphx::program& p, const migraphx::target& t) { p.compile(t); })

--- a/src/quantization.cpp
+++ b/src/quantization.cpp
@@ -3,32 +3,53 @@
 #include <migraphx/instruction.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/op/convert.hpp>
+#include <migraphx/op/dot.hpp>
+#include <migraphx/op/mul.hpp>
+#include <migraphx/op/add.hpp>
+#include <migraphx/op/quant_dot.hpp>
+#include <migraphx/op/capture.hpp>
+#include <migraphx/op/convolution.hpp>
+#include <migraphx/op/quant_convolution.hpp>
+#include <migraphx/op/multibroadcast.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/ranges.hpp>
 #include <utility>
+#include <iomanip>
+#include <fstream>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-instruction_ref insert_fp16(program& prog,
-                            instruction_ref& ins,
-                            shape::type_t type,
-                            std::unordered_map<instruction_ref, instruction_ref>& map_fp16)
+instruction_ref insert_quant_ins(program& prog,
+                                 instruction_ref& ins,
+                                 shape::type_t type,
+                                 std::unordered_map<instruction_ref, instruction_ref>& map_ins)
 {
-    if(map_fp16.count(ins) > 0)
+    if(map_ins.count(ins) > 0)
    {
-        return map_fp16[ins];
+        return map_ins[ins];
+    }
+
+    if(ins->name() == "undefined")
+    {
+        return ins;
    }

    assert(ins->get_shape().type() == shape::float_type ||
-           ins->get_shape().type() == shape::double_type);
-    instruction_ref ins_fp16{};
-    ins_fp16      = prog.insert_instruction(std::next(ins), op::convert{type}, ins);
-    map_fp16[ins] = ins_fp16;
+           ins->get_shape().type() == shape::double_type ||
+           ins->get_shape().type() == shape::int32_type);
+    instruction_ref quant_ins{};
+    quant_ins    = prog.insert_instruction(std::next(ins), op::convert{type}, ins);
+    map_ins[ins] = quant_ins;

-    return ins_fp16;
+    return quant_ins;
 }

+// This function is to convert any instructions specified in the input
+// from double or float to float16 by inserting a convert operator.
+// For the conversion, there could be cases of overflowing, but it
+// is very rare in the area of deeping learning, so we just do a
+// truncate of the input to get the fp16.
 void quantize(program& prog, const std::vector<std::string>& ins_names)
 {
    std::unordered_map<instruction_ref, instruction_ref> map_fp16;
@@ -53,13 +74,14 @@ void quantize(program& prog, const std::vector<std::string>& ins_names)
                // if the input is a convert operator, uses its input
                // as its current input
                instruction_ref input_fp16{};
-                if(input->name() == "convert")
+                if(input->name() == "convert" and
+                   input->inputs().front()->get_shape().type() == shape::half_type)
                {
                    input_fp16 = input->inputs().front();
                }
                else
                {
-                    input_fp16 = insert_fp16(prog, input, shape::half_type, map_fp16);
+                    input_fp16 = insert_quant_ins(prog, input, shape::half_type, map_fp16);
                }
                converted_inputs.push_back(input_fp16);
            }
@@ -79,21 +101,13 @@ void quantize(program& prog, const std::vector<std::string>& ins_names)
        auto ins_shape = compute_shape(op, converted_inputs);
        if(ins_shape.type() != orig_type)
        {
-            // insert another convert instruction to convert it back
-            if(ins == std::prev(prog.end()))
+            // check the dead code case to avoid assert
+            bool output_empty = ins->outputs().empty();
+            auto ins_orig_type =
+                prog.insert_instruction(std::next(ins), op::convert{orig_type}, ins);
+            if(!output_empty)
            {
-                prog.add_instruction(op::convert{orig_type}, ins);
-            }
-            else
-            {
-                // check the dead code case to avoid assert
-                bool output_empty = ins->outputs().empty();
-                auto ins_orig_type =
-                    prog.insert_instruction(std::next(ins), op::convert{orig_type}, ins);
-                if(!output_empty)
-                {
-                    prog.replace_instruction(ins, ins_orig_type);
-                }
+                prog.replace_instruction(ins, ins_orig_type);
            }
        }

@@ -103,5 +117,91 @@ void quantize(program& prog, const std::vector<std::string>& ins_names)

 void quantize(program& prog) { quantize(prog, {"all"}); }

+// For the input of each input argument, we need to insert a
+// capture operator to compute the scale and shift
+std::size_t capture_arguments(program& prog,
+                              const std::vector<std::string>& ins_names,
+                              const std::function<void(std::size_t, std::vector<argument>)>& func)
+{
+
+    size_t num_quant_params = 0;
+    // the int8 quantization only support dot and convolution
+    std::vector<std::string> op_names = {"dot", "convolution"};
+    if(!std::all_of(ins_names.begin(), ins_names.end(), [&](auto name) {
+           return std::find(op_names.begin(), op_names.end(), name) != op_names.end();
+       }))
+    {
+        MIGRAPHX_THROW("CAPTURE_ARGUMENTS: input operator is not supported");
+    }
+
+    std::unordered_map<instruction_ref, instruction_ref> ins_map;
+    for(auto ins : iterator_for(prog))
+    {
+        if(not contains(ins_names, ins->name()))
+        {
+            continue;
+        }
+
+        auto inputs = ins->inputs();
+        std::vector<instruction_ref> new_args;
+        for(auto input : inputs)
+        {
+            instruction_ref new_ins{};
+            if(ins_map.count(input) > 0)
+            {
+                new_ins = ins_map[input];
+            }
+            else
+            {
+                new_ins = prog.insert_instruction(
+                    std::next(input), op::capture{num_quant_params++, func}, input);
+                ins_map[input] = new_ins;
+            }
+            new_args.push_back(new_ins);
+        }
+        instruction::replace(ins, ins->get_operator(), ins->get_shape(), new_args);
+    }
+
+    return num_quant_params;
+}
+
+std::shared_ptr<std::vector<std::pair<float, float>>>
+capture_arguments(program& prog, const std::vector<std::string>& ins_names)
+{
+    std::shared_ptr<std::vector<std::pair<float, float>>> int8_quant_params =
+        std::make_shared<std::vector<std::pair<float, float>>>();
+    std::shared_ptr<std::vector<float>> max_abs_vals = std::make_shared<std::vector<float>>();
+
+    auto calc_quant_params = [int8_quant_params, max_abs_vals](
+                                 std::size_t ins_index, std::vector<migraphx::argument> args) {
+        std::pair<float, float> param_pair{64.0f, 0.0f};
+
+        // scale and shift is need for only int8 type, and we do not
+        // consider shift, so set shift to 0
+        std::vector<float> vec_val;
+        args.front().visit([&](auto output) { vec_val.assign(output.begin(), output.end()); });
+        auto max_val                = *std::max_element(vec_val.begin(), vec_val.end());
+        auto min_val                = *std::min_element(vec_val.begin(), vec_val.end());
+        auto max_abs                = std::max(std::fabs(max_val), std::fabs(min_val));
+        max_abs_vals->at(ins_index) = std::max(max_abs_vals->at(ins_index), max_abs);
+
+        param_pair.first                 = 127.0f / max_abs_vals->at(ins_index);
+        int8_quant_params->at(ins_index) = param_pair;
+    };
+
+    auto num_params = capture_arguments(prog, ins_names, calc_quant_params);
+
+    int8_quant_params->resize(num_params, std::pair<float, float>(64.0f, 0.0f));
+    max_abs_vals->resize(num_params, 0.0f);
+
+    return int8_quant_params;
+}
+
+std::shared_ptr<std::vector<std::pair<float, float>>> capture_arguments(program& prog)
+{
+    std::vector<std::string> ins_names = {"dot", "convolution"};
+    return capture_arguments(prog, ins_names);
+}
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/rewrite_rnn.cpp
+++ b/src/rewrite_rnn.cpp
@@ -674,7 +674,6 @@ void rewrite_rnn::apply_lstm(program& prog, instruction_ref ins) const
    std::vector<float> ihc_data(ihc_shape.elements(), 0.0);

    migraphx::shape pph_shape{type, {1, 3 * hidden_size}};
-    std::vector<float> pph_data(pph_shape.elements(), 0.0);

    auto actv_funcs         = lstm_actv_funcs(ins);
    auto lstm_op            = any_cast<op::lstm>(ins->get_operator());

--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
@@ -52,6 +52,7 @@ struct find_mul_conv
    }
 };

+// a * (x + b) => a * x + a * b
 struct find_mul_add
 {
    auto matcher() const
@@ -60,7 +61,7 @@ struct find_mul_add
            match::name("add")(
                match::either_arg(0, 1)(
                    match::any().bind("x"),
-                    match::any_of(conv_const_weights(), match::is_constant()).bind("y")),
+                    match::any_of(conv_const_weights(), match::is_constant()).bind("b")),
                match::none_of(match::args(match::is_constant(), match::is_constant())),
                match::used_once()),
            match::is_constant().bind("a")));
@@ -70,12 +71,13 @@ struct find_mul_add
    {
        auto ins   = r.result;
        auto a_ins = r.instructions["a"];
+        auto b_ins = r.instructions["b"];
        auto x_ins = r.instructions["x"];
-        auto y_ins = r.instructions["y"];
+        assert(x_ins != b_ins);

-        auto xa_ins = p.insert_instruction(ins, op::mul{}, x_ins, a_ins);
-        auto ya_ins = p.insert_instruction(ins, op::mul{}, y_ins, a_ins);
-        p.replace_instruction(ins, op::add{}, xa_ins, ya_ins);
+        auto ax_ins = p.insert_instruction(ins, op::mul{}, a_ins, x_ins);
+        auto ab_ins = p.insert_instruction(ins, op::mul{}, a_ins, b_ins);
+        p.replace_instruction(ins, op::add{}, ax_ins, ab_ins);
    }
 };


--- a/src/targets/cpu/gemm.cpp
+++ b/src/targets/cpu/gemm.cpp
@@ -44,13 +44,9 @@ struct is_fast_gemm_type<float> : std::true_type
 {
 };

-template <class T>
-void migemm_impl(tensor_view<T> cmat,
-                 tensor_view<T> amat,
-                 tensor_view<T> bmat,
-                 float alpha,
-                 float beta,
-                 std::true_type)
+template <class T, class F>
+void migemm_impl(
+    tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta, std::true_type)
 {
    visit_mat(amat, [&](const auto& a) {
        visit_mat(bmat, [&](const auto& b) {
@@ -66,13 +62,9 @@ void migemm_impl(tensor_view<T> cmat,
    });
 }

-template <class T>
-void migemm_impl(tensor_view<T> cmat,
-                 tensor_view<T> amat,
-                 tensor_view<T> bmat,
-                 float alpha,
-                 float beta,
-                 std::false_type)
+template <class T, class F>
+void migemm_impl(
+    tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta, std::false_type)
 {
    std::size_t n_dims = cmat.get_shape().lens().size();
    std::size_t dim_0  = n_dims - 2;
@@ -95,9 +87,8 @@ void migemm_impl(tensor_view<T> cmat,
    });
 }

-template <class T>
-void migemm_impl(
-    tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, float alpha, float beta)
+template <class T, class F>
+void migemm_impl(tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta)
 {
    auto lens = amat.get_shape().lens();
    bool batch_mul =
@@ -113,13 +104,29 @@ void migemm_impl(
    }
 }

-void migemm(
-    const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta)
+template <class F>
+void migemm_tpl(
+    const argument& c_arg, const argument& a_arg, const argument& b_arg, F alpha, F beta)
 {
    visit_all(c_arg, a_arg, b_arg)(
        [&](auto cmat, auto amat, auto bmat) { migemm_impl(cmat, amat, bmat, alpha, beta); });
 }

+void migemm(
+    const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta)
+{
+    migemm_tpl(c_arg, a_arg, b_arg, alpha, beta);
+}
+
+void migemm(const argument& c_arg,
+            const argument& a_arg,
+            const argument& b_arg,
+            int32_t alpha,
+            int32_t beta)
+{
+    migemm_tpl(c_arg, a_arg, b_arg, alpha, beta);
+}
+
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/cpu/include/migraphx/cpu/gemm.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/gemm.hpp
@@ -10,6 +10,11 @@ namespace cpu {

 void migemm(
    const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta);
+void migemm(const argument& c_arg,
+            const argument& a_arg,
+            const argument& b_arg,
+            int32_t alpha,
+            int32_t beta);

 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -4,7 +4,9 @@
 #include <migraphx/dfor.hpp>
 #include <migraphx/op/batch_norm.hpp>
 #include <migraphx/op/convolution.hpp>
+#include <migraphx/op/quant_convolution.hpp>
 #include <migraphx/op/dot.hpp>
+#include <migraphx/op/quant_dot.hpp>
 #include <migraphx/op/elu.hpp>
 #include <migraphx/op/im2col.hpp>
 #include <migraphx/op/leaky_relu.hpp>
@@ -216,6 +218,61 @@ struct cpu_convolution
    }
 };

+struct cpu_quant_convolution
+{
+    op::quant_convolution op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "cpu::quant_convolution"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        auto output = result.get<int32_t>();
+        visit_all(args[0], args[1])([&](auto input, auto weights) {
+            auto in   = input.get_shape().lens();
+            auto in_h = in[2];
+            auto in_w = in[3];
+
+            auto wei   = weights.get_shape().lens();
+            auto wei_n = wei[0];
+            auto wei_c = wei[1];
+            auto wei_h = wei[2];
+            auto wei_w = wei[3];
+
+            par_dfor(output_shape.lens()[0],
+                     output_shape.lens()[1],
+                     output_shape.lens()[2],
+                     output_shape.lens()[3])(
+                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
+                    const auto start_x  = i * op.stride[0] - op.padding[0];
+                    const auto start_y  = j * op.stride[1] - op.padding[1];
+                    const auto group_id = w / (wei_n / op.group);
+
+                    int32_t acc = 0;
+                    dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
+                        const auto in_x  = start_x + x;
+                        const auto in_y  = start_y + y;
+                        const auto in_ch = group_id * wei_c + k;
+                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
+                        {
+                            acc += static_cast<int32_t>(input(o, in_ch, in_x, in_y)) *
+                                   weights(w, k, x, y);
+                        }
+                    });
+                    output(o, w, i, j) = acc;
+                });
+        });
+
+        return result;
+    }
+};
+
 struct cpu_im2col
 {
    op::im2col op;
@@ -245,17 +302,17 @@ struct cpu_im2col
            const std::size_t& stride_h = op.stride[0];
            const std::size_t& stride_w = op.stride[1];

-            auto kdiv2_h = kernel_h / 2;
-            auto kdiv2_w = kernel_w / 2;
+            long kdiv2_h = long(kernel_h) / 2;
+            long kdiv2_w = long(kernel_w) / 2;
            // calculate output sizes
            const std::size_t col_height = (height - kernel_h + 2 * pad_h) / stride_h + 1;
            const std::size_t col_width  = (width - kernel_w + 2 * pad_w) / stride_w + 1;
            // account for padding for the starting position of the input pixels
-            std::size_t iinput = kdiv2_h - pad_h;
+            long iinput = kdiv2_h - long(pad_h);
            // loop over output pixels (ioutput, joutput)
            for(std::size_t ioutput = 0; ioutput < col_height; ioutput++, iinput += stride_h)
            {
-                std::size_t jinput = kdiv2_w - pad_w;
+                long jinput = kdiv2_w - long(pad_w);
                for(std::size_t joutput = 0; joutput < col_width; joutput++, jinput += stride_w)
                {
                    // compute linear index for output
@@ -264,8 +321,8 @@ struct cpu_im2col
                    dfor(channels,
                         kernel_h,
                         kernel_w)([&](std::size_t c, std::size_t koffset, std::size_t loffset) {
-                        auto idx    = iinput + koffset - kdiv2_h;
-                        auto jdx    = jinput + loffset - kdiv2_w;
+                        auto idx    = iinput + long(koffset) - kdiv2_h;
+                        auto jdx    = jinput + long(loffset) - kdiv2_w;
                        col(ldx, p) = ((idx >= 0) && (idx < height) && (jdx >= 0) && (jdx < width))
                                          ? input(0, c, idx, jdx)
                                          : 0;
@@ -433,7 +490,7 @@ struct cpu_gemm
    {
        argument result{output_shape};
        // 3 inputs, it is alpha * A * B + beta * C, then
-        // A and B are matrics, and C is broadcastable to A * B
+        // A and B are matrices, and C is of the same shape as A * B
        if(args.size() == 3)
        {
            // no need to consider the value of args[2]
@@ -460,13 +517,79 @@ struct cpu_gemm
    }
 };

+struct cpu_quant_gemm
+{
+    op::quant_dot op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "cpu::quant_dot"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        if(inputs.size() == 3)
+        {
+            auto c_shape = inputs.at(2);
+            check_shapes{{c_shape}}.not_broadcasted();
+        }
+        return op.compute_shape(inputs);
+    }
+
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        // 3 inputs, it is alpha * A * B + beta * C, then
+        // A and B are matrices, and C is of the same shape to A * B
+
+        // first, convert the args[0] and args[1] from int8_t to int32_t
+        argument arg_0{{shape::int32_type, {args.at(0).get_shape().lens()}}};
+        argument arg_1{{shape::int32_type, {args.at(1).get_shape().lens()}}};
+        arg_0.visit([&](auto output) {
+            args.at(0).visit(
+                [&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
+        });
+
+        arg_1.visit([&](auto output) {
+            args.at(1).visit(
+                [&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
+        });
+
+        if(args.size() == 3)
+        {
+            // no need to consider the value of args[2]
+            if(op.beta == 0)
+            {
+                result.visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
+            }
+            else
+            {
+                visit_all(result, args[2])([&](auto output, auto input) {
+                    std::copy(input.begin(), input.end(), output.begin());
+                });
+            }
+
+            migemm(result, arg_0, arg_1, op.alpha, op.beta);
+
+            return result;
+        }
+
+        // 2 input arguments
+        migemm(result, arg_0, arg_1, op.alpha, int32_t{0});
+
+        return result;
+    }
+};
+
 struct leaky_relu_op
 {
    op::leaky_relu op;
    std::string name() const { return "cpu::leaky_relu"; }
    auto fcn() const
    {
-        auto& a = op.alpha;
+        auto a = op.alpha;
        return [a](auto x) { return x > 0 ? x : x * a; };
    }
 };
@@ -477,7 +600,7 @@ struct elu_op
    std::string name() const { return "cpu::elu"; }
    auto fcn() const
    {
-        auto& a = op.alpha;
+        auto a = op.alpha;
        return [a](auto x) { return x > 0 ? x : a * std::expm1(x); };
    }
 };
@@ -671,15 +794,17 @@ struct cpu_apply
    {
        apply_map["batch_norm_inference"] =
            extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
-        apply_map["convolution"] = extend_op<cpu_convolution, op::convolution>();
-        apply_map["dot"]         = extend_op<cpu_gemm, op::dot>();
-        apply_map["elu"]         = extend_op<cpu_unary<elu_op>, op::elu>();
-        apply_map["im2col"]      = extend_op<cpu_im2col, op::im2col>();
-        apply_map["leaky_relu"]  = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
-        apply_map["logsoftmax"]  = extend_op<cpu_logsoftmax, op::logsoftmax>();
-        apply_map["lrn"]         = extend_op<cpu_lrn, op::lrn>();
-        apply_map["pad"]         = extend_op<cpu_pad, op::pad>();
-        apply_map["softmax"]     = extend_op<cpu_softmax, op::softmax>();
+        apply_map["convolution"]       = extend_op<cpu_convolution, op::convolution>();
+        apply_map["dot"]               = extend_op<cpu_gemm, op::dot>();
+        apply_map["quant_dot"]         = extend_op<cpu_quant_gemm, op::quant_dot>();
+        apply_map["quant_convolution"] = extend_op<cpu_quant_convolution, op::quant_convolution>();
+        apply_map["elu"]               = extend_op<cpu_unary<elu_op>, op::elu>();
+        apply_map["im2col"]            = extend_op<cpu_im2col, op::im2col>();
+        apply_map["leaky_relu"]        = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
+        apply_map["logsoftmax"]        = extend_op<cpu_logsoftmax, op::logsoftmax>();
+        apply_map["lrn"]               = extend_op<cpu_lrn, op::lrn>();
+        apply_map["pad"]               = extend_op<cpu_pad, op::pad>();
+        apply_map["softmax"]           = extend_op<cpu_softmax, op::softmax>();
    }

    void apply()

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -34,16 +34,19 @@ add_library(migraphx_device
    device/contiguous.cpp
    device/logsoftmax.cpp
    device/softmax.cpp
+    device/sigmoid.cpp
    device/convert.cpp
    device/mul.cpp
    device/concat.cpp
    device/pad.cpp
    device/gather.cpp
    device/sub.cpp
+    device/int8_gemm_pack.cpp
    device/div.cpp
    device/clip.cpp
    device/reduce_sum.cpp
    device/rsqrt.cpp
+    device/round.cpp
    device/sqrt.cpp
    device/reduce_mean.cpp
    device/pow.cpp
@@ -65,8 +68,10 @@ add_library(migraphx_gpu
    target.cpp
    lowering.cpp
    gemm.cpp
+    quant_gemm.cpp
    pooling.cpp
    convolution.cpp
+    quant_convolution.cpp
    softmax.cpp
    logsoftmax.cpp
    contiguous.cpp
@@ -75,17 +80,20 @@ add_library(migraphx_gpu
    batchnorm.cpp
    write_literals.cpp
    rocblas.cpp
-    sigmoid.cpp
    abs.cpp
    elu.cpp
    pad.cpp
    gather.cpp
+    convert.cpp
    lrn.cpp
    schedule_model.cpp
    adjust_allocation.cpp
+    pack_int8_args.cpp
    clip.cpp
    reduce_sum.cpp
    reduce_mean.cpp
+    int8_gemm_pack.cpp
+    int8_conv_pack.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 rocm_clang_tidy_check(migraphx_gpu)

--- a/src/targets/gpu/convert.cpp
+++ b/src/targets/gpu/convert.cpp
+#include <migraphx/gpu/convert.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/convert.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_convert::compute_shape(std::vector<shape> inputs) const
+{
+    inputs.pop_back();
+    check_shapes{inputs}.packed();
+    return op.compute_shape(inputs);
+}
+
+argument hip_convert::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::convert(ctx.get_stream().get(), args[1], args[0]);
+    return args[1];
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
@@ -155,8 +155,8 @@ __device__ void dpp_reduce(T& in, Op op)

 __device__ inline void dpp_reduce(float& x, sum)
 {
-#ifdef MIGRAPHX_USE_CLANG_TIDY
-    (void)x;
+#if defined(MIGRAPHX_USE_CLANG_TIDY) || defined(CPPCHECK)
+    x = 1;
 #else
    __asm__ volatile("s_nop 4\n"
                     "v_add_f32 %0 %0 %0 row_shr:1\n"
@@ -245,8 +245,7 @@ void reduce_standard_impl(hipStream_t stream,
                          T init,
                          Input read_input,
                          Output read_output,
-                          std::size_t relements,
-                          std::size_t stride)
+                          std::size_t relements)
 {
    hip_visit_all(result, arg)([&](auto output, auto input) {
        auto nelements = result.get_shape().elements();
@@ -255,7 +254,7 @@ void reduce_standard_impl(hipStream_t stream,
        const std::size_t block_size     = compute_block_size(relements, max_block_size);
        gs_launch(stream, nelements * block_size, block_size)([=](auto i, auto idx) __device__ {
            const auto out_idx  = i / block_size;
-            const auto base_idx = out_idx * stride;
+            const auto base_idx = out_idx * relements;
            auto r = block_reduce<max_block_size>(idx, op, init, relements, [&](auto j) __device__ {
                return read_input(input.data()[base_idx + j]);
            });
@@ -276,25 +275,15 @@ void reduce(hipStream_t stream,
 {
    auto&& output_shape = result.get_shape();
    auto&& input_shape  = arg.get_shape();
+    assert(output_shape.lens().size() == input_shape.lens().size());
    if(input_shape.standard() and output_shape.standard() and
       output_shape.lens().back() != input_shape.lens().back() and
       std::equal(output_shape.lens().begin(),
                  std::prev(output_shape.lens().end()),
                  input_shape.lens().begin()))
    {
-        std::size_t stride = std::accumulate(input_shape.strides().begin(),
-                                             input_shape.strides().end(),
-                                             1,
-                                             std::multiplies<size_t>());
-        reduce_standard_impl(stream,
-                             result,
-                             arg,
-                             op,
-                             init,
-                             read_input,
-                             read_output,
-                             input_shape.lens().back(),
-                             stride);
+        reduce_standard_impl(
+            stream, result, arg, op, init, read_input, read_output, input_shape.lens().back());
    }
    else
    {

--- a/src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
@@ -31,6 +31,7 @@ struct hip_tensor_descriptor
            result[is] = tidx / strides[is];
            tidx       = tidx % strides[is];
        }
+
        return result;
    }
    __device__ __host__ std::size_t linear(hip_tensor_index<NDim> s) const

--- a/src/targets/gpu/device/int8_gemm_pack.cpp
+++ b/src/targets/gpu/device/int8_gemm_pack.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/int8_gemm_pack.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/hip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void int8_gemm_pack_a(hipStream_t stream, const argument& result, const argument& arg)
+{
+    auto comp_shape    = arg.get_shape();
+    auto out_lens      = comp_shape.lens();
+    auto dim_0         = out_lens.size() - 2;
+    auto dim_1         = out_lens.size() - 1;
+    std::size_t lda    = comp_shape.strides()[dim_0];
+    std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
+    visit_all(result, arg)([&](auto output, auto input) {
+        std::size_t nelements = comp_shape.elements();
+        auto* out_ptr         = device_cast(output.data());
+        auto* in_ptr          = device_cast(input.data());
+        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
+            hip_tensor_descriptor<out_dim> desc(comp_shape);
+            gs_launch(stream, nelements, 256)([=](auto ii) {
+                const size_t nb    = 4;
+                auto idx           = desc.multi(ii);
+                std::size_t i_m    = idx[dim_1];
+                std::size_t i_k    = idx[dim_0];
+                std::size_t offset = ii / m_size * m_size;
+                out_ptr[i_k % nb + (i_m + (i_k / nb) * lda) * nb + offset] =
+                    in_ptr[i_m + i_k * lda + offset];
+            });
+        });
+    });
+}
+
+void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument& arg)
+{
+    auto trans_shape = arg.get_shape();
+    auto out_lens    = trans_shape.lens();
+    auto dim_0       = trans_shape.lens().size() - 2;
+    auto dim_1       = trans_shape.lens().size() - 1;
+    std::size_t ldb  = trans_shape.strides()[dim_1];
+
+    auto wrap_lens = out_lens;
+    std::swap(wrap_lens[dim_0], wrap_lens[dim_1]);
+    shape comp_shape{trans_shape.type(), wrap_lens};
+    std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
+    visit_all(result, arg)([&](auto output, auto input) {
+        std::size_t nelements = comp_shape.elements();
+        auto* out_ptr         = device_cast(output.data());
+        auto* in_ptr          = device_cast(input.data());
+        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
+            hip_tensor_descriptor<out_dim> desc(comp_shape);
+            gs_launch(stream, nelements, 256)([=](auto ii) {
+                const size_t nb    = 4;
+                auto idx           = desc.multi(ii);
+                std::size_t i_n    = idx[dim_1];
+                std::size_t i_k    = idx[dim_0];
+                std::size_t offset = ii / m_size * m_size;
+                out_ptr[i_k % nb + (i_n + (i_k / nb) * ldb) * nb + offset] =
+                    in_ptr[i_n + i_k * ldb + offset];
+            });
+        });
+    });
+}
+
+void sync_stream(hipStream_t stream) { hipStreamSynchronize(stream); }
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/round.cpp
+++ b/src/targets/gpu/device/round.cpp
+#include <migraphx/gpu/device/round.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void round(hipStream_t stream, const argument& result, const argument& arg)
+{
+    nary(stream, result, arg)([](auto x) { return ::round(to_hip_type(x)); });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/sigmoid.cpp
+++ b/src/targets/gpu/device/sigmoid.cpp
+#include <migraphx/gpu/device/sigmoid.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void sigmoid(hipStream_t stream, const argument& result, const argument& arg)
+{
+    nary(stream, result, arg)([](auto x) { return 1.f / (1.f + ::exp(to_hip_type(-x))); });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/gemm.cpp
+++ b/src/targets/gpu/gemm.cpp
@@ -233,6 +233,10 @@ argument miopen_gemm::compute(context& ctx,
        auto to_pointer = [&](auto&& arg) { return to_rocblas_type(as.from(arg.data())); };
        if(num_matrices == 1)
        {
+            // the rocblas_gemm API handles inputs and output matrices as
+            // column-major format. When doing a C = A * B, we actually do
+            // C^T = (B^T) * (A^T). That is the reason we input args[1] as
+            // A and args[0] as B in calling the rocblas_gemm.
            generic_rocblas_gemm(as,
                                 ctx.get_stream().get_rocblas(),
                                 transb ? rocblas_operation_transpose : rocblas_operation_none,