Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX...

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into rewrite_sigmoid_to_hip_kernel

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX...
Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into rewrite_sigmoid_to_hip_kernel
4b0c3753 · Shucai Xiao · c546c166 · de1d5919 · 4b0c3753 · 4b0c3753
Commit 4b0c3753 authored Aug 16, 2019 by Shucai Xiao
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -125,6 +125,8 @@ rocm_enable_cppcheck(
        functionConst:*program.*
        shadowFunction
        shadowVar
+        shadowVariable
+        unsafeClassDivZero
        definePrefix:*test/include/test.hpp
    FORCE
    INCONCLUSIVE

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -99,13 +99,14 @@ rocmtest tidy: rocmnode('rocmtest') { cmake_build ->
            | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-5.0 -style=file {} | diff - {}\'
        '''
    }
-}, clang: rocmnode('vega') { cmake_build ->
+}, clang_debug: rocmnode('vega') { cmake_build ->
    stage('Clang Debug') {
-        // TODO: Enanle integer
+        // TODO: Enable integer
        def sanitizers = "undefined"
        def debug_flags = "-g -fno-omit-frame-pointer -fsanitize=${sanitizers} -fno-sanitize-recover=${sanitizers}"
        cmake_build("hcc", "-DCMAKE_BUILD_TYPE=debug -DMIGRAPHX_ENABLE_PYTHON=Off -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}'")
    }
+}, clang_release: rocmnode('vega') { cmake_build ->
    stage('Clang Release') {
        cmake_build("hcc", "-DCMAKE_BUILD_TYPE=release")
    }

--- a/dev-requirements.txt
+++ b/dev-requirements.txt
 pfultz2/rocm-recipes
-danmar/cppcheck@8aa68ee297c2d9ebadf5bcfd00c66ea8d9291e35 -DHAVE_RULES=1
+danmar/cppcheck@ef714225bb31e9a76ac2484796763572386955ae -DHAVE_RULES=1
 ROCm-Developer-Tools/HIP@2490e42baa7d90458f0632fd9fbead2d395f41b9
 python/cpython@v3.6.6 -X autotools -H sha256:92aa914572c695c0aeb01b0a214813f414da4b51a371234df514a74761f2bb36
 -f requirements.txt
--- a/src/include/migraphx/argument.hpp
+++ b/src/include/migraphx/argument.hpp
@@ -36,7 +36,7 @@ struct argument : raw_data<argument>
    }
    /// Provides a raw pointer to the data
-    std::function<char*()> data;
+    std::function<char*()> data = nullptr;
    /// Whether data is available
    bool empty() const { return not data; }

--- a/src/include/migraphx/matcher.hpp
+++ b/src/include/migraphx/matcher.hpp
@@ -353,14 +353,14 @@ MIGRAPHX_PRED_MATCHER(same_input_shapes, instruction_ref ins)
        ins->inputs().begin(), ins->inputs().end(), [&](auto x) { return x->get_shape() == s; });
 }
-MIGRAPHX_BASIC_MATCHER(output, matcher_context& ctx, instruction_ref ins)
+MIGRAPHX_BASIC_MATCHER(output, const matcher_context& ctx, instruction_ref ins)
 {
    if(ins->outputs().size() == 1)
        return ins->outputs().front();
    return ctx.not_found();
 }
-MIGRAPHX_BASIC_MATCHER(used_once, matcher_context& ctx, instruction_ref ins)
+MIGRAPHX_BASIC_MATCHER(used_once, const matcher_context& ctx, instruction_ref ins)
 {
    if(ins->outputs().size() == 1)
        return ins;
@@ -411,7 +411,7 @@ inline auto nargs(std::size_t n)
 inline auto arg(std::size_t i)
 {
-    return make_basic_fun_matcher([=](matcher_context& ctx, instruction_ref ins) {
+    return make_basic_fun_matcher([=](const matcher_context& ctx, instruction_ref ins) {
        if(i < ins->inputs().size())
            return ins->inputs()[i];
        return ctx.not_found();

--- a/src/include/migraphx/op/quant_convolution.hpp
+++ b/src/include/migraphx/op/quant_convolution.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_QUANT_CONVOLUTION_HPP
+#define MIGRAPHX_GUARD_OPERATORS_QUANT_CONVOLUTION_HPP
+#include <array>
+#include <migraphx/op/common.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/streamutils.hpp>
+#include <migraphx/literal.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/config.hpp>
+#include <cmath>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct quant_convolution
+{
+    std::array<std::size_t, 2> padding  = {{0, 0}};
+    std::array<std::size_t, 2> stride   = {{1, 1}};
+    std::array<std::size_t, 2> dilation = {{1, 1}};
+    padding_mode_t padding_mode = default_;
+    int group                   = 1;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.padding, "padding"),
+                    f(self.stride, "stride"),
+                    f(self.dilation, "dilation"),
+                    f(self.padding_mode, "padding_mode"),
+                    f(self.group, "group"));
+    }
+    std::string name() const { return "quant_convolution"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(2).same_type().same_ndims().only_dims(4);
+        const shape& input   = inputs.at(0);
+        const shape& weights = inputs.at(1);
+        auto t               = input.type();
+        // all input type must be int8_type and output is float_type
+        if(t != shape::int8_type)
+        {
+            MIGRAPHX_THROW("QUANT_CONVOLUTION: only accept input and weights of type int8_t");
+        }
+        t = shape::int32_type;
+        return {t,
+                {
+                    input.lens()[0],
+                    weights.lens()[0],
+                    std::size_t(std::max<std::ptrdiff_t>(
+                        1,
+                        (input.lens()[2] - (1 + dilation[0] * (weights.lens()[2] - 1)) +
+                         2 * padding[0]) /
+                                stride[0] +
+                            1)),
+                    std::size_t(std::max<std::ptrdiff_t>(
+                        1,
+                        (input.lens()[3] - (1 + dilation[1] * (weights.lens()[3] - 1)) +
+                         2 * padding[1]) /
+                                stride[1] +
+                            1)),
+                }};
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/quant_dot.hpp
+++ b/src/include/migraphx/op/quant_dot.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_QUANT_DOT_HPP
+#define MIGRAPHX_GUARD_OPERATORS_QUANT_DOT_HPP
+#include <array>
+#include <migraphx/operation.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/streamutils.hpp>
+#include <migraphx/literal.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/config.hpp>
+#include <cmath>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct quant_dot
+{
+    int32_t alpha = 1;
+    int32_t beta  = 1;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(as_number(self.alpha), "alpha"), f(as_number(self.beta), "beta"));
+    }
+    std::string name() const { return "quant_dot"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{{inputs.at(0), inputs.at(1)}, *this}.same_type();
+        const shape& a = inputs.at(0);
+        const shape& b = inputs.at(1);
+        auto t         = a.type();
+        if(t != shape::int8_type)
+        {
+            MIGRAPHX_THROW("QUANT_DOT: only support data type int8_t");
+        }
+        if(!std::all_of(inputs.begin(), inputs.end(), [](auto s) { return s.lens().size() >= 2; }))
+        {
+            MIGRAPHX_THROW("QUANT_DOT: dot only accept 2 or more dims operands");
+        }
+        // only handle the case that the batch size of a and b are the same
+        if(!std::equal(
+               a.lens().rbegin() + 2, a.lens().rend(), b.lens().rbegin() + 2, b.lens().rend()))
+        {
+            MIGRAPHX_THROW("QUANT_DOT: batch size of A and B mismatch: {" +
+                           to_string_range(a.lens()) + "} x {" + to_string_range(b.lens()) + "}");
+        }
+        std::size_t dim_0 = a.lens().size() - 2;
+        std::size_t dim_1 = a.lens().size() - 1;
+        if(a.lens()[dim_1] != b.lens()[dim_0])
+        {
+            MIGRAPHX_THROW("QUANT_DOT: inner dimensions do not match: {" +
+                           to_string_range(a.lens()) + "} x {" + to_string_range(b.lens()) + "}");
+        }
+        // k be multiple of 4
+        if((a.lens()[dim_1] % 4) != 0)
+        {
+            MIGRAPHX_THROW("QUANT_DOT: size of A {" + to_string_range(a.lens()) + "} and B {" +
+                           to_string_range(b.lens()) + "} must be multiple of 4 for int8 type");
+        }
+        auto out_lens   = a.lens();
+        out_lens[dim_1] = b.lens()[dim_1];
+        if(inputs.size() == 3 && out_lens != inputs.at(2).lens())
+        {
+            MIGRAPHX_THROW("QUANT_DOT: dimension mismatch, operand C: {" +
+                           to_string_range(inputs.at(2).lens()) +
+                           "}, cannot add to operand A * B: {" + to_string_range(out_lens) + "}");
+        }
+        if(inputs.size() == 3 && inputs.at(2).type() != shape::int32_type)
+        {
+            MIGRAPHX_THROW("QUANT_DOT: operand C type must be int32");
+        }
+        return {shape::int32_type, out_lens};
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -45,6 +45,8 @@
 #include <migraphx/op/outline.hpp>
 #include <migraphx/op/pad.hpp>
 #include <migraphx/op/pooling.hpp>
+#include <migraphx/op/quant_convolution.hpp>
+#include <migraphx/op/quant_dot.hpp>
 #include <migraphx/op/pow.hpp>
 #include <migraphx/op/reduce_sum.hpp>
 #include <migraphx/op/reduce_mean.hpp>

--- a/src/include/migraphx/verify.hpp
+++ b/src/include/migraphx/verify.hpp
@@ -168,6 +168,7 @@ bool verify_range(R1&& r1, R2&& r2, double tolerance = 80, double* out_error = n
 {
    double threshold = std::numeric_limits<range_value<R1>>::epsilon() * tolerance;
    auto error       = rms_range(r1, r2);
+    // cppcheck-suppress uninitvar
    if(out_error != nullptr)
        *out_error = error;
    return error <= threshold;

--- a/src/onnx/onnx.cpp
+++ b/src/onnx/onnx.cpp
@@ -1011,9 +1011,10 @@ struct onnx_parser
        }
        std::vector<operation> vec_actv_funcs(vec_names.size());
-        std::transform(vec_names.begin(), vec_names.end(), vec_actv_funcs.begin(), [&](auto& fn) {
+        std::transform(vec_names.begin(),
-            return map_actv_funcs[fn];
+                       vec_names.end(),
-        });
+                       vec_actv_funcs.begin(),
+                       [&](const auto& fn) { return map_actv_funcs[fn]; });
        // To be added later
        float clip = 0.0;
@@ -1127,9 +1128,10 @@ struct onnx_parser
        }
        std::vector<operation> vec_actv_funcs(vec_names.size());
-        std::transform(vec_names.begin(), vec_names.end(), vec_actv_funcs.begin(), [&](auto& name) {
+        std::transform(vec_names.begin(),
-            return map_actv_funcs[name];
+                       vec_names.end(),
-        });
+                       vec_actv_funcs.begin(),
+                       [&](const auto& name) { return map_actv_funcs[name]; });
        float clip = 0.0;
        if(contains(attributes, "clip"))
@@ -1299,9 +1301,10 @@ struct onnx_parser
        }
        std::vector<operation> vec_actv_funcs(vec_names.size());
-        std::transform(vec_names.begin(), vec_names.end(), vec_actv_funcs.begin(), [&](auto& name) {
+        std::transform(vec_names.begin(),
-            return map_actv_funcs[name];
+                       vec_names.end(),
-        });
+                       vec_actv_funcs.begin(),
+                       [&](const auto& name) { return map_actv_funcs[name]; });
        float clip = 0.0;
        if(contains(attributes, "clip"))

--- a/src/opt/memory_coloring_impl.cpp
+++ b/src/opt/memory_coloring_impl.cpp
@@ -85,6 +85,9 @@ bool memory_coloring_impl::allocate(interval_ptr interval)
            offset += (element_size - (offset % element_size));
        conflict_queue.pop();
    }
+    // when int8 type is used, the offset could be any number
+    // if not 4-byte aligned, miopen int8 convolution can crash
+    offset         = (offset + 3) / 4 * 4;
    segment.offset = offset;
    MIGRAPHX_DEBUG(segment.dump());
    required_bytes = std::max(required_bytes, offset + segment.size);

--- a/src/opt/memory_coloring_impl.hpp
+++ b/src/opt/memory_coloring_impl.hpp
@@ -107,7 +107,7 @@ struct memory_coloring_impl
        return ins->name() == "check_context";
    }
-    static bool is_disjoin(live_range& range1, live_range& range2)
+    static bool is_disjoin(const live_range& range1, const live_range& range2)
    {
        if((range1.size == 0) || (range2.size == 0))
            return false;

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -241,7 +241,7 @@ instruction_ref program::remove_instructions(instruction_ref first, instruction_
    // TODO: Check every element
    assert(has_instruction(first));
    std::for_each(first, last, [&](instruction& ins) { ins.clear_arguments(); });
-    assert(std::all_of(first, last, [&](instruction& ins) { return ins.outputs().empty(); }));
+    assert(std::all_of(first, last, [&](const instruction& ins) { return ins.outputs().empty(); }));
    return impl->instructions.erase(first, last);
 }

--- a/src/rewrite_rnn.cpp
+++ b/src/rewrite_rnn.cpp
@@ -674,7 +674,6 @@ void rewrite_rnn::apply_lstm(program& prog, instruction_ref ins) const
    std::vector<float> ihc_data(ihc_shape.elements(), 0.0);
    migraphx::shape pph_shape{type, {1, 3 * hidden_size}};
-    std::vector<float> pph_data(pph_shape.elements(), 0.0);
    auto actv_funcs         = lstm_actv_funcs(ins);
    auto lstm_op            = any_cast<op::lstm>(ins->get_operator());

--- a/src/targets/cpu/gemm.cpp
+++ b/src/targets/cpu/gemm.cpp
@@ -44,13 +44,9 @@ struct is_fast_gemm_type<float> : std::true_type
 {
 };
-template <class T>
+template <class T, class F>
-void migemm_impl(tensor_view<T> cmat,
+void migemm_impl(
-                 tensor_view<T> amat,
+    tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta, std::true_type)
-                 tensor_view<T> bmat,
-                 float alpha,
-                 float beta,
-                 std::true_type)
 {
    visit_mat(amat, [&](const auto& a) {
        visit_mat(bmat, [&](const auto& b) {
@@ -66,13 +62,9 @@ void migemm_impl(tensor_view<T> cmat,
    });
 }
-template <class T>
+template <class T, class F>
-void migemm_impl(tensor_view<T> cmat,
+void migemm_impl(
-                 tensor_view<T> amat,
+    tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta, std::false_type)
-                 tensor_view<T> bmat,
-                 float alpha,
-                 float beta,
-                 std::false_type)
 {
    std::size_t n_dims = cmat.get_shape().lens().size();
    std::size_t dim_0  = n_dims - 2;
@@ -95,9 +87,8 @@ void migemm_impl(tensor_view<T> cmat,
    });
 }
-template <class T>
+template <class T, class F>
-void migemm_impl(
+void migemm_impl(tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta)
-    tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, float alpha, float beta)
 {
    auto lens = amat.get_shape().lens();
    bool batch_mul =
@@ -113,13 +104,29 @@ void migemm_impl(
    }
 }
-void migemm(
+template <class F>
-    const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta)
+void migemm_tpl(
+    const argument& c_arg, const argument& a_arg, const argument& b_arg, F alpha, F beta)
 {
    visit_all(c_arg, a_arg, b_arg)(
        [&](auto cmat, auto amat, auto bmat) { migemm_impl(cmat, amat, bmat, alpha, beta); });
 }
+void migemm(
+    const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta)
+{
+    migemm_tpl(c_arg, a_arg, b_arg, alpha, beta);
+}
+void migemm(const argument& c_arg,
+            const argument& a_arg,
+            const argument& b_arg,
+            int32_t alpha,
+            int32_t beta)
+{
+    migemm_tpl(c_arg, a_arg, b_arg, alpha, beta);
+}
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/cpu/include/migraphx/cpu/gemm.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/gemm.hpp
@@ -10,6 +10,11 @@ namespace cpu {
 void migemm(
    const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta);
+void migemm(const argument& c_arg,
+            const argument& a_arg,
+            const argument& b_arg,
+            int32_t alpha,
+            int32_t beta);
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -4,7 +4,9 @@
 #include <migraphx/dfor.hpp>
 #include <migraphx/op/batch_norm.hpp>
 #include <migraphx/op/convolution.hpp>
+#include <migraphx/op/quant_convolution.hpp>
 #include <migraphx/op/dot.hpp>
+#include <migraphx/op/quant_dot.hpp>
 #include <migraphx/op/elu.hpp>
 #include <migraphx/op/im2col.hpp>
 #include <migraphx/op/leaky_relu.hpp>
@@ -216,6 +218,60 @@ struct cpu_convolution
    }
 };
+struct cpu_quant_convolution
+{
+    op::quant_convolution op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "cpu::quant_convolution"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        auto output = result.get<int32_t>();
+        visit_all(args[0], args[1])([&](auto input, auto weights) {
+            auto in   = input.get_shape().lens();
+            auto in_h = in[2];
+            auto in_w = in[3];
+            auto wei   = weights.get_shape().lens();
+            auto wei_n = wei[0];
+            auto wei_c = wei[1];
+            auto wei_h = wei[2];
+            auto wei_w = wei[3];
+            par_dfor(output_shape.lens()[0],
+                     output_shape.lens()[1],
+                     output_shape.lens()[2],
+                     output_shape.lens()[3])(
+                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
+                    const auto start_x  = i * op.stride[0] - op.padding[0];
+                    const auto start_y  = j * op.stride[1] - op.padding[1];
+                    const auto group_id = w / (wei_n / op.group);
+                    int32_t acc = 0;
+                    dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
+                        const auto in_x  = start_x + x;
+                        const auto in_y  = start_y + y;
+                        const auto in_ch = group_id * wei_c + k;
+                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
+                        {
+                            acc += input(o, in_ch, in_x, in_y) * weights(w, k, x, y);
+                        }
+                    });
+                    output(o, w, i, j) = acc;
+                });
+        });
+        return result;
+    }
+};
 struct cpu_im2col
 {
    op::im2col op;
@@ -245,17 +301,17 @@ struct cpu_im2col
            const std::size_t& stride_h = op.stride[0];
            const std::size_t& stride_w = op.stride[1];
-            auto kdiv2_h = kernel_h / 2;
+            long kdiv2_h = long(kernel_h) / 2;
-            auto kdiv2_w = kernel_w / 2;
+            long kdiv2_w = long(kernel_w) / 2;
            // calculate output sizes
            const std::size_t col_height = (height - kernel_h + 2 * pad_h) / stride_h + 1;
            const std::size_t col_width  = (width - kernel_w + 2 * pad_w) / stride_w + 1;
            // account for padding for the starting position of the input pixels
-            std::size_t iinput = kdiv2_h - pad_h;
+            long iinput = kdiv2_h - long(pad_h);
            // loop over output pixels (ioutput, joutput)
            for(std::size_t ioutput = 0; ioutput < col_height; ioutput++, iinput += stride_h)
            {
-                std::size_t jinput = kdiv2_w - pad_w;
+                long jinput = kdiv2_w - long(pad_w);
                for(std::size_t joutput = 0; joutput < col_width; joutput++, jinput += stride_w)
                {
                    // compute linear index for output
@@ -264,8 +320,8 @@ struct cpu_im2col
                    dfor(channels,
                         kernel_h,
                         kernel_w)([&](std::size_t c, std::size_t koffset, std::size_t loffset) {
-                        auto idx    = iinput + koffset - kdiv2_h;
+                        auto idx    = iinput + long(koffset) - kdiv2_h;
-                        auto jdx    = jinput + loffset - kdiv2_w;
+                        auto jdx    = jinput + long(loffset) - kdiv2_w;
                        col(ldx, p) = ((idx >= 0) && (idx < height) && (jdx >= 0) && (jdx < width))
                                          ? input(0, c, idx, jdx)
                                          : 0;
@@ -433,7 +489,7 @@ struct cpu_gemm
    {
        argument result{output_shape};
        // 3 inputs, it is alpha * A * B + beta * C, then
-        // A and B are matrics, and C is broadcastable to A * B
+        // A and B are matrices, and C is of the same shape as A * B
        if(args.size() == 3)
        {
            // no need to consider the value of args[2]
@@ -460,13 +516,80 @@ struct cpu_gemm
    }
 };
+struct cpu_quant_gemm
+{
+    op::quant_dot op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "cpu::quant_dot"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        if(inputs.size() == 3)
+        {
+            auto c_shape = inputs.at(2);
+            check_shapes{{c_shape}}.not_broadcasted();
+        }
+        return op.compute_shape(inputs);
+    }
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        // 3 inputs, it is alpha * A * B + beta * C, then
+        // A and B are matrices, and C is of the same shape to A * B
+        // first, convert the args[0] and args[1] from int8_t to int32_t
+        argument arg_0{{shape::int32_type, {args.at(0).get_shape().lens()}}};
+        argument arg_1{{shape::int32_type, {args.at(1).get_shape().lens()}}};
+        arg_0.visit([&](auto output) {
+            args.at(0).visit(
+                [&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
+        });
+        arg_1.visit([&](auto output) {
+            args.at(1).visit(
+                [&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
+        });
+        if(args.size() == 3)
+        {
+            // no need to consider the value of args[2]
+            if(op.beta == 0)
+            {
+                result.visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
+            }
+            else
+            {
+                visit_all(result, args[2])([&](auto output, auto input) {
+                    std::copy(input.begin(), input.end(), output.begin());
+                });
+            }
+            migemm(result, arg_0, arg_1, op.alpha, op.beta);
+            return result;
+        }
+        // 2 input arguments
+        int32_t beta = 0;
+        migemm(result, arg_0, arg_1, op.alpha, beta);
+        return result;
+    }
+};
 struct leaky_relu_op
 {
    op::leaky_relu op;
    std::string name() const { return "cpu::leaky_relu"; }
    auto fcn() const
    {
-        auto& a = op.alpha;
+        auto a = op.alpha;
        return [a](auto x) { return x > 0 ? x : x * a; };
    }
 };
@@ -477,7 +600,7 @@ struct elu_op
    std::string name() const { return "cpu::elu"; }
    auto fcn() const
    {
-        auto& a = op.alpha;
+        auto a = op.alpha;
        return [a](auto x) { return x > 0 ? x : a * std::expm1(x); };
    }
 };
@@ -671,15 +794,17 @@ struct cpu_apply
    {
        apply_map["batch_norm_inference"] =
            extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
-        apply_map["convolution"] = extend_op<cpu_convolution, op::convolution>();
+        apply_map["convolution"]       = extend_op<cpu_convolution, op::convolution>();
-        apply_map["dot"]         = extend_op<cpu_gemm, op::dot>();
+        apply_map["dot"]               = extend_op<cpu_gemm, op::dot>();
-        apply_map["elu"]         = extend_op<cpu_unary<elu_op>, op::elu>();
+        apply_map["quant_dot"]         = extend_op<cpu_quant_gemm, op::quant_dot>();
-        apply_map["im2col"]      = extend_op<cpu_im2col, op::im2col>();
+        apply_map["quant_convolution"] = extend_op<cpu_quant_convolution, op::quant_convolution>();
-        apply_map["leaky_relu"]  = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
+        apply_map["elu"]               = extend_op<cpu_unary<elu_op>, op::elu>();
-        apply_map["logsoftmax"]  = extend_op<cpu_logsoftmax, op::logsoftmax>();
+        apply_map["im2col"]            = extend_op<cpu_im2col, op::im2col>();
-        apply_map["lrn"]         = extend_op<cpu_lrn, op::lrn>();
+        apply_map["leaky_relu"]        = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
-        apply_map["pad"]         = extend_op<cpu_pad, op::pad>();
+        apply_map["logsoftmax"]        = extend_op<cpu_logsoftmax, op::logsoftmax>();
-        apply_map["softmax"]     = extend_op<cpu_softmax, op::softmax>();
+        apply_map["lrn"]               = extend_op<cpu_lrn, op::lrn>();
+        apply_map["pad"]               = extend_op<cpu_pad, op::pad>();
+        apply_map["softmax"]           = extend_op<cpu_softmax, op::softmax>();
    }
    void apply()

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -40,6 +40,7 @@ add_library(migraphx_device
    device/pad.cpp
    device/gather.cpp
    device/sub.cpp
+    device/int8_gemm_pack.cpp
    device/div.cpp
    device/clip.cpp
    device/reduce_sum.cpp
@@ -65,8 +66,10 @@ add_library(migraphx_gpu
    target.cpp
    lowering.cpp
    gemm.cpp
+    quant_gemm.cpp
    pooling.cpp
    convolution.cpp
+    quant_convolution.cpp
    softmax.cpp
    logsoftmax.cpp
    contiguous.cpp
@@ -82,9 +85,12 @@ add_library(migraphx_gpu
    lrn.cpp
    schedule_model.cpp
    adjust_allocation.cpp
+    pack_int8_args.cpp
    clip.cpp
    reduce_sum.cpp
    reduce_mean.cpp
+    int8_gemm_pack.cpp
+    int8_conv_pack.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 rocm_clang_tidy_check(migraphx_gpu)

--- a/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
@@ -155,8 +155,8 @@ __device__ void dpp_reduce(T& in, Op op)
 __device__ inline void dpp_reduce(float& x, sum)
 {
-#ifdef MIGRAPHX_USE_CLANG_TIDY
+#if defined(MIGRAPHX_USE_CLANG_TIDY) || defined(CPPCHECK)
-    (void)x;
+    x = 1;
 #else
    __asm__ volatile("s_nop 4\n"
                     "v_add_f32 %0 %0 %0 row_shr:1\n"

--- a/src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
@@ -31,6 +31,7 @@ struct hip_tensor_descriptor
            result[is] = tidx / strides[is];
            tidx       = tidx % strides[is];
        }
        return result;
    }
    __device__ __host__ std::size_t linear(hip_tensor_index<NDim> s) const