Use dnnl for cpu backend (#688)

* Add flag to enable cpu backend * Make buffers shared * Enable optimizations * Add onednn * Formatting * Formatting * Add dnnl header * Formatting * Rewrite rnn first * Formatting * Call reference implementation * Formatting * Make literal data shared * Formatting * Add convolution * Formatting * Compensate for dilation * Formatting * Use name/make_op instead * Formatting * Rename gemm header * Formatting * Add dnnl convolution/gemm operators * Formatting * Add eliminate_contiguous * Add faster pointwise operators * Formatting * Formatting * Formatting * Add dnnl op class * Formatting * Add add op * Formatting * Add concat operator * Formatting * Add more ops * Create descriptor during finalization * Formatting * Dont rewrite pooling * Enable memory coloring * Formatting * Add output aliases * Formatting * Fix errors * Formatting * Convert literals * Add missing file * Remove batch_norm * Formatting * Use strides * Formatting * Add some debug checks * Formatting * Fix big in adjusting shape for gemm * Formatting * Fix fallback dot operator * Zero initialize buffers * Add suport for group convolutions * Formatting * Make adjust allocation target independent * Formatting * Enable adjust_allocation for gpu/cpu * Formatting * Add copy to allocation model * Formatting * Add copy operator * Formatting * Better handling of output parameters in adjust_allocation * Formatting * Build with dnnl * Make dnnl required * Fix compile error * Tidy fixes * Formatting * Tidy fixes * Formatting * Fix more tidy issues * Formatting * Add mul op * Add mul op * Set c compiler to clang as well * Compensate for normalized compute shape * Formatting * Fix cppcheck errors * Formatting * Add onednn library to hcc * Guard clang pragmas * Disable cpu mode for gcc for now * Leave it enabled it for gcc 7 * Fix cppcheck suppresion * Fix compile error on gcc 5 * Remove unused code Co-authored-by: Shucai Xiao <shucai.xiao@amd.com> Co-authored-by: mvermeulen <5479696+mvermeulen@users.noreply.github.com>

Use dnnl for cpu backend (#688)
* Add flag to enable cpu backend * Make buffers shared * Enable optimizations * Add onednn * Formatting * Formatting * Add dnnl header * Formatting * Rewrite rnn first * Formatting * Call reference implementation * Formatting * Make literal data shared * Formatting * Add convolution * Formatting * Compensate for dilation * Formatting * Use name/make_op instead * Formatting * Rename gemm header * Formatting * Add dnnl convolution/gemm operators * Formatting * Add eliminate_contiguous * Add faster pointwise operators * Formatting * Formatting * Formatting * Add dnnl op class * Formatting * Add add op * Formatting * Add concat operator * Formatting * Add more ops * Create descriptor during finalization * Formatting * Dont rewrite pooling * Enable memory coloring * Formatting * Add output aliases * Formatting * Fix errors * Formatting * Convert literals * Add missing file * Remove batch_norm * Formatting * Use strides * Formatting * Add some debug checks * Formatting * Fix big in adjusting shape for gemm * Formatting * Fix fallback dot operator * Zero initialize buffers * Add suport for group convolutions * Formatting * Make adjust allocation target independent * Formatting * Enable adjust_allocation for gpu/cpu * Formatting * Add copy to allocation model * Formatting * Add copy operator * Formatting * Better handling of output parameters in adjust_allocation * Formatting * Build with dnnl * Make dnnl required * Fix compile error * Tidy fixes * Formatting * Tidy fixes * Formatting * Fix more tidy issues * Formatting * Add mul op * Add mul op * Set c compiler to clang as well * Compensate for normalized compute shape * Formatting * Fix cppcheck errors * Formatting * Add onednn library to hcc * Guard clang pragmas * Disable cpu mode for gcc for now * Leave it enabled it for gcc 7 * Fix cppcheck suppresion * Fix compile error on gcc 5 * Remove unused code Co-authored-by: Shucai Xiao <shucai.xiao@amd.com> Co-authored-by: mvermeulen <5479696+mvermeulen@users.noreply.github.com>
406afeb8 · Paul Fultz II · GitHub · 8698cd2c · 406afeb8 · 406afeb8
Unverified Commit 406afeb8 authored Dec 14, 2020 by Paul Fultz II Committed by GitHub Dec 14, 2020
20 changed files
--- a/src/targets/cpu/CMakeLists.txt
+++ b/src/targets/cpu/CMakeLists.txt
 add_library(migraphx_cpu
+    allocation_model.cpp
+    allocate.cpp
+    add.cpp
+    contiguous.cpp
+    concat.cpp
+    convolution.cpp
+    copy.cpp
+    mul.cpp
+    pooling.cpp
+    relu.cpp
+    gemm.cpp
    target.cpp
    lowering.cpp
-    gemm.cpp
+    migemm.cpp
 )
 set_target_properties(migraphx_cpu PROPERTIES EXPORT_NAME cpu)
 rocm_set_soversion(migraphx_cpu ${MIGRAPHX_SO_VERSION})
 find_path(BLAZE_INCLUDE blaze/Blaze.h)
 find_package(Threads)
+find_package(dnnl REQUIRED)
 rocm_clang_tidy_check(migraphx_cpu)
-target_link_libraries(migraphx_cpu migraphx Threads::Threads)
+target_link_libraries(migraphx_cpu PRIVATE migraphx Threads::Threads)
 target_include_directories(migraphx_cpu PRIVATE ${BLAZE_INCLUDE})
 target_compile_definitions(migraphx_cpu PRIVATE -DBLAZE_USE_CPP_THREADS)
+if(dnnl_FOUND)
+target_link_libraries(migraphx_cpu PRIVATE DNNL::dnnl)
+target_compile_definitions(migraphx_cpu PRIVATE -DUSE_DNNL)
+find_package(OpenMP)
+target_link_libraries(migraphx_cpu PUBLIC OpenMP::OpenMP_CXX)
+# Add library path to rpath to workaround issues with our broken packages
+foreach(LIBRARY ${OpenMP_CXX_LIBRARIES})
+    if(LIBRARY MATCHES "libomp")
+        get_filename_component(LIBRARY_PATH "${LIBRARY}" PATH)
+        target_link_libraries(migraphx_cpu PUBLIC -Wl,-rpath=${LIBRARY_PATH} -Wl,-rpath-link=${LIBRARY_PATH})
+    endif()
+endforeach()
+endif()
 target_link_libraries(migraphx_all_targets INTERFACE migraphx_cpu)

--- a/src/targets/cpu/add.cpp
+++ b/src/targets/cpu/add.cpp
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/add.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+template struct cpu_binary<op::add>;
+#if USE_DNNL
+struct dnnl_add : dnnl_extend_op<dnnl_add, dnnl::binary, op::add>
+{
+    dnnl::binary::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        return {dnnl::algorithm::binary_add,
+                m.at(DNNL_ARG_SRC_0),
+                m.at(DNNL_ARG_SRC_1),
+                m.at(DNNL_ARG_DST)};
+    }
+};
+#endif
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/allocate.cpp
+++ b/src/targets/cpu/allocate.cpp
+#include <migraphx/config.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx/cpu/context.hpp>
+#include <migraphx/register_op.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+struct cpu_allocate : auto_register_op<cpu_allocate>
+{
+    shape s;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.s, "shape"));
+    }
+    std::string name() const { return "cpu::allocate"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(0);
+        return s;
+    }
+    argument compute(context&, const shape& output_shape, const std::vector<argument>&) const
+    {
+        argument result{output_shape};
+        return result;
+    }
+};
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/allocation_model.cpp
+++ b/src/targets/cpu/allocation_model.cpp
+#include <migraphx/cpu/allocation_model.hpp>
+#include <migraphx/make_op.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+std::string cpu_allocation_model::name() const { return "cpu::allocate"; }
+operation cpu_allocation_model::allocate(const shape& s) const
+{
+    return make_op(name(), {{"shape", to_value(s)}});
+}
+std::string cpu_allocation_model::copy() const { return "cpu::copy"; }
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/concat.cpp
+++ b/src/targets/cpu/concat.cpp
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/concat.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+#if USE_DNNL
+struct dnnl_concat : dnnl_extend_op<dnnl_concat, dnnl::concat, op::concat>
+{
+    std::vector<int> arg_map(int size) const
+    {
+        std::vector<int> result(size);
+        std::iota(result.begin(), result.end(), DNNL_ARG_MULTIPLE_SRC);
+        return result;
+    }
+    // Custom desc class since its missing in dnnl
+    struct desc
+    {
+        dnnl::memory::desc dst;
+        std::size_t axis = 1;
+        std::vector<dnnl::memory::desc> srcs;
+    };
+    desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        std::vector<dnnl::memory::desc> srcs;
+        srcs.reserve(m.size() - 1);
+        for(auto i = 0; i < m.size() - 1; i++)
+        {
+            srcs.push_back(m.at(DNNL_ARG_MULTIPLE_SRC + i));
+        }
+        return {m.at(DNNL_ARG_DST), std::size_t(op.axis), srcs};
+    }
+    auto get_primitive_desc(const desc& d) const
+    {
+        return dnnl::concat::primitive_desc(d.dst, d.axis, d.srcs, get_dnnl_context().engine);
+    }
+};
+#endif
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/contiguous.cpp
+++ b/src/targets/cpu/contiguous.cpp
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/contiguous.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+template struct cpu_unary<op::contiguous>;
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/convolution.cpp
+++ b/src/targets/cpu/convolution.cpp
+#include <migraphx/config.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx/cpu/context.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+#include <migraphx/op/convolution.hpp>
+#include <migraphx/op/quant_convolution.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+template <class V, class T, class... Ts>
+void visit_quantize_impl(V&& v, T&& x, Ts&&... xs)
+{
+    x.visit([&](auto y) { visit_all(xs...)([&](auto... ys) { v(y, ys...); }); });
+}
+template <class T, class... Ts>
+auto visit_quantize(T&& x, Ts&&... xs)
+{
+    return [&](auto v) {
+        // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70100
+        visit_quantize_impl(v, x, xs...);
+    };
+}
+template <class Op>
+struct cpu_convolution : auto_register_op<cpu_convolution<Op>>
+{
+    Op op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "cpu::" + op.name(); }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        inputs.pop_back();
+        return op.compute_shape(inputs);
+    }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        visit_quantize(args.back(), args[0], args[1])([&](auto output, auto input, auto weights) {
+            auto in_lens = input.get_shape().lens();
+            auto wei_lens = weights.get_shape().lens();
+            auto wei_n    = wei_lens[0];
+            auto wei_c    = wei_lens[1];
+            std::vector<std::size_t> win_size(wei_lens.begin() + 1, wei_lens.end());
+            par_for(output_shape.elements(), [&](auto i) {
+                auto idx_o = output_shape.multi(i);
+                auto w     = idx_o[1];
+                auto n_dim = idx_o.size();
+                std::vector<std::ptrdiff_t> win_start;
+                for(std::size_t dim = 2; dim < n_dim; ++dim)
+                {
+                    auto d_2 = dim - 2;
+                    win_start.push_back(std::ptrdiff_t(idx_o[dim] * op.stride[d_2]) -
+                                        std::ptrdiff_t(op.padding[d_2]));
+                }
+                const auto group_id = w / (wei_n / op.group);
+                shape win_shape{output_shape.type(), win_size};
+                double acc = 0.0;
+                shape_for_each(win_shape, [&](auto idx_win) {
+                    auto k           = idx_win[0];
+                    const auto in_ch = group_id * wei_c + k;
+                    std::vector<std::ptrdiff_t> idx(idx_o.begin(), idx_o.end());
+                    idx[1] = in_ch;
+                    std::transform(idx_win.begin() + 1,
+                                   idx_win.end(),
+                                   win_start.begin(),
+                                   idx.begin() + 2,
+                                   [](std::ptrdiff_t ii, std::ptrdiff_t jj) { return ii + jj; });
+                    std::vector<std::ptrdiff_t> idx_wei(idx_o.size());
+                    idx_wei[0] = w;
+                    std::copy(idx_win.begin(), idx_win.end(), idx_wei.begin() + 1);
+                    if(std::all_of(idx.begin() + 2, idx.end(), [&](auto ii) { return ii >= 0; }) and
+                       std::equal(idx.begin(),
+                                  idx.end(),
+                                  in_lens.begin(),
+                                  in_lens.end(),
+                                  std::less<std::ptrdiff_t>{}))
+                    {
+                        acc +=
+                            input(idx.begin(), idx.end()) * weights(idx_wei.begin(), idx_wei.end());
+                    }
+                });
+                output[i] = acc;
+            });
+        });
+        return args.back();
+    }
+};
+template struct cpu_convolution<op::quant_convolution>;
+template struct cpu_convolution<op::convolution>;
+#if USE_DNNL
+struct dnnl_convolution
+    : dnnl_extend_op<dnnl_convolution, dnnl::convolution_forward, op::convolution>
+{
+    std::vector<int> arg_map(int) const { return {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS}; }
+    shape adjust_shape(const shape& x, int i) const
+    {
+        auto s = base_adjust_shape(x);
+        if(i == 1 and op.group > 1)
+        {
+            // TODO: Add support for transposed weights
+            if(not s.standard())
+                MIGRAPHX_THROW("Weights for grouped convolution must be standard");
+            auto lens = s.lens();
+            lens.insert(lens.begin(), op.group);
+            lens.at(1) /= op.group;
+            return shape{s.type(), lens};
+        }
+        return s;
+    }
+    dnnl::convolution_forward::desc
+    get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        // In DNNL dilation is zero-based
+        auto dilation = op.dilation;
+        std::transform(
+            dilation.begin(), dilation.end(), dilation.begin(), [](auto x) { return x - 1; });
+        return {dnnl::prop_kind::forward_inference,
+                dnnl::algorithm::convolution_auto,
+                m.at(DNNL_ARG_SRC),
+                m.at(DNNL_ARG_WEIGHTS),
+                m.at(DNNL_ARG_DST),
+                to_dnnl_dims(op.stride),
+                to_dnnl_dims(dilation),
+                to_dnnl_dims(op.padding),
+                to_dnnl_dims(op.padding)};
+    }
+};
+#endif
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/copy.cpp
+++ b/src/targets/cpu/copy.cpp
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+struct cpu_copy : reduce_dims_base, auto_register_op<cpu_copy>
+{
+    template <class Self, class F>
+    static auto reflect(Self&, F)
+    {
+        return pack();
+    }
+    std::string name() const { return "cpu::copy"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(2);
+        return inputs.at(1);
+    }
+    argument
+    // cppcheck-suppress constParameter
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
+    {
+        argument result = get_arg(args, args.size() - 1);
+        visit_all(result, get_arg(args, 0))([&](auto output, auto input) {
+            pointwise(output, input)(ctx, output.get_shape(), 1024, [](auto& y, auto x) { y = x; });
+        });
+        return result.reshape(output_shape);
+    }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/gemm.cpp
+++ b/src/targets/cpu/gemm.cpp
-#include <migraphx/cpu/gemm.hpp>
+#include <migraphx/config.hpp>
-#include <migraphx/dfor.hpp>
+#include <migraphx/register_op.hpp>
-#include <migraphx/requires.hpp>
+#include <migraphx/reflect.hpp>
-#include <migraphx/shape_for_each.hpp>
+#include <migraphx/context.hpp>
-#include <blaze/math/CustomMatrix.h>
+#include <migraphx/cpu/context.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+#include <migraphx/cpu/migemm.hpp>
+#include <migraphx/op/dot.hpp>
+#include <migraphx/op/quant_dot.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
-template <class T>
+#if USE_DNNL
-using matrix = blaze::CustomMatrix<T, blaze::unaligned, blaze::unpadded>; // NOLINT
+struct dnnl_gemm : dnnl_extend_op<dnnl_gemm, dnnl::matmul, op::dot>
-template <class T>
-static auto make_mat(tensor_view<T> x)
-{
-    const auto& s = x.get_shape();
-    // assert(s.lens().size() == 2);
-    std::size_t n_dims = s.lens().size();
-    std::size_t dim_0  = n_dims - 2;
-    std::size_t dim_1  = n_dims - 1;
-    if(s.transposed())
-        return matrix<T>{x.data(), s.lens()[dim_1], s.lens()[dim_0], s.strides()[dim_1]};
-    return matrix<T>{x.data(), s.lens()[dim_0], s.lens()[dim_1], s.strides()[dim_0]};
-}
-template <class T, class F>
-static void visit_mat(tensor_view<T> x, F f)
 {
-    auto mat = make_mat(x);
+    std::vector<int> arg_map(int) const { return {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS}; }
-    if(x.get_shape().transposed())
-        f(blaze::trans(mat));
+    // Batch must be a single dimension
+    shape adjust_shape(shape x, int) const
+    {
+        auto s     = base_adjust_shape(x);
+        auto ndims = s.lens().size();
+        if(ndims > 3)
+        {
+            if(not std::is_sorted(
+                   s.strides().begin(), s.strides().begin() + (ndims - 2), std::greater<>{}))
+                MIGRAPHX_THROW("Batch transposed");
+            std::size_t batch = std::accumulate(
+                s.lens().begin(), s.lens().begin() + (ndims - 2), 1, std::multiplies<>{});
+            shape s3d{s.type(),
+                      {batch, s.lens()[ndims - 2], s.lens()[ndims - 1]},
+                      {s.lens()[ndims - 2] * s.lens()[ndims - 1],
+                       s.strides()[ndims - 2],
+                       s.strides()[ndims - 1]}};
+            return s3d;
+        }
        else
-        f(mat);
+        {
-}
+            return s;
+        }
+    }
-template <class T>
+    dnnl::matmul::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
-struct is_fast_gemm_type : std::false_type
+    {
-{
+        return {m.at(DNNL_ARG_SRC), m.at(DNNL_ARG_WEIGHTS), m.at(DNNL_ARG_DST)};
+    }
 };
+#endif
-// template <>
+struct cpu_gemm : auto_register_op<cpu_gemm>
-// struct is_fast_gemm_type<float> : std::true_type
-// {
-// };
-template <class T, class F>
-void migemm_impl(
-    tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta, std::true_type)
 {
-    visit_mat(amat, [&](const auto& a) {
+    op::dot op;
-        visit_mat(bmat, [&](const auto& b) {
-            auto c = make_mat(cmat);
+    template <class Self, class F>
-            c      = beta * c;
+    static auto reflect(Self& self, F f)
-            // This is a simple optimization to avoid
-            // compute A * B if alpha is 0.0
-            if(alpha != 0.0)
    {
-                c = c + alpha * a * b;
+        return migraphx::reflect(self.op, f);
    }
+    std::string name() const { return "cpu::dot"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.standard();
+        inputs.pop_back();
+        return op.compute_shape(inputs);
+    }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+    argument compute(context&, const shape&, std::vector<argument> args) const
+    {
+        // 3 inputs, it is alpha * A * B + beta * C, then
+        // A and B are matrices, and C is of the same shape as A * B
+        if(args.size() == 4)
+        {
+            // no need to consider the value of args[2]
+            if(op.beta == 0.0f)
+            {
+                args.back().visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
+            }
+            else
+            {
+                visit_all(args.back(), args[2])([&](auto output, auto input) {
+                    std::copy(input.begin(), input.end(), output.begin());
                });
-    });
+            }
-}
+            migemm(args.back(), args[0], args[1], op.alpha, op.beta);
-template <class T, class F>
+            return args.back();
-void migemm_impl(
+        }
-    tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta, std::false_type)
+        // 2 input arguments
+        migemm(args.back(), args[0], args[1], op.alpha, 0.0f);
+        return args.back();
+    }
+};
+struct cpu_quant_gemm : auto_register_op<cpu_quant_gemm>
 {
-    std::size_t n_dims = cmat.get_shape().lens().size();
+    op::quant_dot op;
-    std::size_t dim_0  = n_dims - 2;
-    std::size_t dim_1  = n_dims - 1;
+    template <class Self, class F>
-    auto k             = amat.get_shape().lens()[dim_1];
+    static auto reflect(Self& self, F f)
+    {
-    assert(amat.get_shape().lens()[dim_1] == bmat.get_shape().lens()[dim_0]);
+        return migraphx::reflect(self.op, f);
-    assert(cmat.get_shape().lens()[dim_0] == amat.get_shape().lens()[dim_0]);
+    }
-    assert(cmat.get_shape().lens()[dim_1] == bmat.get_shape().lens()[dim_1]);
+    std::string name() const { return "cpu::quant_dot"; }
-    shape_for_each(cmat.get_shape(), [&](const auto& c_idx) {
+    shape compute_shape(std::vector<shape> inputs) const
-        auto a_idx = c_idx;
+    {
-        auto b_idx = c_idx;
+        check_shapes{inputs, *this}.standard();
-        double s   = 0.0;
+        inputs.pop_back();
-        dfor(k)([&](auto kk) {
+        return op.compute_shape(inputs);
-            a_idx[dim_1] = b_idx[dim_0] = kk;
+    }
-            s += amat(a_idx.begin(), a_idx.end()) * bmat(b_idx.begin(), b_idx.end());
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+    argument compute(context&, const shape&, std::vector<argument> args) const
+    {
+        // 3 inputs, it is alpha * A * B + beta * C, then
+        // A and B are matrices, and C is of the same shape to A * B
+        // first, convert the args[0] and args[1] from int8_t to int32_t
+        argument arg_0{{shape::int32_type, {args.at(0).get_shape().lens()}}};
+        argument arg_1{{shape::int32_type, {args.at(1).get_shape().lens()}}};
+        arg_0.visit([&](auto output) {
+            args.at(0).visit(
+                [&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
        });
-        cmat(c_idx.begin(), c_idx.end()) = alpha * s + cmat(c_idx.begin(), c_idx.end()) * beta;
+        arg_1.visit([&](auto output) {
+            args.at(1).visit(
+                [&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
        });
-}
-template <class T, class F>
+        if(args.size() == 4)
-void migemm_impl(tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta)
-{
-    auto lens = amat.get_shape().lens();
-    bool batch_mul =
-        std::accumulate(
-            lens.rbegin() + 2, lens.rend(), std::size_t{1}, std::multiplies<std::size_t>()) == 1;
-    if(batch_mul)
        {
-        migemm_impl(cmat, amat, bmat, alpha, beta, is_fast_gemm_type<T>{});
+            // no need to consider the value of args[2]
+            if(op.beta == 0)
+            {
+                args.back().visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
            }
            else
            {
-        migemm_impl(cmat, amat, bmat, alpha, beta, std::false_type{});
+                visit_all(args.back(), args[2])([&](auto output, auto input) {
+                    std::copy(input.begin(), input.end(), output.begin());
+                });
            }
-}
-template <class F>
+            migemm(args.back(), arg_0, arg_1, op.alpha, op.beta);
-void migemm_tpl(
-    const argument& c_arg, const argument& a_arg, const argument& b_arg, F alpha, F beta)
-{
-    visit_all(c_arg, a_arg, b_arg)(
-        [&](auto cmat, auto amat, auto bmat) { migemm_impl(cmat, amat, bmat, alpha, beta); });
-}
-void migemm(
+            return args.back();
-    const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta)
+        }
-{
-    migemm_tpl(c_arg, a_arg, b_arg, alpha, beta);
+        // 2 input arguments
-}
+        migemm(args.back(), arg_0, arg_1, op.alpha, int32_t{0});
-void migemm(const argument& c_arg,
+        return args.back();
-            const argument& a_arg,
+    }
-            const argument& b_arg,
+};
-            int32_t alpha,
-            int32_t beta)
-{
-    migemm_tpl(c_arg, a_arg, b_arg, alpha, beta);
-}
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/cpu/include/migraphx/cpu/allocation_model.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/allocation_model.hpp
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_ALLOCATION_MODEL_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_ALLOCATION_MODEL_HPP
+#include <migraphx/config.hpp>
+#include <migraphx/operation.hpp>
+#include <string>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+struct cpu_allocation_model
+{
+    std::string name() const;
+    std::string copy() const;
+    operation allocate(const shape& s) const;
+};
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/cpu/include/migraphx/cpu/context.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/context.hpp
@@ -2,15 +2,55 @@
 #define MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP
 #include <migraphx/config.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+#include <migraphx/cpu/parallel.hpp>
+#include <migraphx/par_for.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
+#ifdef USE_DNNL
 struct context
 {
    void finish() const {}
+    template <class F>
+    void bulk_execute(std::size_t n, std::size_t min_grain, F f)
+    {
+        cpu::parallel_for(n, min_grain, f);
+    }
+    template <class F>
+    void bulk_execute(std::size_t n, F f)
+    {
+        this->bulk_execute(n, 256, f);
+    }
 };
+#else
+struct context
+{
+    void finish() const {}
+    template <class F>
+    void bulk_execute(std::size_t n, std::size_t min_grain, F f)
+    {
+        const auto threadsize =
+            std::min<std::size_t>(std::thread::hardware_concurrency(), n / min_grain);
+        std::size_t grainsize = std::ceil(static_cast<double>(n) / threadsize);
+        par_for(threadsize, 1, [&](auto tid) {
+            std::size_t work = tid * grainsize;
+            f(work, std::min(n, work + grainsize));
+        });
+    }
+    template <class F>
+    void bulk_execute(std::size_t n, F f)
+    {
+        this->bulk_execute(n, 256, f);
+    }
+};
+#endif
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/cpu/include/migraphx/cpu/dnnl.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/dnnl.hpp
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_DNNL_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_DNNL_HPP
+#include <migraphx/config.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <unordered_map>
+#ifdef USE_DNNL
+#include <dnnl.hpp>
+#include <migraphx/errors.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+struct dnnl_context
+{
+    dnnl::engine engine;
+    dnnl::stream stream;
+    dnnl_context() : engine(dnnl::engine::kind::cpu, 0), stream(engine) {}
+};
+inline dnnl_context& get_dnnl_context()
+{
+    static dnnl_context ctx{}; // NOLINT
+    return ctx;
+}
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wswitch-enum"
+#endif
+inline dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t)
+{
+    using dt = dnnl::memory::data_type;
+    using st = shape::type_t;
+    switch(t)
+    {
+    case st::half_type: return dt::f16;
+    case st::float_type: return dt::f32;
+    case st::int32_type: return dt::s32;
+    case st::int8_type: return dt::s8;
+    case st::uint8_type: return dt::u8;
+    default: MIGRAPHX_THROW("Unsupported data type");
+    }
+}
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+inline dnnl::memory::format_tag to_dnnl_memory_format_tag(std::size_t n)
+{
+    switch(n)
+    {
+    case 1: return dnnl::memory::format_tag::a;
+    case 2: return dnnl::memory::format_tag::ab;
+    case 3: return dnnl::memory::format_tag::abc;
+    case 4: return dnnl::memory::format_tag::abcd;
+    case 5: return dnnl::memory::format_tag::abcde;
+    case 6: return dnnl::memory::format_tag::abcdef;
+    default: MIGRAPHX_THROW("Unsupported tensor size: " + std::to_string(n));
+    }
+}
+template <class R>
+inline dnnl::memory::dims to_dnnl_dims(R&& r)
+{
+    return {r.begin(), r.end()};
+}
+inline dnnl::memory::desc to_dnnl_memory_desc(const shape& s)
+{
+    return {to_dnnl_dims(s.lens()), to_dnnl_memory_data_type(s.type()), to_dnnl_dims(s.strides())};
+}
+inline dnnl::memory to_dnnl_memory(const dnnl::memory::desc& desc, const argument& a)
+{
+    return dnnl::memory(desc, get_dnnl_context().engine, a.data());
+}
+inline dnnl::memory to_dnnl_memory(const argument& a)
+{
+    return to_dnnl_memory(to_dnnl_memory_desc(a.get_shape()), a);
+}
+template <class Derived, class Primitive>
+struct dnnl_op : auto_register_op<Derived>
+{
+    std::function<argument(context& ctx, const std::vector<argument>& args)> execute;
+    static std::vector<shape> to_shapes(const std::vector<argument>& args)
+    {
+        std::vector<shape> shapes(args.size());
+        std::transform(args.begin(), args.end(), shapes.begin(), [](const argument& a) {
+            return a.get_shape();
+        });
+        return shapes;
+    }
+    // Map arg index to arg in dnnl
+    std::vector<int> arg_map(int size) const
+    {
+        std::vector<int> result(size);
+        std::iota(result.begin(), result.end(), DNNL_ARG_SRC_0);
+        return result;
+    }
+    shape base_adjust_shape(const shape& s) const
+    {
+        if(s.broadcasted())
+        {
+            auto lens    = s.lens();
+            auto strides = s.strides();
+            std::transform(strides.begin(),
+                           strides.end(),
+                           lens.begin(),
+                           lens.begin(),
+                           [](auto stride, auto len) -> std::size_t {
+                               if(stride == 0)
+                                   return 1;
+                               else
+                                   return len;
+                           });
+            return shape{s.type(), lens};
+        }
+        return s;
+    }
+    shape adjust_shape(const shape& s, int) const { return base_adjust_shape(s); }
+    std::unordered_map<int, dnnl::memory::desc>
+    to_memory_desc(const shape& output_shape, const std::vector<shape>& inputs) const
+    {
+        const auto& self = static_cast<const Derived&>(*this);
+        std::unordered_map<int, dnnl::memory::desc> result;
+        result[DNNL_ARG_DST] = to_dnnl_memory_desc(self.adjust_shape(output_shape, inputs.size()));
+        auto m               = self.arg_map(inputs.size());
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            result[m[i]] = to_dnnl_memory_desc(self.adjust_shape(inputs[i], i));
+        }
+        return result;
+    }
+    template <class T>
+    auto get_primitive_desc(const T& desc) const
+        -> decltype(typename Primitive::primitive_desc(desc, get_dnnl_context().engine))
+    {
+        return typename Primitive::primitive_desc(desc, get_dnnl_context().engine);
+    }
+    Primitive get_primitive(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        const auto& self = static_cast<const Derived&>(*this);
+        auto desc        = self.get_desc(m);
+        auto pd          = self.get_primitive_desc(desc);
+        return Primitive(pd);
+    }
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        return execute(ctx, args);
+    }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+    void finalize(context&, const shape& output_shape, std::vector<shape> inputs)
+    {
+        // Compensate for allocation
+        inputs.pop_back();
+        const auto& self = static_cast<const Derived&>(*this);
+        auto md          = to_memory_desc(output_shape, inputs);
+        auto prim        = get_primitive(md);
+        auto arg_lookup  = self.arg_map(inputs.size());
+        execute          = [=](context&, const std::vector<argument>& args) {
+#ifndef NDEBUG
+            // Check that the memory descriptors have not changed
+            auto debug_args = args;
+            debug_args.pop_back();
+            auto debug_md = to_memory_desc(output_shape, to_shapes(debug_args));
+            for(auto&& p : debug_md)
+            {
+                if(p.second == md.at(p.first))
+                    continue;
+                MIGRAPHX_THROW("Memory descriptor has changed for: " + std::to_string(p.first));
+            }
+#endif
+            std::unordered_map<int, dnnl::memory> m;
+            m[DNNL_ARG_DST] = to_dnnl_memory(md.at(DNNL_ARG_DST), args.back());
+            for(int i = 0; i < args.size() - 1; i++)
+                m[arg_lookup[i]] = to_dnnl_memory(md.at(arg_lookup[i]), args[i]);
+            prim.execute(get_dnnl_context().stream, m);
+            return args.back();
+        };
+    }
+};
+template <class Derived, class Primitive, class Op>
+struct dnnl_extend_op : dnnl_op<Derived, Primitive>
+{
+    Op op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "dnnl::" + op.name(); }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        // Compensate for allocation
+        inputs.pop_back();
+        // check_shapes(inputs, *this).standard();
+        auto r = migraphx::compute_shape(op, inputs);
+        // Call to get_primitive to make sure an algo is available
+        this->get_primitive(this->to_memory_desc(r, inputs));
+        return r;
+    }
+};
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
+#endif
--- a/src/targets/cpu/include/migraphx/cpu/gemm.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/gemm.hpp
--- a/src/targets/cpu/include/migraphx/cpu/parallel.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/parallel.hpp
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_PARALLEL_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_PARALLEL_HPP
+#include <migraphx/config.hpp>
+#if USE_DNNL
+#include <omp.h>
+#endif
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+#if USE_DNNL
+template <class F>
+void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
+{
+    if(threadsize <= 1)
+    {
+        f(std::size_t{0}, n);
+    }
+    else
+    {
+        std::size_t grainsize = std::ceil(static_cast<double>(n) / threadsize);
+#pragma omp parallel num_threads(threadsize)
+        {
+            std::size_t tid  = omp_get_thread_num();
+            std::size_t work = tid * grainsize;
+            f(work, std::min(n, work + grainsize));
+        }
+    }
+}
+template <class F>
+void parallel_for(std::size_t n, std::size_t min_grain, F f)
+{
+    const auto threadsize = std::min<std::size_t>(omp_get_num_threads(), n / min_grain);
+    parallel_for_impl(n, threadsize, f);
+}
+template <class F>
+void parallel_for(std::size_t n, F f)
+{
+    const int min_grain = 8;
+    parallel_for(n, min_grain, f);
+}
+#endif
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/cpu/include/migraphx/cpu/pointwise.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/pointwise.hpp
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_POINTWISE_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_POINTWISE_HPP
+#include <migraphx/config.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/cpu/context.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/register_op.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+struct multi_index
+{
+    multi_index(const shape& s, std::size_t i) : n(s.lens().size())
+    {
+        assert(n < max_size);
+        std::copy(s.lens().begin(), s.lens().end(), dims);
+        s.multi_copy(i, index, index + max_size);
+    }
+    std::size_t size() const { return n; }
+    std::size_t* begin() { return index; }
+    const std::size_t* begin() const { return index; }
+    std::size_t* end() { return index + size(); }
+    const std::size_t* end() const { return index + size(); }
+    std::size_t offset(const shape& s) const { return s.index(begin(), end()); }
+    void carry()
+    {
+        std::size_t overflow = 0;
+        for(std::ptrdiff_t i = size() - 1; i > 0; i--)
+        {
+            auto z = index[i] + overflow;
+            // Reset overflow
+            overflow = 0;
+            // Compute overflow using while loop instead of mod
+            while(z >= dims[i])
+            {
+                z -= dims[i];
+                overflow += 1;
+            }
+            index[i] = z;
+        }
+        index[0] += overflow;
+    }
+    void increment(std::size_t i)
+    {
+        index[size() - 1] += i;
+        carry();
+    }
+    multi_index& operator+=(std::size_t i)
+    {
+        increment(i);
+        return *this;
+    }
+    multi_index& operator++()
+    {
+        increment(1);
+        return *this;
+    }
+    multi_index operator++(int) // NOLINT
+    {
+        multi_index result = *this;
+        increment(1);
+        return result;
+    }
+    private:
+    static const std::size_t max_size = 5;
+    std::size_t index[max_size];
+    std::size_t dims[max_size];
+    std::size_t n;
+};
+struct reduce_dims_base
+{
+    std::vector<shape> reduce_shapes;
+    void finalize(context&, const shape&, const std::vector<shape>& inputs)
+    {
+        reduce_shapes = reduce_dims(inputs);
+    }
+    argument get_arg(const std::vector<argument>& args, std::size_t i) const
+    {
+        if(reduce_shapes.empty())
+            return args[i];
+        return args.at(i).reshape(reduce_shapes.at(i));
+    }
+    argument get_output() const
+    {
+        argument a{reduce_shapes[0]};
+        return a;
+    }
+};
+template <class X, class... Xs>
+bool is_standard_offset(const X& x, const Xs&... xs)
+{
+    if(all_of({x, xs...}, [](const auto& s) { return s.standard(); }))
+        return true;
+    if(all_of({x, xs...}, [](const auto& s) { return s.packed(); }) and
+       all_of({xs...}, [&](const auto& s) { return s == x; }))
+        return true;
+    return false;
+}
+template <class... Ts>
+auto pointwise(Ts... xs)
+{
+    return [=](context& ctx, const shape& base_shape, std::size_t min_grain, auto f) mutable {
+        if(is_standard_offset(xs.get_shape()...))
+        {
+            ctx.bulk_execute(base_shape.elements(), min_grain, [=](auto start, auto end) mutable {
+                for(auto i = start; i < end; i++)
+                {
+                    f(xs.data()[i]...);
+                }
+            });
+        }
+        else
+        {
+            assert(base_shape.lens().size() <= 6);
+            ctx.bulk_execute(base_shape.elements(), min_grain, [=](auto start, auto end) mutable {
+                multi_index mi(base_shape, start);
+                for(auto i = start; i < end; i++)
+                {
+                    f(xs.data()[mi.offset(xs.get_shape())]...);
+                    ++mi;
+                }
+            });
+        }
+    };
+}
+template <class Op>
+struct cpu_unary : reduce_dims_base, auto_register_op<cpu_unary<Op>>
+{
+    Op op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "cpu::" + op.name(); }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(2);
+        auto s = inputs.at(0);
+        return {s.type(), s.lens()};
+    }
+    argument
+    // cppcheck-suppress constParameter
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
+    {
+        argument result = get_arg(args, args.size() - 1);
+        visit_all(result, get_arg(args, 0))([&](auto output, auto input) {
+            auto op2 = op;
+            pointwise(output, input)(
+                ctx, output.get_shape(), 1024, [op2](auto& y, auto x) { y = op2.apply()(x); });
+        });
+        return result.reshape(output_shape);
+    }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+template <class Op>
+struct cpu_binary : reduce_dims_base, auto_register_op<cpu_binary<Op>>
+{
+    Op op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "cpu::" + op.name(); }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(3);
+        auto s = inputs.at(0);
+        return {s.type(), s.lens()};
+    }
+    argument
+    // cppcheck-suppress constParameter
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
+    {
+        argument result = get_arg(args, args.size() - 1);
+        visit_all(result, get_arg(args, 0), get_arg(args, 1))(
+            [&](auto output, auto input1, auto input2) {
+                auto op2 = op;
+                pointwise(output, input1, input2)(
+                    ctx, output.get_shape(), 1024, [op2](auto& z, auto x, auto y) {
+                        z = op2.apply()(x, y);
+                    });
+            });
+        return result.reshape(output_shape);
+    }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
--- a/src/targets/cpu/migemm.cpp
+++ b/src/targets/cpu/migemm.cpp
+#include <migraphx/cpu/migemm.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/requires.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <blaze/math/CustomMatrix.h>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+template <class T, class F>
+void migemm_impl(
+    tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta, std::false_type)
+{
+    std::size_t n_dims = cmat.get_shape().lens().size();
+    std::size_t dim_0  = n_dims - 2;
+    std::size_t dim_1  = n_dims - 1;
+    auto k             = amat.get_shape().lens()[dim_1];
+    assert(amat.get_shape().lens()[dim_1] == bmat.get_shape().lens()[dim_0]);
+    assert(cmat.get_shape().lens()[dim_0] == amat.get_shape().lens()[dim_0]);
+    assert(cmat.get_shape().lens()[dim_1] == bmat.get_shape().lens()[dim_1]);
+    shape_for_each(cmat.get_shape(), [&](const auto& c_idx) {
+        auto a_idx = c_idx;
+        auto b_idx = c_idx;
+        double s   = 0.0;
+        dfor(k)([&](auto kk) {
+            a_idx[dim_1] = b_idx[dim_0] = kk;
+            s += amat(a_idx.begin(), a_idx.end()) * bmat(b_idx.begin(), b_idx.end());
+        });
+        cmat(c_idx.begin(), c_idx.end()) = alpha * s + cmat(c_idx.begin(), c_idx.end()) * beta;
+    });
+}
+template <class T, class F>
+void migemm_impl(tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta)
+{
+    migemm_impl(cmat, amat, bmat, alpha, beta, std::false_type{});
+}
+template <class F>
+void migemm_tpl(
+    const argument& c_arg, const argument& a_arg, const argument& b_arg, F alpha, F beta)
+{
+    visit_all(c_arg, a_arg, b_arg)(
+        [&](auto cmat, auto amat, auto bmat) { migemm_impl(cmat, amat, bmat, alpha, beta); });
+}
+void migemm(
+    const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta)
+{
+    migemm_tpl(c_arg, a_arg, b_arg, alpha, beta);
+}
+void migemm(const argument& c_arg,
+            const argument& a_arg,
+            const argument& b_arg,
+            int32_t alpha,
+            int32_t beta)
+{
+    migemm_tpl(c_arg, a_arg, b_arg, alpha, beta);
+}
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/mul.cpp
+++ b/src/targets/cpu/mul.cpp
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/mul.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+template struct cpu_binary<op::mul>;
+#if USE_DNNL
+struct dnnl_mul : dnnl_extend_op<dnnl_mul, dnnl::binary, op::mul>
+{
+    dnnl::binary::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        return {dnnl::algorithm::binary_mul,
+                m.at(DNNL_ARG_SRC_0),
+                m.at(DNNL_ARG_SRC_1),
+                m.at(DNNL_ARG_DST)};
+    }
+};
+#endif
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/pooling.cpp
+++ b/src/targets/cpu/pooling.cpp
+#include <migraphx/config.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx/cpu/context.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+#include <migraphx/op/pooling.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+struct max_pool
+{
+    static std::string name() { return "max"; }
+    template <class T>
+    static T start()
+    {
+        return std::numeric_limits<T>::lowest();
+    }
+    static double apply(double x, double y)
+    {
+        double m = std::max(x, y);
+        return (m);
+    }
+    static double final(double x, std::size_t) { return (x); }
+};
+struct avg_pool
+{
+    static std::string name() { return "average"; }
+    template <class T>
+    static double start()
+    {
+        return 0.0;
+    }
+    static double apply(double x, double y) { return x + y; }
+    static double final(double x, std::size_t y) { return (y == 0) ? 0.0 : (x / y); }
+};
+template <class Op>
+struct cpu_pooling : auto_register_op<cpu_pooling<Op>>
+{
+    cpu_pooling() = default;
+    cpu_pooling(op::pooling pop) : op(std::move(pop)) {}
+    op::pooling op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "cpu::pooling_" + Op::name(); }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        inputs.pop_back();
+        return op.compute_shape(inputs);
+    }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    {
+        visit_all(args.back(), args[0])([&](auto output, auto input) {
+            using type   = typename decltype(output)::value_type;
+            auto in_s    = input.get_shape();
+            auto in_lens = in_s.lens();
+            std::vector<std::size_t> vec_len(in_lens.begin() + 2, in_lens.end());
+            par_for(output_shape.elements(), [&](auto i) {
+                auto idx_o = output_shape.multi(i);
+                auto n_dim = idx_o.size();
+                std::vector<std::size_t> win_start;
+                std::vector<std::size_t> win_size;
+                for(std::size_t dim = 2; dim < n_dim; ++dim)
+                {
+                    auto d_2  = dim - 2;
+                    int start = static_cast<int>(idx_o[dim] * op.stride[d_2]) -
+                                static_cast<int>(op.padding[d_2]);
+                    int end = std::min(start + op.lengths[d_2], in_lens[dim]);
+                    start   = std::max(start, 0);
+                    win_start.push_back(start);
+                    win_size.push_back(end - start);
+                }
+                shape win_shape{output_shape.type(), win_size};
+                auto pool_size = win_shape.elements();
+                double acc     = Op::template start<type>();
+                shape_for_each(win_shape, [&](auto idx_w) {
+                    auto idx = idx_o;
+                    std::transform(idx_w.begin(),
+                                   idx_w.end(),
+                                   win_start.begin(),
+                                   idx.begin() + 2,
+                                   [](auto ii, auto jj) { return ii + jj; });
+                    if(std::all_of(idx.begin() + 2, idx.end(), [&](auto ii) { return ii >= 0; }) and
+                       idx < in_lens)
+                    {
+                        acc = Op::apply(acc, input[in_s.index(idx)]);
+                    }
+                });
+                output[i] = type(Op::final(acc, pool_size));
+            });
+        });
+        return args.back();
+    }
+};
+template struct cpu_pooling<avg_pool>;
+template struct cpu_pooling<max_pool>;
+#if USE_DNNL
+struct dnnl_pooling : dnnl_extend_op<dnnl_pooling, dnnl::pooling_forward, op::pooling>
+{
+    std::vector<int> arg_map(int) const { return {DNNL_ARG_SRC}; }
+    dnnl::pooling_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        auto algo = op.mode == "max" ? dnnl::algorithm::pooling_max : dnnl::algorithm::pooling_avg;
+        return {dnnl::prop_kind::forward_inference,
+                algo,
+                m.at(DNNL_ARG_SRC),
+                m.at(DNNL_ARG_DST),
+                to_dnnl_dims(op.stride),
+                to_dnnl_dims(op.lengths),
+                to_dnnl_dims(op.padding),
+                to_dnnl_dims(op.padding)};
+    }
+};
+#endif
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/relu.cpp
+++ b/src/targets/cpu/relu.cpp
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/relu.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+template struct cpu_unary<op::relu>;
+#if USE_DNNL
+struct dnnl_relu : dnnl_extend_op<dnnl_relu, dnnl::eltwise_forward, op::relu>
+{
+    dnnl::eltwise_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        return {dnnl::prop_kind::forward_inference,
+                dnnl::algorithm::eltwise_relu,
+                m.at(DNNL_ARG_SRC_0)};
+    }
+};
+#endif
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx