Add more supported operators and optimizations for the cpu backend (#746)

* Add eliminate_data_type pass * Formatting * Auto convert quant ops * Formatting * Flip the order of decompose * Compute max size differently * Formatting * Clamp values in convert * Formatting * Fix loss of precision in reduce * Formatting * Fix bugs in reduction * Fix accumulator type in reference softmax implementation * Formatting * Update convert test * Remove unused variables * Remove unnecessary quant_dot check * Formatting * Add tests * Formatting * Remove unused code * Remove duplicate ops * Remove blaze dependency * Use set since shape::type_t is no hashable on gcc 5 * Formatting * Add dnnl binary op * Formatting * Add binary and eltwise * Formatting * Add softmax * Formatting * Remove unused operators * Add missing files * Formatting * Add lrn * Formatting * Add deconvolution * Formatting * Change allocate default * Add reorder * Formatting * Add reductions * Formatting * Sort lines * Change literals in another loop * Add pow operator * Formatting * Add pow operator * Formatting * Make sure shapes are packed * Allow broadcasted inputs * Remove unused operators * Simplify functions * Remove softmax * Add sub and erf functions * Formatting * Fix bug * Formatting * Improve parallism * Formatting * Allow multiple batch dimensions * Formatting * Move literal transforms out of lowering * Formatting * Add gather operator * Sort lines * Add early exit for carry * Formatting * Add missing concat * Rename macro * Fix deep nesting * Formatting * Fix cppcheck issues * Remov else * Move attribute to typedef * Formatting * Disable maybe-uninitialized warning since its broken on gcc * Add constexpr default constructor * Formatting * Fix compiler warnings * Fix adjust_allocation test Co-authored-by: Shucai Xiao <shucai@gmail.com> Co-authored-by: mvermeulen <5479696+mvermeulen@users.noreply.github.com>

Add more supported operators and optimizations for the cpu backend (#746)
* Add eliminate_data_type pass * Formatting * Auto convert quant ops * Formatting * Flip the order of decompose * Compute max size differently * Formatting * Clamp values in convert * Formatting * Fix loss of precision in reduce * Formatting * Fix bugs in reduction * Fix accumulator type in reference softmax implementation * Formatting * Update convert test * Remove unused variables * Remove unnecessary quant_dot check * Formatting * Add tests * Formatting * Remove unused code * Remove duplicate ops * Remove blaze dependency * Use set since shape::type_t is no hashable on gcc 5 * Formatting * Add dnnl binary op * Formatting * Add binary and eltwise * Formatting * Add softmax * Formatting * Remove unused operators * Add missing files * Formatting * Add lrn * Formatting * Add deconvolution * Formatting * Change allocate default * Add reorder * Formatting * Add reductions * Formatting * Sort lines * Change literals in another loop * Add pow operator * Formatting * Add pow operator * Formatting * Make sure shapes are packed * Allow broadcasted inputs * Remove unused operators * Simplify functions * Remove softmax * Add sub and erf functions * Formatting * Fix bug * Formatting * Improve parallism * Formatting * Allow multiple batch dimensions * Formatting * Move literal transforms out of lowering * Formatting * Add gather operator * Sort lines * Add early exit for carry * Formatting * Add missing concat * Rename macro * Fix deep nesting * Formatting * Fix cppcheck issues * Remov else * Move attribute to typedef * Formatting * Disable maybe-uninitialized warning since its broken on gcc * Add constexpr default constructor * Formatting * Fix compiler warnings * Fix adjust_allocation test Co-authored-by: Shucai Xiao <shucai@gmail.com> Co-authored-by: mvermeulen <5479696+mvermeulen@users.noreply.github.com>
a0b570b2 · Paul Fultz II · GitHub · 165d1a17 · a0b570b2 · a0b570b2
Unverified Commit a0b570b2 authored Feb 26, 2021 by Paul Fultz II Committed by GitHub Feb 26, 2021
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -177,8 +177,7 @@ rocm_enable_cppcheck(
        passedByValue
        unusedStructMember
        functionStatic
-        functionConst:*program.*
+        functionConst
-        functionConst:*module.*
        shadowFunction
        shadowVar
        shadowVariable

--- a/cmake/EnableCompilerWarnings.cmake
+++ b/cmake/EnableCompilerWarnings.cmake
@@ -107,9 +107,12 @@ else()
        else()
            list(APPEND CMAKE_COMPILER_WARNINGS
                -Wno-missing-field-initializers
+                -Wno-maybe-uninitialized
                # -Wno-deprecated-declarations
            )
        endif()
-        add_definitions(${CMAKE_COMPILER_WARNINGS})
+        foreach(COMPILER_WARNING ${CMAKE_COMPILER_WARNINGS})
+            add_compile_options($<$<COMPILE_LANGUAGE:${COMPILER}>:${COMPILER_WARNING}>)
+        endforeach()
    endforeach()
 endif ()
--- a/src/eliminate_contiguous.cpp
+++ b/src/eliminate_contiguous.cpp
@@ -77,9 +77,7 @@ void eliminate_contiguous::apply(module& p) const
        auto args = ins->inputs();
        for(auto arg : ins->inputs())
        {
-            // TODO: Pass in names for the operator in the constructor instead
+            if(arg->name() == op_name)
-            // of using ends_with
-            if(ends_with(arg->name(), "contiguous"))
            {
                auto new_args = args;
                auto prev     = arg->inputs().front();

--- a/src/include/migraphx/check_shapes.hpp
+++ b/src/include/migraphx/check_shapes.hpp
@@ -151,6 +151,13 @@ struct check_shapes
        return *this;
    }
+    const check_shapes& packed_or_broadcasted() const
+    {
+        if(!this->all_of([](const shape& s) { return s.packed() or s.broadcasted(); }))
+            MIGRAPHX_THROW(prefix() + "Shapes are not packed nor broadcasted");
+        return *this;
+    }
    const check_shapes& not_transposed() const
    {
        if(!this->all_of([](const shape& s) { return not s.transposed(); }))

--- a/src/include/migraphx/eliminate_contiguous.hpp
+++ b/src/include/migraphx/eliminate_contiguous.hpp
@@ -15,6 +15,7 @@ struct module;
 */
 struct eliminate_contiguous
 {
+    std::string op_name;
    std::string name() const { return "eliminate_contiguous"; }
    void apply(module& p) const;
 };

--- a/src/targets/cpu/CMakeLists.txt
+++ b/src/targets/cpu/CMakeLists.txt
+include(CheckCXXCompilerFlag)
 add_library(migraphx_cpu
-    allocation_model.cpp
    allocate.cpp
-    add.cpp
+    allocation_model.cpp
-    contiguous.cpp
+    binary.cpp
    concat.cpp
    convolution.cpp
    copy.cpp
-    mul.cpp
+    deconvolution.cpp
-    pooling.cpp
+    dnnl.cpp
-    relu.cpp
+    eltwise.cpp
+    erf.cpp
+    gather.cpp
    gemm.cpp
-    target.cpp
+    logsoftmax.cpp
    lowering.cpp
+    lrn.cpp
+    pooling.cpp
+    reduction.cpp
+    reorder.cpp
+    softmax.cpp
+    sub.cpp
+    target.cpp
+    write_literals.cpp
 )
 set_target_properties(migraphx_cpu PROPERTIES EXPORT_NAME cpu)
 rocm_set_soversion(migraphx_cpu ${MIGRAPHX_SO_VERSION})
@@ -23,6 +34,7 @@ find_package(dnnl REQUIRED)
 rocm_clang_tidy_check(migraphx_cpu)
 target_link_libraries(migraphx_cpu PRIVATE migraphx Threads::Threads)
 target_link_libraries(migraphx_cpu PRIVATE DNNL::dnnl)
 find_package(OpenMP)
 target_link_libraries(migraphx_cpu PUBLIC OpenMP::OpenMP_CXX)
 # Add library path to rpath to workaround issues with our broken packages

--- a/src/targets/cpu/binary.cpp
+++ b/src/targets/cpu/binary.cpp
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+struct dnnl_binary : dnnl_op<dnnl_binary, dnnl::binary>
+{
+    std::string algo;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.algo, "algo"));
+    }
+    std::string name() const { return "dnnl::binary"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        // Compensate for allocation
+        inputs.pop_back();
+        check_shapes{inputs, *this}.has(2);
+        auto s0 = inputs.at(0);
+        auto s1 = inputs.at(1);
+        auto r  = s0;
+        if(s0 != s1 or !s0.packed())
+        {
+            r = shape{s0.type(), s0.lens()};
+        }
+        // Call to get_primitive to make sure an algo is available
+        this->get_primitive(this->to_memory_desc(r, inputs));
+        return r;
+    }
+    dnnl::binary::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        return {to_dnnl_algo(algo), m.at(DNNL_ARG_SRC_0), m.at(DNNL_ARG_SRC_1), m.at(DNNL_ARG_DST)};
+    }
+};
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/deconvolution.cpp
+++ b/src/targets/cpu/deconvolution.cpp
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+#include <migraphx/op/deconvolution.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+struct dnnl_deconvolution
+    : dnnl_extend_op<dnnl_deconvolution, dnnl::deconvolution_forward, op::deconvolution>
+{
+    std::vector<int> arg_map(int) const { return {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS}; }
+    shape adjust_shape(const shape& x, int i) const
+    {
+        auto s = base_adjust_shape(x);
+        if(i == 1)
+        {
+            // The input and output channels are flipped for dnnl
+            auto lens = s.lens();
+            std::swap(lens[0], lens[1]);
+            auto strides = s.strides();
+            std::swap(strides[0], strides[1]);
+            return {s.type(), lens, strides};
+        }
+        return s;
+    }
+    dnnl::deconvolution_forward::desc
+    get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        // In DNNL dilation is zero-based
+        auto dilation = op.dilation;
+        std::transform(
+            dilation.begin(), dilation.end(), dilation.begin(), [](auto x) { return x - 1; });
+        return {dnnl::prop_kind::forward_inference,
+                dnnl::algorithm::deconvolution_direct,
+                m.at(DNNL_ARG_SRC),
+                m.at(DNNL_ARG_WEIGHTS),
+                m.at(DNNL_ARG_DST),
+                to_dnnl_dims(op.stride),
+                to_dnnl_dims(dilation),
+                to_dnnl_dims(op.padding),
+                to_dnnl_dims(op.padding)};
+    }
+};
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/dnnl.cpp
+++ b/src/targets/cpu/dnnl.cpp
+#include <migraphx/cpu/dnnl.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+dnnl_context& get_dnnl_context()
+{
+    static dnnl_context ctx{}; // NOLINT
+    return ctx;
+}
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wswitch-enum"
+#endif
+dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t)
+{
+    using dt = dnnl::memory::data_type;
+    using st = shape::type_t;
+    switch(t)
+    {
+    case st::half_type: return dt::f16;
+    case st::float_type: return dt::f32;
+    case st::int32_type: return dt::s32;
+    case st::int8_type: return dt::s8;
+    case st::uint8_type: return dt::u8;
+    default: MIGRAPHX_THROW("Unsupported data type");
+    }
+}
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+dnnl::memory::format_tag to_dnnl_memory_format_tag(std::size_t n)
+{
+    switch(n)
+    {
+    case 1: return dnnl::memory::format_tag::a;
+    case 2: return dnnl::memory::format_tag::ab;
+    case 3: return dnnl::memory::format_tag::abc;
+    case 4: return dnnl::memory::format_tag::abcd;
+    case 5: return dnnl::memory::format_tag::abcde;
+    case 6: return dnnl::memory::format_tag::abcdef;
+    default: MIGRAPHX_THROW("Unsupported tensor size: " + std::to_string(n));
+    }
+}
+dnnl::memory::desc to_dnnl_memory_desc(const shape& s)
+{
+    return {to_dnnl_dims(s.lens()), to_dnnl_memory_data_type(s.type()), to_dnnl_dims(s.strides())};
+}
+dnnl::memory to_dnnl_memory(const dnnl::memory::desc& desc, const argument& a)
+{
+    return dnnl::memory(desc, get_dnnl_context().engine, a.data());
+}
+dnnl::memory to_dnnl_memory(const argument& a)
+{
+    return to_dnnl_memory(to_dnnl_memory_desc(a.get_shape()), a);
+}
+// clang-format off
+#define MIGRAPHX_VISIT_DNNL_ALGO(m) \
+        m(undef) \
+        m(convolution_auto) \
+        m(convolution_direct) \
+        m(convolution_winograd) \
+        m(deconvolution_direct) \
+        m(deconvolution_winograd) \
+        m(eltwise_relu) \
+        m(eltwise_tanh) \
+        m(eltwise_elu) \
+        m(eltwise_square) \
+        m(eltwise_abs) \
+        m(eltwise_sqrt) \
+        m(eltwise_swish) \
+        m(eltwise_linear) \
+        m(eltwise_bounded_relu) \
+        m(eltwise_soft_relu) \
+        m(eltwise_logistic) \
+        m(eltwise_exp) \
+        m(eltwise_gelu) \
+        m(eltwise_gelu_tanh) \
+        m(eltwise_gelu_erf) \
+        m(eltwise_log) \
+        m(eltwise_clip) \
+        m(eltwise_pow) \
+        m(eltwise_round) \
+        m(eltwise_relu_use_dst_for_bwd) \
+        m(eltwise_tanh_use_dst_for_bwd) \
+        m(eltwise_elu_use_dst_for_bwd) \
+        m(eltwise_sqrt_use_dst_for_bwd) \
+        m(eltwise_logistic_use_dst_for_bwd) \
+        m(eltwise_exp_use_dst_for_bwd) \
+        m(lrn_across_channels) \
+        m(lrn_within_channel) \
+        m(pooling_max) \
+        m(pooling_avg) \
+        m(pooling_avg_include_padding) \
+        m(pooling_avg_exclude_padding) \
+        m(vanilla_rnn) \
+        m(vanilla_lstm) \
+        m(vanilla_gru) \
+        m(lbr_gru) \
+        m(binary_add) \
+        m(binary_mul) \
+        m(binary_max) \
+        m(binary_min) \
+        m(binary_div) \
+        m(resampling_nearest) \
+        m(resampling_linear) \
+        m(reduction_max) \
+        m(reduction_min) \
+        m(reduction_sum) \
+        m(reduction_mul) \
+        m(reduction_mean) \
+        m(reduction_norm_lp_max) \
+        m(reduction_norm_lp_sum) \
+        m(reduction_norm_lp_power_p_max) \
+        m(reduction_norm_lp_power_p_sum)
+// clang-format on
+const std::unordered_map<std::string, dnnl::algorithm>& dnnl_algo_map()
+{
+    static const std::unordered_map<std::string, dnnl::algorithm> m = {
+#define MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR(x) {#x, dnnl::algorithm::x},
+        MIGRAPHX_VISIT_DNNL_ALGO(MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR)
+#undef MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR
+    };
+    return m;
+}
+dnnl::algorithm to_dnnl_algo(const std::string& name)
+{
+    if(dnnl_algo_map().count(name) == 0)
+        MIGRAPHX_THROW("Missing dnnl algo: " + name);
+    return dnnl_algo_map().at(name);
+}
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/eltwise.cpp
+++ b/src/targets/cpu/eltwise.cpp
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+struct dnnl_eltwise : dnnl_op<dnnl_eltwise, dnnl::eltwise_forward>
+{
+    std::string algo;
+    float alpha = 0;
+    float beta  = 0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.algo, "algo"), f(self.alpha, "alpha"), f(self.beta, "beta"));
+    }
+    std::string name() const { return "dnnl::eltwise"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        // Compensate for allocation
+        inputs.pop_back();
+        check_shapes{inputs, *this}.has(1).packed();
+        auto s = inputs.at(0);
+        auto r = s;
+        if(not s.packed())
+            r = shape{s.type(), s.lens()};
+        // Call to get_primitive to make sure an algo is available
+        this->get_primitive(this->to_memory_desc(r, inputs));
+        return r;
+    }
+    dnnl::eltwise_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        return {dnnl::prop_kind::forward_inference,
+                to_dnnl_algo(algo),
+                m.at(DNNL_ARG_SRC_0),
+                alpha,
+                beta};
+    }
+};
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/contiguous.cpp
+++ b/src/targets/cpu/contiguous.cpp
 #include <migraphx/config.hpp>
 #include <migraphx/cpu/pointwise.hpp>
-#include <migraphx/op/contiguous.hpp>
+#include <migraphx/op/erf.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
-template struct cpu_unary<op::contiguous>;
+template struct cpu_unary<op::erf>;
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/cpu/gather.cpp
+++ b/src/targets/cpu/gather.cpp
+#include <migraphx/config.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx/cpu/context.hpp>
+#include <migraphx/op/gather.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+struct cpu_gather : auto_register_op<cpu_gather>
+{
+    op::gather op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "cpu::" + op.name(); }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        // Compensate for allocation
+        inputs.pop_back();
+        check_shapes(inputs, *this).standard();
+        return migraphx::compute_shape(op, inputs);
+    }
+    argument
+    // cppcheck-suppress constParameter
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
+    {
+        std::size_t nelements = output_shape.elements();
+        auto lens             = args[0].get_shape().lens();
+        auto axis_dim_size    = lens[op.axis];
+        lens[op.axis]         = args[1].get_shape().elements();
+        shape out_comp{output_shape.type(), lens};
+        visit_all(args.back(), args[0])([&](auto output, auto input) {
+            args[1].visit([&](auto indices) {
+                const auto* indices_ptr = indices.data();
+                auto* output_ptr        = output.data();
+                ctx.bulk_execute(nelements, 1024, [=](auto start, auto end) {
+                    for(auto i = start; i < end; i++)
+                    {
+                        auto idx      = out_comp.multi(i);
+                        auto in_index = indices_ptr[idx[op.axis]];
+                        in_index      = (in_index < 0) ? in_index + axis_dim_size : in_index;
+                        idx[op.axis]  = in_index;
+                        output_ptr[i] = input(idx.begin(), idx.end());
+                    }
+                });
+            });
+        });
+        return args.back();
+    }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/gemm.cpp
+++ b/src/targets/cpu/gemm.cpp
@@ -15,30 +15,7 @@ struct dnnl_gemm : dnnl_extend_op<dnnl_gemm, dnnl::matmul, op::dot>
 {
    std::vector<int> arg_map(int) const { return {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS}; }
-    // Batch must be a single dimension
+    void required(const check_shapes& cs) const { cs.not_broadcasted(); }
-    shape adjust_shape(shape x, int) const
-    {
-        auto s     = base_adjust_shape(x);
-        auto ndims = s.lens().size();
-        if(ndims > 3)
-        {
-            if(not std::is_sorted(
-                   s.strides().begin(), s.strides().begin() + (ndims - 2), std::greater<>{}))
-                MIGRAPHX_THROW("Batch transposed");
-            std::size_t batch = std::accumulate(
-                s.lens().begin(), s.lens().begin() + (ndims - 2), 1, std::multiplies<>{});
-            shape s3d{s.type(),
-                      {batch, s.lens()[ndims - 2], s.lens()[ndims - 1]},
-                      {s.lens()[ndims - 2] * s.lens()[ndims - 1],
-                       s.strides()[ndims - 2],
-                       s.strides()[ndims - 1]}};
-            return s3d;
-        }
-        else
-        {
-            return s;
-        }
-    }
    dnnl::matmul::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {

--- a/src/targets/cpu/include/migraphx/cpu/dnnl.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/dnnl.hpp
@@ -21,46 +21,11 @@ struct dnnl_context
    dnnl_context() : engine(dnnl::engine::kind::cpu, 0), stream(engine) {}
 };
-inline dnnl_context& get_dnnl_context()
+dnnl_context& get_dnnl_context();
-{
-    static dnnl_context ctx{}; // NOLINT
-    return ctx;
-}
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wswitch-enum"
-#endif
-inline dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t)
-{
-    using dt = dnnl::memory::data_type;
-    using st = shape::type_t;
-    switch(t)
-    {
-    case st::half_type: return dt::f16;
-    case st::float_type: return dt::f32;
-    case st::int32_type: return dt::s32;
-    case st::int8_type: return dt::s8;
-    case st::uint8_type: return dt::u8;
-    default: MIGRAPHX_THROW("Unsupported data type");
-    }
-}
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-inline dnnl::memory::format_tag to_dnnl_memory_format_tag(std::size_t n)
+dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t);
-{
-    switch(n)
+dnnl::memory::format_tag to_dnnl_memory_format_tag(std::size_t n);
-    {
-    case 1: return dnnl::memory::format_tag::a;
-    case 2: return dnnl::memory::format_tag::ab;
-    case 3: return dnnl::memory::format_tag::abc;
-    case 4: return dnnl::memory::format_tag::abcd;
-    case 5: return dnnl::memory::format_tag::abcde;
-    case 6: return dnnl::memory::format_tag::abcdef;
-    default: MIGRAPHX_THROW("Unsupported tensor size: " + std::to_string(n));
-    }
-}
 template <class R>
 inline dnnl::memory::dims to_dnnl_dims(R&& r)
@@ -68,20 +33,13 @@ inline dnnl::memory::dims to_dnnl_dims(R&& r)
    return {r.begin(), r.end()};
 }
-inline dnnl::memory::desc to_dnnl_memory_desc(const shape& s)
+dnnl::memory::desc to_dnnl_memory_desc(const shape& s);
-{
-    return {to_dnnl_dims(s.lens()), to_dnnl_memory_data_type(s.type()), to_dnnl_dims(s.strides())};
-}
-inline dnnl::memory to_dnnl_memory(const dnnl::memory::desc& desc, const argument& a)
+dnnl::memory to_dnnl_memory(const dnnl::memory::desc& desc, const argument& a);
-{
-    return dnnl::memory(desc, get_dnnl_context().engine, a.data());
-}
-inline dnnl::memory to_dnnl_memory(const argument& a)
+dnnl::memory to_dnnl_memory(const argument& a);
-{
-    return to_dnnl_memory(to_dnnl_memory_desc(a.get_shape()), a);
+dnnl::algorithm to_dnnl_algo(const std::string& name);
-}
 template <class Derived, class Primitive>
 struct dnnl_op : auto_register_op<Derived>
@@ -208,12 +166,16 @@ struct dnnl_extend_op : dnnl_op<Derived, Primitive>
        return migraphx::reflect(self.op, f);
    }
+    // dnnl has some issues with non-packed inputs
+    void required(const check_shapes& cs) const { cs.packed_or_broadcasted(); }
    std::string name() const { return "dnnl::" + op.name(); }
    shape compute_shape(std::vector<shape> inputs) const
    {
+        const auto& self = static_cast<const Derived&>(*this);
        // Compensate for allocation
        inputs.pop_back();
-        // check_shapes(inputs, *this).standard();
+        self.required(check_shapes(inputs, self));
        auto r = migraphx::compute_shape(op, inputs);
        // Call to get_primitive to make sure an algo is available
        this->get_primitive(this->to_memory_desc(r, inputs));

--- a/src/targets/cpu/include/migraphx/cpu/parallel.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/parallel.hpp
 #ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_PARALLEL_HPP
 #define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_PARALLEL_HPP
+// #define MIGRAPHX_DISABLE_OMP
 #include <migraphx/config.hpp>
+#ifdef MIGRAPHX_DISABLE_OMP
+#include <migraphx/par_for.hpp>
+#else
 #include <omp.h>
+#endif
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
+#ifdef MIGRAPHX_DISABLE_OMP
+inline std::size_t max_threads() { return std::thread::hardware_concurrency(); }
+template <class F>
+void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
+{
+    if(threadsize <= 1)
+    {
+        f(std::size_t{0}, n);
+    }
+    else
+    {
+        std::vector<joinable_thread> threads(threadsize);
+// Using const here causes gcc 5 to ICE
+#if(!defined(__GNUC__) || __GNUC__ != 5)
+        const
+#endif
+            std::size_t grainsize = std::ceil(static_cast<double>(n) / threads.size());
+        std::size_t work = 0;
+        std::generate(threads.begin(), threads.end(), [=, &work] {
+            auto result =
+                joinable_thread([=]() mutable { f(work, std::min(n, work + grainsize)); });
+            work += grainsize;
+            return result;
+        });
+        // cppcheck-suppress unsignedLessThanZero
+        assert(work >= n);
+    }
+}
+#else
+inline std::size_t max_threads() { return omp_get_max_threads(); }
 template <class F>
 void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
 {
@@ -18,19 +59,19 @@ void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
    else
    {
        std::size_t grainsize = std::ceil(static_cast<double>(n) / threadsize);
-#pragma omp parallel num_threads(threadsize)
+#pragma omp parallel for num_threads(threadsize) schedule(static, 1) private(grainsize, n)
+        for(std::size_t tid = 0; tid < threadsize; tid++)
        {
-            std::size_t tid  = omp_get_thread_num();
            std::size_t work = tid * grainsize;
            f(work, std::min(n, work + grainsize));
        }
    }
 }
+#endif
 template <class F>
 void parallel_for(std::size_t n, std::size_t min_grain, F f)
 {
-    const auto threadsize = std::min<std::size_t>(omp_get_num_threads(), n / min_grain);
+    const auto threadsize = std::min<std::size_t>(max_threads(), n / min_grain);
    parallel_for_impl(n, threadsize, f);
 }

--- a/src/targets/cpu/include/migraphx/cpu/pointwise.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/pointwise.hpp
@@ -14,6 +14,8 @@ namespace cpu {
 struct multi_index
 {
+    constexpr multi_index() = default;
    multi_index(const shape& s, std::size_t i) : n(s.lens().size())
    {
        assert(n < max_size);
@@ -21,17 +23,17 @@ struct multi_index
        s.multi_copy(i, index, index + max_size);
    }
-    std::size_t size() const { return n; }
+    constexpr std::size_t size() const { return n; }
-    std::size_t* begin() { return index; }
+    constexpr std::size_t* begin() { return index; }
-    const std::size_t* begin() const { return index; }
+    constexpr const std::size_t* begin() const { return index; }
-    std::size_t* end() { return index + size(); }
+    constexpr std::size_t* end() { return index + size(); }
-    const std::size_t* end() const { return index + size(); }
+    constexpr const std::size_t* end() const { return index + size(); }
    std::size_t offset(const shape& s) const { return s.index(begin(), end()); }
-    void carry()
+    constexpr void carry()
    {
        std::size_t overflow = 0;
        for(std::ptrdiff_t i = size() - 1; i > 0; i--)
@@ -40,29 +42,34 @@ struct multi_index
            // Reset overflow
            overflow = 0;
            // Compute overflow using while loop instead of mod
+            // overflow = z / dims[i];
+            // z = z % dims[i];
            while(z >= dims[i])
            {
                z -= dims[i];
                overflow += 1;
            }
            index[i] = z;
+            // Exit if there is no overflow
+            if(overflow == 0)
+                return;
        }
        index[0] += overflow;
    }
-    void increment(std::size_t i)
+    constexpr void increment(std::size_t i)
    {
        index[size() - 1] += i;
        carry();
    }
-    multi_index& operator+=(std::size_t i)
+    constexpr multi_index& operator+=(std::size_t i)
    {
        increment(i);
        return *this;
    }
-    multi_index& operator++()
+    constexpr multi_index& operator++()
    {
        increment(1);
        return *this;
@@ -76,9 +83,9 @@ struct multi_index
    private:
    static const std::size_t max_size = 5;
-    std::size_t index[max_size];
+    std::size_t index[max_size]       = {};
-    std::size_t dims[max_size];
+    std::size_t dims[max_size]        = {};
-    std::size_t n;
+    std::size_t n                     = 0;
 };
 struct reduce_dims_base
@@ -104,6 +111,153 @@ struct reduce_dims_base
    }
 };
+template <class T, std::size_t N>
+struct vec
+{
+    using array_type                                              = std::array<T, N>;
+    using vector_type __attribute__((vector_size(N * sizeof(T)))) = T;
+    union
+    {
+        array_type array;
+        vector_type vector;
+    };
+    static_assert(sizeof(array_type) == sizeof(vector_type), "Not the same size");
+};
+template <class T>
+constexpr std::integral_constant<std::size_t, 0> vec_size(const T&)
+{
+    return {};
+}
+template <class T, std::size_t N>
+constexpr std::integral_constant<std::size_t, N> vec_size(const vec<T, N>&)
+{
+    return {};
+}
+template <class T>
+constexpr std::size_t vec_size()
+{
+    return decltype(vec_size(std::declval<T>())){};
+}
+template <class F, class V, class... Vs, MIGRAPHX_REQUIRES((vec_size<V>() > 0))>
+void vec_apply(F f, V& v, Vs... vs)
+{
+    assert(all_of({vec_size<Vs>()...}, [&](auto n) { return n == vec_size<V>(); }));
+    assert(vec_size<V>() == v.array.size());
+    for(std::size_t i = 0; i < vec_size<V>(); i++)
+        f(v.array[i], vs.vector[i]...);
+}
+template <class F, class V, class... Vs, MIGRAPHX_REQUIRES((vec_size<V>() == 0))>
+void vec_apply(F f, V& v, Vs&... vs)
+{
+    f(v, vs...);
+}
+inline std::size_t find_packed_len(const shape& s)
+{
+    for(std::size_t i = 0; i < s.lens().size(); i++)
+    {
+        if(s.lens()[i] > 1 and s.strides()[i] == 1)
+        {
+            return i;
+        }
+    }
+    return -1;
+}
+template <std::size_t N>
+shape vectorize(const shape& s)
+{
+    assert(s.standard() or s.broadcasted());
+    auto lens = s.lens();
+    if(s.broadcasted())
+    {
+        auto n = find_packed_len(s);
+        assert(n != -1);
+        assert((lens[n] % N) == 0);
+        lens[n] /= N;
+        return {s.type(), lens, s.strides()};
+    }
+    assert((lens.back() % N) == 0);
+    lens.back() /= N;
+    return {s.type(), lens};
+}
+template <std::size_t N, class T>
+tensor_view<vec<T, N>> vectorize(tensor_view<T> tv)
+{
+    return {vectorize<N>(tv.get_shape()), reinterpret_cast<vec<T, N>*>(tv.data())};
+}
+template <class T>
+struct is_vector_type : std::false_type
+{
+};
+template <>
+struct is_vector_type<float> : std::true_type
+{
+};
+template <class... Ts>
+struct is_vector_tensor_view : and_<is_vector_type<typename Ts::value_type>{}...>
+{
+};
+template <std::size_t N, class... Xs>
+bool is_vectorizable(const Xs&... xs)
+{
+    return all_of({xs...}, [](const auto& s) {
+        if(s.standard() and (s.lens().back() % N) == 0)
+            return true;
+        if(s.broadcasted())
+        {
+            auto n = std::inner_product(s.lens().begin(),
+                                        s.lens().end(),
+                                        s.strides().begin(),
+                                        0,
+                                        std::plus<>{},
+                                        [&](auto len, auto stride) -> std::size_t {
+                                            if(stride > 0 and len == 1)
+                                                return 0;
+                                            return stride;
+                                        });
+            if(n == 1)
+            {
+                auto i = find_packed_len(s);
+                assert(i != -1);
+                return (s.lens()[i] % N) == 0;
+            }
+        }
+        return false;
+    });
+}
+template <class... Ts, MIGRAPHX_REQUIRES(is_vector_tensor_view<Ts...>{})>
+auto auto_vectorize(const shape& base_shape, Ts... xs)
+{
+    return [=](auto f) {
+        if(is_vectorizable<32>(base_shape, xs.get_shape()...))
+            f(vectorize<32>(base_shape), vectorize<32>(xs)...);
+        else if(is_vectorizable<8>(base_shape, xs.get_shape()...))
+            f(vectorize<8>(base_shape), vectorize<8>(xs)...);
+        else
+            f(base_shape, xs...);
+    };
+}
+template <class... Ts, MIGRAPHX_REQUIRES(not is_vector_tensor_view<Ts...>{})>
+auto auto_vectorize(const shape& base_shape, Ts... xs)
+{
+    return [=](auto f) { f(base_shape, xs...); };
+}
 template <class X, class... Xs>
 bool is_standard_offset(const X& x, const Xs&... xs)
 {
@@ -116,15 +270,15 @@ bool is_standard_offset(const X& x, const Xs&... xs)
 }
 template <class... Ts>
-auto pointwise(Ts... xs)
+auto pointwise_apply(Ts... ts)
 {
    return [=](context& ctx, const shape& base_shape, std::size_t min_grain, auto f) mutable {
-        if(is_standard_offset(xs.get_shape()...))
+        if(is_standard_offset(ts.get_shape()...))
        {
            ctx.bulk_execute(base_shape.elements(), min_grain, [=](auto start, auto end) mutable {
                for(auto i = start; i < end; i++)
                {
-                    f(xs.data()[i]...);
+                    vec_apply(f, ts.data()[i]...);
                }
            });
        }
@@ -135,7 +289,7 @@ auto pointwise(Ts... xs)
                multi_index mi(base_shape, start);
                for(auto i = start; i < end; i++)
                {
-                    f(xs.data()[mi.offset(xs.get_shape())]...);
+                    vec_apply(f, ts.data()[mi.offset(ts.get_shape())]...);
                    ++mi;
                }
            });
@@ -143,6 +297,15 @@ auto pointwise(Ts... xs)
    };
 }
+template <class... Ts>
+auto pointwise(Ts... ts)
+{
+    return [=](context& ctx, const shape& base_shape, std::size_t min_grain, auto f) mutable {
+        auto_vectorize(base_shape, ts...)(
+            [&](auto bs, auto... xs) { pointwise_apply(xs...)(ctx, bs, min_grain, f); });
+    };
+}
 template <class Op>
 struct cpu_unary : reduce_dims_base, auto_register_op<cpu_unary<Op>>
 {

--- a/src/targets/cpu/include/migraphx/cpu/write_literals.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/write_literals.hpp
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_WRITE_LITERALS_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_WRITE_LITERALS_HPP
+#include <migraphx/config.hpp>
+#include <string>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct module;
+namespace cpu {
+struct write_literals
+{
+    std::string name() const { return "cpu::write_literals"; }
+    void apply(module& m) const;
+};
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/cpu/logsoftmax.cpp
+++ b/src/targets/cpu/logsoftmax.cpp
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/dnnl.hpp>
+#include <migraphx/op/logsoftmax.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+struct dnnl_logsoftmax : dnnl_extend_op<dnnl_logsoftmax, dnnl::logsoftmax_forward, op::logsoftmax>
+{
+    dnnl::logsoftmax_forward::desc
+    get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    {
+        int axis = this->op.axis;
+        return {dnnl::prop_kind::forward_inference, m.at(DNNL_ARG_SRC_0), axis};
+    }
+};
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -51,145 +51,6 @@ typename std::conditional_t<std::is_integral<T>{}, std::make_signed<T>, std::ena
    return x;
 }
-struct cpu_lrn
-{
-    op::lrn op;
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-    std::string name() const { return "cpu::lrn"; }
-    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0])([&](auto output, auto input) {
-            int n_batch         = output_shape.lens()[0];
-            int channels        = output_shape.lens()[1];
-            int height          = output_shape.lens()[2];
-            int width           = output_shape.lens()[3];
-            float alphaoverarea = op.alpha / float(op.size);
-            int radius_lower    = (op.size - 1) / 2;
-            int radius_upper    = op.size / 2 + 1;
-            par_dfor(n_batch, height, width)([&](int b, int h, int w) {
-                float scale = 0;
-                dfor(channels)([&](int c) {
-                    auto start = (c - radius_lower) < 0 ? 0 : (c - radius_lower);
-                    auto end   = (c + radius_upper) > channels ? channels : (c + radius_upper);
-                    for(auto k = start; k < end; ++k)
-                    {
-                        scale += std::pow(input(b, k, h, w), 2);
-                    }
-                    scale *= alphaoverarea;
-                    scale += op.bias;
-                    scale              = std::pow(scale, -op.beta);
-                    output(b, c, h, w) = input(b, c, h, w) * scale;
-                });
-            });
-        });
-        return result;
-    }
-};
-MIGRAPHX_REGISTER_OP(cpu_lrn)
-template <class Op>
-struct cpu_deconvolution : auto_register_op<cpu_deconvolution<Op>>
-{
-    cpu_deconvolution() = default;
-    cpu_deconvolution(Op pop) : op(std::move(pop)) {}
-    Op op;
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-    std::string name() const { return "cpu::" + op.name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
-            using type = typename decltype(output)::value_type;
-            std::fill(output.begin(), output.end(), type{0});
-            auto in_lens = input.get_shape().lens();
-            auto in_n    = in_lens[0];
-            auto in_c    = in_lens[1];
-            auto wei   = weights.get_shape().lens();
-            auto wei_n = wei[0];
-            auto wei_c = wei[1];
-            auto out_lens = output_shape.lens();
-            auto kdims    = op.kdims();
-            std::vector<std::size_t> win_size{in_c};
-            std::copy(in_lens.begin() + 2, in_lens.end(), std::back_inserter(win_size));
-            std::copy(wei.begin() + 2, wei.end(), std::back_inserter(win_size));
-            shape win_shape{output_shape.type(), win_size};
-            par_dfor(in_n, wei_c)([&](int o, int k) {
-                shape_for_each(win_shape, [&](auto idx_win) {
-                    const int w = idx_win[0];
-                    auto input_dims_start = idx_win.begin() + 1;
-                    auto wei_dims_start   = idx_win.begin() + kdims + 1;
-                    std::vector<std::ptrdiff_t> win_start;
-                    for(std::size_t n = 0; n < kdims; ++n)
-                    {
-                        win_start.push_back(std::ptrdiff_t(*(input_dims_start + n) * op.stride[n]) -
-                                            std::ptrdiff_t(op.padding[n]));
-                    }
-                    const int group_id = w / (wei_n / op.group);
-                    const int in_ch    = group_id * wei_c + k;
-                    std::vector<std::ptrdiff_t> idx_out{o, in_ch};
-                    for(size_t n = 0; n < kdims; n++)
-                    {
-                        idx_out.push_back(win_start[n] + *(wei_dims_start + n) * op.dilation[n]);
-                    }
-                    std::vector<std::ptrdiff_t> idx_wei{w, k};
-                    std::copy(wei_dims_start, idx_win.end(), std::back_inserter(idx_wei));
-                    std::vector<std::ptrdiff_t> idx_in{o, w};
-                    std::copy(input_dims_start, wei_dims_start, std::back_inserter(idx_in));
-                    if(std::all_of(
-                           idx_out.begin() + 2, idx_out.end(), [&](auto ii) { return ii >= 0; }) and
-                       std::equal(idx_out.begin() + 2,
-                                  idx_out.end(),
-                                  out_lens.begin() + 2,
-                                  out_lens.end(),
-                                  std::less<std::ptrdiff_t>{}))
-                    {
-                        output(idx_out.begin(), idx_out.end()) +=
-                            input(idx_in.begin(), idx_in.end()) *
-                            weights(idx_wei.begin(), idx_wei.end());
-                    }
-                });
-            });
-        });
-        return result;
-    }
-};
-template struct cpu_deconvolution<op::deconvolution>;
 struct cpu_im2col
 {
    op::im2col op;
@@ -334,17 +195,6 @@ struct leaky_relu_op
    }
 };
-struct elu_op
-{
-    op::elu op;
-    std::string name() const { return "cpu::elu"; }
-    auto fcn() const
-    {
-        auto a = op.alpha;
-        return [a](auto x) { return x > 0 ? x : a * std::expm1(x); };
-    }
-};
 template <typename Op>
 struct cpu_unary2 : auto_register_op<cpu_unary2<Op>>
 {
@@ -382,78 +232,6 @@ struct cpu_unary2 : auto_register_op<cpu_unary2<Op>>
    }
 };
 template struct cpu_unary2<leaky_relu_op>;
-template struct cpu_unary2<elu_op>;
-template <class Op>
-struct cpu_softmax : auto_register_op<cpu_softmax<Op>>
-{
-    cpu_softmax() = default;
-    cpu_softmax(Op pop) : op(std::move(pop)) {}
-    Op op;
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-    std::string name() const { return "cpu::" + op.name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const
-    {
-        check_shapes{inputs, *this}.has(1).standard();
-        return op.normalize_compute_shape(inputs);
-    }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        auto batch_lens        = output_shape.lens();
-        int64_t tuned_axis     = tune_axis(args[0].get_shape().lens().size(), op.axis, op.name());
-        std::size_t n_dims     = batch_lens[tuned_axis];
-        batch_lens[tuned_axis] = 1;
-        shape batch_shape{shape::int32_type, batch_lens};
-        visit_all(result, args[0])([&](auto output, auto input) {
-            using value_type = typename decltype(input)::value_type;
-            std::vector<value_type> batch_max(batch_shape.elements(),
-                                              std::numeric_limits<value_type>::lowest());
-            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
-            par_for(batch_shape.elements(), [&](auto i) {
-                auto idx = batch_shape.multi(i);
-                for(std::size_t j = 0; j < n_dims; ++j)
-                {
-                    idx[tuned_axis] = j;
-                    batch_max[i]    = std::max(batch_max[i], input(idx.begin(), idx.end()));
-                }
-                for(std::size_t j = 0; j < n_dims; ++j)
-                {
-                    idx[tuned_axis]   = j;
-                    std::size_t index = output_shape.index(idx);
-                    output[index]     = std::exp(input[index] - batch_max[i]);
-                }
-                for(std::size_t j = 0; j < n_dims; ++j)
-                {
-                    idx[tuned_axis] = j;
-                    batch_sum[i] += output(idx.begin(), idx.end());
-                }
-                for(std::size_t j = 0; j < n_dims; ++j)
-                {
-                    idx[tuned_axis] = j;
-                    output(idx.begin(), idx.end()) =
-                        op.output()(output(idx.begin(), idx.end()), batch_sum[i]);
-                }
-            });
-        });
-        return result;
-    }
-};
-template struct cpu_softmax<op::softmax>;
-template struct cpu_softmax<op::logsoftmax>;
 struct cpu_rnn_var_sl_last_output
 {
@@ -502,29 +280,6 @@ struct cpu_rnn_var_sl_last_output
 };
 MIGRAPHX_REGISTER_OP(cpu_rnn_var_sl_last_output)
-struct cpu_literal
-{
-    argument data;
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return pack(f(self.data, "data"));
-    }
-    std::string name() const { return "cpu::literal"; }
-    shape compute_shape(const std::vector<shape>&) const { return data.get_shape(); }
-    argument compute(const shape&, const std::vector<argument>&) const { return data; }
-    friend std::ostream& operator<<(std::ostream& os, const cpu_literal& x)
-    {
-        os << x.name();
-        return os;
-    }
-};
 struct cpu_apply
 {
    module* modl;
@@ -553,49 +308,97 @@ struct cpu_apply
        }
    }
-    void extend_op(const std::string& op_name, const std::string& cpu_name, bool allocate = false)
+    void extend_op(const std::string& op_name, const std::string& cpu_name, bool allocate = true)
    {
        apply_map.emplace(op_name, [=](instruction_ref ins) {
            auto&& op = ins->get_operator();
            if(allocate)
-                replace(ins, make_op(cpu_name, op.to_value()));
+                return replace(ins, make_op(cpu_name, op.to_value()));
            return modl->replace_instruction(ins, make_op(cpu_name, op.to_value()), ins->inputs());
        });
    }
+    void extend_dnnl_algos(const std::string& dnnl_name,
+                           const std::vector<std::pair<std::string, std::string>>& algos)
+    {
+        for(auto&& pp : algos)
+        {
+            std::string op_name = pp.first;
+            std::string algo    = pp.second;
+            apply_map.emplace(op_name, [=](instruction_ref ins) {
+                auto v = ins->get_operator().to_value();
+                if(not v.is_object())
+                    return ins;
+                v["algo"] = algo;
+                auto op   = make_op(dnnl_name, v);
+                return replace(ins, op);
+            });
+        }
+    }
    void init()
    {
        create_output_names();
-        extend_op("add", "dnnl::add", true);
+        extend_dnnl_algos("dnnl::binary",
-        extend_op("mul", "dnnl::mul", true);
+                          {
-        extend_op("convolution", "dnnl::convolution", true);
+                              {"add", "binary_add"},
-        extend_op("dot", "dnnl::dot", true);
+                              {"div", "binary_div"},
-        extend_op("relu", "dnnl::relu", true);
+                              {"max", "binary_max"},
+                              {"min", "binary_min"},
-        extend_op("contiguous", "cpu::contiguous", true);
+                              {"mul", "binary_mul"},
-        extend_op("deconvolution", "cpu::deconvolution");
+                          });
-        extend_op("elu", "cpu::elu");
-        extend_op("im2col", "cpu::im2col");
+        extend_dnnl_algos("dnnl::eltwise",
-        extend_op("leaky_relu", "cpu::leaky_relu");
+                          {
-        extend_op("logsoftmax", "cpu::logsoftmax");
+                              {"abs", "eltwise_abs"},
-        extend_op("lrn", "cpu::lrn");
+                              {"elu", "eltwise_elu"},
-        extend_op("pad", "cpu::pad");
+                              {"exp", "eltwise_exp"},
-        extend_op("quant_convolution", "cpu::quant_convolution", true);
+                              {"log", "eltwise_log"},
-        extend_op("quant_dot", "cpu::quant_dot", true);
+                              {"relu", "eltwise_relu"},
-        extend_op("rnn_var_sl_last_output", "cpu::rnn_var_sl_last_output");
+                              {"sqrt", "eltwise_sqrt"},
-        extend_op("softmax", "cpu::softmax");
+                              {"tanh", "eltwise_tanh"},
+                          });
+        extend_dnnl_algos("dnnl::reduction",
+                          {
+                              {"reduce_max", "reduction_max"},
+                              {"reduce_mean", "reduction_mean"},
+                              {"reduce_min", "reduction_min"},
+                              {"reduce_sum", "reduction_sum"},
+                          });
+        extend_op("concat", "dnnl::concat");
+        extend_op("contiguous", "dnnl::reorder");
+        extend_op("convolution", "dnnl::convolution");
+        extend_op("deconvolution", "dnnl::deconvolution");
+        extend_op("dot", "dnnl::dot");
+        extend_op("erf", "cpu::erf");
+        extend_op("gather", "cpu::gather");
+        extend_op("logsoftmax", "dnnl::logsoftmax");
+        extend_op("lrn", "dnnl::lrn");
+        extend_op("softmax", "dnnl::softmax");
+        extend_op("sub", "cpu::sub");
+        extend_op("im2col", "cpu::im2col", false);
+        extend_op("leaky_relu", "cpu::leaky_relu", false);
+        extend_op("pad", "cpu::pad", false);
+        extend_op("rnn_var_sl_last_output", "cpu::rnn_var_sl_last_output", false);
    }
    void apply()
    {
        init();
+        // Apply these operators first so the inputs can be const folded
        for(auto it : iterator_for(*modl))
        {
-            if(it->name() == "@literal")
+            if(it->name() == "pow")
            {
-                apply_literal(it);
+                apply_pow(it);
            }
-            else if(it->name() == "pooling")
+        }
+        for(auto it : iterator_for(*modl))
+        {
+            if(it->name() == "pooling")
            {
                apply_pooling(it);
            }
@@ -606,9 +409,15 @@ struct cpu_apply
        }
    }
-    instruction_ref apply_literal(instruction_ref ins) const
+    instruction_ref apply_pow(instruction_ref ins)
    {
-        return modl->replace_instruction(ins, cpu_literal{ins->get_literal().get_argument()});
+        auto beta = read_scalar<float>(ins->inputs()[1]);
+        if(beta.empty())
+            return ins;
+        return replace(ins,
+                       make_op("dnnl::eltwise",
+                               {{"algo", "eltwise_pow"}, {"alpha", 1.0}, {"beta", beta.front()}}),
+                       {ins->inputs().front()});
    }
    instruction_ref apply_pooling(instruction_ref ins)
@@ -626,9 +435,27 @@ struct cpu_apply
        return ins;
    }
+    template <class T>
+    static std::vector<T> read_scalar(instruction_ref ins)
+    {
+        if(ins->name() == "contiguous")
+            return read_scalar<T>(ins->inputs().front());
+        if(ins->get_shape().elements() != 1 and not ins->get_shape().scalar())
+            return {};
+        auto r = ins->eval();
+        if(r.empty())
+            return {};
+        return {r.at<T>()};
+    }
    instruction_ref replace(instruction_ref ins, const operation& op)
    {
-        auto inputs = ins->inputs();
+        return replace(ins, op, ins->inputs());
+    }
+    instruction_ref
+    replace(instruction_ref ins, const operation& op, std::vector<instruction_ref> inputs)
+    {
        inputs.push_back(insert_allocation(ins, ins->get_shape()));
        return modl->replace_instruction(ins, op, inputs);
    }

--- a/src/targets/cpu/mul.cpp
+++ b/src/targets/cpu/mul.cpp
 #include <migraphx/config.hpp>
-#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/cpu/dnnl.hpp>
-#include <migraphx/op/mul.hpp>
+#include <migraphx/op/lrn.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {
-struct dnnl_mul : dnnl_extend_op<dnnl_mul, dnnl::binary, op::mul>
+struct dnnl_lrn : dnnl_extend_op<dnnl_lrn, dnnl::lrn_forward, op::lrn>
 {
-    dnnl::binary::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
+    dnnl::lrn_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
-        return {dnnl::algorithm::binary_mul,
+        return {dnnl::prop_kind::forward_inference,
+                dnnl::algorithm::lrn_across_channels,
                m.at(DNNL_ARG_SRC_0),
-                m.at(DNNL_ARG_SRC_1),
+                this->op.size,
-                m.at(DNNL_ARG_DST)};
+                this->op.alpha,
+                this->op.beta,
+                this->op.bias};
    }
 };