Use dnnl for cpu backend (#688)

* Add flag to enable cpu backend * Make buffers shared * Enable optimizations * Add onednn * Formatting * Formatting * Add dnnl header * Formatting * Rewrite rnn first * Formatting * Call reference implementation * Formatting * Make literal data shared * Formatting * Add convolution * Formatting * Compensate for dilation * Formatting * Use name/make_op instead * Formatting * Rename gemm header * Formatting * Add dnnl convolution/gemm operators * Formatting * Add eliminate_contiguous * Add faster pointwise operators * Formatting * Formatting * Formatting * Add dnnl op class * Formatting * Add add op * Formatting * Add concat operator * Formatting * Add more ops * Create descriptor during finalization * Formatting * Dont rewrite pooling * Enable memory coloring * Formatting * Add output aliases * Formatting * Fix errors * Formatting * Convert literals * Add missing file * Remove batch_norm * Formatting * Use strides * Formatting * Add some debug checks * Formatting * Fix big in adjusting shape for gemm * Formatting * Fix fallback dot operator * Zero initialize buffers * Add suport for group convolutions * Formatting * Make adjust allocation target independent * Formatting * Enable adjust_allocation for gpu/cpu * Formatting * Add copy to allocation model * Formatting * Add copy operator * Formatting * Better handling of output parameters in adjust_allocation * Formatting * Build with dnnl * Make dnnl required * Fix compile error * Tidy fixes * Formatting * Tidy fixes * Formatting * Fix more tidy issues * Formatting * Add mul op * Add mul op * Set c compiler to clang as well * Compensate for normalized compute shape * Formatting * Fix cppcheck errors * Formatting * Add onednn library to hcc * Guard clang pragmas * Disable cpu mode for gcc for now * Leave it enabled it for gcc 7 * Fix cppcheck suppresion * Fix compile error on gcc 5 * Remove unused code Co-authored-by: Shucai Xiao <shucai.xiao@amd.com> Co-authored-by: mvermeulen <5479696+mvermeulen@users.noreply.github.com>

Use dnnl for cpu backend (#688)
* Add flag to enable cpu backend * Make buffers shared * Enable optimizations * Add onednn * Formatting * Formatting * Add dnnl header * Formatting * Rewrite rnn first * Formatting * Call reference implementation * Formatting * Make literal data shared * Formatting * Add convolution * Formatting * Compensate for dilation * Formatting * Use name/make_op instead * Formatting * Rename gemm header * Formatting * Add dnnl convolution/gemm operators * Formatting * Add eliminate_contiguous * Add faster pointwise operators * Formatting * Formatting * Formatting * Add dnnl op class * Formatting * Add add op * Formatting * Add concat operator * Formatting * Add more ops * Create descriptor during finalization * Formatting * Dont rewrite pooling * Enable memory coloring * Formatting * Add output aliases * Formatting * Fix errors * Formatting * Convert literals * Add missing file * Remove batch_norm * Formatting * Use strides * Formatting * Add some debug checks * Formatting * Fix big in adjusting shape for gemm * Formatting * Fix fallback dot operator * Zero initialize buffers * Add suport for group convolutions * Formatting * Make adjust allocation target independent * Formatting * Enable adjust_allocation for gpu/cpu * Formatting * Add copy to allocation model * Formatting * Add copy operator * Formatting * Better handling of output parameters in adjust_allocation * Formatting * Build with dnnl * Make dnnl required * Fix compile error * Tidy fixes * Formatting * Tidy fixes * Formatting * Fix more tidy issues * Formatting * Add mul op * Add mul op * Set c compiler to clang as well * Compensate for normalized compute shape * Formatting * Fix cppcheck errors * Formatting * Add onednn library to hcc * Guard clang pragmas * Disable cpu mode for gcc for now * Leave it enabled it for gcc 7 * Fix cppcheck suppresion * Fix compile error on gcc 5 * Remove unused code Co-authored-by: Shucai Xiao <shucai.xiao@amd.com> Co-authored-by: mvermeulen <5479696+mvermeulen@users.noreply.github.com>
406afeb8 · Paul Fultz II · GitHub · 8698cd2c · 406afeb8 · 406afeb8
Unverified Commit 406afeb8 authored Dec 14, 2020 by Paul Fultz II Committed by GitHub Dec 14, 2020
15 changed files
--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp

 #include <migraphx/auto_contiguous.hpp>
 #include <migraphx/check_context.hpp>
+#include <migraphx/adjust_allocation.hpp>
 #include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/decompose.hpp>
 #include <migraphx/eliminate_allocation.hpp>
@@ -17,8 +18,10 @@
 #include <migraphx/rewrite_pooling.hpp>
 #include <migraphx/rewrite_rnn.hpp>
 #include <migraphx/schedule.hpp>
+#include <migraphx/memory_coloring.hpp>
 #include <migraphx/simplify_algebra.hpp>
 #include <migraphx/simplify_reshapes.hpp>
+#include <migraphx/cpu/allocation_model.hpp>
 #include <migraphx/cpu/target.hpp>
 #include <migraphx/cpu/lowering.hpp>
 #include <migraphx/pass.hpp>
@@ -44,8 +47,6 @@ std::vector<pass> target::get_passes(migraphx::context&, const compile_options&)
            dead_code_elimination{},
            rewrite_rnn{},
            dead_code_elimination{},
-            rewrite_pooling{},
-            dead_code_elimination{},
            eliminate_common_subexpression{},
            dead_code_elimination{},
            simplify_algebra{},
@@ -56,6 +57,11 @@ std::vector<pass> target::get_passes(migraphx::context&, const compile_options&)
            propagate_constant{},
            dead_code_elimination{},
            lowering{},
+            eliminate_contiguous{},
+            dead_code_elimination{},
+            adjust_allocation{cpu_allocation_model{}},
+            dead_code_elimination{},
+            memory_coloring{"cpu::allocate"},
            dead_code_elimination{}};
 }


--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -97,6 +97,7 @@ target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CUR

 add_library(migraphx_gpu
    analyze_streams.cpp
+    allocation_model.cpp
    argmax.cpp
    argmin.cpp
    eliminate_workspace.cpp
@@ -123,7 +124,6 @@ add_library(migraphx_gpu
    convert.cpp
    lrn.cpp
    schedule_model.cpp
-    adjust_allocation.cpp
    pack_int8_args.cpp
    clip.cpp
    int8_gemm_pack.cpp

--- a/src/targets/gpu/allocation_model.cpp
+++ b/src/targets/gpu/allocation_model.cpp
+#include <migraphx/gpu/allocation_model.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+std::string gpu_allocation_model::name() const { return "hip::allocate"; }
+operation gpu_allocation_model::allocate(const shape& s) const
+{
+    return make_op(name(), {{"shape", to_value(s)}});
+}
+
+std::string gpu_allocation_model::copy() const { return "hip::copy"; }
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/include/migraphx/gpu/allocation_model.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/allocation_model.hpp
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_GPU_ALLOCATION_MODEL_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_GPU_ALLOCATION_MODEL_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/operation.hpp>
+#include <string>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct gpu_allocation_model
+{
+    std::string name() const;
+    std::string copy() const;
+    operation allocate(const shape& s) const;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/hip.hpp
@@ -135,7 +135,7 @@ struct hip_copy
    std::string name() const { return "hip::copy"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2).standard();
+        check_shapes{inputs, *this}.has(2);
        return inputs.at(1);
    }
    argument compute(context& ctx, const shape&, std::vector<argument> args) const

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
+#include <migraphx/adjust_allocation.hpp>
 #include <migraphx/auto_contiguous.hpp>
 #include <migraphx/check_context.hpp>
 #include <migraphx/dead_code_elimination.hpp>
@@ -19,7 +20,7 @@
 #include <migraphx/schedule.hpp>
 #include <migraphx/simplify_algebra.hpp>
 #include <migraphx/simplify_reshapes.hpp>
-#include <migraphx/gpu/adjust_allocation.hpp>
+#include <migraphx/gpu/allocation_model.hpp>
 #include <migraphx/gpu/concat_gpu_opt.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/eliminate_workspace.hpp>
@@ -71,7 +72,7 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        eliminate_concat{concat_gpu_optimization{}},
        dead_code_elimination{},
-        adjust_allocation{},
+        adjust_allocation{gpu_allocation_model{}},
        dead_code_elimination{},
        pack_int8_args{},
        dead_code_elimination{},

--- a/src/targets/ref/lowering.cpp
+++ b/src/targets/ref/lowering.cpp
--- a/src/value.cpp
+++ b/src/value.cpp
@@ -310,7 +310,7 @@ value& value::at(const std::string& pkey)
    if(r == nullptr)
        MIGRAPHX_THROW("Not an object");
    if(r == end())
-        MIGRAPHX_THROW("Key not found");
+        MIGRAPHX_THROW("Key not found: " + pkey);
    return *r;
 }
 const value& value::at(const std::string& pkey) const

--- a/test/gpu/adjust_allocation.cpp
+++ b/test/gpu/adjust_allocation.cpp
-#include <migraphx/gpu/adjust_allocation.hpp>
-#include <migraphx/gpu/target.hpp>
-#include <migraphx/gpu/lowering.hpp>
+#include <migraphx/gpu/allocation_model.hpp>
 #include <migraphx/gpu/context.hpp>
-#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/gpu/lowering.hpp>
+#include <migraphx/gpu/target.hpp>
+#include <migraphx/adjust_allocation.hpp>
 #include <migraphx/auto_contiguous.hpp>
+#include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/eliminate_contiguous.hpp>
+#include <migraphx/instruction.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/op/add.hpp>
-#include <migraphx/op/transpose.hpp>
 #include <migraphx/op/contiguous.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/pass_manager.hpp>
 #include <migraphx/op/tanh.hpp>
+#include <migraphx/op/transpose.hpp>
+#include <migraphx/pass_manager.hpp>
 #include <basic_ops.hpp>
 #include <test.hpp>

@@ -61,7 +62,8 @@ TEST_CASE(tanh_shape)
    EXPECT(p1 != p2);

    migraphx::run_passes(*p2.get_main_module(),
-                         {migraphx::gpu::adjust_allocation{}, migraphx::dead_code_elimination{}});
+                         {migraphx::adjust_allocation{migraphx::gpu::gpu_allocation_model{}},
+                          migraphx::dead_code_elimination{}});
    EXPECT(p1 == p2);
 }


--- a/test/verify/gemm_literal.cpp
+++ b/test/verify/gemm_literal.cpp
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/operators.hpp>
+
+struct gemm_literal : verify_program<gemm_literal>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        migraphx::shape a_shape{migraphx::shape::float_type, {2, 4}};
+        migraphx::shape b_shape{migraphx::shape::float_type, {4, 4}};
+
+        auto a = p.add_literal(migraphx::generate_literal(a_shape));
+        auto b = p.add_parameter("b", b_shape);
+        p.add_instruction(migraphx::op::dot{}, a, b);
+
+        return p;
+    }
+};
--- a/test/verify/main.cpp
+++ b/test/verify/main.cpp
--- a/test/verify/run_verify.cpp
+++ b/test/verify/run_verify.cpp
@@ -12,6 +12,7 @@
 #include <utility>

 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_TEST_COMPILE)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_TEST)

 // An improved async, that doesn't block
 template <class Function>
@@ -167,7 +168,7 @@ void run_verify::verify(const std::string& name, const migraphx::program& p) con
                passed &= migraphx::verify_args(tname, gold[i], result[i]);
            }

-            if(not passed)
+            if(not passed or migraphx::enabled(MIGRAPHX_TRACE_TEST{}))
            {
                std::cout << p << std::endl;
                std::cout << "ref:\n" << p << std::endl;

--- a/tools/include/allocation_model.hpp
+++ b/tools/include/allocation_model.hpp
+#ifndef MIGRAPHX_GUARD_ALLOCATION_MODEL_HPP
+#define MIGRAPHX_GUARD_ALLOCATION_MODEL_HPP
+
+#include <cassert>
+#include <string>
+#include <functional>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include <migraphx/config.hpp>
+#include <migraphx/operation.hpp>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+#ifdef DOXYGEN
+
+/// An interface for target-dependent allocation
+struct allocation_model
+{
+    /// A name of the target-dependent allocate operator
+    std::string name() const;
+    /// A name of the target-dependent copy operator
+    std::string copy() const;
+    /// Create an allocation operator for the given shape
+    operation allocate(const shape& s) const;
+};
+
+#else
+
+<%
+interface('allocation_model',
+    virtual('name', returns='std::string', const=True),
+    virtual('copy', returns='std::string', const=True),
+    virtual('allocate', s='const shape&', returns='operation', const=True)
+)
+%>
+
+#endif
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/tools/include/concat_opt.hpp
+++ b/tools/include/concat_opt.hpp
@@ -15,8 +15,6 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-struct program;
-
 #ifdef DOXYGEN

 /// An interface for target-dependent optimization for the concat instruction

--- a/tools/include/operation.hpp
+++ b/tools/include/operation.hpp
@@ -324,6 +324,25 @@ void from_value_op(T& x, const value& v)
    return !(x == y);
 }

+inline shape compute_shape(const operation& op, const std::vector<shape>& inputs)
+{
+    return op.compute_shape(inputs);
+}
+
+template <class T>
+inline auto compute_shape(const T& op, const std::vector<shape>& inputs)
+    -> decltype(op.compute_shape(inputs))
+{
+    return op.compute_shape(inputs);
+}
+
+template <class T>
+inline auto compute_shape(const T& op, const std::vector<shape>& inputs)
+    -> decltype(op.normalize_compute_shape(inputs))
+{
+    return detail::normalize_compute_shape_op(op, inputs);
+}
+
 inline bool is_context_free(const operation& op) { return op.is_context_free(); }

 template <class T>