manual merge and biasadd change

85d789f7 · Khalique · 864fafb2 · a5b0afa0 · 85d789f7 · 85d789f7
Commit 85d789f7 authored Feb 11, 2019 by Khalique
20 changed files
--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -103,6 +103,43 @@ struct cpu_batch_norm_inference
    }
 };
+struct cpu_lrn
+{
+    op::lrn op;
+    std::string name() const { return "cpu::lrn"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            int n_batch         = output_shape.lens()[0];
+            int channels        = output_shape.lens()[1];
+            int height          = output_shape.lens()[2];
+            int width           = output_shape.lens()[3];
+            float alphaoverarea = op.alpha / op.size;
+            int radius          = (op.size - 1) / 2;
+            par_dfor(n_batch, height, width)([&](int b, int h, int w) {
+                float scale = 0;
+                dfor(channels)([&](int c) {
+                    auto start = (c - radius) < 0 ? 0 : (c - radius);
+                    auto end   = (c + radius) > channels ? channels : (c + radius);
+                    for(auto k = start; k < end; ++k)
+                    {
+                        scale += std::pow(input(b, k, h, w), 2);
+                    }
+                    scale *= alphaoverarea;
+                    scale += op.bias;
+                    scale              = std::pow(scale, -op.beta);
+                    output(b, c, h, w) = input(b, c, h, w) * scale;
+                });
+            });
+        });
+        return result;
+    }
+};
 struct cpu_convolution
 {
    op::convolution op;
@@ -287,14 +324,7 @@ struct cpu_contiguous
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
-        assert(output_shape.standard());
+        return op.compute(output_shape, std::move(args));
-        argument result{output_shape};
-        visit_all(result, args[0])([&](auto output, auto input) {
-            shape_for_each(output.get_shape(), [&](const auto& idx) {
-                output(idx.begin(), idx.end()) = input(idx.begin(), idx.end());
-            });
-        });
-        return result;
    }
 };
@@ -688,6 +718,7 @@ struct cpu_apply
        apply_map["dot"]         = extend_op<cpu_gemm, op::dot>();
        apply_map["batch_norm_inference"] =
            extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
+        apply_map["lrn"]        = extend_op<cpu_lrn, op::lrn>();
        apply_map["contiguous"] = extend_op<cpu_contiguous, op::contiguous>();
        apply_map["pad"]        = extend_op<cpu_pad, op::pad>();
        apply_map["concat"]     = extend_op<cpu_concat, op::concat>();

--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp
@@ -2,6 +2,8 @@
 #include <migraphx/cpu/target.hpp>
 #include <migraphx/cpu/lowering.hpp>
 #include <migraphx/auto_contiguous.hpp>
+#include <migraphx/rewrite_rnn.hpp>
+#include <migraphx/dead_code_elimination.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -11,7 +13,11 @@ std::string target::name() const { return "cpu"; }
 std::vector<pass> target::get_passes(migraphx::context&) const
 {
-    return {auto_contiguous{}, lowering{}};
+    return {auto_contiguous{},
+            rewrite_rnn{},
+            dead_code_elimination{},
+            lowering{},
+            dead_code_elimination{}};
 }
 } // namespace cpu

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -30,6 +30,7 @@ add_library(migraphx_device
    device/concat.cpp
    device/pad.cpp
    device/gather.cpp
+    device/sub.cpp
 )
 set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
 rocm_clang_tidy_check(migraphx_device)
@@ -60,6 +61,7 @@ add_library(migraphx_gpu
    elu.cpp
    pad.cpp
    gather.cpp
+    lrn.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 rocm_clang_tidy_check(migraphx_gpu)

--- a/src/targets/gpu/device/sub.cpp
+++ b/src/targets/gpu/device/sub.cpp
+#include <migraphx/gpu/device/sub.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void sub(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2)
+{
+    nary(stream, result, arg1, arg2)([](auto x, auto y) { return y - x; });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/gemm.cpp
+++ b/src/targets/gpu/gemm.cpp
@@ -107,6 +107,7 @@ argument miopen_gemm::compute(context& ctx,
                             ldc);
    });
    return args[2];
 }

--- a/src/targets/gpu/include/migraphx/gpu/device/sub.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/sub.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_SUB_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_SUB_HPP
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void sub(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2);
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/lrn.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/lrn.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_LRN_HPP
+#define MIGRAPHX_GUARD_RTGLIB_LRN_HPP
+#include <migraphx/gpu/lowering.hpp>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/gpu/device/contiguous.hpp>
+#include <migraphx/gpu/device/add.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/gpu/rocblas.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct miopen_lrn
+{
+    shared<lrn_descriptor> ldesc;
+    std::string name() const { return "gpu::lrn"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/miopen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/miopen.hpp
@@ -23,6 +23,8 @@ using fusion_plan_descriptor = MIGRAPHX_MANAGE_PTR(miopenFusionPlanDescriptor_t,
                                                   miopenDestroyFusionPlan);
 using fused_operator_args    = MIGRAPHX_MANAGE_PTR(miopenOperatorArgs_t, miopenDestroyOperatorArgs);
+using lrn_descriptor = MIGRAPHX_MANAGE_PTR(miopenLRNDescriptor_t, miopenDestroyLRNDescriptor);
 template <class Result, class F, class... Ts>
 Result make_obj(F f, Ts... xs)
 {
@@ -89,6 +91,13 @@ inline pooling_descriptor make_pooling(const migraphx::op::pooling& op)
    return p;
 }
+inline lrn_descriptor make_lrn(const migraphx::op::lrn& op)
+{
+    auto ldesc = make_obj<lrn_descriptor>(&miopenCreateLRNDescriptor);
+    miopenSetLRNDescriptor(ldesc.get(), miopenLRNCrossChannel, op.size, op.alpha, op.beta, op.bias);
+    return ldesc;
+}
 inline activation_descriptor make_relu()
 {
    auto ad = make_obj<activation_descriptor>(&miopenCreateActivationDescriptor);

--- a/src/targets/gpu/include/migraphx/gpu/sub.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/sub.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_SUB_HPP
+#define MIGRAPHX_GUARD_RTGLIB_SUB_HPP
+#include <migraphx/gpu/lowering.hpp>
+#include <migraphx/gpu/oper.hpp>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/gpu/device/contiguous.hpp>
+#include <migraphx/gpu/device/sub.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/gpu/rocblas.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/config.hpp>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct hip_sub : binary_device<hip_sub, device::sub>
+{
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -22,6 +22,7 @@
 #include <migraphx/gpu/elu.hpp>
 #include <migraphx/gpu/softmax.hpp>
 #include <migraphx/gpu/add.hpp>
+#include <migraphx/gpu/sub.hpp>
 #include <migraphx/gpu/exp.hpp>
 #include <migraphx/gpu/log.hpp>
 #include <migraphx/gpu/sin.hpp>
@@ -42,6 +43,7 @@
 #include <migraphx/gpu/concat.hpp>
 #include <migraphx/gpu/pad.hpp>
 #include <migraphx/gpu/gather.hpp>
+#include <migraphx/gpu/lrn.hpp>
 #include <utility>
 #include <functional>
 #include <algorithm>
@@ -55,6 +57,7 @@ struct miopen_apply
    program* prog = nullptr;
    context ctx{};
    std::unordered_map<std::string, std::function<instruction_ref(instruction_ref)>> apply_map{};
+    instruction_ref last{};
    void check_shape(shape x, instruction_ref i)
    {
@@ -65,6 +68,7 @@ struct miopen_apply
    void init()
    {
+        this->last = instruction::get_output_alias(std::prev(prog->end()));
        add_miopen_simple_op<miopen_relu>("relu", make_relu);
        add_miopen_simple_op<miopen_sigmoid>("sigmoid", make_sigmoid);
        add_miopen_simple_op<miopen_abs>("abs", make_abs);
@@ -74,6 +78,7 @@ struct miopen_apply
        add_miopen_extend_op<miopen_elu, op::elu>("elu", make_elu);
        add_generic_op<hip_add>("add");
+        add_generic_op<hip_sub>("sub");
        add_generic_op<hip_exp>("exp");
        add_generic_op<hip_log>("log");
        add_generic_op<hip_sin>("sin");
@@ -95,6 +100,7 @@ struct miopen_apply
        add_extend_op<hip_gather, op::gather>("gather");
        add_extend_op<hip_pad, op::pad>("pad");
+        add_lrn_op();
        add_convolution_op();
        add_pooling_op();
        add_batch_norm_inference_op();
@@ -115,7 +121,7 @@ struct miopen_apply
    instruction_ref insert_allocation(instruction_ref ins, const shape& s, std::string tag = "")
    {
-        if(ins == --prog->end() and tag.empty())
+        if(ins == last and tag.empty())
        {
            return prog->add_parameter("output", s);
        }
@@ -155,6 +161,17 @@ struct miopen_apply
        });
    }
+    void add_lrn_op()
+    {
+        apply_map.emplace("lrn", [=](instruction_ref ins) {
+            auto&& op   = any_cast<op::lrn>(ins->get_operator());
+            auto ldesc  = make_lrn(op);
+            auto output = insert_allocation(ins, ins->get_shape());
+            return prog->replace_instruction(
+                ins, miopen_lrn{std::move(ldesc)}, ins->inputs().at(0), output);
+        });
+    }
    template <class T>
    void add_generic_op(std::string name)
    {

--- a/src/targets/gpu/lrn.cpp
+++ b/src/targets/gpu/lrn.cpp
+#include <migraphx/gpu/lrn.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/gpu/miopen.hpp>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+shape miopen_lrn::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{inputs, *this}.has(2).not_broadcasted();
+    return inputs.at(1);
+}
+argument miopen_lrn::compute(context& ctx,
+                             const shape& output_shape,
+                             const std::vector<argument>& args) const
+{
+    float alpha = 1;
+    float beta  = 0;
+    auto x_desc = make_tensor(args[0].get_shape());
+    auto y_desc = make_tensor(output_shape);
+    miopenLRNForward(ctx.get_stream().get_miopen(),
+                     ldesc.get(),
+                     &alpha,
+                     x_desc.get(),
+                     args[0].implicit(),
+                     &beta,
+                     y_desc.get(),
+                     args[1].implicit(),
+                     false,
+                     nullptr);
+    return args[1];
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -15,6 +15,7 @@
 #include <migraphx/eliminate_contiguous.hpp>
 #include <migraphx/common_subexpression_elimination.hpp>
 #include <migraphx/fwd_conv_batchnorm_rewrite.hpp>
+#include <migraphx/rewrite_rnn.hpp>
 #include <migraphx/eliminate_concat.hpp>
 #include <migraphx/gpu/concat_gpu_opt.hpp>
@@ -31,14 +32,16 @@ std::vector<pass> target::get_passes(migraphx::context& gctx) const
        dead_code_elimination{},
        fwd_conv_batchnorm_rewrite{},
        dead_code_elimination{},
-        common_subexpression_elimination{},
+        rewrite_rnn{},
        dead_code_elimination{},
+        //common_subexpression_elimination{},
+        //dead_code_elimination{},
        simplify_algebra{},
        dead_code_elimination{},
        constant_propagate{},
        dead_code_elimination{},
        auto_contiguous{},
-        simplify_reshapes{},
+        //simplify_reshapes{},
        dead_code_elimination{},
        lowering{ctx},
        eliminate_concat{concat_gpu_optimization{}},

--- a/src/tf/tf.cpp
+++ b/src/tf/tf.cpp
@@ -175,10 +175,8 @@ struct tf_parser
    instruction_ref
    parse_biasadd(const std::string&, attribute_map, std::vector<instruction_ref> args)
    {
-        // assume second arg is bias
+        uint64_t axis = 1;
-        std::vector<int64_t> dims;
+        auto l0 = prog.add_instruction(op::broadcast{axis, args[0]->get_shape()}, args[1]);
-        copy(args[0]->get_shape().lens(), std::back_inserter(dims));
-        auto l0 = prog.add_instruction(op::reshape{dims}, args[1]);
        return prog.add_instruction(op::add{}, args[0], l0);
    }
@@ -519,7 +517,33 @@ struct tf_parser
            break; // throw std::runtime_error("Unsupported type VARIANT");
        case tensorflow::DataType::DT_UINT32: shape_type = shape::uint32_type; break;
        case tensorflow::DataType::DT_UINT64: shape_type = shape::uint64_type; break;
-        default: break;
+        // tf pb should not use these types
+        case tensorflow::DataType::DT_FLOAT_REF: break;
+        case tensorflow::DataType::DT_DOUBLE_REF: break;
+        case tensorflow::DataType::DT_INT32_REF: break;
+        case tensorflow::DataType::DT_UINT8_REF: break;
+        case tensorflow::DataType::DT_INT16_REF: break;
+        case tensorflow::DataType::DT_INT8_REF: break;
+        case tensorflow::DataType::DT_STRING_REF: break;
+        case tensorflow::DataType::DT_COMPLEX64_REF: break;
+        case tensorflow::DataType::DT_INT64_REF: break;
+        case tensorflow::DataType::DT_BOOL_REF: break;
+        case tensorflow::DataType::DT_QINT8_REF: break;
+        case tensorflow::DataType::DT_QUINT8_REF: break;
+        case tensorflow::DataType::DT_QINT32_REF: break;
+        case tensorflow::DataType::DT_BFLOAT16_REF: break;
+        case tensorflow::DataType::DT_QINT16_REF: break;
+        case tensorflow::DataType::DT_QUINT16_REF: break;
+        case tensorflow::DataType::DT_UINT16_REF: break;
+        case tensorflow::DataType::DT_COMPLEX128_REF: break;
+        case tensorflow::DataType::DT_HALF_REF: break;
+        case tensorflow::DataType::DT_RESOURCE_REF: break;
+        case tensorflow::DataType::DT_VARIANT_REF: break;
+        case tensorflow::DataType::DT_UINT32_REF: break;
+        case tensorflow::DataType::DT_UINT64_REF: break;
+        case tensorflow::DataType::DataType_INT_MAX_SENTINEL_DO_NOT_USE_: break;
+        case tensorflow::DataType::DataType_INT_MIN_SENTINEL_DO_NOT_USE_: break;
        }
        return shape_type;
    }
@@ -559,7 +583,39 @@ struct tf_parser
            case tensorflow::DataType::DT_UINT64: throw std::runtime_error("");
            case tensorflow::DataType::DT_COMPLEX64: throw std::runtime_error("");
            case tensorflow::DataType::DT_COMPLEX128: throw std::runtime_error("");
-            default: break;
+            case tensorflow::DataType::DT_QINT8: throw std::runtime_error("");
+            case tensorflow::DataType::DT_QUINT8: throw std::runtime_error("");
+            case tensorflow::DataType::DT_QINT32: throw std::runtime_error("");
+            case tensorflow::DataType::DT_BFLOAT16: throw std::runtime_error("");
+            case tensorflow::DataType::DT_QINT16: throw std::runtime_error("");
+            case tensorflow::DataType::DT_QUINT16: throw std::runtime_error("");
+            case tensorflow::DataType::DT_RESOURCE: throw std::runtime_error("");
+            case tensorflow::DataType::DT_VARIANT: throw std::runtime_error("");
+            case tensorflow::DataType::DT_FLOAT_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_DOUBLE_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_INT32_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_UINT8_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_INT16_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_INT8_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_STRING_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_COMPLEX64_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_INT64_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_BOOL_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_QINT8_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_QUINT8_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_QINT32_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_BFLOAT16_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_QINT16_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_QUINT16_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_UINT16_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_COMPLEX128_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_HALF_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_RESOURCE_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_VARIANT_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_UINT32_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DT_UINT64_REF: throw std::runtime_error("");
+            case tensorflow::DataType::DataType_INT_MAX_SENTINEL_DO_NOT_USE_: throw std::runtime_error("");
+            case tensorflow::DataType::DataType_INT_MIN_SENTINEL_DO_NOT_USE_: throw std::runtime_error("");
            }
            MIGRAPHX_THROW("Invalid tensor type");
        }
@@ -591,7 +647,39 @@ struct tf_parser
        case tensorflow::DataType::DT_UINT64: throw std::runtime_error("");
        case tensorflow::DataType::DT_COMPLEX64: throw std::runtime_error("");
        case tensorflow::DataType::DT_COMPLEX128: throw std::runtime_error("");
-        default: break;
+        case tensorflow::DataType::DT_QINT8: throw std::runtime_error("");
+        case tensorflow::DataType::DT_QUINT8: throw std::runtime_error("");
+        case tensorflow::DataType::DT_QINT32: throw std::runtime_error("");
+        case tensorflow::DataType::DT_BFLOAT16: throw std::runtime_error("");
+        case tensorflow::DataType::DT_QINT16: throw std::runtime_error("");
+        case tensorflow::DataType::DT_QUINT16: throw std::runtime_error("");
+        case tensorflow::DataType::DT_RESOURCE: throw std::runtime_error("");
+        case tensorflow::DataType::DT_VARIANT: throw std::runtime_error("");
+        case tensorflow::DataType::DT_FLOAT_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_DOUBLE_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_INT32_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_UINT8_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_INT16_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_INT8_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_STRING_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_COMPLEX64_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_INT64_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_BOOL_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_QINT8_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_QUINT8_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_QINT32_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_BFLOAT16_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_QINT16_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_QUINT16_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_UINT16_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_COMPLEX128_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_HALF_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_RESOURCE_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_VARIANT_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_UINT32_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DT_UINT64_REF: throw std::runtime_error("");
+        case tensorflow::DataType::DataType_INT_MAX_SENTINEL_DO_NOT_USE_: throw std::runtime_error("");
+        case tensorflow::DataType::DataType_INT_MIN_SENTINEL_DO_NOT_USE_: throw std::runtime_error("");
        }
        MIGRAPHX_THROW("Invalid tensor type");
    }

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -112,7 +112,8 @@ if(MIGRAPHX_ENABLE_GPU)
 endif()
 # Onnx test
-add_executable(test_onnx onnx/onnx_test.cpp)
+set(TEST_ONNX_DIR ${CMAKE_CURRENT_SOURCE_DIR}/onnx)
+add_executable(test_onnx ${TEST_ONNX_DIR}/onnx_test.cpp)
 rocm_clang_tidy_check(test_onnx)
 target_link_libraries(test_onnx migraphx_onnx)
 target_include_directories(test_onnx PUBLIC include)
@@ -129,6 +130,10 @@ add_test(NAME test_tf COMMAND $<TARGET_FILE:test_tf> WORKING_DIRECTORY ${CMAKE_C
 add_dependencies(tests test_tf)
 add_dependencies(check test_tf)
+if(MIGRAPHX_ENABLE_PYTHON)
+add_subdirectory(py)
+endif()
 function(test_header NAME HEADER)
    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/header-main-include-${NAME}.cpp 

--- a/test/cpu_ops_test.cpp
+++ b/test/cpu_ops_test.cpp
@@ -5,6 +5,7 @@
 #include <migraphx/instruction.hpp>
 #include <migraphx/cpu/target.hpp>
 #include <migraphx/verify.hpp>
+#include <migraphx/onnx.hpp>
 #include "test.hpp"
 float sigmoid(float x) { return 1 / (1 + expf(-x)); }
@@ -731,6 +732,20 @@ TEST_CASE(leaky_relu_test)
    EXPECT(migraphx::verify_range(results_vector, gold));
 }
+TEST_CASE(lrn_test)
+{
+    migraphx::program p;
+    migraphx::shape s{migraphx::shape::float_type, {1, 5, 1, 1}};
+    auto l = p.add_literal(migraphx::literal{s, {-2.0f, 1.0f, 0.f, 1.0f, 2.0f}});
+    p.add_instruction(migraphx::op::lrn{0.0001, 0.75, 1, 5}, l);
+    p.compile(migraphx::cpu::target{});
+    auto result = p.eval({});
+    std::vector<float> results_vector(5);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    std::vector<float> gold = {-2 / 1.000075, 1 / 1.00009, 0 / 1.000145, 1 / 1.00009, 2 / 1.000075};
+    EXPECT(migraphx::verify_range(results_vector, gold));
+}
 TEST_CASE(imagescaler_test)
 {
    migraphx::program p;

--- a/test/cpu_rnn_ops_test.cpp
+++ b/test/cpu_rnn_ops_test.cpp
--- a/test/dead_code_elimination_test.cpp
+++ b/test/dead_code_elimination_test.cpp
 #include <migraphx/dead_code_elimination.hpp>
 #include <basic_ops.hpp>
+#include <migraphx/operators.hpp>
 #include <test.hpp>
 struct dce_target
@@ -111,4 +112,21 @@ TEST_CASE(depth_test)
    EXPECT(result != migraphx::literal{4});
 }
+TEST_CASE(undefined_test)
+{
+    migraphx::program p;
+    auto one   = p.add_literal(1);
+    auto two   = p.add_literal(2);
+    auto undef = p.add_instruction(migraphx::op::undefined{});
+    p.add_instruction(sum_op{}, one, two);
+    auto count = std::distance(p.begin(), p.end());
+    p.compile(dce_target{});
+    EXPECT(std::distance(p.begin(), p.end()) == count - 1);
+    EXPECT(not p.has_instruction(undef));
+    auto result = p.eval({});
+    EXPECT(result == migraphx::literal{3});
+    EXPECT(result != migraphx::literal{4});
+}
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/gpu/miopen.cpp
+++ b/test/gpu/miopen.cpp
--- a/test/memory_coloring_test.cpp
+++ b/test/memory_coloring_test.cpp
@@ -78,7 +78,7 @@ TEST_CASE(test3)
    auto p3 = add_alloc(p, {migraphx::shape::float_type, {40}});
    p.add_instruction(pass_op{}, p3, p1);
    p.compile(memory_coloring_target{});
-    CHECK(p.get_parameter_shape("scratch").bytes() == 704); // The optimal solution is actually 672
+    CHECK(p.get_parameter_shape("scratch").bytes() == 672);
    CHECK(no_allocate(p));
 }
@@ -487,7 +487,7 @@ TEST_CASE(test33)
    auto a5 = add_alloc(p, {migraphx::shape::float_type, {40}});
    p.add_instruction(pass_op{}, a5, p1);
    p.compile(memory_coloring_target{});
-    CHECK(p.get_parameter_shape("scratch").bytes() == 224);
+    CHECK(p.get_parameter_shape("scratch").bytes() == 192);
    CHECK(no_allocate(p));
 }
@@ -594,7 +594,7 @@ TEST_CASE(test38)
    auto p83    = p.add_instruction(pass_op{}, p78, p77);
    p.add_instruction(pass_op{}, output, p83, p63);
    p.compile(memory_coloring_target{});
-    CHECK(p.get_parameter_shape("scratch").bytes() == 6422528);
+    CHECK(p.get_parameter_shape("scratch").bytes() == 7225344); // Optimal solution is 6422528
    CHECK(no_allocate(p));
 }

--- a/test/onnx/implicit_sub_bcast_test.onnx
+++ b/test/onnx/implicit_sub_bcast_test.onnx
+subtraction2:q
+0
+1out"Subsubtraction2Z
+0
+Z
+1
+b
+out
+B
\ No newline at end of file