Only use no-cache on jenkins

fb75dfaf · Paul · e596eec2 · f0604d78 · fb75dfaf · fb75dfaf
Commit fb75dfaf authored Aug 14, 2018 by Paul
20 changed files
--- a/src/targets/gpu/include/migraph/gpu/hip.hpp
+++ b/src/targets/gpu/include/migraph/gpu/hip.hpp
+#ifndef MIGRAPH_GUARD_MIGRAPHLIB_HIP_HPP
+#define MIGRAPH_GUARD_MIGRAPHLIB_HIP_HPP
+#include <migraph/operators.hpp>
+namespace migraph {
+namespace gpu {
+migraph::argument allocate_gpu(migraph::shape s);
+migraph::argument to_gpu(migraph::argument arg);
+migraph::argument from_gpu(migraph::argument arg);
+struct hip_allocate
+{
+    std::string name() const { return "hip::allocate"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs}.has(1);
+        return inputs.front();
+    }
+    argument compute(context&, shape output_shape, std::vector<argument>) const
+    {
+        return allocate_gpu(output_shape);
+    }
+};
+struct hip_write
+{
+    std::string name() const { return "hip::write"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs}.has(1);
+        return inputs.front();
+    }
+    argument compute(context&, shape, std::vector<argument> args) const
+    {
+        return to_gpu(args.front());
+    }
+};
+} // namespace gpu
+} // namespace migraph
+#endif
--- a/src/targets/gpu/include/migraph/gpu/kernels.hpp
+++ b/src/targets/gpu/include/migraph/gpu/kernels.hpp
+#ifndef MIGRAPH_GUARD_MIGRAPHLIB_KERNELS_HPP
+#define MIGRAPH_GUARD_MIGRAPHLIB_KERNELS_HPP
+namespace migraph {
+namespace gpu {
+void hip_contiguous(migraph::shape output_shape, migraph::argument arg, migraph::argument result);
+} // namespace gpu
+} // namespace migraph
+#endif
--- a/src/targets/gpu/include/migraph/gpu/lowering.hpp
+++ b/src/targets/gpu/include/migraph/gpu/lowering.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_MIOPEN_LOWERING_HPP
+#define MIGRAPH_GUARD_RTGLIB_MIOPEN_LOWERING_HPP
+#include <migraph/program.hpp>
+#include <migraph/gpu/context.hpp>
+namespace migraph {
+namespace gpu {
+struct lowering
+{
+    context ctx;
+    std::string name() const { return "gpu::lowering"; }
+    void apply(program& p) const;
+};
+} // namespace gpu
+} // namespace migraph
+#endif
--- a/src/targets/gpu/include/migraph/gpu/miopen.hpp
+++ b/src/targets/gpu/include/migraph/gpu/miopen.hpp
+#ifndef MIGRAPH_GUARD_MIGRAPHLIB_MIOPEN_HPP
+#define MIGRAPH_GUARD_MIGRAPHLIB_MIOPEN_HPP
+#include <migraph/manage_ptr.hpp>
+#include <migraph/operators.hpp>
+#include <miopen/miopen.h>
+namespace migraph {
+namespace gpu {
+using miopen_handle          = MIGRAPH_MANAGE_PTR(miopenHandle_t, miopenDestroy);
+using tensor_descriptor      = MIGRAPH_MANAGE_PTR(miopenTensorDescriptor_t,
+                                             miopenDestroyTensorDescriptor);
+using convolution_descriptor = MIGRAPH_MANAGE_PTR(miopenConvolutionDescriptor_t,
+                                                  miopenDestroyConvolutionDescriptor);
+using pooling_descriptor     = MIGRAPH_MANAGE_PTR(miopenPoolingDescriptor_t,
+                                              miopenDestroyPoolingDescriptor);
+using activation_descriptor  = MIGRAPH_MANAGE_PTR(miopenActivationDescriptor_t,
+                                                 miopenDestroyActivationDescriptor);
+template <class Result, class F, class... Ts>
+Result make_obj(F f, Ts... xs)
+{
+    typename Result::pointer x = nullptr;
+    auto status                = f(&x, xs...);
+    Result r{x};
+    if(status != miopenStatusSuccess)
+        MIGRAPH_THROW("MIOpen call failed");
+    return r;
+}
+inline tensor_descriptor make_tensor(const migraph::shape& s)
+{
+    auto t = make_obj<tensor_descriptor>(&miopenCreateTensorDescriptor);
+    // Convert to ints
+    std::vector<int> lens(s.lens().begin(), s.lens().end());
+    std::vector<int> strides(s.strides().begin(), s.strides().end());
+    miopenDataType_t d;
+    if(s.type() == shape::float_type)
+        d = miopenFloat;
+    else
+        MIGRAPH_THROW("Unsupported type");
+    miopenSetTensorDescriptor(t.get(), d, s.lens().size(), lens.data(), strides.data());
+    return t;
+}
+inline convolution_descriptor make_conv(const migraph::convolution& op)
+{
+    auto c = make_obj<convolution_descriptor>(&miopenCreateConvolutionDescriptor);
+    miopenInitConvolutionDescriptor(c.get(),
+                                    miopenConvolution,
+                                    op.padding[0],
+                                    op.padding[1],
+                                    op.stride[0],
+                                    op.stride[1],
+                                    op.dilation[0],
+                                    op.dilation[1]);
+    return c;
+}
+inline pooling_descriptor make_pooling(const migraph::pooling& op)
+{
+    miopenPoolingMode_t mode;
+    if(op.mode == "max")
+        mode = miopenPoolingMax;
+    else
+        mode = miopenPoolingAverage;
+    auto p = make_obj<pooling_descriptor>(&miopenCreatePoolingDescriptor);
+    miopenSet2dPoolingDescriptor(p.get(),
+                                 mode,
+                                 op.lengths[0],
+                                 op.lengths[1],
+                                 op.padding[0],
+                                 op.padding[1],
+                                 op.stride[0],
+                                 op.stride[1]);
+    return p;
+}
+inline activation_descriptor make_relu()
+{
+    auto ad = make_obj<activation_descriptor>(&miopenCreateActivationDescriptor);
+    miopenSetActivationDescriptor(ad.get(), miopenActivationRELU, 0, 0, 0);
+    return ad;
+}
+} // namespace gpu
+} // namespace migraph
+#endif
--- a/src/targets/gpu/include/migraph/gpu/rocblas.hpp
+++ b/src/targets/gpu/include/migraph/gpu/rocblas.hpp
+#ifndef MIGRAPH_GUARD_MIGRAPHLIB_ROCBLAS_HPP
+#define MIGRAPH_GUARD_MIGRAPHLIB_ROCBLAS_HPP
+#include <migraph/manage_ptr.hpp>
+#include <migraph/operators.hpp>
+#include <rocblas.h>
+namespace migraph {
+namespace gpu {
+using rocblas_handle_ptr = MIGRAPH_MANAGE_PTR(rocblas_handle, rocblas_destroy_handle);
+rocblas_handle_ptr create_rocblas_handle_ptr();
+} // namespace gpu
+} // namespace migraph
+#endif
--- a/src/targets/gpu/include/migraph/gpu/target.hpp
+++ b/src/targets/gpu/include/migraph/gpu/target.hpp
+#ifndef MIGRAPH_GUARD_MIGRAPHLIB_MIOPEN_TARGET_HPP
+#define MIGRAPH_GUARD_MIGRAPHLIB_MIOPEN_TARGET_HPP
+#include <migraph/program.hpp>
+namespace migraph {
+namespace gpu {
+struct target
+{
+    std::string name() const;
+    std::vector<pass> get_passes(migraph::context& gctx) const;
+    migraph::context get_context() const;
+};
+} // namespace gpu
+} // namespace migraph
+#endif
--- a/src/targets/gpu/include/migraph/gpu/write_literals.hpp
+++ b/src/targets/gpu/include/migraph/gpu/write_literals.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_MIOPEN_WRITE_LITERALS_HPP
+#define MIGRAPH_GUARD_RTGLIB_MIOPEN_WRITE_LITERALS_HPP
+#include <migraph/program.hpp>
+namespace migraph {
+namespace gpu {
+struct write_literals
+{
+    std::string name() const { return "gpu::write_literals"; }
+    void apply(program& p) const;
+};
+} // namespace gpu
+} // namespace migraph
+#endif
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
+#include <rocblas.h>
+#include <migraph/gpu/lowering.hpp>
+#include <migraph/manage_ptr.hpp>
+#include <migraph/instruction.hpp>
+#include <migraph/operators.hpp>
+#include <migraph/generate.hpp>
+#include <migraph/shape_for_each.hpp>
+#include <migraph/gpu/miopen.hpp>
+#include <migraph/gpu/hip.hpp>
+#include <migraph/dfor.hpp>
+#include <migraph/gpu/kernels.hpp>
+#include <migraph/iterator_for.hpp>
+#include <migraph/gpu/rocblas.hpp>
+#include <migraph/gpu/context.hpp>
+namespace migraph {
+namespace gpu {
+struct miopen_batch_norm_inference
+{
+    batch_norm_inference op;
+    std::string name() const { return "gpu::batch_norm_inference"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(6);
+        return op.compute_shape(
+            {inputs.at(0), inputs.at(1), inputs.at(2), inputs.at(3), inputs.at(4)});
+    }
+    argument compute(context& ctx, shape output_shape, std::vector<argument> args) const
+    {
+        auto x_desc  = make_tensor(args[0].get_shape());
+        auto y_desc  = make_tensor(output_shape);
+        auto bn_desc = make_tensor(args[3].get_shape());
+        float alpha = 1.0, beta = 0.0f;
+        miopenBatchNormalizationForwardInference(ctx.handle.get(),
+                                                 miopenBatchNormMode_t(op.bn_mode),
+                                                 &alpha,
+                                                 &beta,
+                                                 x_desc.get(),
+                                                 args[0].implicit(),
+                                                 y_desc.get(),
+                                                 args[5].implicit(),
+                                                 bn_desc.get(),
+                                                 args[3].implicit(),
+                                                 args[4].implicit(),
+                                                 args[1].implicit(),
+                                                 args[2].implicit(),
+                                                 op.epsilon);
+        return args[5];
+    }
+};
+struct miopen_convolution
+{
+    convolution op;
+    shared<convolution_descriptor> cd;
+    miopenConvFwdAlgorithm_t algo{};
+    std::string name() const { return "gpu::convolution"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(4).standard();
+        return op.compute_shape({inputs.at(0), inputs.at(1)});
+    }
+    argument compute(context& ctx, shape output_shape, std::vector<argument> args) const
+    {
+        auto x_desc = make_tensor(args[0].get_shape());
+        auto w_desc = make_tensor(args[1].get_shape());
+        auto y_desc = make_tensor(output_shape);
+        float alpha = 1, beta = 0;
+        miopenConvolutionForward(ctx.handle.get(),
+                                 &alpha,
+                                 x_desc.get(),
+                                 args[0].implicit(),
+                                 w_desc.get(),
+                                 args[1].implicit(),
+                                 cd.get(),
+                                 algo,
+                                 &beta,
+                                 y_desc.get(),
+                                 args[3].implicit(),
+                                 args[2].implicit(),
+                                 args[2].get_shape().bytes());
+        return args[3];
+    }
+    shape compile(context& ctx, shape output_shape, std::vector<instruction_ref> inputs)
+    {
+        shape workspace_shape{};
+        auto x_desc = make_tensor(inputs[0]->get_shape());
+        auto w_desc = make_tensor(inputs[1]->get_shape());
+        auto y_desc = make_tensor(output_shape);
+        std::size_t workspace_size = 0;
+        miopenConvolutionForwardGetWorkSpaceSize(
+            ctx.handle.get(), x_desc.get(), w_desc.get(), cd.get(), y_desc.get(), &workspace_size);
+        workspace_shape = shape{shape::int8_type, {workspace_size}};
+        auto x         = to_gpu(generate_argument(inputs[0]->get_shape()));
+        auto w         = to_gpu(generate_argument(inputs[1]->get_shape()));
+        auto y         = to_gpu(generate_argument(output_shape));
+        auto workspace = allocate_gpu(workspace_shape);
+        int algo_count;
+        miopenConvAlgoPerf_t perf;
+        miopenFindConvolutionForwardAlgorithm(ctx.handle.get(),
+                                              x_desc.get(),
+                                              x.implicit(),
+                                              w_desc.get(),
+                                              w.implicit(),
+                                              cd.get(),
+                                              y_desc.get(),
+                                              y.implicit(),
+                                              1,
+                                              &algo_count,
+                                              &perf,
+                                              workspace.implicit(),
+                                              workspace_size,
+                                              false);
+        algo = perf.fwd_algo;
+        return workspace_shape;
+    }
+};
+struct miopen_pooling
+{
+    pooling op;
+    shared<pooling_descriptor> pd;
+    std::string name() const { return "gpu::pooling"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(2).standard();
+        return op.compute_shape({inputs.at(1)});
+    }
+    argument compute(context& ctx, shape output_shape, std::vector<argument> args) const
+    {
+        auto x_desc = make_tensor(args[0].get_shape());
+        auto y_desc = make_tensor(output_shape);
+        float alpha = 1, beta = 0;
+        miopenPoolingForward(ctx.handle.get(),
+                             pd.get(),
+                             &alpha,
+                             x_desc.get(),
+                             args[0].implicit(),
+                             &beta,
+                             y_desc.get(),
+                             args[1].implicit(),
+                             false,
+                             nullptr,
+                             0);
+        return args[1];
+    }
+};
+struct miopen_add
+{
+    std::string name() const { return "gpu::add"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(3).not_broadcasted();
+        return inputs.at(0);
+    }
+    argument compute(context& ctx, shape output_shape, std::vector<argument> args) const
+    {
+        if(args[1].get_shape().broadcasted())
+        {
+            argument result{output_shape};
+            visit_all(result, from_gpu(args[0]), from_gpu(args[1]))(
+                [&](auto output, auto input1, auto input2) {
+                    shape_for_each(output.get_shape(), [&](const auto& idx) {
+                        output(idx.begin(), idx.end()) =
+                            input1(idx.begin(), idx.end()) + input2(idx.begin(), idx.end());
+                    });
+                });
+            return to_gpu(result);
+        }
+        else
+        {
+            float alpha = 1, beta = 0;
+            auto a_desc = make_tensor(args[0].get_shape());
+            auto b_desc = make_tensor(args[1].get_shape());
+            auto c_desc = make_tensor(output_shape);
+            miopenOpTensor(ctx.handle.get(),
+                           miopenTensorOpAdd,
+                           &alpha,
+                           a_desc.get(),
+                           args[0].implicit(),
+                           &alpha,
+                           b_desc.get(),
+                           args[1].implicit(),
+                           &beta,
+                           c_desc.get(),
+                           args[2].implicit());
+            return args[2];
+        }
+    }
+};
+struct miopen_gemm
+{
+    gemm op;
+    std::string name() const { return "gpu::convolution"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(3);
+        return op.compute_shape({inputs.at(0), inputs.at(1)});
+    }
+    argument compute(context& ctx, shape output_shape, std::vector<argument> args) const
+    {
+        float alpha     = 1.0f;
+        float beta      = 0.0f;
+        bool transa     = args[0].get_shape().transposed();
+        bool transb     = args[1].get_shape().transposed();
+        rocblas_int lda = args[0].get_shape().strides()[transa ? 1 : 0];
+        rocblas_int ldb = args[1].get_shape().strides()[transb ? 1 : 0];
+        rocblas_int ldc = args[2].get_shape().strides()[0];
+        rocblas_int m   = output_shape.lens()[0];
+        rocblas_int n   = output_shape.lens()[1];
+        rocblas_int k   = args[0].get_shape().lens()[1];
+        rocblas_sgemm(ctx.rbhandle.get(),
+                      transb ? rocblas_operation_transpose : rocblas_operation_none,
+                      transa ? rocblas_operation_transpose : rocblas_operation_none,
+                      n,
+                      m,
+                      k,
+                      &alpha,
+                      args[1].implicit(),
+                      ldb,
+                      args[0].implicit(),
+                      lda,
+                      &beta,
+                      args[2].implicit(),
+                      ldc);
+        return args[2];
+    }
+};
+struct miopen_contiguous
+{
+    contiguous op;
+    std::string name() const { return "gpu::contiguous"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(2);
+        return op.compute_shape({inputs.at(0)});
+    }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        hip_contiguous(output_shape, args.at(0), args.at(1));
+        return args.at(1);
+    }
+};
+struct miopen_relu
+{
+    shared<activation_descriptor> ad;
+    std::string name() const { return "gpu::relu"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(2).not_broadcasted();
+        return inputs.at(1);
+    }
+    argument compute(context& ctx, shape output_shape, std::vector<argument> args) const
+    {
+        float alpha = 1, beta = 0;
+        auto x_desc = make_tensor(args[0].get_shape());
+        auto y_desc = make_tensor(output_shape);
+        miopenActivationForward(ctx.handle.get(),
+                                ad.get(),
+                                &alpha,
+                                x_desc.get(),
+                                args[0].implicit(),
+                                &beta,
+                                y_desc.get(),
+                                args[1].implicit());
+        return args[1];
+    }
+};
+struct miopen_apply
+{
+    program* prog = nullptr;
+    context ctx{};
+    void apply()
+    {
+        for(auto it = prog->begin(); it != prog->end(); it++)
+        {
+            if(it->op.name() == "convolution")
+            {
+                apply_convolution(it);
+            }
+            else if(it->op.name() == "activation")
+            {
+                apply_activation(it);
+            }
+            else if(it->op.name() == "pooling")
+            {
+                apply_pooling(it);
+            }
+            else if(it->op.name() == "add")
+            {
+                apply_add(it);
+            }
+            else if(it->op.name() == "gemm")
+            {
+                apply_gemm(it);
+            }
+            else if(it->op.name() == "contiguous")
+            {
+                apply_contiguous(it);
+            }
+            else if(it->op.name() == "batch_norm_inference")
+            {
+                apply_batch_norm_inference(it);
+            }
+        }
+    }
+    instruction_ref insert_allocation(instruction_ref ins, const shape& s)
+    {
+        if(ins == --prog->end())
+        {
+            return prog->add_parameter("output", s);
+        }
+        else
+        {
+            auto is     = prog->add_outline(s);
+            auto result = prog->insert_instruction(ins, hip_allocate{}, is);
+            return result;
+        }
+    }
+    void apply_convolution(instruction_ref ins)
+    {
+        auto&& op = any_cast<convolution>(ins->op);
+        auto conv = miopen_convolution{op, make_conv(op)};
+        auto ws   = conv.compile(ctx, ins->result, ins->arguments);
+        auto workspace = insert_allocation(ins, ws);
+        auto output    = insert_allocation(ins, ins->result);
+        prog->replace_instruction(
+            ins, conv, ins->arguments.at(0), ins->arguments.at(1), workspace, output);
+    }
+    void apply_pooling(instruction_ref ins)
+    {
+        auto&& op   = any_cast<pooling>(ins->op);
+        auto pd     = make_pooling(op);
+        auto output = insert_allocation(ins, ins->result);
+        prog->replace_instruction(
+            ins, miopen_pooling{op, std::move(pd)}, ins->arguments.at(0), output);
+    }
+    void apply_activation(instruction_ref ins)
+    {
+        auto&& op = any_cast<activation>(ins->op);
+        auto ad   = make_relu();
+        if(op.mode == "relu")
+        {
+            auto output = insert_allocation(ins, ins->result);
+            prog->replace_instruction(
+                ins, miopen_relu{std::move(ad)}, ins->arguments.at(0), output);
+        }
+    }
+    void apply_add(instruction_ref ins)
+    {
+        auto output = insert_allocation(ins, ins->result);
+        prog->replace_instruction(
+            ins, miopen_add{}, ins->arguments.at(0), ins->arguments.at(1), output);
+    }
+    void apply_gemm(instruction_ref ins)
+    {
+        auto&& op   = any_cast<gemm>(ins->op);
+        auto output = insert_allocation(ins, ins->result);
+        prog->replace_instruction(
+            ins, miopen_gemm{op}, ins->arguments.at(0), ins->arguments.at(1), output);
+    }
+    void apply_contiguous(instruction_ref ins)
+    {
+        auto&& op   = any_cast<contiguous>(ins->op);
+        auto output = insert_allocation(ins, ins->result);
+        prog->replace_instruction(ins, miopen_contiguous{op}, ins->arguments.at(0), output);
+    }
+    void apply_batch_norm_inference(instruction_ref ins)
+    {
+        auto&& op       = any_cast<batch_norm_inference>(ins->op);
+        auto output     = insert_allocation(ins, ins->result);
+        shape old_shape = ins->arguments.at(1)->get_shape();
+        std::vector<int64_t> new_shape{1, static_cast<int64_t>(old_shape.elements()), 1, 1};
+        auto reshape_op = reshape{new_shape};
+        std::vector<instruction_ref> reshapes;
+        std::transform(ins->arguments.begin() + 1,
+                       ins->arguments.end(),
+                       std::back_inserter(reshapes),
+                       [&](auto i) { return prog->insert_instruction(ins, reshape_op, i); });
+        prog->replace_instruction(ins,
+                                  miopen_batch_norm_inference{op},
+                                  ins->arguments.at(0),
+                                  reshapes[0],
+                                  reshapes[1],
+                                  reshapes[2],
+                                  reshapes[3],
+                                  output);
+    }
+};
+void lowering::apply(program& p) const { miopen_apply{&p, ctx}.apply(); }
+} // namespace gpu
+} // namespace migraph
--- a/src/targets/gpu/rocblas.cpp
+++ b/src/targets/gpu/rocblas.cpp
+#include <migraph/gpu/rocblas.hpp>
+namespace migraph {
+namespace gpu {
+rocblas_handle_ptr create_rocblas_handle_ptr()
+{
+    rocblas_handle handle;
+    rocblas_create_handle(&handle);
+    return rocblas_handle_ptr{handle};
+}
+} // namespace gpu
+} // namespace migraph
--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
+#include <migraph/gpu/target.hpp>
+#include <migraph/gpu/lowering.hpp>
+#include <migraph/gpu/write_literals.hpp>
+#include <migraph/gpu/context.hpp>
+#include <migraph/check_context.hpp>
+#include <migraph/auto_contiguous.hpp>
+#include <migraph/dead_code_elimination.hpp>
+#include <migraph/simplify_reshapes.hpp>
+namespace migraph {
+namespace gpu {
+std::vector<pass> target::get_passes(migraph::context& gctx) const
+{
+    auto& ctx = any_cast<context>(gctx);
+    // clang-format off
+    return
+    {
+        auto_contiguous{},
+        simplify_reshapes{},
+        lowering{ctx},
+        write_literals{},
+        check_context<context>{},
+        dead_code_elimination{}
+    };
+    // clang-format on
+}
+std::string target::name() const { return "miopen"; }
+migraph::context target::get_context() const
+{
+    return context{share(make_obj<miopen_handle>(&miopenCreate)),
+                   share(create_rocblas_handle_ptr())};
+}
+} // namespace gpu
+} // namespace migraph
--- a/src/targets/gpu/write_literals.cpp
+++ b/src/targets/gpu/write_literals.cpp
+#include <migraph/gpu/write_literals.hpp>
+#include <migraph/iterator_for.hpp>
+#include <migraph/gpu/hip.hpp>
+#include <migraph/instruction.hpp>
+namespace migraph {
+namespace gpu {
+void write_literals::apply(program& p) const
+{
+    for(auto ins : iterator_for(p))
+    {
+        if(ins->op.name() == "@literal")
+        {
+            literal l = ins->lit;
+            auto pre  = p.add_literal(l);
+            p.replace_instruction(ins, hip_write{}, pre);
+        }
+    }
+}
+} // namespace gpu
+} // namespace migraph
--- a/src/targets/miopen/CMakeLists.txt
+++ b/src/targets/miopen/CMakeLists.txt
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm /opt/rocm/hip /opt/rocm/hcc)
-find_package(miopen)
-if(NOT TARGET MIOpen)
-    message(SEND_ERROR "Cant find miopen")
-endif()
-add_library(rtg_miopen
-    miopen_target.cpp
-)
-rocm_clang_tidy_check(rtg_miopen)
-target_link_libraries(rtg_miopen rtg MIOpen)
-target_include_directories(rtg_miopen PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
--- a/src/targets/miopen/include/rtg/miopen/miopen_target.hpp
+++ b/src/targets/miopen/include/rtg/miopen/miopen_target.hpp
-#ifndef RTG_GUARD_RTGLIB_MIOPEN_TARGET_HPP
-#define RTG_GUARD_RTGLIB_MIOPEN_TARGET_HPP
-#include <rtg/program.hpp>
-namespace rtg {
-namespace miopen {
-struct miopen_target
-{
-    std::string name() const;
-    void apply(program& p) const;
-};
-} // namespace miopen
-} // namespace rtg
-#endif
--- a/src/targets/miopen/miopen_target.cpp
+++ b/src/targets/miopen/miopen_target.cpp
-#include <rtg/miopen/miopen_target.hpp>
-#include <rtg/manage_ptr.hpp>
-#include <rtg/instruction.hpp>
-#include <rtg/operators.hpp>
-#include <miopen/miopen.h>
-namespace rtg {
-namespace miopen {
-struct hip_allocate
-{
-    std::string name() const { return "hip::allocate"; }
-    shape compute_shape(std::vector<shape> inputs) const
-    {
-        check_shapes{inputs}.has(1);
-        return inputs.front();
-    }
-    argument compute(shape output_shape, std::vector<argument>) const
-    {
-        char* data = nullptr;
-        // TODO: Check return status
-        hipMalloc(&data, output_shape.bytes());
-        return {output_shape, data};
-    }
-};
-struct hip_free
-{
-    std::string name() const { return "hip::free"; }
-    shape compute_shape(std::vector<shape> inputs) const
-    {
-        check_shapes{inputs}.has(1);
-        return {};
-    }
-    argument compute(shape, std::vector<argument> args) const
-    {
-        // TODO: Check return status
-        hipFree(args.front().data());
-        return {};
-    }
-};
-using miopen_handle     = RTG_MANAGE_PTR(miopenHandle_t, miopenDestroy);
-using tensor_descriptor = RTG_MANAGE_PTR(miopenTensorDescriptor_t, miopenDestroyTensorDescriptor);
-using convolution_descriptor = RTG_MANAGE_PTR(miopenConvolutionDescriptor_t,
-                                              miopenDestroyConvolutionDescriptor);
-using activation_descriptor  = RTG_MANAGE_PTR(miopenActivationDescriptor_t,
-                                             miopenDestroyActivationDescriptor);
-template <class Result, class F, class... Ts>
-Result make_obj(F f, Ts... xs)
-{
-    typename Result::pointer x = nullptr;
-    auto status                = f(&x, xs...);
-    Result r{x};
-    if(status != miopenStatusSuccess)
-        RTG_THROW("MIOpen call failed");
-    return r;
-}
-tensor_descriptor make_tensor(const rtg::shape& s)
-{
-    auto t = make_obj<tensor_descriptor>(&miopenCreateTensorDescriptor);
-    // Convert to ints
-    std::vector<int> lens(s.lens().begin(), s.lens().end());
-    std::vector<int> strides(s.strides().begin(), s.strides().end());
-    miopenDataType_t d;
-    if(s.type() == shape::float_type)
-        d = miopenFloat;
-    else
-        RTG_THROW("Unsupported type");
-    miopenSetTensorDescriptor(t.get(), d, s.lens().size(), lens.data(), strides.data());
-    return t;
-}
-convolution_descriptor make_conv(const rtg::convolution& op)
-{
-    auto c = make_obj<convolution_descriptor>(&miopenCreateConvolutionDescriptor);
-    miopenInitConvolutionDescriptor(c.get(),
-                                    miopenConvolution,
-                                    op.padding[0],
-                                    op.padding[1],
-                                    op.stride[0],
-                                    op.stride[1],
-                                    op.dilation[0],
-                                    op.dilation[1]);
-    return c;
-}
-activation_descriptor make_relu()
-{
-    auto ad = make_obj<activation_descriptor>(&miopenCreateActivationDescriptor);
-    miopenSetActivationDescriptor(ad.get(), miopenActivationRELU, 0, 0, 0);
-    return ad;
-}
-struct miopen_convolution
-{
-    convolution op;
-    shared<convolution_descriptor> cd;
-    std::string name() const { return "miopen::convolution"; }
-    shape compute_shape(std::vector<shape> inputs) const
-    {
-        check_shapes{inputs}.has(4);
-        return op.compute_shape({inputs.at(1), inputs.at(2)});
-    }
-    argument compute(shape output_shape, std::vector<argument> args) const
-    {
-        auto x_desc = make_tensor(args[1].get_shape());
-        auto w_desc = make_tensor(args[2].get_shape());
-        auto y_desc = make_tensor(output_shape);
-        float alpha = 1, beta = 0;
-        int algo_count;
-        miopenConvAlgoPerf_t perf;
-        miopenFindConvolutionForwardAlgorithm(args[0].implicit(),
-                                              x_desc.get(),
-                                              args[1].implicit(),
-                                              w_desc.get(),
-                                              args[2].implicit(),
-                                              cd.get(),
-                                              y_desc.get(),
-                                              args[3].implicit(),
-                                              1,
-                                              &algo_count,
-                                              &perf,
-                                              nullptr,
-                                              0,
-                                              false);
-        miopenConvolutionForward(args[0].implicit(),
-                                 &alpha,
-                                 x_desc.get(),
-                                 args[1].implicit(),
-                                 w_desc.get(),
-                                 args[2].implicit(),
-                                 cd.get(),
-                                 perf.fwd_algo,
-                                 &beta,
-                                 y_desc.get(),
-                                 args[3].implicit(),
-                                 nullptr,
-                                 0);
-        return args[3];
-    }
-};
-struct miopen_relu
-{
-    shared<activation_descriptor> ad;
-    std::string name() const { return "miopen::relu"; }
-    shape compute_shape(std::vector<shape> inputs) const
-    {
-        check_shapes{inputs}.has(3);
-        return inputs.at(1);
-    }
-    argument compute(shape output_shape, std::vector<argument> args) const
-    {
-        float alpha = 1, beta = 0;
-        auto x_desc = make_tensor(args[1].get_shape());
-        auto y_desc = make_tensor(output_shape);
-        miopenActivationForward(args[0].implicit(),
-                                ad.get(),
-                                &alpha,
-                                x_desc.get(),
-                                args[1].implicit(),
-                                &beta,
-                                y_desc.get(),
-                                args[2].implicit());
-        return args[2];
-    }
-};
-struct miopen_apply
-{
-    program* prog = nullptr;
-    instruction_ref handle{};
-    void apply()
-    {
-        handle = prog->add_parameter("handle", shape{shape::any_type});
-        for(auto it = prog->begin(); it != prog->end(); it++)
-        {
-            if(it->op.name() == "convolution")
-            {
-                apply_convolution(it);
-            }
-            else if(it->op.name() == "activation")
-            {
-                apply_activation(it);
-            }
-        }
-    }
-    instruction_ref insert_allocation(instruction_ref ins, const shape& s)
-    {
-        if(ins == --prog->end())
-        {
-            return prog->add_parameter("output", s);
-        }
-        else
-        {
-            auto is     = prog->add_outline(s);
-            auto result = prog->insert_instruction(ins, hip_allocate{}, is);
-            prog->insert_instruction(++ins, hip_free{}, result);
-            return result;
-        }
-    }
-    void apply_convolution(instruction_ref ins)
-    {
-        auto&& op   = any_cast<convolution>(ins->op);
-        auto cd     = make_conv(op);
-        auto output = insert_allocation(ins, ins->result);
-        prog->replace_instruction(ins,
-                                  miopen_convolution{op, std::move(cd)},
-                                  handle,
-                                  ins->arguments.at(0),
-                                  ins->arguments.at(1),
-                                  output);
-    }
-    void apply_activation(instruction_ref ins)
-    {
-        auto&& op = any_cast<activation>(ins->op);
-        auto ad   = make_relu();
-        if(op.mode == "relu")
-        {
-            auto output = insert_allocation(ins, ins->result);
-            prog->replace_instruction(
-                ins, miopen_relu{std::move(ad)}, handle, ins->arguments.at(0), output);
-        }
-    }
-};
-std::string miopen_target::name() const { return "miopen"; }
-void miopen_target::apply(program& p) const { miopen_apply{&p}.apply(); }
-} // namespace miopen
-} // namespace rtg
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -10,12 +10,12 @@ set(CTEST_PARALLEL_LEVEL ${N} CACHE STRING "CTest parallel level")
 add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -j ${CTEST_PARALLEL_LEVEL} -C ${CMAKE_CFG_INTDIR})
 add_custom_target(tests)
-find_program(RTG_GDB gdb)
+find_program(MIGRAPH_GDB gdb)
-if(RTG_GDB)
+if(MIGRAPH_GDB)
-    set(RTG_TEST_GDB On CACHE BOOL "")
+    set(MIGRAPH_TEST_GDB On CACHE BOOL "")
 else()
-    set(RTG_TEST_GDB Off CACHE BOOL "")
+    set(MIGRAPH_TEST_GDB Off CACHE BOOL "")
 endif()
 set(SKIP_TESTS)
@@ -34,26 +34,32 @@ function(add_test_command NAME EXE)
                    %1 ${ARGN}")
        add_test(NAME ${NAME} COMMAND ${WINE_CMD} cmd /c "${CMAKE_CURRENT_BINARY_DIR}/test_${NAME}.cmd" $<TARGET_FILE:${EXE}>)
    else()
-        if(RTG_TEST_GDB)
+        if(MIGRAPH_TEST_GDB)
-            # add_test(NAME ${NAME} COMMAND ${RTG_GDB} 
+            # add_test(NAME ${NAME} COMMAND ${MIGRAPH_GDB} 
            #     --batch
            #     --return-child-result
            #     -ex "set disable-randomization off"
            #     -ex run
            #     -ex backtrace
            #     --args $<TARGET_FILE:${EXE}> ${ARGN})
-            file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/test_${NAME}.cmake"
+            set(TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/gdb/test_${NAME})
+            file(MAKE_DIRECTORY ${TEST_DIR})
+            file(GENERATE OUTPUT "${TEST_DIR}/run.cmake"
                CONTENT "
-                execute_process(COMMAND $<TARGET_FILE:${EXE}> ${ARGN} RESULT_VARIABLE RESULT)
+                # Remove previous core dump
+                file(REMOVE ${TEST_DIR}/core)
+                execute_process(COMMAND $<TARGET_FILE:${EXE}> ${ARGN} WORKING_DIRECTORY ${TEST_DIR} RESULT_VARIABLE RESULT)
                if(NOT RESULT EQUAL 0)
                    # TODO: check for core files based on pid when setting /proc/sys/kernel/core_uses_pid
-                    if(EXISTS core)
+                    if(EXISTS ${TEST_DIR}/core)
-                        execute_process(COMMAND ${RTG_GDB} $<TARGET_FILE:${EXE}> core -batch -ex bt)
+                        set(\$ENV{UBSAN_OPTIONS} print_stacktrace=1)
+                        set(\$ENV{ASAN_OPTIONS} print_stacktrace=1)
+                        execute_process(COMMAND ${MIGRAPH_GDB} $<TARGET_FILE:${EXE}> ${TEST_DIR}/core -batch -ex bt)
                    endif()
                    message(FATAL_ERROR \"Test failed\")
                endif()
            ")
-            add_test(NAME ${NAME} COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/test_${NAME}.cmake")
+            add_test(NAME ${NAME} COMMAND ${CMAKE_COMMAND} -P "${TEST_DIR}/run.cmake")
        else()
            add_test(NAME ${NAME} COMMAND ${EXE} ${ARGN})
        endif()
@@ -78,7 +84,7 @@ function(add_test_executable TEST_NAME)
    add_dependencies(tests ${TEST_NAME})
    add_dependencies(check ${TEST_NAME})
    set_tests_properties(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION "FAILED")
-    target_link_libraries(${TEST_NAME} rtg rtg_cpu)
+    target_link_libraries(${TEST_NAME} migraph migraph_cpu migraph_onnx)
    target_include_directories(${TEST_NAME} PUBLIC include)
 endfunction(add_test_executable)
@@ -89,13 +95,42 @@ foreach(TEST ${TESTS})
    add_test_executable(test_${BASE_NAME} ${TEST})
 endforeach()
-if(RTG_ENABLE_MIOPEN)
+if(MIGRAPH_ENABLE_GPU)
-    # miopen tests
+    # gpu tests
-    file(GLOB MIOPEN_TESTS miopen/*.cpp)
+    file(GLOB GPU_TESTS gpu/*.cpp)
-    foreach(TEST ${MIOPEN_TESTS})
+    foreach(TEST ${GPU_TESTS})
        get_filename_component(BASE_NAME ${TEST} NAME_WE)
-        add_test_executable(test_miopen_${BASE_NAME} ${TEST})
+        add_test_executable(test_gpu_${BASE_NAME} ${TEST})
-        target_link_libraries(test_miopen_${BASE_NAME} rtg_miopen)
+        target_link_libraries(test_gpu_${BASE_NAME} migraph_gpu)
    endforeach()
 endif()
+# Onnx test
+add_executable(test_onnx onnx/onnx_test.cpp)
+target_link_libraries(test_onnx migraph_onnx)
+target_include_directories(test_onnx PUBLIC include)
+add_test(NAME test_onnx COMMAND $<TARGET_FILE:test_onnx> WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/onnx) 
+add_dependencies(tests test_onnx)
+add_dependencies(check test_onnx)
+function(test_header NAME HEADER)
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/header-main-include-${NAME}.cpp 
+        "#include <${HEADER}>\nint main() {}\n"
+    )
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/header-static-include-${NAME}.cpp 
+        "#include <${HEADER}>\n"
+    )
+    add_test_executable(${NAME}
+        ${CMAKE_CURRENT_BINARY_DIR}/header-main-include-${NAME}.cpp 
+        ${CMAKE_CURRENT_BINARY_DIR}/header-static-include-${NAME}.cpp
+    )
+endfunction()
+file(GLOB HEADERS ${CMAKE_SOURCE_DIR}/src/include/migraph/*.hpp)
+foreach(HEADER ${HEADERS})
+    get_filename_component(BASE_NAME ${HEADER} NAME_WE)
+    test_header(header_${BASE_NAME} migraph/${BASE_NAME}.hpp)
+endforeach()
--- a/test/auto_contiguous_test.cpp
+++ b/test/auto_contiguous_test.cpp
+#include <migraph/auto_contiguous.hpp>
+#include <migraph/operators.hpp>
+#include <basic_ops.hpp>
+#include <test.hpp>
+struct contiguous_target
+{
+    std::string name() const { return "contiguous"; }
+    std::vector<migraph::pass> get_passes(migraph::context&) const
+    {
+        return {migraph::auto_contiguous{}};
+    }
+    migraph::context get_context() const { return {}; }
+};
+migraph::literal get_2x2()
+{
+    return migraph::literal{{migraph::shape::float_type, {2, 2}}, {1, 2, 3, 4}};
+}
+migraph::literal get_2x2_transposed()
+{
+    return migraph::literal{{migraph::shape::float_type, {2, 2}, {1, 2}}, {1, 2, 3, 4}};
+}
+migraph::literal get_2() { return migraph::literal{{migraph::shape::float_type, {2}}, {1, 2}}; }
+migraph::literal get_2_broadcasted()
+{
+    return migraph::literal{{migraph::shape::float_type, {2, 1}, {1, 0}}, {1, 2}};
+}
+void literal_broadcast()
+{
+    migraph::program p;
+    p.add_literal(get_2_broadcasted());
+    EXPECT(not p.get_shape().standard());
+    EXPECT(p.get_shape().broadcasted());
+    p.compile(contiguous_target{});
+    EXPECT(p.get_shape().standard());
+    EXPECT(not p.get_shape().broadcasted());
+}
+void literal_transpose()
+{
+    migraph::program p;
+    p.add_literal(get_2x2_transposed());
+    EXPECT(not p.get_shape().standard());
+    EXPECT(p.get_shape().transposed());
+    p.compile(contiguous_target{});
+    EXPECT(p.get_shape().standard());
+    EXPECT(not p.get_shape().transposed());
+}
+void after_literal_transpose()
+{
+    migraph::program p;
+    auto l = p.add_literal(get_2x2());
+    EXPECT(p.get_shape().standard());
+    EXPECT(not p.get_shape().transposed());
+    auto t = p.add_instruction(migraph::transpose{{1, 0}}, l);
+    p.add_instruction(pass_op{}, t);
+    EXPECT(not p.get_shape().standard());
+    EXPECT(p.get_shape().transposed());
+    p.compile(contiguous_target{});
+    EXPECT(p.get_shape().standard());
+    EXPECT(not p.get_shape().transposed());
+}
+void after_literal_broadcast()
+{
+    migraph::program p;
+    auto l1 = p.add_literal(get_2x2());
+    auto l2 = p.add_literal(get_2());
+    EXPECT(p.get_shape().standard());
+    EXPECT(not p.get_shape().broadcasted());
+    auto b = p.add_instruction(migraph::broadcast{}, l1, l2);
+    p.add_instruction(pass_op{}, b);
+    EXPECT(not p.get_shape().standard());
+    EXPECT(p.get_shape().broadcasted());
+    p.compile(contiguous_target{});
+    EXPECT(p.get_shape().standard());
+    EXPECT(not p.get_shape().broadcasted());
+}
+void after_param_transpose()
+{
+    migraph::program p;
+    auto l = p.add_parameter("2x2", {migraph::shape::float_type, {2, 2}});
+    EXPECT(p.get_shape().standard());
+    EXPECT(not p.get_shape().transposed());
+    auto t = p.add_instruction(migraph::transpose{{1, 0}}, l);
+    p.add_instruction(pass_op{}, t);
+    EXPECT(not p.get_shape().standard());
+    EXPECT(p.get_shape().transposed());
+    p.compile(contiguous_target{});
+    EXPECT(p.get_shape().standard());
+    EXPECT(not p.get_shape().transposed());
+}
+void after_param_broadcast()
+{
+    migraph::program p;
+    auto l1 = p.add_parameter("2x2", {migraph::shape::float_type, {2, 2}});
+    auto l2 = p.add_parameter("2", {migraph::shape::float_type, {2}});
+    EXPECT(p.get_shape().standard());
+    EXPECT(not p.get_shape().broadcasted());
+    auto b = p.add_instruction(migraph::broadcast{}, l1, l2);
+    p.add_instruction(pass_op{}, b);
+    EXPECT(not p.get_shape().standard());
+    EXPECT(p.get_shape().broadcasted());
+    p.compile(contiguous_target{});
+    EXPECT(p.get_shape().standard());
+    EXPECT(not p.get_shape().broadcasted());
+}
+int main()
+{
+    // literal_broadcast();
+    literal_transpose();
+    after_literal_transpose();
+    after_literal_broadcast();
+    after_param_transpose();
+    after_param_broadcast();
+}
--- a/test/cpu_ops_test.cpp
+++ b/test/cpu_ops_test.cpp
+#include <iostream>
+#include <vector>
+#include <migraph/literal.hpp>
+#include <migraph/operators.hpp>
+#include <migraph/cpu/cpu_target.hpp>
+#include "test.hpp"
+#include "verify.hpp"
+void batch_norm_inference_test()
+{
+    migraph::program p;
+    const size_t width = 2, height = 2, channels = 4, batches = 2;
+    const float x_val = 8.0f, mean_val = 2.0f, variance_val = 4.0f, scale_val = 2.0f,
+                bias_val   = 1.0f;
+    const float output_val = scale_val * (x_val - mean_val) / (std::sqrt(variance_val)) + bias_val;
+    migraph::shape s{migraph::shape::float_type, {batches, channels, height, width}};
+    migraph::shape vars{migraph::shape::float_type, {channels}};
+    std::vector<float> x_data(width * height * channels * batches);
+    std::vector<float> scale_data(channels);
+    std::vector<float> bias_data(channels);
+    std::vector<float> mean_data(channels);
+    std::vector<float> variance_data(channels);
+    std::fill(x_data.begin(), x_data.end(), x_val);
+    std::fill(mean_data.begin(), mean_data.end(), mean_val);
+    std::fill(variance_data.begin(), variance_data.end(), variance_val);
+    std::fill(scale_data.begin(), scale_data.end(), scale_val);
+    std::fill(bias_data.begin(), bias_data.end(), bias_val);
+    auto x        = p.add_literal(migraph::literal{s, x_data});
+    auto scale    = p.add_literal(migraph::literal{vars, scale_data});
+    auto bias     = p.add_literal(migraph::literal{vars, bias_data});
+    auto mean     = p.add_literal(migraph::literal{vars, mean_data});
+    auto variance = p.add_literal(migraph::literal{vars, variance_data});
+    p.add_instruction(migraph::batch_norm_inference{}, x, mean, variance, scale, bias);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    std::vector<float> result_vector(width * height * channels * batches);
+    std::vector<float> gold(width * height * channels * batches);
+    std::fill(gold.begin(), gold.end(), output_val);
+    result.visit([&](auto output) { result_vector.assign(output.begin(), output.end()); });
+    EXPECT(test::verify_range(result_vector, gold));
+}
+void exp_test()
+{
+    migraph::program p;
+    migraph::shape s{migraph::shape::float_type, {3}};
+    auto l = p.add_literal(migraph::literal{s, {-1, 0, 1}});
+    p.add_instruction(migraph::exp{}, l);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    std::vector<float> results_vector(3);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    std::vector<float> gold = {0.36787944f, 1.f, 2.71828183f};
+    EXPECT(test::verify_range(results_vector, gold));
+}
+void sin_test()
+{
+    migraph::program p;
+    migraph::shape s{migraph::shape::float_type, {3}};
+    auto l = p.add_literal(migraph::literal{s, {-1, 0, 1}});
+    p.add_instruction(migraph::sin{}, l);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    std::vector<float> results_vector(3);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    std::vector<float> gold = {-0.84147098f, 0.f, 0.84147098f};
+    EXPECT(test::verify_range(results_vector, gold));
+}
+void cos_test()
+{
+    migraph::program p;
+    migraph::shape s{migraph::shape::float_type, {3}};
+    auto l = p.add_literal(migraph::literal{s, {-1, 0, 1}});
+    p.add_instruction(migraph::cos{}, l);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    std::vector<float> results_vector(3);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    std::vector<float> gold = {0.54030231f, 1.f, 0.54030231f};
+    EXPECT(test::verify_range(results_vector, gold));
+}
+void tan_test()
+{
+    migraph::program p;
+    migraph::shape s{migraph::shape::float_type, {3}};
+    auto l = p.add_literal(migraph::literal{s, {-1, 0, 1}});
+    p.add_instruction(migraph::tan{}, l);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    std::vector<float> results_vector(3);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    std::vector<float> gold = {-1.55740772f, 0.0f, 1.55740772f};
+    EXPECT(test::verify_range(results_vector, gold));
+}
+void add_test()
+{
+    migraph::program p;
+    migraph::shape s{migraph::shape::float_type, {3}};
+    auto l1 = p.add_literal(migraph::literal{s, {-1, 0, 1}});
+    auto l2 = p.add_literal(migraph::literal{s, {1, 2, 3}});
+    p.add_instruction(migraph::add{}, l1, l2);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    std::vector<float> results_vector(3);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    std::vector<float> gold = {0, 2, 4};
+    EXPECT(test::verify_range(results_vector, gold));
+}
+void broadcast_test()
+{
+    migraph::program p;
+    migraph::shape a_shape{migraph::shape::int32_type, {2, 2}};
+    std::vector<int32_t> a_data{0, 0, 0, 0};
+    migraph::shape b_shape{migraph::shape::int32_type, {2}};
+    std::vector<int32_t> b_data{-2, -3};
+    uint64_t axis = 0;
+    auto l1       = p.add_literal(migraph::literal{a_shape, a_data});
+    auto l2       = p.add_literal(migraph::literal{b_shape, b_data});
+    p.add_instruction(migraph::broadcast{axis}, l1, l2);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    auto output = result.get<int32_t>();
+    EXPECT(output(0, 0) == -2);
+    EXPECT(output(0, 1) == -2);
+    EXPECT(output(1, 0) == -3);
+    EXPECT(output(1, 1) == -3);
+}
+void add_broadcast_test()
+{
+    migraph::program p;
+    migraph::shape a_shape{migraph::shape::float_type, {2, 2, 3}};
+    std::vector<float> a_data{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+    migraph::shape b_shape{migraph::shape::float_type, {2, 2}};
+    std::vector<float> b_data{0, -1, -2, -3};
+    uint64_t axis = 0;
+    auto l1       = p.add_literal(migraph::literal{a_shape, a_data});
+    auto l2       = p.add_literal(migraph::literal{b_shape, b_data});
+    auto l3       = p.add_instruction(migraph::broadcast{axis}, l1, l2);
+    p.add_instruction(migraph::add{}, l1, l3);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    EXPECT(result.get_shape().packed());
+    std::vector<float> results_vector(12);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    std::vector<float> gold = {0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8};
+    EXPECT(test::verify_range(results_vector, gold));
+}
+void sub_test()
+{
+    migraph::program p;
+    migraph::shape s{migraph::shape::float_type, {3}};
+    auto l1 = p.add_literal(migraph::literal{s, {-1, 0, 1}});
+    auto l2 = p.add_literal(migraph::literal{s, {1, 2, 3}});
+    p.add_instruction(migraph::sub{}, l1, l2);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    std::vector<float> results_vector(3);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    std::vector<float> gold = {-2, -2, -2};
+    EXPECT(test::verify_range(results_vector, gold));
+}
+void mul_test()
+{
+    migraph::program p;
+    migraph::shape s{migraph::shape::float_type, {3}};
+    auto l1 = p.add_literal(migraph::literal{s, {-1, 0, 1}});
+    auto l2 = p.add_literal(migraph::literal{s, {1, 2, 3}});
+    p.add_instruction(migraph::mul{}, l1, l2);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    std::vector<float> results_vector(3);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    std::vector<float> gold = {-1, 0, 3};
+    EXPECT(test::verify_range(results_vector, gold));
+}
+void div_test()
+{
+    migraph::program p;
+    migraph::shape s{migraph::shape::float_type, {3}};
+    auto l1 = p.add_literal(migraph::literal{s, {-1.0f, 0.5f, 1.0f}});
+    auto l2 = p.add_literal(migraph::literal{s, {1.0f, 2.0f, 4.0f}});
+    p.add_instruction(migraph::div{}, l1, l2);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    std::vector<float> results_vector(3);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    std::vector<float> gold = {-1.f, 0.25f, 0.25f};
+    EXPECT(test::verify_range(results_vector, gold));
+}
+void reshape_test()
+{
+    migraph::shape a_shape{migraph::shape::float_type, {24, 1, 1, 1}};
+    std::vector<float> data(24);
+    std::iota(data.begin(), data.end(), -3);
+    {
+        migraph::program p;
+        auto l                         = p.add_literal(migraph::literal{a_shape, data});
+        std::vector<int64_t> new_shape = {8, 3, 1, 1};
+        p.add_instruction(migraph::reshape{new_shape}, l);
+        p.compile(migraph::cpu::cpu_target{});
+        auto result = p.eval({});
+        std::vector<float> results_vector(3);
+        result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+        EXPECT(test::verify_range(results_vector, data));
+    }
+    {
+        migraph::program p;
+        auto l                         = p.add_literal(migraph::literal{a_shape, data});
+        std::vector<int64_t> new_shape = {1, 3, 4, 2};
+        p.add_instruction(migraph::reshape{new_shape}, l);
+        p.compile(migraph::cpu::cpu_target{});
+        auto result = p.eval({});
+        std::vector<float> results_vector(3);
+        result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+        EXPECT(test::verify_range(results_vector, data));
+    }
+    {
+        migraph::program p;
+        auto l                         = p.add_literal(migraph::literal{a_shape, data});
+        std::vector<int64_t> new_shape = {1, 3, 4, 2};
+        p.add_instruction(migraph::reshape{new_shape}, l);
+        p.compile(migraph::cpu::cpu_target{});
+        auto result = p.eval({});
+        std::vector<float> results_vector(3);
+        result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+        EXPECT(test::verify_range(results_vector, data));
+    }
+}
+template <class T>
+void gemm_test()
+{
+    migraph::program p;
+    std::vector<T> a = {-0.00925222, 0.56250403, 0.70107397,  0.75402161,  -0.505885,
+                        1.33628943,  -0.11413,   -0.31270559, 1.59336732,  -0.19361027,
+                        -0.91620867, 0.40108416, -0.06969921, 0.68483471,  -0.39906632,
+                        -1.66423624, 0.69040076, -1.31490171, -0.11282616, -0.79391814};
+    std::vector<T> b = {6.09568541e-01,
+                        -6.10527007e-01,
+                        3.66646462e-01,
+                        1.18951101e-01,
+                        5.58777432e-01,
+                        -3.21296298e-01,
+                        -5.95997198e-01,
+                        -5.01425721e-01,
+                        -2.84606807e-01,
+                        -5.73673557e-01,
+                        -8.99430260e-01,
+                        -4.25103093e-01,
+                        1.53027987e+00,
+                        -3.81407415e-04,
+                        -3.29650255e-01};
+    std::vector<T> c = {-1.56327541e+00,
+                        -7.09570140e-01,
+                        -5.37424982e-01,
+                        -2.22994831e-01,
+                        -2.15586437e+00,
+                        2.09177941e-03,
+                        -1.47279677e+00,
+                        2.02627040e-01,
+                        -6.04527691e-01,
+                        -1.29885596e+00,
+                        2.16294914e+00,
+                        -1.48101497e-01};
+    migraph::shape a_shape{migraph::shape::get_type<T>{}, {4, 5}};
+    auto al = p.add_literal(migraph::literal{a_shape, a});
+    migraph::shape b_shape{migraph::shape::get_type<T>{}, {5, 3}};
+    auto bl = p.add_literal(migraph::literal{b_shape, b});
+    p.add_instruction(migraph::gemm{}, al, bl);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    std::vector<T> results_vector(12);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    float tol = 1e-6;
+    for(int i = 0; i < results_vector.size(); i++)
+    {
+        EXPECT(std::abs(results_vector[i] - c[i]) < tol);
+    }
+}
+void maxpool_test()
+{
+    migraph::program p;
+    std::vector<float> a = {
+        -2.1314404,  -1.63041711, 1.54562736,  1.04625261,  -1.42931843, -0.48703974, 0.4065806,
+        -0.1524526,  1.30775225,  0.45538983,  -0.06631992, -1.75332725, 1.33493888,  0.47327688,
+        0.36873096,  1.18358743,  -0.34640595, 1.22098756,  0.01946825,  -0.20238149, 0.43348005,
+        -0.67991608, -0.83041084, 0.93537551,  0.70241445,  -0.5654031,  -1.30899191, -0.26735824,
+        -0.52444768, 1.99097753,  1.86504853,  -0.26506025, 0.26236168,  0.43763575,  0.95300823,
+        -1.02733946, -0.74655169, -0.5374338,  -0.28901565, -0.59789604, 0.5310151,   0.99125904,
+        0.40609556,  -1.57175648, 0.22031412,  1.45862222,  0.53217483,  1.39087725,  1.00170159,
+        -0.87175864, -1.7204628,  -1.72008383, -0.38656762, -0.01443311, 1.46645272,  -1.39995027,
+        0.22505587,  -0.43461126, -0.05511411, -0.79950953, -0.01439556, 0.08795211,  1.18943918,
+        -0.84079367, -1.73383629, -0.55662078, -0.30626822, -0.67339015, 0.44179603,  0.54316711,
+        0.40899998,  -0.27831686, -1.11900508, -0.0881724,  0.35483059,  2.36277103,  -0.04765317,
+        -0.36865309, 0.73814237,  1.47151589,  1.36546791,  -0.32649881, -1.0517807,  2.24768877,
+        0.68883753,  0.58646208,  -0.91017133, -0.50462508, -0.4013325,  -0.72348958, -0.47368807,
+        0.35285577,  -1.01817429, -0.5152272,  0.60321307,  0.43521205,  -0.23733577, 0.66427642,
+        0.82949388,  0.82443929,  0.71550399,  0.34561086,  0.68570769,  -0.40718508, -1.20350206,
+        0.15793853,  -2.31013632, -0.07934658, -0.09348056, 0.36576006,  2.46601582,  0.11090943,
+        0.9144392,   0.56759721,  -0.22112127, -0.21955389, 0.72474903,  -1.28448462, 1.53285873,
+        0.37437943,  0.31409341,  1.95433736,  0.91620457,  0.86205518,  1.24365854,  0.19248386,
+        0.22526583,  0.13462132,  -0.27561715, -2.06446075, -0.02306402, -1.38278747, 1.1411345,
+        1.31293464,  -1.86041689, 1.06763375,  -0.26541466, 1.4545635,   1.11430049,  -0.66491818,
+        0.87101674,  0.67768967,  -1.02062869, -1.05031872, -2.2764678,  -2.0200038,  0.37592548,
+        -0.26701379, -0.83388507, 0.19403623,  1.00968623,  0.11020003,  1.16736257,  -1.1160326,
+        0.47346735,  0.6126079,   -0.19135755, 1.33624589,  -0.29802522, -0.57873946, -1.06555879,
+        -0.20686582, 1.36892557,  -0.19937795, 0.8649236,   -1.40126073, 1.53441942,  0.34682792,
+        -1.31724346, -1.32898355, 2.40126371,  0.07845283,  1.35732043,  -0.63678312, 0.39429256,
+        -1.36487007, -0.31026676, -0.44981545, -0.28994772, -0.14657612, -1.75206447, -0.70612341,
+        1.20071781,  -1.64647579, -0.7133292,  0.88494766,  0.52119428,  -2.77387547, 2.07681108,
+        -0.90133125, 0.2847338,   0.6174528,   -0.20616426, -0.64263535, -1.08496261, 0.54275119,
+        -0.88503587, 0.6629802,   1.47319221,  -1.05829155, -0.97027361, -0.93187737, -1.39954746,
+        -0.52359426, -0.14743951, 1.51522756,  0.2078452,   -1.28156149, -1.19363916, -0.78680223,
+        -0.89094824, 1.30212069,  -0.77974445, -0.58411664, 0.48764706,  -0.67132682};
+    std::vector<float> c = {1.33493888, 1.54562736, 1.22098756, 1.33493888, 1.18358743, 1.99097753,
+                            1.00170159, 1.45862222, 1.39087725, 1.46645272, 1.18943918, -0.01443311,
+                            1.47151589, 2.36277103, 2.24768877, 0.68883753, 0.82949388, 0.71550399,
+                            1.95433736, 2.46601582, 1.53285873, 1.95433736, 1.06763375, 1.4545635,
+                            1.33624589, 1.16736257, 0.6126079,  1.36892557, 2.40126371, 1.53441942,
+                            0.52119428, 2.07681108, 0.88494766, 1.51522756, 0.54275119, 0.6629802};
+    migraph::shape a_shape{migraph::shape::float_type, {2, 3, 6, 6}};
+    auto al = p.add_literal(migraph::literal{a_shape, a});
+    p.add_instruction(migraph::pooling{"max", {{0, 0}}, {{2, 2}}, {{3, 2}}}, al);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    std::cout << result.get_shape() << std::endl;
+    std::vector<float> results_vector(36);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    float tol = 1e-6;
+    for(int i = 0; i < results_vector.size(); i++)
+    {
+        // std::cout << results_vector[i] << "          " << c[i] << std::endl;
+        EXPECT(std::abs(results_vector[i] - c[i]) < tol);
+    }
+}
+void softmax_test()
+{
+    migraph::program p;
+    std::vector<float> a = {
+        -5.61869681e-01, 9.07827199e-01,  1.29255986e+00,  3.18533443e-02,  -1.22183852e-03,
+        -2.83830553e-01, -1.03245842e+00, -9.28322077e-01, -8.82696748e-01, 1.11327164e-01,
+        -9.20038462e-01, 8.47388089e-01,  2.51734018e-01,  1.50563884e+00,  2.23056650e+00,
+        -6.17576987e-02, -1.00264274e-01, -6.10369384e-01, 1.17537189e+00,  -2.51560897e-01,
+        -8.50333512e-01, -8.03578615e-01, -6.51194930e-01, -2.58137047e-01, 4.65528190e-01,
+        3.23284641e-02,  -1.54700470e+00, 1.38096774e+00,  5.39869189e-01,  -7.56884992e-01,
+        1.81503093e+00,  -2.11269641e+00, 1.92466557e+00,  1.77230799e+00,  2.21660900e+00,
+        1.56777036e+00,  -2.08995026e-03, 3.50566894e-01,  -1.15042710e+00, -1.18577778e+00,
+        8.90633047e-01,  -6.63949102e-02, 1.44661188e+00,  1.59215283e+00,  -2.56262213e-01,
+        9.39079225e-01,  4.07298543e-02,  3.86590779e-01,  6.09607756e-01,  8.22331488e-01,
+        -2.82126725e-01, -9.49052632e-01, -4.24012303e-01, -5.32990396e-01, -3.18386006e+00,
+        3.27092171e-01,  -1.33315325e+00, 3.62459183e-01,  3.74710828e-01,  -1.30302286e+00,
+        1.79680198e-01,  -4.51832324e-01, 4.34282750e-01,  -7.09520102e-01, 6.20333970e-01,
+        -1.28712380e+00, 2.04130828e-01,  -7.70607769e-01, 1.61889160e+00,  -1.50951004e+00,
+        -4.10505563e-01, -3.56566496e-02, -1.29747534e+00, -1.49967879e-01, 7.77626812e-01,
+        -8.28408226e-02, 2.73412596e-02,  5.79780899e-03,  9.87900198e-02,  -7.95276761e-01,
+        -1.38536084e+00, -6.63573861e-01, 3.89783204e-01,  -1.30670881e+00, -7.62425125e-01,
+        -4.04883057e-01, 6.24344349e-01,  3.68128955e-01,  -1.01577950e+00, -3.06715906e-01,
+        5.67961395e-01,  2.98198581e-01,  -1.63613629e+00, -3.75131965e-01, -6.75393403e-01,
+        2.59172034e+00,  6.75538957e-01,  9.07939598e-02,  1.92257717e-01,  -1.21592450e+00,
+        -2.73682117e-01, 1.25232983e+00,  -1.39969170e+00, -1.91483587e-01, 2.57732719e-01,
+        3.10056299e-01,  1.41833842e+00,  -1.81386679e-01, 3.92868072e-01,  -8.14771175e-01,
+        2.02392387e+00,  -9.42091495e-02, -3.77683818e-01, 2.05638766e+00,  2.93796062e-01,
+        -6.02131486e-01, 2.70461679e-01,  -8.92358482e-01, 1.04388881e+00,  2.66154885e-01};
+    std::vector<float> s = {
+        0.30191708, 0.59879845, 0.50029165, 0.24915339, 0.36823985, 0.13190967, 0.0349741,
+        0.18750034, 0.21905553, 0.27000085, 0.0547399,  0.56318235, 0.47422904, 0.78964758,
+        0.91381913, 0.44601166, 0.47902739, 0.13120073, 0.4449684,  0.18766427, 0.15753111,
+        0.07844277, 0.05120674, 0.36648798, 0.14637007, 0.13152322, 0.01560997, 0.29065287,
+        0.49196178, 0.10550152, 0.81890774, 0.06369215, 0.62972021, 0.74931765, 0.67285055,
+        0.35034987, 0.28612873, 0.31931475, 0.04220394, 0.16093165, 0.22390974, 0.11915915,
+        0.3115395,  0.35899726, 0.22190949, 0.57518375, 0.13888834, 0.7753762,  0.4642328,
+        0.57055861, 0.21954368, 0.34515455, 0.09486015, 0.40631217, 0.01842281, 0.48770609,
+        0.06652815, 0.36023033, 0.42343026, 0.24226256, 0.17348589, 0.44066274, 0.6865865,
+        0.17296699, 0.46923906, 0.06921105, 0.3570261,  0.4125829,  0.73165393, 0.15302512,
+        0.29499072, 0.33932695, 0.30852377, 0.40762195, 0.40170741, 0.36259529, 0.60848355,
+        0.42618036, 0.31721094, 0.02960522, 0.28256637, 0.24389413, 0.2725659,  0.10663581,
+        0.27622163, 0.28264219, 0.53652936, 0.09476089, 0.40890986, 0.34848392, 0.32572666,
+        0.53076893, 0.11529481, 0.29117745, 0.14625968, 0.8756339,  0.49818122, 0.10656087,
+        0.1813329,  0.17664003, 0.21410346, 0.80408043, 0.02315119, 0.27155462, 0.32804728,
+        0.13268511, 0.61795473, 0.49703068, 0.41696799, 0.10175809, 0.71028161, 0.29929739,
+        0.17377149, 0.76075399, 0.20071237, 0.32632929, 0.36892858, 0.09416146, 0.26656723,
+        0.42914796};
+    migraph::shape a_shape{migraph::shape::float_type, {5, 3, 4, 2}};
+    auto al = p.add_literal(migraph::literal{a_shape, a});
+    p.add_instruction(migraph::softmax{}, al);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    std::vector<float> results_vector(120);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    EXPECT(test::verify_range(results_vector, s));
+}
+void conv2d_test()
+{
+    migraph::program p;
+    std::vector<float> a = {
+        2.71567607,  -0.9960829,  0.91671127,  0.28140706,  0.63235772,  0.08077253,  0.80927712,
+        -0.59108931, -1.05421555, -2.76622486, -0.85044265, -0.52049929, 0.67726439,  -0.65290606,
+        0.02345525,  -0.33579525, 0.38901961,  1.05473483,  -1.31188095, 1.8963089,   -0.07265259,
+        0.947339,    0.41949373,  -0.70814759, 0.25892952,  1.07311416,  1.2571274,   -0.62318051,
+        -0.19951548, -0.94232577, -0.29393643, 0.42292568,  -0.80230367, 1.40909171,  0.63617158,
+        0.13900366,  1.09253144,  -0.15265895, 1.54781747,  0.72780299,  1.09189606,  -0.38068101,
+        0.97057933,  -0.58958799, 1.56188643,  0.21474874,  0.58725154,  -1.27097559, -0.03024297,
+        1.09437096,  -0.4897908,  0.34838957,  -1.31042492, -1.69069934, 0.86956722,  -0.40457946,
+        0.46691212,  1.29273605,  0.26464137,  0.22073045,  -1.02178168, 0.22163901,  -1.84387338,
+        0.75522131,  -0.45775682, -0.42241111, -1.50944722, 1.07256448,  -1.95876884, -0.28106022,
+        0.3341668,   2.13129425,  -1.14728117, -1.06555498, -0.298444,   -0.88322699, -0.65866792,
+        -2.06007552, 0.01374334,  0.45612028,  0.52715492,  1.01914406,  -1.72659791, 0.80650896,
+        0.16860051,  2.24112225,  -0.78620857, 0.36566174,  -0.07020134, -0.47976932, -0.68230027,
+        -0.94711417, -0.54506505, 1.66504931,  -0.71860826, 0.61132306};
+    std::vector<float> c = {
+        2.82721668e-02,  6.44195229e-02,  1.53499246e-02,  1.72468081e-01,  -6.33238107e-02,
+        9.49496776e-02,  1.40258059e-01,  -7.92879611e-02, -1.29301161e-01, 3.11307609e-03,
+        -1.90624535e-01, 1.13238767e-01,  -2.80647576e-02, 3.12882811e-02,  -3.52091640e-02,
+        3.33581865e-02,  6.43158704e-02,  7.40238279e-02,  -1.00106120e-01, -9.56912562e-02,
+        1.44342467e-01,  9.40258950e-02,  6.36333972e-02,  1.66158378e-03,  -8.91554281e-02,
+        2.58734226e-02,  1.70919895e-02,  1.78214177e-01,  8.84564668e-02,  8.98126513e-02,
+        -1.63809001e-01, 1.37802169e-01,  1.66439757e-01,  -1.45631135e-02, 1.88469887e-04,
+        4.76950556e-02,  -1.91969007e-01, -1.76233292e-01, -7.70473927e-02, 1.14828631e-01,
+        1.76608220e-01,  -1.50728196e-01, 1.99946314e-02,  -5.88052124e-02, 1.31612435e-01,
+        1.61106288e-02,  -1.35080189e-01, 1.49512306e-01,  3.86456847e-02,  1.29330024e-01,
+        -3.22975963e-02, -5.60784787e-02, -5.41997552e-02, 4.78562862e-02};
+    std::vector<float> s = {0.27039781,
+                            0.19105849,
+                            -0.06339942,
+                            -0.65087199,
+                            0.40867025,
+                            0.05063812,
+                            -0.14907975,
+                            0.49018705,
+                            -0.49197209,
+                            0.33236548,
+                            -0.39374301,
+                            0.16012701,
+                            0.06574871,
+                            0.71606487,
+                            -0.55201721,
+                            -0.46427044};
+    migraph::shape a_shape{migraph::shape::float_type, {2, 3, 4, 4}};
+    auto al = p.add_literal(migraph::literal{a_shape, a});
+    migraph::shape c_shape{migraph::shape::float_type, {2, 3, 3, 3}};
+    auto cl = p.add_literal(migraph::literal{c_shape, c});
+    p.add_instruction(migraph::convolution{}, al, cl);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    std::vector<float> results_vector(16);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    EXPECT(test::verify_range(results_vector, s));
+}
+void conv2d_padding_test()
+{
+    migraph::program p;
+    std::vector<float> a = {
+        2.71567607,  -0.9960829,  0.91671127,  0.28140706,  0.63235772,  0.08077253,  0.80927712,
+        -0.59108931, -1.05421555, -2.76622486, -0.85044265, -0.52049929, 0.67726439,  -0.65290606,
+        0.02345525,  -0.33579525, 0.38901961,  1.05473483,  -1.31188095, 1.8963089,   -0.07265259,
+        0.947339,    0.41949373,  -0.70814759, 0.25892952,  1.07311416,  1.2571274,   -0.62318051,
+        -0.19951548, -0.94232577, -0.29393643, 0.42292568,  -0.80230367, 1.40909171,  0.63617158,
+        0.13900366,  1.09253144,  -0.15265895, 1.54781747,  0.72780299,  1.09189606,  -0.38068101,
+        0.97057933,  -0.58958799, 1.56188643,  0.21474874,  0.58725154,  -1.27097559, -0.03024297,
+        1.09437096,  -0.4897908,  0.34838957,  -1.31042492, -1.69069934, 0.86956722,  -0.40457946,
+        0.46691212,  1.29273605,  0.26464137,  0.22073045,  -1.02178168, 0.22163901,  -1.84387338,
+        0.75522131,  -0.45775682, -0.42241111, -1.50944722, 1.07256448,  -1.95876884, -0.28106022,
+        0.3341668,   2.13129425,  -1.14728117, -1.06555498, -0.298444,   -0.88322699, -0.65866792,
+        -2.06007552, 0.01374334,  0.45612028,  0.52715492,  1.01914406,  -1.72659791, 0.80650896,
+        0.16860051,  2.24112225,  -0.78620857, 0.36566174,  -0.07020134, -0.47976932, -0.68230027,
+        -0.94711417, -0.54506505, 1.66504931,  -0.71860826, 0.61132306};
+    std::vector<float> c = {
+        -0.16115488, -0.09800646, -0.05412646, 0.10475694,  0.00555485,  -0.12667653, 0.0458357,
+        -0.02656217, -0.16338061, 0.15037455,  0.0102711,   0.01303349,  0.05242859,  0.02034754,
+        0.04751867,  -0.17038961, -0.1434752,  -0.10770349, 0.05676742,  -0.15838449, 0.10128359,
+        -0.18958683, 0.11954515,  0.10758857,  -0.01058291, -0.12797487, 0.08971019,  0.18793164,
+        -0.00881396, -0.06588994, -0.13321903, -0.03300409, 0.01439607,  0.07618178,  -0.11556662,
+        0.00764295,  0.12956454,  -0.08937147, -0.12763587, 0.04674943,  0.05765297,  0.11336918,
+        0.14747436,  -0.06199479, -0.01166052, -0.12432006, -0.04494537, -0.17581205, 0.09475745,
+        0.1149437,   -0.1014564,  0.0274073,   -0.01323579, -0.11092556};
+    std::vector<float> s = {
+        -0.0201216,  0.40407312,  -0.39005592, -0.0631946,  0.37963012,  -0.64611685, 0.1349397,
+        -0.54113752, 0.28533003,  0.27667275,  -0.16442731, -0.181494,   0.30564839,  0.58744538,
+        0.32015014,  0.24969585,  -0.27367792, -0.53308117, 0.41236052,  0.26136363,  -0.01489828,
+        0.57652152,  -0.38506854, 0.119615,    0.0437076,   0.04779706,  0.57887721,  0.23126155,
+        0.05695833,  -0.68200272, 0.02063358,  -0.10267162, 0.8062973,   -0.38149622, -0.40134856,
+        -0.03353126, 0.38991132,  -0.3478111,  0.03661491,  0.25783631,  0.62772679,  -0.1961118,
+        0.76423508,  -0.36241418, -0.20994355, -0.12368261, -0.9406727,  0.02340185,  -0.08793129,
+        -0.02471633, -0.58163726, -0.02211772, -0.42014724, 0.77525634,  0.504951,    -0.20537445,
+        -0.20369984, -0.83037728, -1.40423918, -0.46160448, -0.22944322, 0.36074194,  0.49579027,
+        0.46527559};
+    migraph::shape a_shape{migraph::shape::float_type, {2, 3, 4, 4}};
+    auto al = p.add_literal(migraph::literal{a_shape, a});
+    migraph::shape c_shape{migraph::shape::float_type, {2, 3, 3, 3}};
+    auto cl = p.add_literal(migraph::literal{c_shape, c});
+    p.add_instruction(migraph::convolution{{{1, 1}}, {{1, 1}}}, al, cl);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    std::vector<float> results_vector(64);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    EXPECT(test::verify_range(results_vector, s));
+}
+void conv2d_padding_stride_test()
+{
+    migraph::program p;
+    std::vector<float> a = {
+        2.71567607,  -0.9960829,  0.91671127,  0.28140706,  0.63235772,  0.08077253,  0.80927712,
+        -0.59108931, -1.05421555, -2.76622486, -0.85044265, -0.52049929, 0.67726439,  -0.65290606,
+        0.02345525,  -0.33579525, 0.38901961,  1.05473483,  -1.31188095, 1.8963089,   -0.07265259,
+        0.947339,    0.41949373,  -0.70814759, 0.25892952,  1.07311416,  1.2571274,   -0.62318051,
+        -0.19951548, -0.94232577, -0.29393643, 0.42292568,  -0.80230367, 1.40909171,  0.63617158,
+        0.13900366,  1.09253144,  -0.15265895, 1.54781747,  0.72780299,  1.09189606,  -0.38068101,
+        0.97057933,  -0.58958799, 1.56188643,  0.21474874,  0.58725154,  -1.27097559, -0.03024297,
+        1.09437096,  -0.4897908,  0.34838957,  -1.31042492, -1.69069934, 0.86956722,  -0.40457946,
+        0.46691212,  1.29273605,  0.26464137,  0.22073045,  -1.02178168, 0.22163901,  -1.84387338,
+        0.75522131,  -0.45775682, -0.42241111, -1.50944722, 1.07256448,  -1.95876884, -0.28106022,
+        0.3341668,   2.13129425,  -1.14728117, -1.06555498, -0.298444,   -0.88322699, -0.65866792,
+        -2.06007552, 0.01374334,  0.45612028,  0.52715492,  1.01914406,  -1.72659791, 0.80650896,
+        0.16860051,  2.24112225,  -0.78620857, 0.36566174,  -0.07020134, -0.47976932, -0.68230027,
+        -0.94711417, -0.54506505, 1.66504931,  -0.71860826, 0.61132306};
+    std::vector<float> c = {
+        -0.14601797, -0.13000923, 0.06521662,  0.06178288,  -0.11083675, 0.10154136,  0.09990512,
+        0.06030385,  -0.11374587, -0.17523311, -0.14344215, 0.17802463,  0.06300922,  -0.15325832,
+        0.07066704,  0.05166031,  0.00615084,  -0.02606523, 0.08083995,  -0.17913306, 0.0624622,
+        0.0735731,   -0.04198661, -0.0164391,  -0.06374192, 0.16569914,  0.10681538,  0.07370754,
+        0.02802075,  0.00282027,  0.15104802,  -0.11084409, -0.00197773, 0.07924436,  0.03528272,
+        0.04765259,  -0.15896152, 0.07917164,  0.12125669,  -0.1154705,  -0.11999125, 0.12749968,
+        -0.06269585, 0.18658121,  -0.03944227, 0.0111798,   -0.17731084, 0.11789055,  -0.09982193,
+        0.08142821,  0.0729029,   0.11303909,  0.12735154,  0.03885292};
+    std::vector<float> s = {-0.20817225,
+                            0.87965256,
+                            0.14958936,
+                            -1.24887264,
+                            -0.06540672,
+                            0.20778663,
+                            0.40456355,
+                            -0.99900877,
+                            0.4917807,
+                            0.1994698,
+                            0.64205718,
+                            0.37798831,
+                            -0.25315839,
+                            0.44276932,
+                            -0.16138598,
+                            0.79344082};
+    migraph::shape a_shape{migraph::shape::float_type, {2, 3, 4, 4}};
+    auto al = p.add_literal(migraph::literal{a_shape, a});
+    migraph::shape c_shape{migraph::shape::float_type, {2, 3, 3, 3}};
+    auto cl = p.add_literal(migraph::literal{c_shape, c});
+    p.add_instruction(migraph::convolution{{{1, 1}}, {{2, 2}}}, al, cl);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    std::vector<float> results_vector(16);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    EXPECT(test::verify_range(results_vector, s));
+}
+void transpose_test()
+{
+    migraph::shape a_shape{migraph::shape::float_type, {1, 2, 2, 3}};
+    std::vector<float> data(12);
+    std::iota(data.begin(), data.end(), 0);
+    {
+        migraph::program p;
+        auto l                    = p.add_literal(migraph::literal{a_shape, data});
+        std::vector<int64_t> perm = {0, 3, 1, 2};
+        p.add_instruction(migraph::transpose{perm}, l);
+        p.compile(migraph::cpu::cpu_target{});
+        auto result = p.eval({});
+        result.visit([&](auto output) {
+            std::vector<size_t> new_lens    = {1, 3, 2, 2};
+            std::vector<size_t> new_strides = {12, 1, 6, 3};
+            EXPECT(bool{output.get_shape().lens() == new_lens});
+            EXPECT(bool{output.get_shape().strides() == new_strides});
+        });
+    }
+    {
+        migraph::program p;
+        auto l                    = p.add_literal(migraph::literal{a_shape, data});
+        std::vector<int64_t> perm = {0, 3, 1, 2};
+        auto result               = p.add_instruction(migraph::transpose{perm}, l);
+        p.add_instruction(migraph::contiguous{}, result);
+        p.compile(migraph::cpu::cpu_target{});
+        auto result2 = p.eval({});
+        std::vector<float> results_vector(12);
+        result2.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+        std::vector<float> gold = {0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11};
+        EXPECT(test::verify_range(results_vector, gold));
+    }
+}
+void contiguous_test()
+{
+    migraph::shape a_shape{migraph::shape::float_type, {1, 3, 2, 2}, {12, 1, 6, 3}};
+    std::vector<float> data(12);
+    std::iota(data.begin(), data.end(), 0);
+    migraph::program p;
+    auto l = p.add_literal(migraph::literal{a_shape, data});
+    p.add_instruction(migraph::contiguous{}, l);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    std::vector<float> results_vector(12);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    std::vector<size_t> new_lens    = {1, 3, 2, 2};
+    std::vector<size_t> new_strides = {12, 1, 6, 3};
+    std::vector<float> gold         = {1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 0};
+    EXPECT(test::verify_range(results_vector, gold));
+}
+int main()
+{
+    exp_test();
+    sin_test();
+    cos_test();
+    tan_test();
+    add_test();
+    broadcast_test();
+    add_broadcast_test();
+    sub_test();
+    mul_test();
+    gemm_test<float>();
+    gemm_test<double>();
+    reshape_test();
+    transpose_test();
+    contiguous_test();
+    softmax_test();
+    // maxpool_test();
+    conv2d_test();
+    conv2d_padding_test();
+    conv2d_padding_stride_test();
+    batch_norm_inference_test();
+}
--- a/test/dead_code_elimination_test.cpp
+++ b/test/dead_code_elimination_test.cpp
+#include <migraph/dead_code_elimination.hpp>
+#include <basic_ops.hpp>
+#include <test.hpp>
+struct dce_target
+{
+    std::string name() const { return "dce"; }
+    std::vector<migraph::pass> get_passes(migraph::context&) const
+    {
+        return {migraph::dead_code_elimination{}};
+    }
+    migraph::context get_context() const { return {}; }
+};
+void simple_test()
+{
+    migraph::program p;
+    auto one = p.add_literal(1);
+    auto two = p.add_literal(2);
+    p.add_instruction(sum_op{}, one, two);
+    auto count = std::distance(p.begin(), p.end());
+    p.compile(dce_target{});
+    EXPECT(std::distance(p.begin(), p.end()) == count);
+    auto result = p.eval({});
+    EXPECT(result == migraph::literal{3});
+    EXPECT(result != migraph::literal{4});
+}
+void simple_test_nop()
+{
+    migraph::program p;
+    auto one = p.add_literal(1);
+    auto two = p.add_literal(2);
+    p.add_instruction(nop{});
+    p.add_instruction(sum_op{}, one, two);
+    auto count = std::distance(p.begin(), p.end());
+    p.compile(dce_target{});
+    EXPECT(std::distance(p.begin(), p.end()) == count);
+    auto result = p.eval({});
+    EXPECT(result == migraph::literal{3});
+    EXPECT(result != migraph::literal{4});
+}
+void duplicate_test1()
+{
+    migraph::program p;
+    auto one = p.add_literal(1);
+    auto two = p.add_literal(2);
+    p.add_instruction(sum_op{}, one, two);
+    p.add_instruction(sum_op{}, one, two);
+    auto count = std::distance(p.begin(), p.end());
+    p.compile(dce_target{});
+    EXPECT(std::distance(p.begin(), p.end()) == (count - 1));
+    auto result = p.eval({});
+    EXPECT(result == migraph::literal{3});
+    EXPECT(result != migraph::literal{4});
+}
+void duplicate_test2()
+{
+    migraph::program p;
+    auto one = p.add_literal(1);
+    auto two = p.add_literal(2);
+    p.add_instruction(sum_op{}, one, two);
+    p.add_instruction(minus_op{}, one, two);
+    p.add_instruction(sum_op{}, one, two);
+    auto count = std::distance(p.begin(), p.end());
+    p.compile(dce_target{});
+    EXPECT(std::distance(p.begin(), p.end()) == (count - 2));
+    auto result = p.eval({});
+    EXPECT(result == migraph::literal{3});
+    EXPECT(result != migraph::literal{4});
+}
+void depth_test()
+{
+    migraph::program p;
+    auto one = p.add_literal(1);
+    auto two = p.add_literal(2);
+    auto x1  = p.add_instruction(sum_op{}, one, two);
+    auto x2  = p.add_instruction(sum_op{}, one, two);
+    p.add_instruction(minus_op{}, x1, x2);
+    p.add_instruction(minus_op{}, x1, x2);
+    p.add_instruction(sum_op{}, one, two);
+    auto count = std::distance(p.begin(), p.end());
+    p.compile(dce_target{});
+    EXPECT(std::distance(p.begin(), p.end()) == (count - 4));
+    auto result = p.eval({});
+    EXPECT(result == migraph::literal{3});
+    EXPECT(result != migraph::literal{4});
+}
+int main()
+{
+    simple_test();
+    simple_test_nop();
+    duplicate_test1();
+    duplicate_test2();
+    depth_test();
+}
--- a/test/eval_test.cpp
+++ b/test/eval_test.cpp
-#include <rtg/program.hpp>
+#include <migraph/program.hpp>
-#include <rtg/argument.hpp>
+#include <migraph/iterator_for.hpp>
-#include <rtg/shape.hpp>
+#include <migraph/instruction.hpp>
 #include <sstream>
 #include "test.hpp"
+#include <basic_ops.hpp>
-struct sum_op
+struct id_target
 {
-    std::string name() const { return "sum"; }
+    std::string name() const { return "id"; }
-    rtg::argument compute(rtg::shape, std::vector<rtg::argument> args) const
+    std::vector<migraph::pass> get_passes(migraph::context&) const { return {}; }
-    {
+    migraph::context get_context() const { return {}; }
-        rtg::argument result;
-        if(args.size() != 2)
-            RTG_THROW("Wrong args");
-        if(args[0].get_shape() != args[1].get_shape())
-            RTG_THROW("Wrong args");
-        if(args[0].get_shape().lens().size() != 1)
-            RTG_THROW("Wrong args");
-        if(args[0].get_shape().lens().front() != 1)
-            RTG_THROW("Wrong args");
-        args[0].visit_at([&](auto x) {
-            args[1].visit_at([&](auto y) { result = rtg::literal{x + y}.get_argument(); });
-        });
-        return result;
-    }
-    rtg::shape compute_shape(std::vector<rtg::shape> inputs) const
-    {
-        if(inputs.size() != 2)
-            RTG_THROW("Wrong inputs");
-        return inputs.front();
-    }
 };
-struct minus_op
+struct reverse_pass
 {
-    std::string name() const { return "minus"; }
+    std::string name() const { return "reverse_pass"; }
-    rtg::argument compute(rtg::shape, std::vector<rtg::argument> args) const
-    {
-        rtg::argument result;
-        if(args.size() != 2)
-            RTG_THROW("Wrong args");
-        if(args[0].get_shape() != args[1].get_shape())
-            RTG_THROW("Wrong args");
-        if(args[0].get_shape().lens().size() != 1)
-            RTG_THROW("Wrong args");
-        if(args[0].get_shape().lens().front() != 1)
-            RTG_THROW("Wrong args");
-        args[0].visit_at([&](auto x) {
-            args[1].visit_at([&](auto y) { result = rtg::literal{x - y}.get_argument(); });
-        });
-        return result;
-    }
-    rtg::shape compute_shape(std::vector<rtg::shape> inputs) const
+    void apply(migraph::program& p) const
    {
-        if(inputs.size() != 2)
+        for(auto ins : migraph::iterator_for(p))
-            RTG_THROW("Wrong inputs");
+        {
-        return inputs.front();
+            if(ins->op.name() == "sum")
+            {
+                p.replace_instruction(ins, minus_op{}, ins->arguments);
+            }
+            else if(ins->op.name() == "minus")
+            {
+                p.replace_instruction(ins, sum_op{}, ins->arguments);
+            }
+        }
    }
 };
-struct id_target
+struct reverse_target
 {
-    std::string name() const { return "id"; }
+    std::string name() const { return "reverse"; }
-    void apply(rtg::program&) const {}
+    std::vector<migraph::pass> get_passes(migraph::context&) const { return {reverse_pass{}}; }
+    migraph::context get_context() const { return {}; }
+};
+struct double_reverse_target
+{
+    std::string name() const { return "double_reverse"; }
+    std::vector<migraph::pass> get_passes(migraph::context&) const
+    {
+        return {reverse_pass{}, reverse_pass{}};
+    }
+    migraph::context get_context() const { return {}; }
 };
 void literal_test1()
 {
-    rtg::program p;
+    migraph::program p;
    auto one = p.add_literal(1);
    auto two = p.add_literal(2);
    p.add_instruction(sum_op{}, one, two);
    auto result = p.eval({});
-    EXPECT(result == rtg::literal{3});
+    EXPECT(result == migraph::literal{3});
-    EXPECT(result != rtg::literal{4});
+    EXPECT(result != migraph::literal{4});
 }
 void literal_test2()
 {
-    rtg::program p;
+    migraph::program p;
    auto one  = p.add_literal(1);
    auto two  = p.add_literal(2);
@@ -91,15 +72,15 @@ void literal_test2()
    p.add_instruction(sum_op{}, sum1, two);
    auto result = p.eval({});
-    EXPECT(result == rtg::literal{5});
+    EXPECT(result == migraph::literal{5});
-    EXPECT(result != rtg::literal{3});
+    EXPECT(result != migraph::literal{3});
 }
 void print_test()
 {
-    rtg::program p;
+    migraph::program p;
-    auto x   = p.add_parameter("x", {rtg::shape::int64_type});
+    auto x   = p.add_parameter("x", {migraph::shape::int64_type});
    auto two = p.add_literal(2);
    p.add_instruction(sum_op{}, x, two);
@@ -111,35 +92,69 @@ void print_test()
 void param_test()
 {
-    rtg::program p;
+    migraph::program p;
-    auto x = p.add_parameter("x", {rtg::shape::int64_type});
+    auto x = p.add_parameter("x", {migraph::shape::int64_type});
-    auto y = p.add_parameter("y", {rtg::shape::int64_type});
+    auto y = p.add_parameter("y", {migraph::shape::int64_type});
    p.add_instruction(sum_op{}, x, y);
-    auto result =
+    auto result = p.eval(
-        p.eval({{"x", rtg::literal{1}.get_argument()}, {"y", rtg::literal{2}.get_argument()}});
+        {{"x", migraph::literal{1}.get_argument()}, {"y", migraph::literal{2}.get_argument()}});
-    EXPECT(result == rtg::literal{3});
+    EXPECT(result == migraph::literal{3});
-    EXPECT(result != rtg::literal{4});
+    EXPECT(result != migraph::literal{4});
 }
 void replace_test()
 {
-    rtg::program p;
+    migraph::program p;
    auto one = p.add_literal(1);
    auto two = p.add_literal(2);
    auto sum = p.add_instruction(sum_op{}, one, two);
    p.replace_instruction(sum, minus_op{}, two, one);
+    EXPECT(bool{p.validate() == p.end()});
+    auto result = p.eval({});
+    EXPECT(result == migraph::literal{1});
+    EXPECT(result != migraph::literal{3});
+}
+void replace_ins_test()
+{
+    migraph::program p;
+    auto one   = p.add_literal(1);
+    auto two   = p.add_literal(2);
+    auto sum   = p.add_instruction(sum_op{}, one, two);
+    auto minus = p.add_instruction(minus_op{}, two, one);
+    p.replace_instruction(sum, minus);
+    EXPECT(bool{p.validate() == p.end()});
    auto result = p.eval({});
-    EXPECT(result == rtg::literal{1});
+    EXPECT(result == migraph::literal{1});
-    EXPECT(result != rtg::literal{3});
+    EXPECT(result != migraph::literal{3});
+}
+void replace_ins_test2()
+{
+    migraph::program p;
+    auto one   = p.add_literal(1);
+    auto two   = p.add_literal(2);
+    auto sum   = p.add_instruction(sum_op{}, one, two);
+    auto minus = p.add_instruction(minus_op{}, two, one);
+    p.add_instruction(pass_op{}, minus);
+    p.replace_instruction(two, sum);
+    EXPECT(bool{p.validate() == p.end()});
+    auto result = p.eval({});
+    EXPECT(result == migraph::literal{2});
+    EXPECT(result != migraph::literal{3});
 }
 void insert_replace_test()
 {
-    rtg::program p;
+    migraph::program p;
    auto one  = p.add_literal(1);
    auto two  = p.add_literal(2);
@@ -148,23 +163,50 @@ void insert_replace_test()
    auto sum0 = p.insert_instruction(sum1, sum_op{}, two, two);
    p.replace_instruction(sum1, minus_op{}, sum0, two);
+    EXPECT(bool{p.validate() == p.end()});
    auto result = p.eval({});
-    EXPECT(result == rtg::literal{4});
+    EXPECT(result == migraph::literal{4});
-    EXPECT(result != rtg::literal{5});
+    EXPECT(result != migraph::literal{5});
 }
 void target_test()
 {
-    rtg::program p;
+    migraph::program p;
    auto one = p.add_literal(1);
    auto two = p.add_literal(2);
    p.add_instruction(sum_op{}, one, two);
    p.compile(id_target{});
    auto result = p.eval({});
-    EXPECT(result == rtg::literal{3});
+    EXPECT(result == migraph::literal{3});
-    EXPECT(result != rtg::literal{4});
+    EXPECT(result != migraph::literal{4});
+}
+void reverse_target_test()
+{
+    migraph::program p;
+    auto one = p.add_literal(1);
+    auto two = p.add_literal(2);
+    p.add_instruction(sum_op{}, two, one);
+    p.compile(reverse_target{});
+    auto result = p.eval({});
+    EXPECT(result == migraph::literal{1});
+    EXPECT(result != migraph::literal{4});
+}
+void double_reverse_target_test()
+{
+    migraph::program p;
+    auto one = p.add_literal(1);
+    auto two = p.add_literal(2);
+    p.add_instruction(sum_op{}, two, one);
+    p.compile(double_reverse_target{});
+    auto result = p.eval({});
+    EXPECT(result == migraph::literal{3});
+    EXPECT(result != migraph::literal{4});
 }
 int main()
@@ -174,6 +216,9 @@ int main()
    print_test();
    param_test();
    replace_test();
+    replace_ins_test();
+    replace_ins_test2();
    insert_replace_test();
    target_test();
+    reverse_target_test();
 }
--- a/test/gpu/miopen.cpp
+++ b/test/gpu/miopen.cpp
+#include <migraph/program.hpp>
+#include <migraph/operators.hpp>
+#include <migraph/generate.hpp>
+#include <migraph/cpu/cpu_target.hpp>
+#include <migraph/gpu/target.hpp>
+#include <migraph/gpu/miopen.hpp>
+#include <migraph/gpu/hip.hpp>
+#include <migraph/manage_ptr.hpp>
+#include <migraph/type_name.hpp>
+#include <miopen/miopen.h>
+#include "test.hpp"
+#include "verify.hpp"
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wglobal-constructors"
+#endif
+struct auto_print
+{
+    static std::array<std::function<void()>, 2> handlers;
+    int index;
+    template <class T>
+    auto_print(T& x, int i) : index(i)
+    {
+        handlers[index] = [&x] { std::cout << x << std::endl; };
+    }
+    ~auto_print()
+    {
+        handlers[index] = [] {};
+    }
+};
+std::array<std::function<void()>, 2> auto_print::handlers = {};
+template <class V>
+migraph::argument run_cpu()
+{
+    V v;
+    auto p = v.create_program();
+    auto_print pp{p, 0};
+    p.compile(migraph::cpu::cpu_target{});
+    migraph::program::parameter_map m;
+    for(auto&& x : p.get_parameter_shapes())
+    {
+        m[x.first] = migraph::generate_argument(x.second);
+    }
+    return p.eval(m);
+}
+template <class V>
+migraph::argument run_gpu()
+{
+    V v;
+    auto p = v.create_program();
+    auto_print pp{p, 1};
+    p.compile(migraph::gpu::target{});
+    migraph::program::parameter_map m;
+    for(auto&& x : p.get_parameter_shapes())
+    {
+        m[x.first] = migraph::gpu::to_gpu(migraph::generate_argument(x.second));
+    }
+    return migraph::gpu::from_gpu(p.eval(m));
+}
+template <class V>
+void verify_program()
+{
+    std::set_terminate(+[] {
+        std::cout << "FAILED: " << migraph::get_type_name<V>() << std::endl;
+        try
+        {
+            std::rethrow_exception(std::current_exception());
+        }
+        catch(const std::exception& e)
+        {
+            std::cout << "    what(): " << e.what() << std::endl;
+        }
+        std::cout << std::endl;
+        for(auto&& handle : auto_print::handlers)
+            handle();
+    });
+    auto cpu_arg = run_cpu<V>();
+    auto gpu_arg = run_gpu<V>();
+    visit_all(cpu_arg, gpu_arg)([](auto cpu, auto gpu) {
+        if(not test::verify_range(cpu, gpu))
+        {
+            std::cout << "FAILED: " << migraph::get_type_name<V>() << std::endl;
+        }
+    });
+    std::set_terminate(nullptr);
+}
+struct test_literals
+{
+    migraph::program create_program() const
+    {
+        migraph::program p;
+        auto input = p.add_literal(
+            generate_literal(migraph::shape{migraph::shape::float_type, {4, 3, 3, 3}}));
+        auto weights = p.add_literal(
+            generate_literal(migraph::shape{migraph::shape::float_type, {4, 3, 3, 3}}));
+        auto conv = p.add_instruction(migraph::convolution{}, input, weights);
+        p.add_instruction(migraph::activation{"relu"}, conv);
+        return p;
+    }
+};
+struct test_add
+{
+    migraph::program create_program() const
+    {
+        migraph::program p;
+        migraph::shape s{migraph::shape::float_type, {3}};
+        auto x = p.add_parameter("x", s);
+        auto y = p.add_parameter("y", s);
+        p.add_instruction(migraph::add{}, x, y);
+        return p;
+    }
+};
+struct test_add_broadcast
+{
+    migraph::program create_program() const
+    {
+        migraph::program p;
+        migraph::shape s{migraph::shape::float_type, {3}};
+        auto x  = p.add_parameter("x", {migraph::shape::float_type, {2, 2, 3}});
+        auto y  = p.add_parameter("y", {migraph::shape::float_type, {2, 2}});
+        auto by = p.add_instruction(migraph::broadcast{0}, x, y);
+        p.add_instruction(migraph::add{}, x, by);
+        return p;
+    }
+};
+struct test_conv_relu
+{
+    migraph::program create_program() const
+    {
+        migraph::program p;
+        auto input = p.add_parameter("x", migraph::shape{migraph::shape::float_type, {4, 3, 3, 3}});
+        auto weights =
+            p.add_parameter("w", migraph::shape{migraph::shape::float_type, {4, 3, 3, 3}});
+        auto conv = p.add_instruction(migraph::convolution{}, input, weights);
+        p.add_instruction(migraph::activation{"relu"}, conv);
+        return p;
+    }
+};
+struct test_conv_pooling
+{
+    migraph::program create_program() const
+    {
+        migraph::program p;
+        auto input =
+            p.add_parameter("x", migraph::shape{migraph::shape::float_type, {4, 3, 32, 32}});
+        auto weights =
+            p.add_parameter("w", migraph::shape{migraph::shape::float_type, {4, 3, 3, 3}});
+        auto conv    = p.add_instruction(migraph::convolution{}, input, weights);
+        auto pooling = p.add_instruction(migraph::pooling{"max"}, conv);
+        p.add_instruction(migraph::activation{"relu"}, pooling);
+        return p;
+    }
+};
+struct test_gemm
+{
+    migraph::program create_program() const
+    {
+        migraph::program p;
+        auto a = p.add_parameter("a", migraph::shape{migraph::shape::float_type, {4, 5}});
+        auto b = p.add_parameter("b", migraph::shape{migraph::shape::float_type, {5, 3}});
+        p.add_instruction(migraph::gemm{}, a, b);
+        return p;
+    }
+};
+struct test_gemm_ld
+{
+    migraph::program create_program() const
+    {
+        migraph::program p;
+        auto a = p.add_parameter("a", migraph::shape{migraph::shape::float_type, {4, 5}, {10, 1}});
+        auto b = p.add_parameter("b", migraph::shape{migraph::shape::float_type, {5, 3}, {20, 1}});
+        p.add_instruction(migraph::gemm{}, a, b);
+        return p;
+    }
+};
+struct test_gemm_transposeb
+{
+    migraph::program create_program() const
+    {
+        migraph::program p;
+        auto a  = p.add_parameter("a", migraph::shape{migraph::shape::float_type, {4, 5}});
+        auto b  = p.add_parameter("b", migraph::shape{migraph::shape::float_type, {3, 5}});
+        auto bt = p.add_instruction(migraph::transpose{{1, 0}}, b);
+        p.add_instruction(migraph::gemm{}, a, bt);
+        return p;
+    }
+};
+struct test_gemm_transposea
+{
+    migraph::program create_program() const
+    {
+        migraph::program p;
+        auto a  = p.add_parameter("a", migraph::shape{migraph::shape::float_type, {5, 4}});
+        auto b  = p.add_parameter("b", migraph::shape{migraph::shape::float_type, {5, 3}});
+        auto at = p.add_instruction(migraph::transpose{{1, 0}}, a);
+        p.add_instruction(migraph::gemm{}, at, b);
+        return p;
+    }
+};
+struct test_gemm_transposeab
+{
+    migraph::program create_program() const
+    {
+        migraph::program p;
+        auto a  = p.add_parameter("a", migraph::shape{migraph::shape::float_type, {5, 4}});
+        auto b  = p.add_parameter("b", migraph::shape{migraph::shape::float_type, {3, 5}});
+        auto at = p.add_instruction(migraph::transpose{{1, 0}}, a);
+        auto bt = p.add_instruction(migraph::transpose{{1, 0}}, b);
+        p.add_instruction(migraph::gemm{}, at, bt);
+        return p;
+    }
+};
+struct test_contiguous
+{
+    migraph::program create_program() const
+    {
+        migraph::program p;
+        migraph::shape s{migraph::shape::float_type, {4, 4, 4, 3}, {48, 4, 1, 16}};
+        auto x = p.add_parameter("x", s);
+        p.add_instruction(migraph::contiguous{}, x);
+        return p;
+    }
+};
+struct test_transpose
+{
+    migraph::program create_program() const
+    {
+        migraph::program p;
+        migraph::shape s{migraph::shape::float_type, {4, 3, 4, 4}};
+        auto x                    = p.add_parameter("x", s);
+        std::vector<int64_t> perm = {0, 2, 3, 1};
+        auto l                    = p.add_instruction(migraph::transpose{perm}, x);
+        p.add_instruction(migraph::contiguous{}, l);
+        return p;
+    }
+};
+struct test_batchnorm_inference
+{
+    const size_t width    = 3;
+    const size_t height   = 3;
+    const size_t channels = 3;
+    const size_t batches  = 4;
+    migraph::program create_program() const
+    {
+        migraph::program p;
+        migraph::shape s{migraph::shape::float_type, {batches, channels, height, width}};
+        migraph::shape vars{migraph::shape::float_type, {channels}};
+        auto x        = p.add_parameter("x", s);
+        auto mean     = p.add_parameter("mean", vars);
+        auto variance = p.add_parameter("variance", vars);
+        auto scale    = p.add_parameter("scale", vars);
+        auto bias     = p.add_parameter("bias", vars);
+        p.add_instruction(migraph::batch_norm_inference{}, x, mean, variance, scale, bias);
+        return p;
+    }
+};
+int main()
+{
+    verify_program<test_add>();
+    verify_program<test_add_broadcast>();
+    verify_program<test_conv_relu>();
+    verify_program<test_conv_pooling>();
+    verify_program<test_gemm>();
+    // verify_program<test_gemm_ld>();
+    verify_program<test_gemm_transposeb>();
+    verify_program<test_gemm_transposea>();
+    verify_program<test_gemm_transposeab>();
+    verify_program<test_contiguous>();
+    verify_program<test_transpose>();
+    verify_program<test_batchnorm_inference>();
+}