merged master into rocblas-integration

afa4a833 · wsttiger · 23354075 · a47f8e4b · afa4a833 · afa4a833
Commit afa4a833 authored Jul 17, 2018 by wsttiger
19 changed files
--- a/src/generate.cpp
+++ b/src/generate.cpp
@@ -2,9 +2,9 @@
 namespace migraph {
-migraph::argument generate_argument(migraph::shape s, std::mt19937::result_type seed)
+argument generate_argument(shape s, std::mt19937::result_type seed)
 {
-    migraph::argument result;
+    argument result;
    s.visit_type([&](auto as) {
        using type = typename decltype(as)::type;
        auto v     = generate_tensor_data<type>(s, seed);
@@ -13,4 +13,15 @@ migraph::argument generate_argument(migraph::shape s, std::mt19937::result_type
    return result;
 }
+literal generate_literal(shape s, std::mt19937::result_type seed)
+{
+    literal result;
+    s.visit_type([&](auto as) {
+        using type = typename decltype(as)::type;
+        auto v     = generate_tensor_data<type>(s, seed);
+        result     = {s, v};
+    });
+    return result;
+}
 } // namespace migraph
--- a/src/include/migraph/context.hpp
+++ b/src/include/migraph/context.hpp
 #ifndef MIGRAPH_GUARD_CONTEXT_HPP
 #define MIGRAPH_GUARD_CONTEXT_HPP
+#include <string>
+#include <functional>
+#include <memory>
+#include <type_traits>
+#include <utility>
 namespace migraph {
 /*

--- a/src/include/migraph/generate.hpp
+++ b/src/include/migraph/generate.hpp
@@ -2,6 +2,7 @@
 #define MIGRAPH_GUARD_MIGRAPHLIB_GENERATE_HPP
 #include <migraph/argument.hpp>
+#include <migraph/literal.hpp>
 #include <random>
 namespace migraph {
@@ -16,7 +17,9 @@ std::vector<T> generate_tensor_data(migraph::shape s, std::mt19937::result_type
    return result;
 }
-migraph::argument generate_argument(migraph::shape s, std::mt19937::result_type seed = 0);
+argument generate_argument(shape s, std::mt19937::result_type seed = 0);
+literal generate_literal(shape s, std::mt19937::result_type seed = 0);
 } // namespace migraph

--- a/src/include/migraph/instruction.hpp
+++ b/src/include/migraph/instruction.hpp
@@ -5,6 +5,7 @@
 #include <migraph/shape.hpp>
 #include <migraph/builtin.hpp>
 #include <migraph/instruction_ref.hpp>
+#include <migraph/operation.hpp>
 #include <migraph/erase.hpp>
 #include <string>

--- a/src/targets/cpu/CMakeLists.txt
+++ b/src/targets/cpu/CMakeLists.txt
 add_library(migraph_cpu
    cpu_target.cpp
+    cpu_lowering.cpp
 )
 rocm_clang_tidy_check(migraph_cpu)
 target_link_libraries(migraph_cpu migraph)

--- a/src/targets/cpu/cpu_lowering.cpp
+++ b/src/targets/cpu/cpu_lowering.cpp
+#include <migraph/cpu/cpu_lowering.hpp>
+#include <migraph/instruction.hpp>
+#include <migraph/dfor.hpp>
+#include <migraph/operators.hpp>
+#include <migraph/shape_for_each.hpp>
+#include <migraph/iterator_for.hpp>
+#include <unordered_map>
+namespace migraph {
+namespace cpu {
+template <typename T>
+T zero(const T&)
+{
+    return T(0);
+}
+struct cpu_convolution
+{
+    convolution op;
+    std::string name() const { return "cpu::convolution"; }
+    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
+            auto in_h = input.get_shape().lens()[2];
+            auto in_w = input.get_shape().lens()[3];
+            auto wei_c = weights.get_shape().lens()[1];
+            auto wei_h = weights.get_shape().lens()[2];
+            auto wei_w = weights.get_shape().lens()[3];
+            dfor(output_shape.lens()[0],
+                 output_shape.lens()[1],
+                 output_shape.lens()[2],
+                 output_shape.lens()[3])(
+                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
+                    const int start_x = i * op.stride[0] - op.padding[0];
+                    const int start_y = j * op.stride[1] - op.padding[1];
+                    double acc = 0;
+                    dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
+                        const int in_x = start_x + x;
+                        const int in_y = start_y + y;
+                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
+                        {
+                            acc += input(o, k, in_x, in_y) * weights(w, k, x, y);
+                        }
+                    });
+                    output(o, w, i, j) = acc;
+                });
+        });
+        return result;
+    }
+};
+struct max_pool
+{
+    static std::string name() { return "max"; }
+    static double start() { return std::numeric_limits<double>::lowest(); }
+    static double apply(double x, double y)
+    {
+        double m = std::max(x, y);
+        return (m);
+    }
+    static double final(double x, double) { return (x); }
+};
+struct avg_pool
+{
+    static std::string name() { return "average"; }
+    static double start() { return 0.0; }
+    static double apply(double x, double y) { return x + y; }
+    static double final(double x, double y) { return x / y; }
+};
+template <class Op>
+struct cpu_pooling
+{
+    pooling op;
+    std::string name() const { return "cpu::pooling_" + Op::name(); }
+    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            using type = typename decltype(output)::value_type;
+            auto in_h  = input.get_shape().lens()[2];
+            auto in_w  = input.get_shape().lens()[3];
+            dfor(output_shape.lens()[0],
+                 output_shape.lens()[1],
+                 output_shape.lens()[2],
+                 output_shape.lens()[3])(
+                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
+                    const int start_x0 = i * op.stride[0] - op.padding[0];
+                    const int start_y0 = j * op.stride[1] - op.padding[1];
+                    const int hend = std::min(start_x0 + op.lengths[0], in_h);
+                    const int wend = std::min(start_y0 + op.lengths[1], in_w);
+                    const int start_x = std::max(start_x0, 0);
+                    const int start_y = std::max(start_y0, 0);
+                    const int w_h       = (hend - start_x);
+                    const int w_w       = (wend - start_y);
+                    const int pool_size = std::max(w_h * w_w, 1);
+                    double acc = Op::start();
+                    dfor(w_h, w_w)([&](int x, int y) {
+                        const int in_x = start_x + x;
+                        const int in_y = start_y + y;
+                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
+                        {
+                            acc = Op::apply(acc, input(o, w, in_x, in_y));
+                        }
+                    });
+                    output(o, w, i, j) = type(Op::final(acc, pool_size));
+                });
+        });
+        return result;
+    }
+};
+struct cpu_transpose
+{
+    transpose op;
+    std::string name() const { return "cpu::transpose"; }
+    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        return {output_shape, std::move(args.front().data)};
+    }
+};
+struct cpu_contiguous
+{
+    contiguous op;
+    std::string name() const { return "cpu::contiguous"; }
+    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            shape_for_each(output.get_shape(), [&](const auto& idx) {
+                output(idx.begin(), idx.end()) = input(idx.begin(), idx.end());
+            });
+        });
+        return result;
+    }
+};
+struct cpu_reshape
+{
+    reshape op;
+    std::string name() const { return "cpu::reshape"; }
+    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        return {output_shape, std::move(args.front().data)};
+    }
+};
+struct cpu_gemm
+{
+    gemm op;
+    std::string name() const { return "cpu::gemm"; }
+    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0], args[1])([&](auto cmat, auto amat, auto bmat) {
+            auto m = amat.get_shape().lens()[0];
+            auto n = bmat.get_shape().lens()[1];
+            auto k = bmat.get_shape().lens()[0];
+            auto a = amat.data();
+            auto b = bmat.data();
+            auto c = cmat.data();
+            for(int ii = 0; ii < m; ii++)
+            {
+                for(int jj = 0; jj < n; jj++)
+                {
+                    c[ii * n + jj] = 0;
+                }
+            }
+            for(int ii = 0; ii < m; ii++)
+            {
+                for(int kk = 0; kk < k; kk++)
+                {
+                    auto aik  = a[ii * k + kk];
+                    auto* bkj = &b[kk * n];
+                    auto* cij = &c[ii * n];
+                    for(int jj = 0; jj < n; jj++, cij++, bkj++)
+                    {
+                        *cij += aik * (*bkj);
+                    }
+                }
+            }
+        });
+        return result;
+    }
+};
+struct identity_op
+{
+    std::string name() const { return "cpu::identity"; }
+    auto fcn() const
+    {
+        return [](auto x) { return x; };
+    }
+};
+struct abs_op
+{
+    std::string name() const { return "cpu::abs"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::abs(x); };
+    }
+};
+struct exp_op
+{
+    std::string name() const { return "cpu::exp"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::exp(x); };
+    }
+};
+struct sin_op
+{
+    std::string name() const { return "cpu::sin"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::sin(x); };
+    }
+};
+struct cos_op
+{
+    std::string name() const { return "cpu::cos"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::cos(x); };
+    }
+};
+struct tan_op
+{
+    std::string name() const { return "cpu::tan"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::tan(x); };
+    }
+};
+struct asin_op
+{
+    std::string name() const { return "cpu::asin"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::asin(x); };
+    }
+};
+struct acos_op
+{
+    std::string name() const { return "cpu::acos"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::acos(x); };
+    }
+};
+struct atan_op
+{
+    std::string name() const { return "cpu::atan"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::atan(x); };
+    }
+};
+struct tanh_op
+{
+    std::string name() const { return "cpu::tanh"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::tanh(x); };
+    }
+};
+struct sigmoid_op
+{
+    std::string name() const { return "cpu::sigmoid"; }
+    auto fcn() const
+    {
+        return [](auto x) { return 1.f / (1.f + std::exp(-x)); };
+    }
+};
+struct neg_op
+{
+    std::string name() const { return "cpu::neg"; }
+    auto fcn() const
+    {
+        return [](auto x) { return -x; };
+    }
+};
+struct relu_op
+{
+    std::string name() const { return "cpu::relu"; }
+    auto fcn() const
+    {
+        return [](auto x) { return x > 0 ? x : 0; };
+    }
+};
+template <typename Op>
+struct cpu_unary
+{
+    Op op;
+    std::string name() const { return op.name(); }
+    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        result.visit([&](auto output) {
+            args[0].visit([&](auto input) {
+                std::transform(input.begin(), input.end(), output.begin(), op.fcn());
+            });
+        });
+        return result;
+    }
+};
+struct softmax2d
+{
+    std::string name() const { return "cpu::softmax2d"; }
+    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            using value_type = typename decltype(input)::value_type;
+            auto nb          = input.get_shape().lens()[0];
+            auto nc          = input.get_shape().lens()[1];
+            auto nh          = input.get_shape().lens()[2];
+            auto nw          = input.get_shape().lens()[3];
+            dfor(nb, nh, nw)([&](std::size_t b, std::size_t i, std::size_t j) {
+                value_type cmax = std::numeric_limits<value_type>::lowest();
+                for(int c = 0; c < nc; c++)
+                {
+                    cmax = std::max(cmax, input(b, c, i, j));
+                }
+                for(int c = 0; c < nc; c++)
+                {
+                    output(b, c, i, j) = std::exp(input(b, c, i, j) - cmax);
+                }
+                value_type sum = value_type(0);
+                for(int c = 0; c < nc; c++)
+                {
+                    sum += output(b, c, i, j);
+                }
+                for(int c = 0; c < nc; c++)
+                {
+                    output(b, c, i, j) = output(b, c, i, j) / sum;
+                }
+            });
+        });
+        return result;
+    }
+};
+struct add_op
+{
+    std::string name() const { return "add"; }
+    auto fcn() const
+    {
+        return [](auto x, auto y) { return x + y; };
+    }
+};
+struct sub_op
+{
+    std::string name() const { return "sub"; }
+    auto fcn() const
+    {
+        return [](auto x, auto y) { return x - y; };
+    }
+};
+struct mul_op
+{
+    std::string name() const { return "mul"; }
+    auto fcn() const
+    {
+        return [](auto x, auto y) { return x * y; };
+    }
+};
+struct div_op
+{
+    std::string name() const { return "div"; }
+    auto fcn() const
+    {
+        return [](auto x, auto y) { return x / y; };
+    }
+};
+template <typename Op>
+struct cpu_binary
+{
+    Op op;
+    std::string name() const { return op.name(); }
+    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0], args[1])([&](auto output, auto input1, auto input2) {
+            if(input1.get_shape().packed() and input2.get_shape().packed())
+            {
+                std::transform(
+                    input1.begin(), input1.end(), input2.begin(), output.begin(), op.fcn());
+            }
+            else
+            {
+                shape_for_each(output.get_shape(), [&](const auto& idx) {
+                    output(idx.begin(), idx.end()) =
+                        op.fcn()(input1(idx.begin(), idx.end()), input2(idx.begin(), idx.end()));
+                });
+            }
+        });
+        return result;
+    }
+};
+struct cpu_apply
+{
+    program* prog;
+    std::unordered_map<std::string, std::function<void(instruction_ref)>> apply_map{};
+    template <class T>
+    auto simple_op()
+    {
+        return [this](instruction_ref ins) { apply_simple_op<T>(ins); };
+    }
+    template <class T, class Op>
+    auto extend_op()
+    {
+        return [this](instruction_ref ins) { apply_extend_op<T, Op>(ins); };
+    }
+    void init()
+    {
+        apply_map["convolution"] = extend_op<cpu_convolution, convolution>();
+        apply_map["gemm"]        = extend_op<cpu_gemm, gemm>();
+        apply_map["reshape"]     = extend_op<cpu_reshape, reshape>();
+        apply_map["contiguous"]  = extend_op<cpu_contiguous, contiguous>();
+        apply_map["transpose"]   = extend_op<cpu_transpose, transpose>();
+        apply_map["identity"] = simple_op<cpu_unary<identity_op>>();
+        apply_map["tanh"]     = simple_op<cpu_unary<tanh_op>>();
+        apply_map["sigmoid"]  = simple_op<cpu_unary<sigmoid_op>>();
+        apply_map["exp"]      = simple_op<cpu_unary<exp_op>>();
+        apply_map["neg"]      = simple_op<cpu_unary<neg_op>>();
+        apply_map["sin"]      = simple_op<cpu_unary<sin_op>>();
+        apply_map["cos"]      = simple_op<cpu_unary<cos_op>>();
+        apply_map["tan"]      = simple_op<cpu_unary<tan_op>>();
+        apply_map["add"]      = simple_op<cpu_binary<add_op>>();
+        apply_map["sub"]      = simple_op<cpu_binary<sub_op>>();
+        apply_map["mul"]      = simple_op<cpu_binary<mul_op>>();
+        apply_map["div"]      = simple_op<cpu_binary<div_op>>();
+        apply_map["softmax"] = simple_op<softmax2d>();
+    }
+    void apply()
+    {
+        init();
+        for(auto it : iterator_for(*prog))
+        {
+            if(it->op.name() == "activation")
+            {
+                apply_activation(it);
+            }
+            else if(it->op.name() == "pooling")
+            {
+                apply_pooling(it);
+            }
+            else if(apply_map.count(it->op.name()) > 0)
+            {
+                apply_map.at(it->op.name())(it);
+            }
+        }
+    }
+    template <class T>
+    void apply_simple_op(instruction_ref ins)
+    {
+        prog->replace_instruction(ins, T{}, ins->arguments);
+    }
+    template <class T, class Op>
+    void apply_extend_op(instruction_ref ins)
+    {
+        auto&& op = any_cast<Op>(ins->op);
+        prog->replace_instruction(ins, T{op}, ins->arguments);
+    }
+    void apply_activation(instruction_ref ins)
+    {
+        auto&& op = any_cast<activation>(ins->op);
+        if(op.mode == "relu")
+            prog->replace_instruction(ins, cpu_unary<relu_op>{}, ins->arguments);
+    }
+    void apply_pooling(instruction_ref ins)
+    {
+        auto&& op = any_cast<pooling>(ins->op);
+        if(op.mode == "max")
+            prog->replace_instruction(ins, cpu_pooling<max_pool>{op}, ins->arguments);
+        else if(op.mode == "average")
+            prog->replace_instruction(ins, cpu_pooling<avg_pool>{op}, ins->arguments);
+    }
+};
+void cpu_lowering::apply(program& p) const { cpu_apply{&p}.apply(); }
+} // namespace cpu
+} // namespace migraph
--- a/src/targets/cpu/cpu_target.cpp
+++ b/src/targets/cpu/cpu_target.cpp
 #include <migraph/cpu/cpu_target.hpp>
-#include <migraph/instruction.hpp>
+#include <migraph/cpu/cpu_lowering.hpp>
-#include <migraph/dfor.hpp>
-#include <migraph/operators.hpp>
-#include <migraph/shape_for_each.hpp>
-#include <migraph/iterator_for.hpp>
 namespace migraph {
 namespace cpu {
-template <typename T>
-T zero(const T&)
-{
-    return T(0);
-}
-struct cpu_convolution
-{
-    convolution op;
-    std::string name() const { return "cpu::convolution"; }
-    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
-            auto in_h = input.get_shape().lens()[2];
-            auto in_w = input.get_shape().lens()[3];
-            auto wei_c = weights.get_shape().lens()[1];
-            auto wei_h = weights.get_shape().lens()[2];
-            auto wei_w = weights.get_shape().lens()[3];
-            dfor(output_shape.lens()[0],
-                 output_shape.lens()[1],
-                 output_shape.lens()[2],
-                 output_shape.lens()[3])(
-                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
-                    const int start_x = i * op.stride[0] - op.padding[0];
-                    const int start_y = j * op.stride[1] - op.padding[1];
-                    double acc = 0;
-                    dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
-                        const int in_x = start_x + x;
-                        const int in_y = start_y + y;
-                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
-                        {
-                            acc += input(o, k, in_x, in_y) * weights(w, k, x, y);
-                        }
-                    });
-                    output(o, w, i, j) = acc;
-                });
-        });
-        return result;
-    }
-};
-struct max_pool
-{
-    static std::string name() { return "max"; }
-    static double start() { return std::numeric_limits<double>::lowest(); }
-    static double apply(double x, double y)
-    {
-        double m = std::max(x, y);
-        return (m);
-    }
-    static double final(double x, double) { return (x); }
-};
-struct avg_pool
-{
-    static std::string name() { return "average"; }
-    static double start() { return 0.0; }
-    static double apply(double x, double y) { return x + y; }
-    static double final(double x, double y) { return x / y; }
-};
-template <class Op>
-struct cpu_pooling
-{
-    pooling op;
-    std::string name() const { return "cpu::pooling_" + Op::name(); }
-    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0])([&](auto output, auto input) {
-            using type = typename decltype(output)::value_type;
-            auto in_h  = input.get_shape().lens()[2];
-            auto in_w  = input.get_shape().lens()[3];
-            dfor(output_shape.lens()[0],
-                 output_shape.lens()[1],
-                 output_shape.lens()[2],
-                 output_shape.lens()[3])(
-                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
-                    const int start_x0 = i * op.stride[0] - op.padding[0];
-                    const int start_y0 = j * op.stride[1] - op.padding[1];
-                    const int hend = std::min(start_x0 + op.lengths[0], in_h);
-                    const int wend = std::min(start_y0 + op.lengths[1], in_w);
-                    const int start_x = std::max(start_x0, 0);
-                    const int start_y = std::max(start_y0, 0);
-                    const int w_h       = (hend - start_x);
-                    const int w_w       = (wend - start_y);
-                    const int pool_size = std::max(w_h * w_w, 1);
-                    double acc = Op::start();
-                    dfor(w_h, w_w)([&](int x, int y) {
-                        const int in_x = start_x + x;
-                        const int in_y = start_y + y;
-                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
-                        {
-                            acc = Op::apply(acc, input(o, w, in_x, in_y));
-                        }
-                    });
-                    output(o, w, i, j) = type(Op::final(acc, pool_size));
-                });
-        });
-        return result;
-    }
-};
-struct cpu_transpose
-{
-    transpose op;
-    std::string name() const { return "cpu::transpose"; }
-    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        return {output_shape, std::move(args.front().data)};
-    }
-};
-struct cpu_contiguous
-{
-    contiguous op;
-    std::string name() const { return "cpu::contiguous"; }
-    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0])([&](auto output, auto input) {
-            shape_for_each(output.get_shape(), [&](const auto& idx) {
-                output(idx.begin(), idx.end()) = input(idx.begin(), idx.end());
-            });
-        });
-        return result;
-    }
-};
-struct cpu_reshape
-{
-    reshape op;
-    std::string name() const { return "cpu::reshape"; }
-    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        return {output_shape, std::move(args.front().data)};
-    }
-};
-struct cpu_gemm
-{
-    gemm op;
-    std::string name() const { return "cpu::gemm"; }
-    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0], args[1])([&](auto cmat, auto amat, auto bmat) {
-            auto m = amat.get_shape().lens()[0];
-            auto n = bmat.get_shape().lens()[1];
-            auto k = bmat.get_shape().lens()[0];
-            auto a = amat.data();
-            auto b = bmat.data();
-            auto c = cmat.data();
-            for(int ii = 0; ii < m; ii++)
-            {
-                for(int jj = 0; jj < n; jj++)
-                {
-                    c[ii * n + jj] = 0;
-                }
-            }
-            for(int ii = 0; ii < m; ii++)
-            {
-                for(int kk = 0; kk < k; kk++)
-                {
-                    auto aik  = a[ii * k + kk];
-                    auto* bkj = &b[kk * n];
-                    auto* cij = &c[ii * n];
-                    for(int jj = 0; jj < n; jj++, cij++, bkj++)
-                    {
-                        *cij += aik * (*bkj);
-                    }
-                }
-            }
-        });
-        return result;
-    }
-};
-struct identity_op
-{
-    std::string name() const { return "cpu::identity"; }
-    auto fcn() const
-    {
-        return [](auto x) { return x; };
-    }
-};
-struct abs_op
-{
-    std::string name() const { return "cpu::abs"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::abs(x); };
-    }
-};
-struct exp_op
-{
-    std::string name() const { return "cpu::exp"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::exp(x); };
-    }
-};
-struct sin_op
-{
-    std::string name() const { return "cpu::sin"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::sin(x); };
-    }
-};
-struct cos_op
-{
-    std::string name() const { return "cpu::cos"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::cos(x); };
-    }
-};
-struct tan_op
-{
-    std::string name() const { return "cpu::tan"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::tan(x); };
-    }
-};
-struct asin_op
-{
-    std::string name() const { return "cpu::asin"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::asin(x); };
-    }
-};
-struct acos_op
-{
-    std::string name() const { return "cpu::acos"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::acos(x); };
-    }
-};
-struct atan_op
-{
-    std::string name() const { return "cpu::atan"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::atan(x); };
-    }
-};
-struct tanh_op
-{
-    std::string name() const { return "cpu::tanh"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::tanh(x); };
-    }
-};
-struct sigmoid_op
-{
-    std::string name() const { return "cpu::sigmoid"; }
-    auto fcn() const
-    {
-        return [](auto x) { return 1.f / (1.f + std::exp(-x)); };
-    }
-};
-struct neg_op
-{
-    std::string name() const { return "cpu::neg"; }
-    auto fcn() const
-    {
-        return [](auto x) { return -x; };
-    }
-};
-struct relu_op
-{
-    std::string name() const { return "cpu::relu"; }
-    auto fcn() const
-    {
-        return [](auto x) { return x > 0 ? x : 0; };
-    }
-};
-template <typename Op>
-struct cpu_unary
-{
-    Op op;
-    std::string name() const { return op.name(); }
-    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        result.visit([&](auto output) {
-            args[0].visit([&](auto input) {
-                std::transform(input.begin(), input.end(), output.begin(), op.fcn());
-            });
-        });
-        return result;
-    }
-};
-struct softmax2d
-{
-    std::string name() const { return "cpu::softmax2d"; }
-    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0])([&](auto output, auto input) {
-            using value_type = typename decltype(input)::value_type;
-            auto nb          = input.get_shape().lens()[0];
-            auto nc          = input.get_shape().lens()[1];
-            auto nh          = input.get_shape().lens()[2];
-            auto nw          = input.get_shape().lens()[3];
-            dfor(nb, nh, nw)([&](std::size_t b, std::size_t i, std::size_t j) {
-                value_type cmax = std::numeric_limits<value_type>::lowest();
-                for(int c = 0; c < nc; c++)
-                {
-                    cmax = std::max(cmax, input(b, c, i, j));
-                }
-                for(int c = 0; c < nc; c++)
-                {
-                    output(b, c, i, j) = std::exp(input(b, c, i, j) - cmax);
-                }
-                value_type sum = value_type(0);
-                for(int c = 0; c < nc; c++)
-                {
-                    sum += output(b, c, i, j);
-                }
-                for(int c = 0; c < nc; c++)
-                {
-                    output(b, c, i, j) = output(b, c, i, j) / sum;
-                }
-            });
-        });
-        return result;
-    }
-};
-struct add_op
-{
-    std::string name() const { return "add"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x + y; };
-    }
-};
-struct sub_op
-{
-    std::string name() const { return "sub"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x - y; };
-    }
-};
-struct mul_op
-{
-    std::string name() const { return "mul"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x * y; };
-    }
-};
-struct div_op
-{
-    std::string name() const { return "div"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x / y; };
-    }
-};
-template <typename Op>
-struct cpu_binary
-{
-    Op op;
-    std::string name() const { return op.name(); }
-    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0], args[1])([&](auto output, auto input1, auto input2) {
-            if(input1.get_shape().packed() and input2.get_shape().packed())
-            {
-                std::transform(
-                    input1.begin(), input1.end(), input2.begin(), output.begin(), op.fcn());
-            }
-            else
-            {
-                shape_for_each(output.get_shape(), [&](const auto& idx) {
-                    output(idx.begin(), idx.end()) =
-                        op.fcn()(input1(idx.begin(), idx.end()), input2(idx.begin(), idx.end()));
-                });
-            }
-        });
-        return result;
-    }
-};
-struct cpu_apply
-{
-    program* prog;
-    std::unordered_map<std::string, std::function<void(instruction_ref)>> apply_map{};
-    template <class T>
-    auto simple_op()
-    {
-        return [this](instruction_ref ins) { apply_simple_op<T>(ins); };
-    }
-    template <class T, class Op>
-    auto extend_op()
-    {
-        return [this](instruction_ref ins) { apply_extend_op<T, Op>(ins); };
-    }
-    void init()
-    {
-        apply_map["convolution"] = extend_op<cpu_convolution, convolution>();
-        apply_map["gemm"]        = extend_op<cpu_gemm, gemm>();
-        apply_map["reshape"]     = extend_op<cpu_reshape, reshape>();
-        apply_map["contiguous"]  = extend_op<cpu_contiguous, contiguous>();
-        apply_map["transpose"]   = extend_op<cpu_transpose, transpose>();
-        apply_map["identity"] = simple_op<cpu_unary<identity_op>>();
-        apply_map["tanh"]     = simple_op<cpu_unary<tanh_op>>();
-        apply_map["sigmoid"]  = simple_op<cpu_unary<sigmoid_op>>();
-        apply_map["exp"]      = simple_op<cpu_unary<exp_op>>();
-        apply_map["neg"]      = simple_op<cpu_unary<neg_op>>();
-        apply_map["sin"]      = simple_op<cpu_unary<sin_op>>();
-        apply_map["cos"]      = simple_op<cpu_unary<cos_op>>();
-        apply_map["tan"]      = simple_op<cpu_unary<tan_op>>();
-        apply_map["add"]      = simple_op<cpu_binary<add_op>>();
-        apply_map["sub"]      = simple_op<cpu_binary<sub_op>>();
-        apply_map["mul"]      = simple_op<cpu_binary<mul_op>>();
-        apply_map["div"]      = simple_op<cpu_binary<div_op>>();
-        apply_map["softmax"] = simple_op<softmax2d>();
-    }
-    void apply()
-    {
-        init();
-        for(auto it : iterator_for(*prog))
-        {
-            if(it->op.name() == "activation")
-            {
-                apply_activation(it);
-            }
-            else if(it->op.name() == "pooling")
-            {
-                apply_pooling(it);
-            }
-            else if(apply_map.count(it->op.name()) > 0)
-            {
-                apply_map.at(it->op.name())(it);
-            }
-        }
-    }
-    template <class T>
-    void apply_simple_op(instruction_ref ins)
-    {
-        prog->replace_instruction(ins, T{}, ins->arguments);
-    }
-    template <class T, class Op>
-    void apply_extend_op(instruction_ref ins)
-    {
-        auto&& op = any_cast<Op>(ins->op);
-        prog->replace_instruction(ins, T{op}, ins->arguments);
-    }
-    void apply_activation(instruction_ref ins)
-    {
-        auto&& op = any_cast<activation>(ins->op);
-        if(op.mode == "relu")
-            prog->replace_instruction(ins, cpu_unary<relu_op>{}, ins->arguments);
-    }
-    void apply_pooling(instruction_ref ins)
-    {
-        auto&& op = any_cast<pooling>(ins->op);
-        if(op.mode == "max")
-            prog->replace_instruction(ins, cpu_pooling<max_pool>{op}, ins->arguments);
-        else if(op.mode == "average")
-            prog->replace_instruction(ins, cpu_pooling<avg_pool>{op}, ins->arguments);
-    }
-};
-struct cpu_pass
-{
-    std::string name() const { return "cpu::pass"; }
-    void apply(program& p) const { cpu_apply{&p}.apply(); }
-};
 std::string cpu_target::name() const { return "cpu"; }
-std::vector<pass> cpu_target::get_passes(context&) const { return {cpu_pass{}}; }
+std::vector<pass> cpu_target::get_passes(context&) const { return {cpu_lowering{}}; }
 } // namespace cpu

--- a/src/targets/cpu/include/migraph/cpu/cpu_lowering.hpp
+++ b/src/targets/cpu/include/migraph/cpu/cpu_lowering.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_CPU_LOWERING_HPP
+#define MIGRAPH_GUARD_RTGLIB_CPU_LOWERING_HPP
+#include <migraph/program.hpp>
+namespace migraph {
+namespace cpu {
+struct cpu_lowering
+{
+    std::string name() const { return "cpu::lowering"; }
+    void apply(program& p) const;
+};
+} // namespace cpu
+} // namespace migraph
+#endif
--- a/src/targets/miopen/CMakeLists.txt
+++ b/src/targets/miopen/CMakeLists.txt
@@ -9,6 +9,8 @@ endif()
 add_library(migraph_miopen
    hip.cpp
    miopen_target.cpp
+    miopen_lowering.cpp
+    miopen_write_literals.cpp
 )
 rocm_clang_tidy_check(migraph_miopen)
 target_link_libraries(migraph_miopen migraph MIOpen rocblas)

--- a/src/targets/miopen/include/migraph/miopen/context.hpp
+++ b/src/targets/miopen/include/migraph/miopen/context.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_CONTEXT_HPP
+#define MIGRAPH_GUARD_RTGLIB_CONTEXT_HPP
+#include <migraph/miopen/miopen.hpp>
+namespace migraph {
+namespace miopen {
+struct miopen_context
+{
+    shared<miopen_handle> handle;
+};
+} // namespace miopen
+} // namespace migraph
+#endif
--- a/src/targets/miopen/include/migraph/miopen/hip.hpp
+++ b/src/targets/miopen/include/migraph/miopen/hip.hpp
@@ -26,6 +26,20 @@ struct hip_allocate
    }
 };
+struct hip_write
+{
+    std::string name() const { return "hip::write"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs}.has(1);
+        return inputs.front();
+    }
+    argument compute(context&, shape, std::vector<argument> args) const
+    {
+        return to_gpu(args.front());
+    }
+};
 } // namespace miopen
 } // namespace migraph

--- a/src/targets/miopen/include/migraph/miopen/miopen.hpp
+++ b/src/targets/miopen/include/migraph/miopen/miopen.hpp
@@ -2,7 +2,7 @@
 #define MIGRAPH_GUARD_MIGRAPHLIB_MIOPEN_HPP
 #include <migraph/manage_ptr.hpp>
+#include <migraph/operators.hpp>
 #include <miopen/miopen.h>
 namespace migraph {

--- a/src/targets/miopen/include/migraph/miopen/miopen_lowering.hpp
+++ b/src/targets/miopen/include/migraph/miopen/miopen_lowering.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_MIOPEN_LOWERING_HPP
+#define MIGRAPH_GUARD_RTGLIB_MIOPEN_LOWERING_HPP
+#include <migraph/program.hpp>
+namespace migraph {
+namespace miopen {
+struct miopen_lowering
+{
+    std::string name() const { return "miopen::lowering"; }
+    void apply(program& p) const;
+};
+} // namespace miopen
+} // namespace migraph
+#endif
--- a/src/targets/miopen/include/migraph/miopen/miopen_write_literals.hpp
+++ b/src/targets/miopen/include/migraph/miopen/miopen_write_literals.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_MIOPEN_WRITE_LITERALS_HPP
+#define MIGRAPH_GUARD_RTGLIB_MIOPEN_WRITE_LITERALS_HPP
+#include <migraph/program.hpp>
+namespace migraph {
+namespace miopen {
+struct miopen_write_literals
+{
+    std::string name() const { return "miopen::write_literals"; }
+    void apply(program& p) const;
+};
+} // namespace miopen
+} // namespace migraph
+#endif
--- a/src/targets/miopen/miopen_lowering.cpp
+++ b/src/targets/miopen/miopen_lowering.cpp
+#include <migraph/miopen/miopen_lowering.hpp>
+#include <migraph/manage_ptr.hpp>
+#include <migraph/instruction.hpp>
+#include <migraph/operators.hpp>
+#include <migraph/shape_for_each.hpp>
+#include <migraph/miopen/miopen.hpp>
+#include <migraph/miopen/hip.hpp>
+#include <migraph/dfor.hpp>
+#include <migraph/iterator_for.hpp>
+#include <rocblas.h>
+namespace migraph {
+namespace miopen {
+struct miopen_context
+{
+    shared<miopen_handle> handle;
+};
+struct miopen_convolution
+{
+    convolution op;
+    shared<convolution_descriptor> cd;
+    std::string name() const { return "miopen::convolution"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(3);
+        return op.compute_shape({inputs.at(0), inputs.at(1)});
+    }
+    argument compute(context& gctx, shape output_shape, std::vector<argument> args) const
+    {
+        auto& ctx   = any_cast<miopen_context>(gctx);
+        auto x_desc = make_tensor(args[0].get_shape());
+        auto w_desc = make_tensor(args[1].get_shape());
+        auto y_desc = make_tensor(output_shape);
+        float alpha = 1, beta = 0;
+        int algo_count;
+        miopenConvAlgoPerf_t perf;
+        miopenFindConvolutionForwardAlgorithm(ctx.handle.get(),
+                                              x_desc.get(),
+                                              args[0].implicit(),
+                                              w_desc.get(),
+                                              args[1].implicit(),
+                                              cd.get(),
+                                              y_desc.get(),
+                                              args[2].implicit(),
+                                              1,
+                                              &algo_count,
+                                              &perf,
+                                              nullptr,
+                                              0,
+                                              false);
+        miopenConvolutionForward(ctx.handle.get(),
+                                 &alpha,
+                                 x_desc.get(),
+                                 args[0].implicit(),
+                                 w_desc.get(),
+                                 args[1].implicit(),
+                                 cd.get(),
+                                 perf.fwd_algo,
+                                 &beta,
+                                 y_desc.get(),
+                                 args[2].implicit(),
+                                 nullptr,
+                                 0);
+        return args[2];
+    }
+};
+struct miopen_pooling
+{
+    pooling op;
+    shared<pooling_descriptor> pd;
+    std::string name() const { return "miopen::pooling"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(2);
+        return op.compute_shape({inputs.at(1)});
+    }
+    argument compute(context& gctx, shape output_shape, std::vector<argument> args) const
+    {
+        auto& ctx   = any_cast<miopen_context>(gctx);
+        auto x_desc = make_tensor(args[0].get_shape());
+        auto y_desc = make_tensor(output_shape);
+        float alpha = 1, beta = 0;
+        miopenPoolingForward(ctx.handle.get(),
+                             pd.get(),
+                             &alpha,
+                             x_desc.get(),
+                             args[0].implicit(),
+                             &beta,
+                             y_desc.get(),
+                             args[1].implicit(),
+                             false,
+                             nullptr,
+                             0);
+        return args[1];
+    }
+};
+struct miopen_add
+{
+    std::string name() const { return "miopen::add"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(3);
+        return inputs.at(0);
+    }
+    argument compute(context& gctx, shape output_shape, std::vector<argument> args) const
+    {
+        if(args[1].get_shape().broadcasted())
+        {
+            argument result{output_shape};
+            visit_all(result, from_gpu(args[0]), from_gpu(args[1]))(
+                [&](auto output, auto input1, auto input2) {
+                    shape_for_each(output.get_shape(), [&](const auto& idx) {
+                        output(idx.begin(), idx.end()) =
+                            input1(idx.begin(), idx.end()) + input2(idx.begin(), idx.end());
+                    });
+                });
+            return to_gpu(result);
+        }
+        else
+        {
+            auto& ctx   = any_cast<miopen_context>(gctx);
+            float alpha = 1, beta = 0;
+            auto a_desc = make_tensor(args[0].get_shape());
+            auto b_desc = make_tensor(args[1].get_shape());
+            auto c_desc = make_tensor(output_shape);
+            miopenOpTensor(ctx.handle.get(),
+                           miopenTensorOpAdd,
+                           &alpha,
+                           a_desc.get(),
+                           args[0].implicit(),
+                           &alpha,
+                           b_desc.get(),
+                           args[1].implicit(),
+                           &beta,
+                           c_desc.get(),
+                           args[2].implicit());
+            return args[2];
+        }
+    }
+};
+struct miopen_gemm
+{
+    gemm op;
+    std::string name() const { return "miopen::convolution"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(3);
+        return op.compute_shape({inputs.at(0), inputs.at(1)});
+    }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        rocblas_handle rochandle;
+        rocblas_create_handle(&rochandle);
+        float alpha     = 1.0f;
+        float beta      = 0.0f;
+        rocblas_int lda = args[0].get_shape().lens()[1];
+        rocblas_int ldb = args[1].get_shape().lens()[1];
+        rocblas_int ldc = args[2].get_shape().lens()[1];
+        rocblas_int m   = output_shape.lens()[0];
+        rocblas_int n   = output_shape.lens()[1];
+        rocblas_int k   = args[0].get_shape().lens()[1];
+        rocblas_sgemm(rochandle,
+                      rocblas_operation_none,
+                      rocblas_operation_none,
+                      n,
+                      m,
+                      k,
+                      &alpha,
+                      args[1].implicit(),
+                      ldb,
+                      args[0].implicit(),
+                      lda,
+                      &beta,
+                      args[2].implicit(),
+                      ldc);
+        return args[2];
+    }
+};
+struct miopen_relu
+{
+    shared<activation_descriptor> ad;
+    std::string name() const { return "miopen::relu"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(2);
+        return inputs.at(1);
+    }
+    argument compute(context& gctx, shape output_shape, std::vector<argument> args) const
+    {
+        auto& ctx   = any_cast<miopen_context>(gctx);
+        float alpha = 1, beta = 0;
+        auto x_desc = make_tensor(args[0].get_shape());
+        auto y_desc = make_tensor(output_shape);
+        miopenActivationForward(ctx.handle.get(),
+                                ad.get(),
+                                &alpha,
+                                x_desc.get(),
+                                args[0].implicit(),
+                                &beta,
+                                y_desc.get(),
+                                args[1].implicit());
+        return args[1];
+    }
+};
+struct miopen_apply
+{
+    program* prog = nullptr;
+    void apply()
+    {
+        prog->insert_instruction(prog->begin(), check_context<miopen_context>{});
+        for(auto it = prog->begin(); it != prog->end(); it++)
+        {
+            if(it->op.name() == "convolution")
+            {
+                apply_convolution(it);
+            }
+            else if(it->op.name() == "activation")
+            {
+                apply_activation(it);
+            }
+            else if(it->op.name() == "pooling")
+            {
+                apply_pooling(it);
+            }
+            else if(it->op.name() == "add")
+            {
+                apply_add(it);
+            }
+            else if(it->op.name() == "gemm")
+            {
+                apply_gemm(it);
+            }
+        }
+    }
+    instruction_ref insert_allocation(instruction_ref ins, const shape& s)
+    {
+        if(ins == --prog->end())
+        {
+            return prog->add_parameter("output", s);
+        }
+        else
+        {
+            auto is     = prog->add_outline(s);
+            auto result = prog->insert_instruction(ins, hip_allocate{}, is);
+            return result;
+        }
+    }
+    void apply_convolution(instruction_ref ins)
+    {
+        auto&& op   = any_cast<convolution>(ins->op);
+        auto cd     = make_conv(op);
+        auto output = insert_allocation(ins, ins->result);
+        prog->replace_instruction(ins,
+                                  miopen_convolution{op, std::move(cd)},
+                                  ins->arguments.at(0),
+                                  ins->arguments.at(1),
+                                  output);
+    }
+    void apply_pooling(instruction_ref ins)
+    {
+        auto&& op   = any_cast<pooling>(ins->op);
+        auto pd     = make_pooling(op);
+        auto output = insert_allocation(ins, ins->result);
+        prog->replace_instruction(
+            ins, miopen_pooling{op, std::move(pd)}, ins->arguments.at(0), output);
+    }
+    void apply_activation(instruction_ref ins)
+    {
+        auto&& op = any_cast<activation>(ins->op);
+        auto ad   = make_relu();
+        if(op.mode == "relu")
+        {
+            auto output = insert_allocation(ins, ins->result);
+            prog->replace_instruction(
+                ins, miopen_relu{std::move(ad)}, ins->arguments.at(0), output);
+        }
+    }
+    void apply_add(instruction_ref ins)
+    {
+        auto output = insert_allocation(ins, ins->result);
+        prog->replace_instruction(
+            ins, miopen_add{}, ins->arguments.at(0), ins->arguments.at(1), output);
+    }
+    void apply_gemm(instruction_ref ins)
+    {
+        auto&& op   = any_cast<gemm>(ins->op);
+        auto output = insert_allocation(ins, ins->result);
+        prog->replace_instruction(
+            ins, miopen_gemm{op}, ins->arguments.at(0), ins->arguments.at(1), output);
+    }
+};
+void miopen_lowering::apply(program& p) const { miopen_apply{&p}.apply(); }
+} // namespace miopen
+} // namespace migraph
--- a/src/targets/miopen/miopen_target.cpp
+++ b/src/targets/miopen/miopen_target.cpp
-#include <rocblas.h>
 #include <migraph/miopen/miopen_target.hpp>
-#include <migraph/manage_ptr.hpp>
+#include <migraph/miopen/miopen_lowering.hpp>
-#include <migraph/instruction.hpp>
+#include <migraph/miopen/miopen_write_literals.hpp>
-#include <migraph/operators.hpp>
+#include <migraph/miopen/context.hpp>
-#include <migraph/shape_for_each.hpp>
-#include <migraph/miopen/miopen.hpp>
-#include <migraph/miopen/hip.hpp>
-#include <migraph/dfor.hpp>
 namespace migraph {
 namespace miopen {
-struct miopen_context
+std::vector<pass> miopen_target::get_passes(context&) const
 {
-    shared<miopen_handle> handle;
+    return {miopen_lowering{}, miopen_write_literals{}};
-};
+}
-struct miopen_convolution
-{
-    convolution op;
-    shared<convolution_descriptor> cd;
-    std::string name() const { return "miopen::convolution"; }
-    shape compute_shape(std::vector<shape> inputs) const
-    {
-        check_shapes{inputs, *this}.has(3);
-        return op.compute_shape({inputs.at(0), inputs.at(1)});
-    }
-    argument compute(context& gctx, shape output_shape, std::vector<argument> args) const
-    {
-        auto& ctx   = any_cast<miopen_context>(gctx);
-        auto x_desc = make_tensor(args[0].get_shape());
-        auto w_desc = make_tensor(args[1].get_shape());
-        auto y_desc = make_tensor(output_shape);
-        float alpha = 1, beta = 0;
-        int algo_count;
-        miopenConvAlgoPerf_t perf;
-        miopenFindConvolutionForwardAlgorithm(ctx.handle.get(),
-                                              x_desc.get(),
-                                              args[0].implicit(),
-                                              w_desc.get(),
-                                              args[1].implicit(),
-                                              cd.get(),
-                                              y_desc.get(),
-                                              args[2].implicit(),
-                                              1,
-                                              &algo_count,
-                                              &perf,
-                                              nullptr,
-                                              0,
-                                              false);
-        miopenConvolutionForward(ctx.handle.get(),
-                                 &alpha,
-                                 x_desc.get(),
-                                 args[0].implicit(),
-                                 w_desc.get(),
-                                 args[1].implicit(),
-                                 cd.get(),
-                                 perf.fwd_algo,
-                                 &beta,
-                                 y_desc.get(),
-                                 args[2].implicit(),
-                                 nullptr,
-                                 0);
-        return args[2];
-    }
-};
-struct miopen_pooling
-{
-    pooling op;
-    shared<pooling_descriptor> pd;
-    std::string name() const { return "miopen::pooling"; }
-    shape compute_shape(std::vector<shape> inputs) const
-    {
-        check_shapes{inputs, *this}.has(2);
-        return op.compute_shape({inputs.at(1)});
-    }
-    argument compute(context& gctx, shape output_shape, std::vector<argument> args) const
-    {
-        auto& ctx   = any_cast<miopen_context>(gctx);
-        auto x_desc = make_tensor(args[0].get_shape());
-        auto y_desc = make_tensor(output_shape);
-        float alpha = 1, beta = 0;
-        miopenPoolingForward(ctx.handle.get(),
-                             pd.get(),
-                             &alpha,
-                             x_desc.get(),
-                             args[0].implicit(),
-                             &beta,
-                             y_desc.get(),
-                             args[1].implicit(),
-                             false,
-                             nullptr,
-                             0);
-        return args[1];
-    }
-};
-struct miopen_add
-{
-    std::string name() const { return "miopen::add"; }
-    shape compute_shape(std::vector<shape> inputs) const
-    {
-        check_shapes{inputs, *this}.has(3);
-        return inputs.at(0);
-    }
-    argument compute(context& gctx, shape output_shape, std::vector<argument> args) const
-    {
-        if(args[1].get_shape().broadcasted())
-        {
-            argument result{output_shape};
-            visit_all(result, from_gpu(args[0]), from_gpu(args[1]))(
-                [&](auto output, auto input1, auto input2) {
-                    shape_for_each(output.get_shape(), [&](const auto& idx) {
-                        output(idx.begin(), idx.end()) =
-                            input1(idx.begin(), idx.end()) + input2(idx.begin(), idx.end());
-                    });
-                });
-            return to_gpu(result);
-        }
-        else
-        {
-            auto& ctx   = any_cast<miopen_context>(gctx);
-            float alpha = 1, beta = 0;
-            auto a_desc = make_tensor(args[0].get_shape());
-            auto b_desc = make_tensor(args[1].get_shape());
-            auto c_desc = make_tensor(output_shape);
-            miopenOpTensor(ctx.handle.get(),
-                           miopenTensorOpAdd,
-                           &alpha,
-                           a_desc.get(),
-                           args[0].implicit(),
-                           &alpha,
-                           b_desc.get(),
-                           args[1].implicit(),
-                           &beta,
-                           c_desc.get(),
-                           args[2].implicit());
-            return args[2];
-        }
-    }
-};
-struct miopen_gemm
-{
-    gemm op;
-    std::string name() const { return "miopen::convolution"; }
-    shape compute_shape(std::vector<shape> inputs) const
-    {
-        check_shapes{inputs, *this}.has(3);
-        return op.compute_shape({inputs.at(0), inputs.at(1)});
-    }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        rocblas_handle rochandle;
-        rocblas_create_handle(&rochandle);
-        float alpha     = 1.0f;
-        float beta      = 0.0f;
-        rocblas_int lda = args[0].get_shape().lens()[1];
-        rocblas_int ldb = args[1].get_shape().lens()[1];
-        rocblas_int ldc = args[2].get_shape().lens()[1];
-        rocblas_int m   = output_shape.lens()[0];
-        rocblas_int n   = output_shape.lens()[1];
-        rocblas_int k   = args[0].get_shape().lens()[1];
-        rocblas_sgemm(rochandle,
-                      rocblas_operation_none,
-                      rocblas_operation_none,
-                      n,
-                      m,
-                      k,
-                      &alpha,
-                      args[1].implicit(),
-                      ldb,
-                      args[0].implicit(),
-                      lda,
-                      &beta,
-                      args[2].implicit(),
-                      ldc);
-        return args[2];
-    }
-};
-struct miopen_relu
-{
-    shared<activation_descriptor> ad;
-    std::string name() const { return "miopen::relu"; }
-    shape compute_shape(std::vector<shape> inputs) const
-    {
-        check_shapes{inputs, *this}.has(2);
-        return inputs.at(1);
-    }
-    argument compute(context& gctx, shape output_shape, std::vector<argument> args) const
-    {
-        auto& ctx   = any_cast<miopen_context>(gctx);
-        float alpha = 1, beta = 0;
-        auto x_desc = make_tensor(args[0].get_shape());
-        auto y_desc = make_tensor(output_shape);
-        miopenActivationForward(ctx.handle.get(),
-                                ad.get(),
-                                &alpha,
-                                x_desc.get(),
-                                args[0].implicit(),
-                                &beta,
-                                y_desc.get(),
-                                args[1].implicit());
-        return args[1];
-    }
-};
-struct miopen_apply
-{
-    program* prog = nullptr;
-    void apply()
-    {
-        prog->insert_instruction(prog->begin(), check_context<miopen_context>{});
-        for(auto it = prog->begin(); it != prog->end(); it++)
-        {
-            if(it->op.name() == "convolution")
-            {
-                apply_convolution(it);
-            }
-            else if(it->op.name() == "activation")
-            {
-                apply_activation(it);
-            }
-            else if(it->op.name() == "pooling")
-            {
-                apply_pooling(it);
-            }
-            else if(it->op.name() == "add")
-            {
-                apply_add(it);
-            }
-            else if(it->op.name() == "gemm")
-            {
-                apply_gemm(it);
-            }
-        }
-    }
-    instruction_ref insert_allocation(instruction_ref ins, const shape& s)
-    {
-        if(ins == --prog->end())
-        {
-            return prog->add_parameter("output", s);
-        }
-        else
-        {
-            auto is     = prog->add_outline(s);
-            auto result = prog->insert_instruction(ins, hip_allocate{}, is);
-            return result;
-        }
-    }
-    void apply_convolution(instruction_ref ins)
-    {
-        auto&& op   = any_cast<convolution>(ins->op);
-        auto cd     = make_conv(op);
-        auto output = insert_allocation(ins, ins->result);
-        prog->replace_instruction(ins,
-                                  miopen_convolution{op, std::move(cd)},
-                                  ins->arguments.at(0),
-                                  ins->arguments.at(1),
-                                  output);
-    }
-    void apply_pooling(instruction_ref ins)
-    {
-        auto&& op   = any_cast<pooling>(ins->op);
-        auto pd     = make_pooling(op);
-        auto output = insert_allocation(ins, ins->result);
-        prog->replace_instruction(
-            ins, miopen_pooling{op, std::move(pd)}, ins->arguments.at(0), output);
-    }
-    void apply_activation(instruction_ref ins)
-    {
-        auto&& op = any_cast<activation>(ins->op);
-        auto ad   = make_relu();
-        if(op.mode == "relu")
-        {
-            auto output = insert_allocation(ins, ins->result);
-            prog->replace_instruction(
-                ins, miopen_relu{std::move(ad)}, ins->arguments.at(0), output);
-        }
-    }
-    void apply_add(instruction_ref ins)
-    {
-        auto output = insert_allocation(ins, ins->result);
-        prog->replace_instruction(
-            ins, miopen_add{}, ins->arguments.at(0), ins->arguments.at(1), output);
-    }
-    void apply_gemm(instruction_ref ins)
-    {
-        auto&& op   = any_cast<gemm>(ins->op);
-        auto output = insert_allocation(ins, ins->result);
-        prog->replace_instruction(
-            ins, miopen_gemm{op}, ins->arguments.at(0), ins->arguments.at(1), output);
-    }
-};
-struct miopen_pass
-{
-    std::string name() const { return "miopen::pass"; }
-    void apply(program& p) const { miopen_apply{&p}.apply(); }
-};
-std::vector<pass> miopen_target::get_passes(context&) const { return {miopen_pass{}}; }
 std::string miopen_target::name() const { return "miopen"; }

--- a/src/targets/miopen/miopen_write_literals.cpp
+++ b/src/targets/miopen/miopen_write_literals.cpp
+#include <migraph/miopen/miopen_write_literals.hpp>
+#include <migraph/iterator_for.hpp>
+#include <migraph/miopen/hip.hpp>
+#include <migraph/instruction.hpp>
+namespace migraph {
+namespace miopen {
+void miopen_write_literals::apply(program& p) const
+{
+    for(auto ins : iterator_for(p))
+    {
+        if(ins->op.name() == "@literal")
+        {
+            literal l = ins->lit;
+            auto pre  = p.add_literal(l);
+            p.replace_instruction(ins, hip_write{}, pre);
+        }
+    }
+}
+} // namespace miopen
+} // namespace migraph
--- a/test/miopen/miopen.cpp
+++ b/test/miopen/miopen.cpp
@@ -49,6 +49,23 @@ void verify_program()
    visit_all(cpu_arg, gpu_arg)([](auto cpu, auto gpu) { EXPECT(test::verify_range(cpu, gpu)); });
 }
+struct test_literals
+{
+    migraph::program create_program() const
+    {
+        migraph::program p;
+        auto input = p.add_literal(
+            generate_literal(migraph::shape{migraph::shape::float_type, {4, 3, 3, 3}}));
+        auto weights = p.add_literal(
+            generate_literal(migraph::shape{migraph::shape::float_type, {4, 3, 3, 3}}));
+        auto conv = p.add_instruction(migraph::convolution{}, input, weights);
+        p.add_instruction(migraph::activation{"relu"}, conv);
+        return p;
+    }
+    migraph::program::parameter_map create_params() const { return {}; }
+};
 struct test_add
 {
    migraph::program create_program() const

--- a/tools/include/context.hpp
+++ b/tools/include/context.hpp
 #ifndef MIGRAPH_GUARD_CONTEXT_HPP
 #define MIGRAPH_GUARD_CONTEXT_HPP
+#include <string>
+#include <functional>
+#include <memory>
+#include <type_traits>
+#include <utility>
 namespace migraph {
 <%