Merged from master

76f68df4 · wsttiger · dc0c4810 · 8ae3ffea · 76f68df4 · 76f68df4
Commit 76f68df4 authored Jul 19, 2018 by wsttiger
20 changed files
--- a/requirements.txt
+++ b/requirements.txt
 google/protobuf -DCMAKE_POSITION_INDEPENDENT_CODE=On
 RadeonOpenCompute/rocm-cmake@3f43e2d493f24abbab4dc189a9ab12cc3ad33baf --build
 ROCmSoftwarePlatform/MIOpen,http://gitlab1.amd.com/pfultz/miopen/-/archive/57dd8372fa043f4bb500ac297f2c479e1c819e89/miopen-57dd8372fa043f4bb500ac297f2c479e1c819e89.tar.bz2 -DMIOPEN_CACHE_DIR=""
+ROCmSoftwarePlatform/rocBLAS@v14.0.1
--- a/src/generate.cpp
+++ b/src/generate.cpp
@@ -2,9 +2,9 @@

 namespace migraph {

-migraph::argument generate_argument(migraph::shape s, std::mt19937::result_type seed)
+argument generate_argument(shape s, std::mt19937::result_type seed)
 {
-    migraph::argument result;
+    argument result;
    s.visit_type([&](auto as) {
        using type = typename decltype(as)::type;
        auto v     = generate_tensor_data<type>(s, seed);
@@ -13,4 +13,15 @@ migraph::argument generate_argument(migraph::shape s, std::mt19937::result_type
    return result;
 }

+literal generate_literal(shape s, std::mt19937::result_type seed)
+{
+    literal result;
+    s.visit_type([&](auto as) {
+        using type = typename decltype(as)::type;
+        auto v     = generate_tensor_data<type>(s, seed);
+        result     = {s, v};
+    });
+    return result;
+}
+
 } // namespace migraph
--- a/src/include/migraph/auto_any_cast.hpp
+++ b/src/include/migraph/auto_any_cast.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_AUTO_ANY_CAST_HPP
+#define MIGRAPH_GUARD_RTGLIB_AUTO_ANY_CAST_HPP
+
+namespace migraph {
+
+namespace detail {
+
+template <class U>
+void any_cast()
+{
+}
+
+template <class T>
+struct auto_any_caster
+{
+    T& x;
+
+    template <class U>
+    operator U&()
+    {
+        return any_cast<U>(x);
+    }
+
+    operator T&() { return x; }
+};
+
+} // namespace detail
+
+template <class T>
+detail::auto_any_caster<T> auto_any_cast(T& x)
+{
+    return {x};
+}
+
+} // namespace migraph
+
+#endif
--- a/src/include/migraph/context.hpp
+++ b/src/include/migraph/context.hpp
 #ifndef MIGRAPH_GUARD_CONTEXT_HPP
 #define MIGRAPH_GUARD_CONTEXT_HPP

+#include <string>
+#include <functional>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
 namespace migraph {

 /*

--- a/src/include/migraph/generate.hpp
+++ b/src/include/migraph/generate.hpp
@@ -2,6 +2,7 @@
 #define MIGRAPH_GUARD_MIGRAPHLIB_GENERATE_HPP

 #include <migraph/argument.hpp>
+#include <migraph/literal.hpp>
 #include <random>

 namespace migraph {
@@ -16,7 +17,9 @@ std::vector<T> generate_tensor_data(migraph::shape s, std::mt19937::result_type
    return result;
 }

-migraph::argument generate_argument(migraph::shape s, std::mt19937::result_type seed = 0);
+argument generate_argument(shape s, std::mt19937::result_type seed = 0);
+
+literal generate_literal(shape s, std::mt19937::result_type seed = 0);

 } // namespace migraph


--- a/src/include/migraph/instruction.hpp
+++ b/src/include/migraph/instruction.hpp
@@ -5,6 +5,7 @@
 #include <migraph/shape.hpp>
 #include <migraph/builtin.hpp>
 #include <migraph/instruction_ref.hpp>
+#include <migraph/operation.hpp>
 #include <migraph/erase.hpp>
 #include <string>


--- a/src/include/migraph/operation.hpp
+++ b/src/include/migraph/operation.hpp
@@ -9,6 +9,7 @@
 #include <migraph/shape.hpp>
 #include <migraph/argument.hpp>
 #include <migraph/context.hpp>
+#include <migraph/auto_any_cast.hpp>

 namespace migraph {

@@ -22,6 +23,12 @@ auto operator<<(std::ostream& os, const T& x) -> decltype(os << x.name())

 } // namespace operation_stream

+template <class T>
+argument compute_op(const T& x, context& ctx, shape output_shape, std::vector<argument> input)
+{
+    return x.compute(auto_any_cast(ctx), output_shape, input);
+}
+
 /*
 * Type-erased interface for:
 *
@@ -169,7 +176,7 @@ struct operation
        argument compute(context& ctx, shape output, std::vector<argument> input) const override
        {

-            return private_detail_te_value.compute(ctx, std::move(output), std::move(input));
+            return compute_op(private_detail_te_value, ctx, std::move(output), std::move(input));
        }

        std::ostream& operator_shift_left(std::ostream& os) const override

--- a/src/include/migraph/operators.hpp
+++ b/src/include/migraph/operators.hpp
@@ -103,6 +103,24 @@ struct not_computable
    }
 };

+struct batch_norm_inference
+{
+    double epsilon = 1.0e-6;
+
+    std::string name() const { return "batch_norm_inference"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(5);
+        return inputs.front();
+    }
+
+    argument compute(context&, shape, std::vector<argument>) const
+    {
+        MIGRAPH_THROW("not computable");
+    }
+};
+
 struct convolution
 {
    std::array<std::size_t, 2> padding  = {{0, 0}};
@@ -193,6 +211,7 @@ struct pooling
    std::array<std::size_t, 2> stride  = {{1, 1}};
    std::array<std::size_t, 2> lengths = {{1, 1}};
    std::string name() const { return "pooling"; }
+
    shape compute_shape(std::vector<shape> inputs) const
    {
        check_shapes{inputs, *this}.has(1).only_dims(4);
@@ -474,6 +493,7 @@ struct broadcast
        auto input  = inputs.at(1);

        std::vector<size_t> bcast_strides(result.lens().size(), 0);
+
        if(std::all_of(
               result.lens().cbegin(), result.lens().cend(), [&](auto x) { return x == 1; }))
        {

--- a/src/include/migraph/shape.hpp
+++ b/src/include/migraph/shape.hpp
@@ -45,6 +45,11 @@ struct shape
    MIGRAPH_SHAPE_VISIT_TYPES(MIGRAPH_SHAPE_GET_TYPE)
 #undef MIGRAPH_SHAPE_GET_TYPE

+    template <class T>
+    struct get_type<const T> : get_type<T>
+    {
+    };
+
    shape();
    shape(type_t t);
    shape(type_t t, std::vector<std::size_t> l);

--- a/src/include/migraph/tensor_view.hpp
+++ b/src/include/migraph/tensor_view.hpp
@@ -29,12 +29,14 @@ struct tensor_view
    template <class... Ts, MIGRAPH_REQUIRES(std::is_integral<Ts>{}...)>
    const T& operator()(Ts... xs) const
    {
+        assert(m_shape.index({static_cast<std::size_t>(xs)...}) < m_shape.bytes() / sizeof(T));
        return m_data[m_shape.index({static_cast<std::size_t>(xs)...})];
    }

    template <class... Ts, MIGRAPH_REQUIRES(std::is_integral<Ts>{}...)>
    T& operator()(Ts... xs)
    {
+        assert(m_shape.index({static_cast<std::size_t>(xs)...}) < m_shape.bytes() / sizeof(T));
        return m_data[m_shape.index({static_cast<std::size_t>(xs)...})];
    }


--- a/src/onnx/CMakeLists.txt
+++ b/src/onnx/CMakeLists.txt
@@ -16,6 +16,10 @@ add_executable(read_onnx read_onnx.cpp)
 rocm_clang_tidy_check(read_onnx)
 target_link_libraries(read_onnx migraph_onnx)

+add_executable(mnist mnist.cpp)
+rocm_clang_tidy_check(mnist)
+target_link_libraries(mnist migraph_cpu migraph_onnx)
+
 if(MIGRAPH_ENABLE_MIOPEN)
 add_executable(verify_onnx verify_onnx.cpp)
 rocm_clang_tidy_check(verify_onnx)

--- a/src/onnx/mnist.cpp
+++ b/src/onnx/mnist.cpp
+#include <cstdio>
+#include <string>
+#include <fstream>
+#include <numeric>
+#include <stdexcept>
+
+#include <migraph/onnx.hpp>
+
+#include <migraph/cpu/cpu_target.hpp>
+#include <migraph/generate.hpp>
+
+auto reverse_int(unsigned int i)
+{
+    unsigned char c1, c2, c3, c4;
+    c1 = i & 255u;
+    c2 = (i >> 8u) & 255u;
+    c3 = (i >> 16u) & 255u;
+    c4 = (i >> 24u) & 255u;
+    return (static_cast<unsigned int>(c1) << 24u) + (static_cast<unsigned int>(c2) << 16u) +
+           (static_cast<unsigned int>(c3) << 8u) + c4;
+};
+
+std::vector<float> read_mnist_images(std::string full_path, int& number_of_images, int& image_size)
+{
+    using uchar = unsigned char;
+
+    std::ifstream file(full_path, std::ios::binary);
+
+    if(file.is_open())
+    {
+        int magic_number = 0, n_rows = 0, n_cols = 0;
+
+        file.read(reinterpret_cast<char*>(&magic_number), sizeof(magic_number));
+        magic_number = reverse_int(magic_number);
+
+        if(magic_number != 2051)
+            throw std::runtime_error("Invalid MNIST image file!");
+
+        file.read(reinterpret_cast<char*>(&number_of_images), sizeof(number_of_images));
+        number_of_images = reverse_int(number_of_images);
+        file.read(reinterpret_cast<char*>(&n_rows), sizeof(n_rows));
+        n_rows = reverse_int(n_rows);
+        file.read(reinterpret_cast<char*>(&n_cols), sizeof(n_cols));
+        n_cols = reverse_int(n_cols);
+
+        image_size = n_rows * n_cols;
+
+        std::vector<float> result(number_of_images * image_size);
+        for(int i = 0; i < number_of_images; i++)
+        {
+            for(int j = 0; j < image_size; j++)
+            {
+                uchar tmp;
+                file.read(reinterpret_cast<char*>(&tmp), 1);
+                result[i * image_size + j] = tmp / 255.0;
+            }
+        }
+        return result;
+    }
+    else
+    {
+        throw std::runtime_error("Cannot open file `" + full_path + "`!");
+    }
+}
+
+std::vector<int32_t> read_mnist_labels(std::string full_path, int& number_of_labels)
+{
+    using uchar = unsigned char;
+
+    std::ifstream file(full_path, std::ios::binary);
+
+    if(file.is_open())
+    {
+        int magic_number = 0;
+        file.read(reinterpret_cast<char*>(&magic_number), sizeof(magic_number));
+        magic_number = reverse_int(magic_number);
+
+        if(magic_number != 2049)
+            throw std::runtime_error("Invalid MNIST label file!");
+
+        file.read(reinterpret_cast<char*>(&number_of_labels), sizeof(number_of_labels));
+        number_of_labels = reverse_int(number_of_labels);
+
+        std::vector<int32_t> result(number_of_labels);
+        for(int i = 0; i < number_of_labels; i++)
+        {
+            uchar tmp;
+            file.read(reinterpret_cast<char*>(&tmp), 1);
+            result[i] = tmp;
+        }
+        return result;
+    }
+    else
+    {
+        throw std::runtime_error("Unable to open file `" + full_path + "`!");
+    }
+}
+
+std::vector<float> softmax(std::vector<float> p)
+{
+    size_t n = p.size();
+    std::vector<float> result(n);
+    std::transform(p.begin(), p.end(), result.begin(), [](auto x) { return std::exp(x); });
+    float s = std::accumulate(result.begin(), result.end(), 0.0f, std::plus<float>());
+    std::transform(result.begin(), result.end(), result.begin(), [=](auto x) { return x / s; });
+    return result;
+}
+
+int main(int argc, char const* argv[])
+{
+    if(argc > 3)
+    {
+        std::string datafile        = argv[2];
+        std::string labelfile       = argv[3];
+        int nimages                 = -1;
+        int image_size              = -1;
+        int nlabels                 = -1;
+        std::vector<float> input    = read_mnist_images(datafile, nimages, image_size);
+        std::vector<int32_t> labels = read_mnist_labels(labelfile, nlabels);
+
+        std::string file = argv[1];
+        auto prog        = migraph::parse_onnx(file);
+        prog.compile(migraph::cpu::cpu_target{});
+        auto s = migraph::shape{migraph::shape::float_type, {1, 1, 28, 28}};
+        std::cout << s << std::endl;
+        auto ptr = input.data();
+        for(int i = 0; i < 20; i++)
+        {
+            std::cout << "label: " << labels[i] << "  ---->  ";
+            auto input3 = migraph::argument{s, &ptr[784 * i]};
+            auto result = prog.eval({{"Input3", input3}});
+            std::vector<float> logits;
+            result.visit([&](auto output) { logits.assign(output.begin(), output.end()); });
+            std::vector<float> probs = softmax(logits);
+            for(auto x : probs)
+                std::cout << x << "  ";
+            std::cout << std::endl;
+        }
+        std::cout << std::endl;
+    }
+}
--- a/src/onnx/onnx.cpp
+++ b/src/onnx/onnx.cpp
@@ -113,13 +113,49 @@ struct onnx_parser
            }
            return prog.add_instruction(add{}, args);
        });
-        add_op("Sub", [this](attribute_map, std::vector<instruction_ref> args) {
+        add_op("Sub", [this](attribute_map attributes, std::vector<instruction_ref> args) {
+            if(contains(attributes, "broadcast"))
+            {
+                uint64_t broadcasted = parse_value(attributes.at("broadcast")).at<uint64_t>();
+                if(broadcasted != 0)
+                {
+                    uint64_t axis = (contains(attributes, "axis"))
+                                        ? parse_value(attributes.at("axis")).at<uint64_t>()
+                                        : 0;
+                    auto l = prog.add_instruction(broadcast{axis}, args);
+                    return prog.add_instruction(sub{}, args[0], l);
+                }
+            }
            return prog.add_instruction(sub{}, args);
        });
-        add_op("Mul", [this](attribute_map, std::vector<instruction_ref> args) {
+        add_op("Mul", [this](attribute_map attributes, std::vector<instruction_ref> args) {
+            if(contains(attributes, "broadcast"))
+            {
+                uint64_t broadcasted = parse_value(attributes.at("broadcast")).at<uint64_t>();
+                if(broadcasted != 0)
+                {
+                    uint64_t axis = (contains(attributes, "axis"))
+                                        ? parse_value(attributes.at("axis")).at<uint64_t>()
+                                        : 0;
+                    auto l = prog.add_instruction(broadcast{axis}, args);
+                    return prog.add_instruction(mul{}, args[0], l);
+                }
+            }
            return prog.add_instruction(mul{}, args);
        });
-        add_op("Div", [this](attribute_map, std::vector<instruction_ref> args) {
+        add_op("Div", [this](attribute_map attributes, std::vector<instruction_ref> args) {
+            if(contains(attributes, "broadcast"))
+            {
+                uint64_t broadcasted = parse_value(attributes.at("broadcast")).at<uint64_t>();
+                if(broadcasted != 0)
+                {
+                    uint64_t axis = (contains(attributes, "axis"))
+                                        ? parse_value(attributes.at("axis")).at<uint64_t>()
+                                        : 0;
+                    auto l = prog.add_instruction(broadcast{axis}, args);
+                    return prog.add_instruction(div{}, args[0], l);
+                }
+            }
            return prog.add_instruction(div{}, args);
        });
    }

--- a/src/onnx/verify_onnx.cpp
+++ b/src/onnx/verify_onnx.cpp
@@ -2,7 +2,7 @@
 #include <migraph/onnx.hpp>

 #include <migraph/cpu/cpu_target.hpp>
-#include <migraph/miopen/miopen_target.hpp>
+#include <migraph/miopen/target.hpp>
 #include <migraph/miopen/hip.hpp>
 #include <migraph/generate.hpp>
 #include <miopen/miopen.h>

--- a/src/targets/cpu/CMakeLists.txt
+++ b/src/targets/cpu/CMakeLists.txt

 add_library(migraph_cpu
    cpu_target.cpp
+    cpu_lowering.cpp
 )
 rocm_clang_tidy_check(migraph_cpu)
 target_link_libraries(migraph_cpu migraph)

--- a/src/targets/cpu/cpu_lowering.cpp
+++ b/src/targets/cpu/cpu_lowering.cpp
+
+#include <migraph/cpu/cpu_lowering.hpp>
+#include <migraph/instruction.hpp>
+#include <migraph/dfor.hpp>
+#include <migraph/operators.hpp>
+#include <migraph/shape_for_each.hpp>
+#include <migraph/iterator_for.hpp>
+#include <unordered_map>
+
+namespace migraph {
+namespace cpu {
+
+template <typename T>
+T zero(const T&)
+{
+    return T(0);
+}
+
+//
+// cpu implemenataion of batch norm for inference
+//
+// inputs are:
+// args[0] -> input data buffer
+// args[1] -> mini batch mean
+// args[2] -> mini batch variance
+// args[3] -> gamma
+// args[4] -> beta
+//
+// The equation to compute batch norm for inference is:
+//
+// output[i] = beta + gamma * (input[i] + mean) / sqrt(variance + epsilon)
+//
+// the input data format should be nchw
+//
+struct cpu_batch_norm_inference
+{
+    batch_norm_inference op;
+
+    std::string name() const { return "cpu::batch_norm_inference"; }
+
+    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument output{output_shape};
+
+        double epsilon           = op.epsilon;
+        auto input               = args[0];
+        auto mini_batch_mean     = args[1].at<float>();
+        auto mini_batch_variance = args[2].at<float>();
+        auto gamma               = args[3].at<float>();
+        auto beta                = args[4].at<float>();
+
+        visit_all(output, input)([&](auto result, auto buffer) {
+            std::transform(buffer.begin(), buffer.end(), result.begin(), [&](auto x) {
+                return gamma * (x - mini_batch_mean) / std::sqrt(mini_batch_variance + epsilon) +
+                       beta;
+            });
+        });
+
+        return output;
+    }
+};
+
+struct cpu_convolution
+{
+    convolution op;
+
+    std::string name() const { return "cpu::convolution"; }
+    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
+            auto in_h = input.get_shape().lens()[2];
+            auto in_w = input.get_shape().lens()[3];
+
+            auto wei_c = weights.get_shape().lens()[1];
+            auto wei_h = weights.get_shape().lens()[2];
+            auto wei_w = weights.get_shape().lens()[3];
+
+            dfor(output_shape.lens()[0],
+                 output_shape.lens()[1],
+                 output_shape.lens()[2],
+                 output_shape.lens()[3])(
+                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
+                    const int start_x = i * op.stride[0] - op.padding[0];
+                    const int start_y = j * op.stride[1] - op.padding[1];
+
+                    double acc = 0;
+                    dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
+                        const int in_x = start_x + x;
+                        const int in_y = start_y + y;
+                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
+                        {
+                            acc += input(o, k, in_x, in_y) * weights(w, k, x, y);
+                        }
+                    });
+                    output(o, w, i, j) = acc;
+                });
+        });
+        return result;
+    }
+};
+
+struct max_pool
+{
+    static std::string name() { return "max"; }
+    static double start() { return std::numeric_limits<double>::lowest(); }
+
+    static double apply(double x, double y)
+    {
+        double m = std::max(x, y);
+        return (m);
+    }
+
+    static double final(double x, double) { return (x); }
+};
+
+struct avg_pool
+{
+    static std::string name() { return "average"; }
+    static double start() { return 0.0; }
+
+    static double apply(double x, double y) { return x + y; }
+
+    static double final(double x, double y) { return x / y; }
+};
+
+template <class Op>
+struct cpu_pooling
+{
+    pooling op;
+
+    std::string name() const { return "cpu::pooling_" + Op::name(); }
+    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            using type = typename decltype(output)::value_type;
+            auto in_h  = input.get_shape().lens()[2];
+            auto in_w  = input.get_shape().lens()[3];
+
+            dfor(output_shape.lens()[0],
+                 output_shape.lens()[1],
+                 output_shape.lens()[2],
+                 output_shape.lens()[3])(
+                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
+                    const int start_x0 = i * op.stride[0] - op.padding[0];
+                    const int start_y0 = j * op.stride[1] - op.padding[1];
+
+                    const int hend = std::min(start_x0 + op.lengths[0], in_h);
+                    const int wend = std::min(start_y0 + op.lengths[1], in_w);
+
+                    const int start_x = std::max(start_x0, 0);
+                    const int start_y = std::max(start_y0, 0);
+
+                    const int w_h       = (hend - start_x);
+                    const int w_w       = (wend - start_y);
+                    const int pool_size = std::max(w_h * w_w, 1);
+
+                    double acc = Op::start();
+                    dfor(w_h, w_w)([&](int x, int y) {
+                        const int in_x = start_x + x;
+                        const int in_y = start_y + y;
+                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
+                        {
+                            acc = Op::apply(acc, input(o, w, in_x, in_y));
+                        }
+                    });
+                    output(o, w, i, j) = type(Op::final(acc, pool_size));
+                });
+        });
+        return result;
+    }
+};
+
+struct cpu_transpose
+{
+    transpose op;
+
+    std::string name() const { return "cpu::transpose"; }
+    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        return {output_shape, std::move(args.front().data)};
+    }
+};
+
+struct cpu_contiguous
+{
+    contiguous op;
+    std::string name() const { return "cpu::contiguous"; }
+    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            shape_for_each(output.get_shape(), [&](const auto& idx) {
+                output(idx.begin(), idx.end()) = input(idx.begin(), idx.end());
+            });
+        });
+        return result;
+    }
+};
+
+struct cpu_reshape
+{
+    reshape op;
+    std::string name() const { return "cpu::reshape"; }
+    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        return {output_shape, std::move(args.front().data)};
+    }
+};
+
+struct cpu_gemm
+{
+    gemm op;
+    std::string name() const { return "cpu::gemm"; }
+    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0], args[1])([&](auto cmat, auto amat, auto bmat) {
+            auto m = amat.get_shape().lens()[0];
+            auto n = bmat.get_shape().lens()[1];
+            auto k = bmat.get_shape().lens()[0];
+
+            auto a = amat.data();
+            auto b = bmat.data();
+            auto c = cmat.data();
+            for(int ii = 0; ii < m; ii++)
+            {
+                for(int jj = 0; jj < n; jj++)
+                {
+                    c[ii * n + jj] = 0;
+                }
+            }
+            for(int ii = 0; ii < m; ii++)
+            {
+                for(int kk = 0; kk < k; kk++)
+                {
+                    auto aik  = a[ii * k + kk];
+                    auto* bkj = &b[kk * n];
+                    auto* cij = &c[ii * n];
+                    for(int jj = 0; jj < n; jj++, cij++, bkj++)
+                    {
+                        *cij += aik * (*bkj);
+                    }
+                }
+            }
+        });
+        return result;
+    }
+};
+
+struct identity_op
+{
+    std::string name() const { return "cpu::identity"; }
+    auto fcn() const
+    {
+        return [](auto x) { return x; };
+    }
+};
+
+struct abs_op
+{
+    std::string name() const { return "cpu::abs"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::abs(x); };
+    }
+};
+
+struct exp_op
+{
+    std::string name() const { return "cpu::exp"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::exp(x); };
+    }
+};
+
+struct sin_op
+{
+    std::string name() const { return "cpu::sin"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::sin(x); };
+    }
+};
+
+struct cos_op
+{
+    std::string name() const { return "cpu::cos"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::cos(x); };
+    }
+};
+
+struct tan_op
+{
+    std::string name() const { return "cpu::tan"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::tan(x); };
+    }
+};
+
+struct asin_op
+{
+    std::string name() const { return "cpu::asin"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::asin(x); };
+    }
+};
+
+struct acos_op
+{
+    std::string name() const { return "cpu::acos"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::acos(x); };
+    }
+};
+
+struct atan_op
+{
+    std::string name() const { return "cpu::atan"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::atan(x); };
+    }
+};
+
+struct tanh_op
+{
+    std::string name() const { return "cpu::tanh"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::tanh(x); };
+    }
+};
+
+struct sigmoid_op
+{
+    std::string name() const { return "cpu::sigmoid"; }
+    auto fcn() const
+    {
+        return [](auto x) { return 1.f / (1.f + std::exp(-x)); };
+    }
+};
+
+struct neg_op
+{
+    std::string name() const { return "cpu::neg"; }
+    auto fcn() const
+    {
+        return [](auto x) { return -x; };
+    }
+};
+
+struct relu_op
+{
+    std::string name() const { return "cpu::relu"; }
+    auto fcn() const
+    {
+        return [](auto x) { return x > 0 ? x : 0; };
+    }
+};
+
+template <typename Op>
+struct cpu_unary
+{
+    Op op;
+    std::string name() const { return op.name(); }
+    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        result.visit([&](auto output) {
+            args[0].visit([&](auto input) {
+                std::transform(input.begin(), input.end(), output.begin(), op.fcn());
+            });
+        });
+        return result;
+    }
+};
+
+struct softmax2d
+{
+    std::string name() const { return "cpu::softmax2d"; }
+    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            using value_type = typename decltype(input)::value_type;
+            auto nb          = input.get_shape().lens()[0];
+            auto nc          = input.get_shape().lens()[1];
+            auto nh          = input.get_shape().lens()[2];
+            auto nw          = input.get_shape().lens()[3];
+            dfor(nb, nh, nw)([&](std::size_t b, std::size_t i, std::size_t j) {
+                value_type cmax = std::numeric_limits<value_type>::lowest();
+                for(int c = 0; c < nc; c++)
+                {
+                    cmax = std::max(cmax, input(b, c, i, j));
+                }
+                for(int c = 0; c < nc; c++)
+                {
+                    output(b, c, i, j) = std::exp(input(b, c, i, j) - cmax);
+                }
+                value_type sum = value_type(0);
+                for(int c = 0; c < nc; c++)
+                {
+                    sum += output(b, c, i, j);
+                }
+                for(int c = 0; c < nc; c++)
+                {
+                    output(b, c, i, j) = output(b, c, i, j) / sum;
+                }
+            });
+        });
+        return result;
+    }
+};
+
+struct add_op
+{
+    std::string name() const { return "add"; }
+    auto fcn() const
+    {
+        return [](auto x, auto y) { return x + y; };
+    }
+};
+
+struct sub_op
+{
+    std::string name() const { return "sub"; }
+    auto fcn() const
+    {
+        return [](auto x, auto y) { return x - y; };
+    }
+};
+
+struct mul_op
+{
+    std::string name() const { return "mul"; }
+    auto fcn() const
+    {
+        return [](auto x, auto y) { return x * y; };
+    }
+};
+
+struct div_op
+{
+    std::string name() const { return "div"; }
+    auto fcn() const
+    {
+        return [](auto x, auto y) { return x / y; };
+    }
+};
+
+template <typename Op>
+struct cpu_binary
+{
+    Op op;
+    std::string name() const { return op.name(); }
+    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0], args[1])([&](auto output, auto input1, auto input2) {
+            if(input1.get_shape().packed() and input2.get_shape().packed())
+            {
+                std::transform(
+                    input1.begin(), input1.end(), input2.begin(), output.begin(), op.fcn());
+            }
+            else
+            {
+                shape_for_each(output.get_shape(), [&](const auto& idx) {
+                    output(idx.begin(), idx.end()) =
+                        op.fcn()(input1(idx.begin(), idx.end()), input2(idx.begin(), idx.end()));
+                });
+            }
+        });
+        return result;
+    }
+};
+
+struct cpu_apply
+{
+    program* prog;
+    std::unordered_map<std::string, std::function<void(instruction_ref)>> apply_map{};
+
+    template <class T>
+    auto simple_op()
+    {
+        return [this](instruction_ref ins) { apply_simple_op<T>(ins); };
+    }
+
+    template <class T, class Op>
+    auto extend_op()
+    {
+        return [this](instruction_ref ins) { apply_extend_op<T, Op>(ins); };
+    }
+
+    void init()
+    {
+        apply_map["convolution"] = extend_op<cpu_convolution, convolution>();
+        apply_map["gemm"]        = extend_op<cpu_gemm, gemm>();
+        apply_map["batch_norm_inference"] =
+            extend_op<cpu_batch_norm_inference, batch_norm_inference>();
+        apply_map["reshape"]    = extend_op<cpu_reshape, reshape>();
+        apply_map["contiguous"] = extend_op<cpu_contiguous, contiguous>();
+        apply_map["transpose"]  = extend_op<cpu_transpose, transpose>();
+
+        apply_map["identity"] = simple_op<cpu_unary<identity_op>>();
+        apply_map["tanh"]     = simple_op<cpu_unary<tanh_op>>();
+        apply_map["sigmoid"]  = simple_op<cpu_unary<sigmoid_op>>();
+        apply_map["exp"]      = simple_op<cpu_unary<exp_op>>();
+        apply_map["neg"]      = simple_op<cpu_unary<neg_op>>();
+        apply_map["sin"]      = simple_op<cpu_unary<sin_op>>();
+        apply_map["cos"]      = simple_op<cpu_unary<cos_op>>();
+        apply_map["tan"]      = simple_op<cpu_unary<tan_op>>();
+        apply_map["add"]      = simple_op<cpu_binary<add_op>>();
+        apply_map["sub"]      = simple_op<cpu_binary<sub_op>>();
+        apply_map["mul"]      = simple_op<cpu_binary<mul_op>>();
+        apply_map["div"]      = simple_op<cpu_binary<div_op>>();
+
+        apply_map["softmax"] = simple_op<softmax2d>();
+    }
+
+    void apply()
+    {
+        init();
+        for(auto it : iterator_for(*prog))
+        {
+            if(it->op.name() == "activation")
+            {
+                apply_activation(it);
+            }
+            else if(it->op.name() == "pooling")
+            {
+                apply_pooling(it);
+            }
+            else if(apply_map.count(it->op.name()) > 0)
+            {
+                apply_map.at(it->op.name())(it);
+            }
+        }
+    }
+
+    template <class T>
+    void apply_simple_op(instruction_ref ins)
+    {
+        prog->replace_instruction(ins, T{}, ins->arguments);
+    }
+
+    template <class T, class Op>
+    void apply_extend_op(instruction_ref ins)
+    {
+        auto&& op = any_cast<Op>(ins->op);
+        prog->replace_instruction(ins, T{op}, ins->arguments);
+    }
+
+    void apply_activation(instruction_ref ins)
+    {
+        auto&& op = any_cast<activation>(ins->op);
+        if(op.mode == "relu")
+            prog->replace_instruction(ins, cpu_unary<relu_op>{}, ins->arguments);
+    }
+
+    void apply_pooling(instruction_ref ins)
+    {
+        auto&& op = any_cast<pooling>(ins->op);
+        if(op.mode == "max")
+            prog->replace_instruction(ins, cpu_pooling<max_pool>{op}, ins->arguments);
+        else if(op.mode == "average")
+            prog->replace_instruction(ins, cpu_pooling<avg_pool>{op}, ins->arguments);
+    }
+};
+
+void cpu_lowering::apply(program& p) const { cpu_apply{&p}.apply(); }
+
+} // namespace cpu
+
+} // namespace migraph
--- a/src/targets/cpu/cpu_target.cpp
+++ b/src/targets/cpu/cpu_target.cpp

 #include <migraph/cpu/cpu_target.hpp>
-#include <migraph/instruction.hpp>
-#include <migraph/dfor.hpp>
-#include <migraph/operators.hpp>
-#include <migraph/shape_for_each.hpp>
-#include <migraph/iterator_for.hpp>
+#include <migraph/cpu/cpu_lowering.hpp>

 namespace migraph {
 namespace cpu {

-template <typename T>
-T zero(const T&)
-{
-    return T(0);
-}
-
-struct cpu_convolution
-{
-    convolution op;
-
-    std::string name() const { return "cpu::convolution"; }
-    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
-            auto in_h = input.get_shape().lens()[2];
-            auto in_w = input.get_shape().lens()[3];
-
-            auto wei_c = weights.get_shape().lens()[1];
-            auto wei_h = weights.get_shape().lens()[2];
-            auto wei_w = weights.get_shape().lens()[3];
-
-            dfor(output_shape.lens()[0],
-                 output_shape.lens()[1],
-                 output_shape.lens()[2],
-                 output_shape.lens()[3])(
-                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
-                    const int start_x = i * op.stride[0] - op.padding[0];
-                    const int start_y = j * op.stride[1] - op.padding[1];
-
-                    double acc = 0;
-                    dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
-                        const int in_x = start_x + x;
-                        const int in_y = start_y + y;
-                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
-                        {
-                            acc += input(o, k, in_x, in_y) * weights(w, k, x, y);
-                        }
-                    });
-                    output(o, w, i, j) = acc;
-                });
-        });
-        return result;
-    }
-};
-
-struct max_pool
-{
-    static std::string name() { return "max"; }
-    static double start() { return std::numeric_limits<double>::lowest(); }
-
-    static double apply(double x, double y)
-    {
-        double m = std::max(x, y);
-        return (m);
-    }
-
-    static double final(double x, double) { return (x); }
-};
-
-struct avg_pool
-{
-    static std::string name() { return "average"; }
-    static double start() { return 0.0; }
-
-    static double apply(double x, double y) { return x + y; }
-
-    static double final(double x, double y) { return x / y; }
-};
-
-template <class Op>
-struct cpu_pooling
-{
-    pooling op;
-
-    std::string name() const { return "cpu::pooling_" + Op::name(); }
-    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0])([&](auto output, auto input) {
-            using type = typename decltype(output)::value_type;
-            auto in_h  = input.get_shape().lens()[2];
-            auto in_w  = input.get_shape().lens()[3];
-
-            dfor(output_shape.lens()[0],
-                 output_shape.lens()[1],
-                 output_shape.lens()[2],
-                 output_shape.lens()[3])(
-                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
-                    const int start_x0 = i * op.stride[0] - op.padding[0];
-                    const int start_y0 = j * op.stride[1] - op.padding[1];
-
-                    const int hend = std::min(start_x0 + op.lengths[0], in_h);
-                    const int wend = std::min(start_y0 + op.lengths[1], in_w);
-
-                    const int start_x = std::max(start_x0, 0);
-                    const int start_y = std::max(start_y0, 0);
-
-                    const int w_h       = (hend - start_x);
-                    const int w_w       = (wend - start_y);
-                    const int pool_size = std::max(w_h * w_w, 1);
-
-                    double acc = Op::start();
-                    dfor(w_h, w_w)([&](int x, int y) {
-                        const int in_x = start_x + x;
-                        const int in_y = start_y + y;
-                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
-                        {
-                            acc = Op::apply(acc, input(o, w, in_x, in_y));
-                        }
-                    });
-                    output(o, w, i, j) = type(Op::final(acc, pool_size));
-                });
-        });
-        return result;
-    }
-};
-
-struct cpu_transpose
-{
-    transpose op;
-
-    std::string name() const { return "cpu::transpose"; }
-    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        return {output_shape, std::move(args.front().data)};
-    }
-};
-
-struct cpu_contiguous
-{
-    contiguous op;
-    std::string name() const { return "cpu::contiguous"; }
-    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0])([&](auto output, auto input) {
-            shape_for_each(output.get_shape(), [&](const auto& idx) {
-                output(idx.begin(), idx.end()) = input(idx.begin(), idx.end());
-            });
-        });
-        return result;
-    }
-};
-
-struct cpu_reshape
-{
-    reshape op;
-    std::string name() const { return "cpu::reshape"; }
-    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
-
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        return {output_shape, std::move(args.front().data)};
-    }
-};
-
-struct cpu_gemm
-{
-    gemm op;
-    std::string name() const { return "cpu::gemm"; }
-    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
-
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0], args[1])([&](auto cmat, auto amat, auto bmat) {
-            auto m = amat.get_shape().lens()[0];
-            auto n = bmat.get_shape().lens()[1];
-            auto k = bmat.get_shape().lens()[0];
-
-            auto a = amat.data();
-            auto b = bmat.data();
-            auto c = cmat.data();
-            for(int ii = 0; ii < m; ii++)
-            {
-                for(int jj = 0; jj < n; jj++)
-                {
-                    c[ii * n + jj] = 0;
-                }
-            }
-            for(int ii = 0; ii < m; ii++)
-            {
-                for(int kk = 0; kk < k; kk++)
-                {
-                    auto aik  = a[ii * k + kk];
-                    auto* bkj = &b[kk * n];
-                    auto* cij = &c[ii * n];
-                    for(int jj = 0; jj < n; jj++, cij++, bkj++)
-                    {
-                        *cij += aik * (*bkj);
-                    }
-                }
-            }
-        });
-        return result;
-    }
-};
-
-struct identity_op
-{
-    std::string name() const { return "cpu::identity"; }
-    auto fcn() const
-    {
-        return [](auto x) { return x; };
-    }
-};
-
-struct abs_op
-{
-    std::string name() const { return "cpu::abs"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::abs(x); };
-    }
-};
-
-struct exp_op
-{
-    std::string name() const { return "cpu::exp"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::exp(x); };
-    }
-};
-
-struct sin_op
-{
-    std::string name() const { return "cpu::sin"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::sin(x); };
-    }
-};
-
-struct cos_op
-{
-    std::string name() const { return "cpu::cos"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::cos(x); };
-    }
-};
-
-struct tan_op
-{
-    std::string name() const { return "cpu::tan"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::tan(x); };
-    }
-};
-
-struct asin_op
-{
-    std::string name() const { return "cpu::asin"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::asin(x); };
-    }
-};
-
-struct acos_op
-{
-    std::string name() const { return "cpu::acos"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::acos(x); };
-    }
-};
-
-struct atan_op
-{
-    std::string name() const { return "cpu::atan"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::atan(x); };
-    }
-};
-
-struct tanh_op
-{
-    std::string name() const { return "cpu::tanh"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::tanh(x); };
-    }
-};
-
-struct sigmoid_op
-{
-    std::string name() const { return "cpu::sigmoid"; }
-    auto fcn() const
-    {
-        return [](auto x) { return 1.f / (1.f + std::exp(-x)); };
-    }
-};
-
-struct neg_op
-{
-    std::string name() const { return "cpu::neg"; }
-    auto fcn() const
-    {
-        return [](auto x) { return -x; };
-    }
-};
-
-struct relu_op
-{
-    std::string name() const { return "cpu::relu"; }
-    auto fcn() const
-    {
-        return [](auto x) { return x > 0 ? x : 0; };
-    }
-};
-
-template <typename Op>
-struct cpu_unary
-{
-    Op op;
-    std::string name() const { return op.name(); }
-    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        result.visit([&](auto output) {
-            args[0].visit([&](auto input) {
-                std::transform(input.begin(), input.end(), output.begin(), op.fcn());
-            });
-        });
-        return result;
-    }
-};
-
-struct softmax2d
-{
-    std::string name() const { return "cpu::softmax2d"; }
-    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0])([&](auto output, auto input) {
-            using value_type = typename decltype(input)::value_type;
-            auto nb          = input.get_shape().lens()[0];
-            auto nc          = input.get_shape().lens()[1];
-            auto nh          = input.get_shape().lens()[2];
-            auto nw          = input.get_shape().lens()[3];
-            dfor(nb, nh, nw)([&](std::size_t b, std::size_t i, std::size_t j) {
-                value_type cmax = std::numeric_limits<value_type>::lowest();
-                for(int c = 0; c < nc; c++)
-                {
-                    cmax = std::max(cmax, input(b, c, i, j));
-                }
-                for(int c = 0; c < nc; c++)
-                {
-                    output(b, c, i, j) = std::exp(input(b, c, i, j) - cmax);
-                }
-                value_type sum = value_type(0);
-                for(int c = 0; c < nc; c++)
-                {
-                    sum += output(b, c, i, j);
-                }
-                for(int c = 0; c < nc; c++)
-                {
-                    output(b, c, i, j) = output(b, c, i, j) / sum;
-                }
-            });
-        });
-        return result;
-    }
-};
-
-struct add_op
-{
-    std::string name() const { return "add"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x + y; };
-    }
-};
-
-struct sub_op
-{
-    std::string name() const { return "sub"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x - y; };
-    }
-};
-
-struct mul_op
-{
-    std::string name() const { return "mul"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x * y; };
-    }
-};
-
-struct div_op
-{
-    std::string name() const { return "div"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x / y; };
-    }
-};
-
-template <typename Op>
-struct cpu_binary
-{
-    Op op;
-    std::string name() const { return op.name(); }
-    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0], args[1])([&](auto output, auto input1, auto input2) {
-            if(input1.get_shape().packed() and input2.get_shape().packed())
-            {
-                std::transform(
-                    input1.begin(), input1.end(), input2.begin(), output.begin(), op.fcn());
-            }
-            else
-            {
-                shape_for_each(output.get_shape(), [&](const auto& idx) {
-                    output(idx.begin(), idx.end()) =
-                        op.fcn()(input1(idx.begin(), idx.end()), input2(idx.begin(), idx.end()));
-                });
-            }
-        });
-        return result;
-    }
-};
-
-struct cpu_apply
-{
-    program* prog;
-    std::unordered_map<std::string, std::function<void(instruction_ref)>> apply_map{};
-
-    template <class T>
-    auto simple_op()
-    {
-        return [this](instruction_ref ins) { apply_simple_op<T>(ins); };
-    }
-
-    template <class T, class Op>
-    auto extend_op()
-    {
-        return [this](instruction_ref ins) { apply_extend_op<T, Op>(ins); };
-    }
-
-    void init()
-    {
-        apply_map["convolution"] = extend_op<cpu_convolution, convolution>();
-        apply_map["gemm"]        = extend_op<cpu_gemm, gemm>();
-        apply_map["reshape"]     = extend_op<cpu_reshape, reshape>();
-        apply_map["contiguous"]  = extend_op<cpu_contiguous, contiguous>();
-        apply_map["transpose"]   = extend_op<cpu_transpose, transpose>();
-
-        apply_map["identity"] = simple_op<cpu_unary<identity_op>>();
-        apply_map["tanh"]     = simple_op<cpu_unary<tanh_op>>();
-        apply_map["sigmoid"]  = simple_op<cpu_unary<sigmoid_op>>();
-        apply_map["exp"]      = simple_op<cpu_unary<exp_op>>();
-        apply_map["neg"]      = simple_op<cpu_unary<neg_op>>();
-        apply_map["sin"]      = simple_op<cpu_unary<sin_op>>();
-        apply_map["cos"]      = simple_op<cpu_unary<cos_op>>();
-        apply_map["tan"]      = simple_op<cpu_unary<tan_op>>();
-        apply_map["add"]      = simple_op<cpu_binary<add_op>>();
-        apply_map["sub"]      = simple_op<cpu_binary<sub_op>>();
-        apply_map["mul"]      = simple_op<cpu_binary<mul_op>>();
-        apply_map["div"]      = simple_op<cpu_binary<div_op>>();
-
-        apply_map["softmax"] = simple_op<softmax2d>();
-    }
-
-    void apply()
-    {
-        init();
-        for(auto it : iterator_for(*prog))
-        {
-            if(it->op.name() == "activation")
-            {
-                apply_activation(it);
-            }
-            else if(it->op.name() == "pooling")
-            {
-                apply_pooling(it);
-            }
-            else if(apply_map.count(it->op.name()) > 0)
-            {
-                apply_map.at(it->op.name())(it);
-            }
-        }
-    }
-
-    template <class T>
-    void apply_simple_op(instruction_ref ins)
-    {
-        prog->replace_instruction(ins, T{}, ins->arguments);
-    }
-
-    template <class T, class Op>
-    void apply_extend_op(instruction_ref ins)
-    {
-        auto&& op = any_cast<Op>(ins->op);
-        prog->replace_instruction(ins, T{op}, ins->arguments);
-    }
-
-    void apply_activation(instruction_ref ins)
-    {
-        auto&& op = any_cast<activation>(ins->op);
-        if(op.mode == "relu")
-            prog->replace_instruction(ins, cpu_unary<relu_op>{}, ins->arguments);
-    }
-
-    void apply_pooling(instruction_ref ins)
-    {
-        auto&& op = any_cast<pooling>(ins->op);
-        if(op.mode == "max")
-            prog->replace_instruction(ins, cpu_pooling<max_pool>{op}, ins->arguments);
-        else if(op.mode == "average")
-            prog->replace_instruction(ins, cpu_pooling<avg_pool>{op}, ins->arguments);
-    }
-};
-
-struct cpu_pass
-{
-    std::string name() const { return "cpu::pass"; }
-
-    void apply(program& p) const { cpu_apply{&p}.apply(); }
-};
-
 std::string cpu_target::name() const { return "cpu"; }

-std::vector<pass> cpu_target::get_passes(context&) const { return {cpu_pass{}}; }
+std::vector<pass> cpu_target::get_passes(context&) const { return {cpu_lowering{}}; }

 } // namespace cpu


--- a/src/targets/cpu/include/migraph/cpu/cpu_lowering.hpp
+++ b/src/targets/cpu/include/migraph/cpu/cpu_lowering.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_CPU_LOWERING_HPP
+#define MIGRAPH_GUARD_RTGLIB_CPU_LOWERING_HPP
+
+#include <migraph/program.hpp>
+
+namespace migraph {
+namespace cpu {
+
+struct cpu_lowering
+{
+    std::string name() const { return "cpu::lowering"; }
+    void apply(program& p) const;
+};
+
+} // namespace cpu
+
+} // namespace migraph
+
+#endif
--- a/src/targets/miopen/CMakeLists.txt
+++ b/src/targets/miopen/CMakeLists.txt
@@ -2,6 +2,10 @@
 list(APPEND CMAKE_PREFIX_PATH /opt/rocm /opt/rocm/hip /opt/rocm/hcc)
 find_package(miopen)

+# rocblas
+find_package(rocblas REQUIRED PATHS /opt/rocm)
+message(STATUS "Build with rocblas")
+
 if(NOT TARGET MIOpen)
    message(SEND_ERROR "Cant find miopen")
 endif()
@@ -15,8 +19,11 @@ target_include_directories(migraph_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRE

 add_library(migraph_miopen
    hip.cpp
-    miopen_target.cpp
+    target.cpp
+    lowering.cpp
+    write_literals.cpp
+    rocblas.cpp
 )
 rocm_clang_tidy_check(migraph_miopen)
-target_link_libraries(migraph_miopen migraph MIOpen migraph_device)
+target_link_libraries(migraph_miopen migraph MIOpen migraph_device roc::rocblas)
 target_include_directories(migraph_miopen PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
--- a/src/targets/miopen/include/migraph/miopen/context.hpp
+++ b/src/targets/miopen/include/migraph/miopen/context.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_CONTEXT_HPP
+#define MIGRAPH_GUARD_RTGLIB_CONTEXT_HPP
+
+#include <migraph/miopen/miopen.hpp>
+#include <migraph/miopen/rocblas.hpp>
+
+namespace migraph {
+namespace miopen {
+
+struct context
+{
+    shared<miopen_handle> handle;
+    shared<rocblas_handle_ptr> rbhandle;
+};
+
+} // namespace miopen
+
+} // namespace migraph
+
+#endif