Only use no-cache on jenkins

fb75dfaf · Paul · e596eec2 · f0604d78 · fb75dfaf · fb75dfaf
Commit fb75dfaf authored Aug 14, 2018 by Paul
20 changed files
--- a/src/onnx/CMakeLists.txt
+++ b/src/onnx/CMakeLists.txt
@@ -5,7 +5,23 @@ add_library(onnx-proto STATIC ${PROTO_SRCS})
 target_include_directories(onnx-proto SYSTEM PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${PROTOBUF_INCLUDE_DIR})
 target_compile_options(onnx-proto PRIVATE -w)
 target_link_libraries(onnx-proto PRIVATE ${PROTOBUF_LIBRARY})
+set_target_properties(onnx-proto PROPERTIES POSITION_INDEPENDENT_CODE On)
+
+add_library(migraph_onnx onnx.cpp)
+rocm_clang_tidy_check(migraph_onnx)
+target_link_libraries(migraph_onnx PRIVATE onnx-proto)
+target_link_libraries(migraph_onnx PUBLIC migraph)

 add_executable(read_onnx read_onnx.cpp)
 rocm_clang_tidy_check(read_onnx)
-target_link_libraries(read_onnx onnx-proto rtg)
+target_link_libraries(read_onnx migraph_onnx)
+
+add_executable(mnist mnist.cpp)
+rocm_clang_tidy_check(mnist)
+target_link_libraries(mnist migraph_cpu migraph_onnx)
+
+if(MIGRAPH_ENABLE_GPU)
+add_executable(verify_onnx verify_onnx.cpp)
+rocm_clang_tidy_check(verify_onnx)
+target_link_libraries(verify_onnx migraph_onnx migraph_cpu migraph_gpu)
+endif()
--- a/src/onnx/mnist.cpp
+++ b/src/onnx/mnist.cpp
+#include <cstdio>
+#include <string>
+#include <fstream>
+#include <numeric>
+#include <stdexcept>
+
+#include <migraph/onnx.hpp>
+
+#include <migraph/cpu/cpu_target.hpp>
+#include <migraph/generate.hpp>
+
+auto reverse_int(unsigned int i)
+{
+    unsigned char c1, c2, c3, c4;
+    c1 = i & 255u;
+    c2 = (i >> 8u) & 255u;
+    c3 = (i >> 16u) & 255u;
+    c4 = (i >> 24u) & 255u;
+    return (static_cast<unsigned int>(c1) << 24u) + (static_cast<unsigned int>(c2) << 16u) +
+           (static_cast<unsigned int>(c3) << 8u) + c4;
+};
+
+std::vector<float> read_mnist_images(std::string full_path, int& number_of_images, int& image_size)
+{
+    using uchar = unsigned char;
+
+    std::ifstream file(full_path, std::ios::binary);
+
+    if(file.is_open())
+    {
+        int magic_number = 0, n_rows = 0, n_cols = 0;
+
+        file.read(reinterpret_cast<char*>(&magic_number), sizeof(magic_number));
+        magic_number = reverse_int(magic_number);
+
+        if(magic_number != 2051)
+            throw std::runtime_error("Invalid MNIST image file!");
+
+        file.read(reinterpret_cast<char*>(&number_of_images), sizeof(number_of_images));
+        number_of_images = reverse_int(number_of_images);
+        file.read(reinterpret_cast<char*>(&n_rows), sizeof(n_rows));
+        n_rows = reverse_int(n_rows);
+        file.read(reinterpret_cast<char*>(&n_cols), sizeof(n_cols));
+        n_cols = reverse_int(n_cols);
+
+        image_size = n_rows * n_cols;
+
+        std::vector<float> result(number_of_images * image_size);
+        for(int i = 0; i < number_of_images; i++)
+        {
+            for(int j = 0; j < image_size; j++)
+            {
+                uchar tmp;
+                file.read(reinterpret_cast<char*>(&tmp), 1);
+                result[i * image_size + j] = tmp / 255.0;
+            }
+        }
+        return result;
+    }
+    else
+    {
+        throw std::runtime_error("Cannot open file `" + full_path + "`!");
+    }
+}
+
+std::vector<int32_t> read_mnist_labels(std::string full_path, int& number_of_labels)
+{
+    using uchar = unsigned char;
+
+    std::ifstream file(full_path, std::ios::binary);
+
+    if(file.is_open())
+    {
+        int magic_number = 0;
+        file.read(reinterpret_cast<char*>(&magic_number), sizeof(magic_number));
+        magic_number = reverse_int(magic_number);
+
+        if(magic_number != 2049)
+            throw std::runtime_error("Invalid MNIST label file!");
+
+        file.read(reinterpret_cast<char*>(&number_of_labels), sizeof(number_of_labels));
+        number_of_labels = reverse_int(number_of_labels);
+
+        std::vector<int32_t> result(number_of_labels);
+        for(int i = 0; i < number_of_labels; i++)
+        {
+            uchar tmp;
+            file.read(reinterpret_cast<char*>(&tmp), 1);
+            result[i] = tmp;
+        }
+        return result;
+    }
+    else
+    {
+        throw std::runtime_error("Unable to open file `" + full_path + "`!");
+    }
+}
+
+std::vector<float> softmax(std::vector<float> p)
+{
+    size_t n = p.size();
+    std::vector<float> result(n);
+    std::transform(p.begin(), p.end(), result.begin(), [](auto x) { return std::exp(x); });
+    float s = std::accumulate(result.begin(), result.end(), 0.0f, std::plus<float>());
+    std::transform(result.begin(), result.end(), result.begin(), [=](auto x) { return x / s; });
+    return result;
+}
+
+int main(int argc, char const* argv[])
+{
+    if(argc > 3)
+    {
+        std::string datafile        = argv[2];
+        std::string labelfile       = argv[3];
+        int nimages                 = -1;
+        int image_size              = -1;
+        int nlabels                 = -1;
+        std::vector<float> input    = read_mnist_images(datafile, nimages, image_size);
+        std::vector<int32_t> labels = read_mnist_labels(labelfile, nlabels);
+
+        std::string file = argv[1];
+        auto prog        = migraph::parse_onnx(file);
+        prog.compile(migraph::cpu::cpu_target{});
+        auto s = migraph::shape{migraph::shape::float_type, {1, 1, 28, 28}};
+        std::cout << s << std::endl;
+        auto ptr = input.data();
+        for(int i = 0; i < 20; i++)
+        {
+            std::cout << "label: " << labels[i] << "  ---->  ";
+            auto input3 = migraph::argument{s, &ptr[784 * i]};
+            auto result = prog.eval({{"Input3", input3}});
+            std::vector<float> logits;
+            result.visit([&](auto output) { logits.assign(output.begin(), output.end()); });
+            std::vector<float> probs = softmax(logits);
+            for(auto x : probs)
+                std::cout << x << "  ";
+            std::cout << std::endl;
+        }
+        std::cout << std::endl;
+    }
+}
--- a/src/onnx/onnx.cpp
+++ b/src/onnx/onnx.cpp
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <onnx.pb.h>
+#include <iostream>
+#include <fstream>
+#include <unordered_map>
+#include <functional>
+#include <array>
+#include <vector>
+
+#include <migraph/fallthrough.hpp>
+#include <migraph/program.hpp>
+#include <migraph/operators.hpp>
+#include <migraph/ranges.hpp>
+#include <migraph/instruction.hpp>
+
+namespace migraph {
+
+struct unknown
+{
+    std::string op;
+    std::string name() const { return "unknown:" + op; }
+    shape compute_shape(std::vector<shape> input) const
+    {
+        if(input.empty())
+            return {};
+        else
+            return input.front();
+    }
+    argument compute(context&, shape, std::vector<argument>) const
+    {
+        MIGRAPH_THROW("not computable");
+    }
+    friend std::ostream& operator<<(std::ostream& os, const unknown& x)
+    {
+        os << x.name();
+        return os;
+    }
+};
+
+struct onnx_parser
+{
+    using attribute_map = std::unordered_map<std::string, onnx::AttributeProto>;
+    using node_map      = std::unordered_map<std::string, onnx::NodeProto>;
+    using op_func = std::function<instruction_ref(attribute_map, std::vector<instruction_ref>)>;
+    node_map nodes;
+    std::unordered_map<std::string, instruction_ref> instructions;
+    program prog = program();
+
+    std::unordered_map<std::string, op_func> ops;
+
+    onnx_parser()
+    {
+        add_generic_op("Add", add{});
+        add_generic_op("Div", div{});
+        add_generic_op("MatMul", gemm{});
+        add_generic_op("Mul", mul{});
+        add_generic_op("Relu", activation{"relu"});
+        add_generic_op("Sub", sub{});
+
+        add_mem_op("Constant", &onnx_parser::parse_constant);
+        add_mem_op("Conv", &onnx_parser::parse_conv);
+        add_mem_op("MaxPool", &onnx_parser::parse_pooling);
+        add_mem_op("Reshape", &onnx_parser::parse_reshape);
+        add_mem_op("BatchNormalization", &onnx_parser::parse_batchnorm);
+    }
+
+    template <class F>
+    void add_op(std::string name, F f)
+    {
+        ops.emplace(name, f);
+    }
+
+    template <class F>
+    void add_mem_op(std::string name, F f)
+    {
+        ops.emplace(name, [=](auto&&... xs) {
+            return std::mem_fn(f)(*this, name, std::forward<decltype(xs)>(xs)...);
+        });
+    }
+
+    template <class T>
+    void add_generic_op(std::string name, T x)
+    {
+        ops.emplace(name, [this, x](attribute_map attributes, std::vector<instruction_ref> args) {
+            if(args.size() == 2 and contains(attributes, "broadcast"))
+            {
+                uint64_t broadcasted = parse_value(attributes.at("broadcast")).at<uint64_t>();
+                if(broadcasted != 0)
+                {
+                    uint64_t axis = (contains(attributes, "axis"))
+                                        ? parse_value(attributes.at("axis")).at<uint64_t>()
+                                        : 0;
+                    auto l = prog.add_instruction(broadcast{axis}, args);
+                    return prog.add_instruction(x, args[0], l);
+                }
+            }
+            return prog.add_instruction(x, args);
+        });
+    }
+
+    instruction_ref
+    parse_conv(std::string, attribute_map attributes, std::vector<instruction_ref> args)
+    {
+        convolution op;
+        if(contains(attributes, "pads"))
+        {
+            copy(attributes["pads"].ints(), op.padding.begin());
+        }
+        if(contains(attributes, "strides"))
+        {
+            copy(attributes["strides"].ints(), op.stride.begin());
+        }
+        if(contains(attributes, "dilations"))
+        {
+            copy(attributes["dilations"].ints(), op.dilation.begin());
+        }
+        if(args.size() == 3)
+        {
+            uint64_t axis = 1;
+            auto l1       = prog.add_instruction(op, args[0], args[1]);
+            auto l2       = prog.add_instruction(broadcast{axis}, l1, args[2]);
+            return prog.add_instruction(add{}, l1, l2);
+        }
+        return prog.add_instruction(op, args);
+    }
+
+    instruction_ref
+    parse_pooling(std::string, attribute_map attributes, std::vector<instruction_ref> args)
+    {
+        pooling op{"max"};
+        if(contains(attributes, "pads"))
+        {
+            copy(attributes["pads"].ints(), op.padding.begin());
+        }
+        if(contains(attributes, "strides"))
+        {
+            copy(attributes["strides"].ints(), op.stride.begin());
+        }
+        if(contains(attributes, "kernel_shape"))
+        {
+            copy(attributes["kernel_shape"].ints(), op.lengths.begin());
+        }
+        return prog.add_instruction(op, args);
+    }
+
+    instruction_ref
+    parse_reshape(std::string, attribute_map attributes, std::vector<instruction_ref> args)
+    {
+        reshape op;
+        if(args.size() == 1)
+        {
+            literal s = parse_value(attributes.at("shape"));
+            s.visit([&](auto v) { copy(v, std::back_inserter(op.dims)); });
+        }
+        if(args.size() == 2)
+        {
+            literal s = args[1]->lit;
+            s.visit([&](auto v) { copy(v, std::back_inserter(op.dims)); });
+        }
+        return prog.add_instruction(op, args[0]);
+    }
+
+    instruction_ref
+    parse_constant(std::string, attribute_map attributes, std::vector<instruction_ref>)
+    {
+        literal v = parse_value(attributes.at("value"));
+        return prog.add_literal(v);
+    }
+
+    instruction_ref
+    parse_batchnorm(std::string, attribute_map attributes, std::vector<instruction_ref> args)
+    {
+        float epsilon                                 = 1e-5f;
+        float momentum                                = 0.9f;
+        batch_norm_inference::bn_infer_mode_t bn_mode = batch_norm_inference::spatial;
+        bool is_test                                  = false;
+        if(contains(attributes, "epsilon"))
+        {
+            epsilon = parse_value(attributes.at("epsilon")).at<float>();
+        }
+        if(contains(attributes, "momentum"))
+        {
+            epsilon = parse_value(attributes.at("momentum")).at<float>();
+        }
+        if(contains(attributes, "is_test"))
+        {
+            is_test = parse_value(attributes.at("is_test")).at<uint64_t>() > 0;
+        }
+        if(contains(attributes, "spatial"))
+        {
+            bn_mode = (parse_value(attributes.at("spatial")).at<uint64_t>() > 0)
+                          ? batch_norm_inference::spatial
+                          : batch_norm_inference::per_activation;
+        }
+        batch_norm_inference op{epsilon, momentum, bn_mode, is_test};
+        return prog.add_instruction(op, args);
+    }
+
+    void parse_from(std::istream& is)
+    {
+        onnx::ModelProto model;
+        if(model.ParseFromIstream(&is))
+        {
+            if(model.has_graph())
+            {
+                this->parse_graph(model.graph());
+            }
+        }
+        else
+        {
+            throw std::runtime_error("Failed reading");
+        }
+    }
+
+    void parse_graph(const onnx::GraphProto& graph)
+    {
+        nodes = get_nodes(graph);
+        for(auto&& input : graph.input())
+        {
+            const std::string& name = input.name();
+            // TODO: Get shape of input parameter
+            shape s            = parse_type(input.type());
+            instructions[name] = prog.add_parameter(name, s);
+        }
+        for(auto&& p : nodes)
+        {
+            this->parse_node(get_name(p.second));
+        }
+    }
+
+    void parse_node(std::string name)
+    {
+        if(name.empty())
+            MIGRAPH_THROW("Onnx node must have a name");
+        if(instructions.count(name) == 0)
+        {
+            auto&& node = nodes.at(name);
+            std::vector<instruction_ref> args;
+            for(auto&& input : node.input())
+            {
+                if(nodes.count(input) > 0)
+                {
+                    auto&& iname = get_name(nodes.at(input));
+                    assert(name != iname);
+                    this->parse_node(iname);
+                    args.push_back(instructions.at(iname));
+                }
+                else
+                {
+                    args.push_back(instructions.at(input));
+                }
+            }
+            if(ops.count(node.op_type()) == 0)
+            {
+                instructions[name] = prog.add_instruction(unknown{node.op_type()}, args);
+            }
+            else
+            {
+                instructions[name] = ops[node.op_type()](get_attributes(node), args);
+            }
+        }
+    }
+
+    static attribute_map get_attributes(const onnx::NodeProto& node)
+    {
+        std::unordered_map<std::string, onnx::AttributeProto> result;
+        for(auto&& attr : node.attribute())
+        {
+            result[attr.name()] = attr;
+        }
+        return result;
+    }
+
+    static std::string get_name(const onnx::NodeProto& node)
+    {
+        if(node.name().empty())
+        {
+            std::string generated = "migraph_unnamed_node";
+            for(auto&& output : node.output())
+            {
+                generated += "_" + output;
+            }
+            return generated;
+        }
+        return node.name();
+    }
+
+    static node_map get_nodes(const onnx::GraphProto& graph)
+    {
+        std::unordered_map<std::string, onnx::NodeProto> result;
+        for(auto&& node : graph.node())
+        {
+            result[get_name(node)] = node;
+            for(auto&& output : node.output())
+            {
+                result[output] = node;
+            }
+        }
+        return result;
+    }
+
+    template <class T>
+    static literal from_repeated(shape::type_t t, const T& r)
+    {
+        std::size_t size = r.size();
+        return literal{{t, {size}}, r.begin(), r.end()};
+    }
+
+    static literal parse_value(const onnx::AttributeProto& attr)
+    {
+        switch(attr.type())
+        {
+        case onnx::AttributeProto::UNDEFINED: return {};
+        case onnx::AttributeProto::FLOAT: return literal{attr.f()};
+        case onnx::AttributeProto::INT: return literal{attr.i()};
+        case onnx::AttributeProto::STRING: return {};
+        case onnx::AttributeProto::TENSOR: return parse_tensor(attr.t());
+        case onnx::AttributeProto::GRAPH: return {};
+        case onnx::AttributeProto::FLOATS: return from_repeated(shape::float_type, attr.floats());
+        case onnx::AttributeProto::INTS: return from_repeated(shape::int64_type, attr.ints());
+        case onnx::AttributeProto::STRINGS: return {};
+        case onnx::AttributeProto::TENSORS: return {};
+        case onnx::AttributeProto::GRAPHS: return {};
+        }
+        MIGRAPH_THROW("Invalid attribute type");
+    }
+
+    static literal parse_tensor(const onnx::TensorProto& t)
+    {
+        std::vector<std::size_t> dims(t.dims().begin(), t.dims().end());
+        if(t.has_raw_data())
+        {
+            const std::string& s = t.raw_data();
+            switch(t.data_type())
+            {
+            case onnx::TensorProto::UNDEFINED: throw std::runtime_error("");
+            case onnx::TensorProto::FLOAT: return literal{{shape::float_type, dims}, s.data()};
+            case onnx::TensorProto::UINT8: throw std::runtime_error("");
+            case onnx::TensorProto::INT8: return literal{{shape::int32_type, dims}, s.data()};
+            case onnx::TensorProto::UINT16: return literal{{shape::int32_type, dims}, s.data()};
+            case onnx::TensorProto::INT16: return literal{{shape::int32_type, dims}, s.data()};
+            case onnx::TensorProto::INT32: return literal{{shape::int32_type, dims}, s.data()};
+            case onnx::TensorProto::INT64: return literal{{shape::int64_type, dims}, s.data()};
+            case onnx::TensorProto::STRING: throw std::runtime_error("");
+            case onnx::TensorProto::BOOL: return literal{{shape::int32_type, dims}, s.data()};
+            case onnx::TensorProto::FLOAT16: throw std::runtime_error("");
+            case onnx::TensorProto::DOUBLE: return literal{{shape::double_type, dims}, s.data()};
+            case onnx::TensorProto::UINT32: throw std::runtime_error("");
+            case onnx::TensorProto::UINT64: throw std::runtime_error("");
+            case onnx::TensorProto::COMPLEX64: throw std::runtime_error("");
+            case onnx::TensorProto::COMPLEX128: throw std::runtime_error("");
+            }
+            MIGRAPH_THROW("Invalid tensor type");
+        }
+        switch(t.data_type())
+        {
+        case onnx::TensorProto::UNDEFINED: throw std::runtime_error("");
+        case onnx::TensorProto::FLOAT:
+            return literal{{shape::float_type, dims}, t.float_data().begin(), t.float_data().end()};
+        case onnx::TensorProto::UINT8: throw std::runtime_error("");
+        case onnx::TensorProto::INT8:
+            return literal{{shape::int32_type, dims}, t.int32_data().begin(), t.int32_data().end()};
+        case onnx::TensorProto::UINT16:
+            return literal{{shape::int32_type, dims}, t.int32_data().begin(), t.int32_data().end()};
+        case onnx::TensorProto::INT16:
+            return literal{{shape::int32_type, dims}, t.int32_data().begin(), t.int32_data().end()};
+        case onnx::TensorProto::INT32:
+            return literal{{shape::int32_type, dims}, t.int32_data().begin(), t.int32_data().end()};
+        case onnx::TensorProto::INT64:
+            return literal{{shape::int64_type, dims}, t.int64_data().begin(), t.int64_data().end()};
+        case onnx::TensorProto::STRING: throw std::runtime_error("");
+        case onnx::TensorProto::BOOL:
+            return literal{{shape::int32_type, dims}, t.int32_data().begin(), t.int32_data().end()};
+        case onnx::TensorProto::FLOAT16: throw std::runtime_error("");
+        case onnx::TensorProto::DOUBLE:
+            return literal{
+                {shape::double_type, dims}, t.double_data().begin(), t.double_data().end()};
+        case onnx::TensorProto::UINT32: throw std::runtime_error("");
+        case onnx::TensorProto::UINT64: throw std::runtime_error("");
+        case onnx::TensorProto::COMPLEX64: throw std::runtime_error("");
+        case onnx::TensorProto::COMPLEX128: throw std::runtime_error("");
+        }
+        MIGRAPH_THROW("Invalid tensor type");
+    }
+
+    static shape parse_type(const onnx::TypeProto& t)
+    {
+        shape::type_t shape_type{};
+        switch(t.tensor_type().elem_type())
+        {
+        case onnx::TensorProto::UNDEFINED:
+            break; // throw std::runtime_error("Unsupported type UNDEFINED");
+        case onnx::TensorProto::FLOAT: shape_type = shape::float_type; break;
+        case onnx::TensorProto::UINT8:
+            break; // throw std::runtime_error("Unsupported type UINT8");
+        case onnx::TensorProto::INT8: shape_type = shape::int8_type; break;
+        case onnx::TensorProto::UINT16: shape_type = shape::uint16_type; break;
+        case onnx::TensorProto::INT16: shape_type = shape::int16_type; break;
+        case onnx::TensorProto::INT32: shape_type = shape::int32_type; break;
+        case onnx::TensorProto::INT64: shape_type = shape::int64_type; break;
+        case onnx::TensorProto::STRING:
+            break; // throw std::runtime_error("Unsupported type STRING");
+        case onnx::TensorProto::BOOL:
+            break; // throw std::runtime_error("Unsupported type BOOL");
+        case onnx::TensorProto::FLOAT16:
+            break; // throw std::runtime_error("Unsupported type FLOAT16");
+        case onnx::TensorProto::DOUBLE: shape_type = shape::double_type; break;
+        case onnx::TensorProto::UINT32: shape_type = shape::uint32_type; break;
+        case onnx::TensorProto::UINT64: shape_type = shape::uint64_type; break;
+        case onnx::TensorProto::COMPLEX64:
+            break; // throw std::runtime_error("Unsupported type COMPLEX64");
+        case onnx::TensorProto::COMPLEX128:
+            break; // throw std::runtime_error("Unsupported type COMPLEX128");
+        }
+        std::vector<std::size_t> dims;
+        // TODO: USe std::transform
+        for(auto&& d : t.tensor_type().shape().dim())
+        {
+            dims.push_back(d.dim_value());
+        }
+        return {shape_type, dims};
+    }
+};
+
+program parse_onnx(const std::string& name)
+{
+    std::fstream input(name.c_str(), std::ios::in | std::ios::binary);
+    onnx_parser parser;
+#ifndef NDEBUG
+    // Log the program when it can't be parsed
+    try
+    {
+        parser.parse_from(input);
+    }
+    catch(...)
+    {
+        std::cerr << parser.prog << std::endl;
+        throw;
+    }
+#else
+    parser.parse_from(input);
+#endif
+    return std::move(parser.prog);
+}
+
+} // namespace migraph
--- a/src/onnx/read_onnx.cpp
+++ b/src/onnx/read_onnx.cpp

-#include <google/protobuf/text_format.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <onnx.pb.h>
-#include <iostream>
-#include <fstream>
-#include <unordered_map>
-#include <functional>
-
-#include <rtg/fallthrough.hpp>
-#include <rtg/program.hpp>
-#include <rtg/operators.hpp>
-
-struct unknown
-{
-    std::string op;
-    std::string name() const { return "unknown:" + op; }
-    rtg::shape compute_shape(std::vector<rtg::shape> input) const
-    {
-        if(input.empty())
-            return {};
-        else
-            return input.front();
-    }
-    rtg::argument compute(rtg::shape, std::vector<rtg::argument>) const
-    {
-        RTG_THROW("not computable");
-    }
-    friend std::ostream& operator<<(std::ostream& os, const unknown& x)
-    {
-        os << x.name();
-        return os;
-    }
-};
-
-template <class C, class T>
-bool contains(C&& c, T&& x)
-{
-    return c.find(x) != c.end();
-}
-
-template <class Range, class Iterator>
-void copy(Range&& r, Iterator it)
-{
-    std::copy(r.begin(), r.end(), it);
-}
-
-struct onnx_parser
-{
-    using attribute_map = std::unordered_map<std::string, onnx::AttributeProto>;
-    using node_map      = std::unordered_map<std::string, onnx::NodeProto>;
-    using op_func =
-        std::function<rtg::instruction_ref(attribute_map, std::vector<rtg::instruction_ref>)>;
-    node_map nodes;
-    std::unordered_map<std::string, rtg::instruction_ref> instructions;
-    rtg::program prog = rtg::program();
-
-    std::unordered_map<std::string, op_func> ops;
-
-    onnx_parser()
-    {
-        add_op("Conv", [this](attribute_map attributes, std::vector<rtg::instruction_ref> args) {
-            rtg::convolution op;
-            if(contains(attributes, "pads"))
-            {
-                copy(attributes["pads"].ints(), op.padding.begin());
-            }
-            if(contains(attributes, "strides"))
-            {
-                copy(attributes["strides"].ints(), op.stride.begin());
-            }
-            if(contains(attributes, "dilations"))
-            {
-                copy(attributes["dilations"].ints(), op.dilation.begin());
-            }
-            return prog.add_instruction(op, args);
-        });
-        add_op("MaxPool", [this](attribute_map attributes, std::vector<rtg::instruction_ref> args) {
-            rtg::pooling op{"max"};
-            // for(auto&& p:attributes) std::cout << p.first << std::endl;
-            if(contains(attributes, "pads"))
-            {
-                copy(attributes["pads"].ints(), op.padding.begin());
-            }
-            if(contains(attributes, "strides"))
-            {
-                copy(attributes["strides"].ints(), op.stride.begin());
-            }
-            if(contains(attributes, "kernel_shape"))
-            {
-                copy(attributes["kernel_shape"].ints(), op.lengths.begin());
-            }
-            return prog.add_instruction(op, args);
-        });
-        add_op("Relu", [this](attribute_map, std::vector<rtg::instruction_ref> args) {
-            return prog.add_instruction(rtg::activation{"relu"}, args);
-        });
-        add_op("Reshape", [this](attribute_map attributes, std::vector<rtg::instruction_ref> args) {
-            rtg::reshape op;
-            rtg::literal s = parse_value(attributes.at("shape"));
-            s.visit([&](auto v) { copy(v, std::back_inserter(op.dims)); });
-            return prog.add_instruction(op, args);
-        });
-        add_op("Constant", [this](attribute_map attributes, std::vector<rtg::instruction_ref>) {
-            rtg::literal v = parse_value(attributes.at("value"));
-            return prog.add_literal(v);
-        });
-    }
-
-    template <class F>
-    void add_op(std::string name, F f)
-    {
-        ops.emplace(name, f);
-    }
-
-    void parse_from(std::istream& is)
-    {
-        onnx::ModelProto model;
-        if(model.ParseFromIstream(&is))
-        {
-            if(model.has_graph())
-            {
-                this->parse_graph(model.graph());
-            }
-        }
-        else
-        {
-            throw std::runtime_error("Failed reading");
-        }
-    }
-
-    void parse_graph(const onnx::GraphProto& graph)
-    {
-        nodes = get_nodes(graph);
-        for(auto&& input : graph.input())
-        {
-            const std::string& name = input.name();
-            // TODO: Get shape of input parameter
-            rtg::shape s       = parse_type(input.type());
-            instructions[name] = prog.add_parameter(name, s);
-        }
-        for(auto&& p : nodes)
-        {
-            this->parse_node(p.second.name());
-        }
-    }
-
-    void parse_node(std::string name)
-    {
-        if(instructions.count(name) == 0)
-        {
-            auto&& node = nodes.at(name);
-            std::vector<rtg::instruction_ref> args;
-            for(auto&& input : node.input())
-            {
-                if(nodes.count(input) > 0)
-                {
-                    auto&& iname = nodes.at(input).name();
-                    this->parse_node(iname);
-                    args.push_back(instructions.at(iname));
-                }
-                else
-                {
-                    args.push_back(instructions.at(input));
-                }
-            }
-            if(ops.count(node.op_type()) == 0)
-            {
-                instructions[name] = prog.add_instruction(unknown{node.op_type()}, args);
-            }
-            else
-            {
-                instructions[name] = ops[node.op_type()](get_attributes(node), args);
-            }
-        }
-    }
-
-    static attribute_map get_attributes(const onnx::NodeProto& node)
-    {
-        std::unordered_map<std::string, onnx::AttributeProto> result;
-        for(auto&& attr : node.attribute())
-        {
-            result[attr.name()] = attr;
-        }
-        return result;
-    }
-
-    static node_map get_nodes(const onnx::GraphProto& graph)
-    {
-        std::unordered_map<std::string, onnx::NodeProto> result;
-        for(auto&& node : graph.node())
-        {
-            result[node.name()] = node;
-            for(auto&& output : node.output())
-            {
-                result[output] = node;
-            }
-        }
-        return result;
-    }
-
-    static rtg::literal parse_value(const onnx::AttributeProto& attr)
-    {
-        switch(attr.type())
-        {
-        case onnx::AttributeProto::UNDEFINED: return {};
-        case onnx::AttributeProto::FLOAT: return rtg::literal{attr.f()};
-        case onnx::AttributeProto::INT: return rtg::literal{attr.i()};
-        case onnx::AttributeProto::STRING: return {};
-        case onnx::AttributeProto::TENSOR: return parse_tensor(attr.t());
-        case onnx::AttributeProto::GRAPH: return {};
-        case onnx::AttributeProto::FLOATS:
-            return rtg::literal{rtg::shape::float_type, attr.floats().begin(), attr.floats().end()};
-        case onnx::AttributeProto::INTS:
-            return rtg::literal{rtg::shape::int32_type, attr.ints().begin(), attr.ints().end()};
-            ;
-        case onnx::AttributeProto::STRINGS: return {};
-        case onnx::AttributeProto::TENSORS: return {};
-        case onnx::AttributeProto::GRAPHS: return {};
-        }
-        RTG_THROW("Invalid attribute type");
-    }
-
-    static rtg::literal parse_tensor(const onnx::TensorProto& t)
-    {
-        std::vector<std::size_t> dims(t.dims().begin(), t.dims().end());
-        switch(t.data_type())
-        {
-        case onnx::TensorProto::UNDEFINED: throw std::runtime_error("");
-        case onnx::TensorProto::FLOAT:
-            return rtg::literal{
-                {rtg::shape::float_type, dims}, t.float_data().begin(), t.float_data().end()};
-        case onnx::TensorProto::UINT8: throw std::runtime_error("");
-        case onnx::TensorProto::INT8:
-            return rtg::literal{
-                {rtg::shape::int32_type, dims}, t.int32_data().begin(), t.int32_data().end()};
-        case onnx::TensorProto::UINT16:
-            return rtg::literal{
-                {rtg::shape::int32_type, dims}, t.int32_data().begin(), t.int32_data().end()};
-        case onnx::TensorProto::INT16:
-            return rtg::literal{
-                {rtg::shape::int32_type, dims}, t.int32_data().begin(), t.int32_data().end()};
-        case onnx::TensorProto::INT32:
-            return rtg::literal{
-                {rtg::shape::int32_type, dims}, t.int32_data().begin(), t.int32_data().end()};
-        case onnx::TensorProto::INT64:
-            return rtg::literal{
-                {rtg::shape::int64_type, dims}, t.int64_data().begin(), t.int64_data().end()};
-        case onnx::TensorProto::STRING: throw std::runtime_error("");
-        case onnx::TensorProto::BOOL:
-            return rtg::literal{
-                {rtg::shape::int32_type, dims}, t.int32_data().begin(), t.int32_data().end()};
-        case onnx::TensorProto::FLOAT16: throw std::runtime_error("");
-        case onnx::TensorProto::DOUBLE:
-            return rtg::literal{
-                {rtg::shape::double_type, dims}, t.double_data().begin(), t.double_data().end()};
-        case onnx::TensorProto::UINT32: throw std::runtime_error("");
-        case onnx::TensorProto::UINT64: throw std::runtime_error("");
-        case onnx::TensorProto::COMPLEX64: throw std::runtime_error("");
-        case onnx::TensorProto::COMPLEX128: throw std::runtime_error("");
-        }
-        RTG_THROW("Invalid tensor type");
-    }
-
-    static rtg::shape parse_type(const onnx::TypeProto& t)
-    {
-        rtg::shape::type_t shape_type{};
-        switch(t.tensor_type().elem_type())
-        {
-        case onnx::TensorProto::UNDEFINED:
-            break; // throw std::runtime_error("Unsupported type UNDEFINED");
-        case onnx::TensorProto::FLOAT: shape_type = rtg::shape::float_type; break;
-        case onnx::TensorProto::UINT8:
-            break; // throw std::runtime_error("Unsupported type UINT8");
-        case onnx::TensorProto::INT8: shape_type = rtg::shape::int8_type; break;
-        case onnx::TensorProto::UINT16: shape_type = rtg::shape::uint16_type; break;
-        case onnx::TensorProto::INT16: shape_type = rtg::shape::int16_type; break;
-        case onnx::TensorProto::INT32: shape_type = rtg::shape::int32_type; break;
-        case onnx::TensorProto::INT64: shape_type = rtg::shape::int64_type; break;
-        case onnx::TensorProto::STRING:
-            break; // throw std::runtime_error("Unsupported type STRING");
-        case onnx::TensorProto::BOOL:
-            break; // throw std::runtime_error("Unsupported type BOOL");
-        case onnx::TensorProto::FLOAT16:
-            break; // throw std::runtime_error("Unsupported type FLOAT16");
-        case onnx::TensorProto::DOUBLE: shape_type = rtg::shape::double_type; break;
-        case onnx::TensorProto::UINT32: shape_type = rtg::shape::uint32_type; break;
-        case onnx::TensorProto::UINT64: shape_type = rtg::shape::uint64_type; break;
-        case onnx::TensorProto::COMPLEX64:
-            break; // throw std::runtime_error("Unsupported type COMPLEX64");
-        case onnx::TensorProto::COMPLEX128:
-            break; // throw std::runtime_error("Unsupported type COMPLEX128");
-        }
-        std::vector<std::size_t> dims;
-        // TODO: USe std::transform
-        for(auto&& d : t.tensor_type().shape().dim())
-        {
-            dims.push_back(d.dim_value());
-        }
-        return {shape_type, dims};
-    }
-};
+#include <migraph/onnx.hpp>

 int main(int argc, char const* argv[])
 {
    if(argc > 1)
    {
        std::string file = argv[1];
-        std::fstream input(file.c_str(), std::ios::in | std::ios::binary);
-        onnx_parser parser;
-        try
-        {
-            parser.parse_from(input);
-        }
-        catch(...)
-        {
-            std::cout << parser.prog << std::endl;
-            throw;
-        }
-        std::cout << parser.prog << std::endl;
+        auto prog        = migraph::parse_onnx(file);
+        std::cout << prog << std::endl;
    }
 }
--- a/src/onnx/verify_onnx.cpp
+++ b/src/onnx/verify_onnx.cpp
+
+#include <migraph/onnx.hpp>
+
+#include <migraph/cpu/cpu_target.hpp>
+#include <migraph/gpu/target.hpp>
+#include <migraph/gpu/hip.hpp>
+#include <migraph/generate.hpp>
+#include <miopen/miopen.h>
+#include <migraph/gpu/miopen.hpp>
+
+migraph::argument run_cpu(std::string file)
+{
+    auto p = migraph::parse_onnx(file);
+    p.compile(migraph::cpu::cpu_target{});
+    auto s      = p.get_parameter_shape("Input3");
+    auto input3 = migraph::generate_argument(s);
+    auto out    = p.eval({{"Input3", input3}});
+    std::cout << p << std::endl;
+    return out;
+}
+
+migraph::argument run_gpu(std::string file)
+{
+    auto p = migraph::parse_onnx(file);
+    p.compile(migraph::cpu::cpu_target{});
+    auto s      = p.get_parameter_shape("Input3");
+    auto input3 = migraph::gpu::to_gpu(migraph::generate_argument(s));
+
+    auto output = migraph::gpu::to_gpu(migraph::generate_argument(p.get_parameter_shape("output")));
+    auto handle = migraph::gpu::make_obj<migraph::gpu::miopen_handle>(&miopenCreate);
+
+    auto out = p.eval({{"Input3", input3}, {"output", output}});
+    std::cout << p << std::endl;
+    return migraph::gpu::from_gpu(out);
+}
+
+int main(int argc, char const* argv[])
+{
+    if(argc > 1)
+    {
+        std::string file = argv[1];
+        auto x           = run_cpu(file);
+        auto y           = run_gpu(file);
+        if(x == y)
+        {
+            std::cout << "Passed" << std::endl;
+        }
+        else
+        {
+            std::cout << "Not equal" << std::endl;
+            std::cout << x << std::endl;
+            std::cout << y << std::endl;
+        }
+    }
+}
--- a/src/program.cpp
+++ b/src/program.cpp
-#include <rtg/program.hpp>
-#include <rtg/stringutils.hpp>
-#include <rtg/instruction.hpp>
+#include <migraph/program.hpp>
+#include <migraph/stringutils.hpp>
+#include <migraph/instruction.hpp>
 #include <iostream>
+#include <sstream>
 #include <algorithm>

-namespace rtg {
+namespace migraph {

 struct program_impl
 {
    // A list is used to keep references to an instruction stable
    std::list<instruction> instructions;
+    context ctx;
 };

+const operation& get_operation(instruction_ref ins) { return ins->op; }
+
 program::program() : impl(std::make_unique<program_impl>()) {}

 program::program(program&&) noexcept = default;
@@ -28,11 +32,13 @@ program::insert_instruction(instruction_ref ins, operation op, std::vector<instr
    assert(std::all_of(
               args.begin(), args.end(), [&](instruction_ref x) { return has_instruction(x); }) &&
           "Argument is not an exisiting instruction");
+    assert(not starts_with(op.name(), "@"));
    // TODO: Use move
    shape r     = compute_shape(op, args);
    auto result = impl->instructions.insert(ins, {op, r, args});
    backreference(result);
    assert(result->arguments == args);
+    assert(result->valid(begin()));
    return result;
 }

@@ -42,13 +48,66 @@ program::replace_instruction(instruction_ref ins, operation op, std::vector<inst
    assert(std::all_of(
               args.begin(), args.end(), [&](instruction_ref x) { return has_instruction(x); }) &&
           "Argument is not an exisiting instruction");
+    assert(not starts_with(op.name(), "@"));

    shape r = compute_shape(op, args);
    ins->replace(op, r, args);
    backreference(ins);
+    assert(ins->valid(begin()));
    return ins;
 }

+instruction_ref program::replace_instruction(instruction_ref ins, instruction_ref rep)
+{
+    assert(has_instruction(ins));
+    assert(has_instruction(rep));
+    assert(ins != rep);
+    // TODO: Should it be an error if the output is empty?
+    if(ins->output.empty())
+    {
+        return rep;
+    }
+    for(auto&& out : ins->output)
+    {
+        // TODO: Check for possible cycles
+        if(out != rep)
+        {
+            replace_argument(out, ins, rep);
+        }
+        assert(out->valid(begin()));
+    }
+    // Replacement should not be dead code unless its the last instruction
+    assert(!rep->output.empty() or rep == std::prev(end()));
+    assert(ins->valid(begin()));
+    assert(rep->valid(begin()));
+    return rep;
+}
+
+instruction_ref program::remove_instruction(instruction_ref ins)
+{
+    assert(has_instruction(ins));
+    assert(ins->output.empty());
+    ins->clear_arguments();
+    return impl->instructions.erase(ins);
+}
+
+instruction_ref program::remove_instructions(instruction_ref first, instruction_ref last)
+{
+    if(first == last)
+        return first;
+    // TODO: Check every element
+    assert(has_instruction(first));
+    std::for_each(first, last, [&](instruction& ins) { ins.clear_arguments(); });
+    assert(std::all_of(first, last, [&](instruction& ins) { return ins.output.empty(); }));
+    return impl->instructions.erase(first, last);
+}
+
+instruction_ref program::move_instruction(instruction_ref src, instruction_ref dst)
+{
+    impl->instructions.splice(dst, impl->instructions, src);
+    return src;
+}
+
 instruction_ref program::add_literal(literal l)
 {
    impl->instructions.emplace_front(std::move(l));
@@ -67,7 +126,7 @@ instruction_ref program::add_parameter(std::string name, shape s)
    return impl->instructions.begin();
 }

-shape program::get_parameter_shape(std::string name)
+shape program::get_parameter_shape(std::string name) const
 {
    auto ins = std::find_if(
        impl->instructions.begin(), impl->instructions.end(), [&](const instruction& x) {
@@ -86,6 +145,20 @@ shape program::get_parameter_shape(std::string name)
        return {};
 }

+std::unordered_map<std::string, shape> program::get_parameter_shapes() const
+{
+    std::unordered_map<std::string, shape> result;
+    for(auto&& ins : impl->instructions)
+    {
+        if(ins.op.name() == "@param")
+        {
+            auto&& name  = any_cast<builtin::param>(ins.op).parameter;
+            result[name] = ins.result;
+        }
+    }
+    return result;
+}
+
 bool program::has_instruction(instruction_ref ins) const
 {
    return std::find_if(
@@ -94,27 +167,46 @@ bool program::has_instruction(instruction_ref ins) const
               }) != impl->instructions.end();
 }

-instruction_ref program::begin() { return impl->instructions.begin(); }
-instruction_ref program::end() { return impl->instructions.end(); }
+instruction_ref program::begin() const { return impl->instructions.begin(); }
+instruction_ref program::end() const { return impl->instructions.end(); }
+
+shape program::get_shape() const { return impl->instructions.back().result; }

 instruction_ref program::validate() const
 {
    return std::find_if(impl->instructions.begin(),
                        impl->instructions.end(),
-                        [](const instruction& i) { return i.valid(); });
+                        [&](const instruction& i) { return !i.valid(impl->instructions.begin()); });
 }

 void program::compile(const target& t)
 {
-    assert(this->validate() != impl->instructions.end());
-    t.apply(*this);
-    if(this->validate() == impl->instructions.end())
-        RTG_THROW("Invalid program from compilation");
+    assert(this->validate() == impl->instructions.end());
+    this->impl->ctx = t.get_context();
+    for(auto&& p : t.get_passes(this->impl->ctx))
+    {
+        p.apply(*this);
+#ifndef NDEBUG
+        auto invalid = this->validate();
+        if(invalid != impl->instructions.end())
+        {
+            auto index = std::distance(impl->instructions.begin(), invalid);
+            MIGRAPH_THROW(p.name() + " pass produces invalid program at instruction " +
+                          std::to_string(index) + ": " + invalid->op.name());
+        }
+#endif
+    }
+    auto invalid = this->validate();
+    if(invalid != impl->instructions.end())
+    {
+        auto index = std::distance(impl->instructions.begin(), invalid);
+        MIGRAPH_THROW("Invalid program from compilation at instruction " + std::to_string(index));
+    }
 }

 argument program::eval(std::unordered_map<std::string, argument> params) const
 {
-    assert(this->validate() != impl->instructions.end());
+    assert(this->validate() == impl->instructions.end());
    std::unordered_map<const instruction*, argument> results;
    argument result;
    for(auto& ins : impl->instructions)
@@ -138,13 +230,15 @@ argument program::eval(std::unordered_map<std::string, argument> params) const
                           ins.arguments.end(),
                           values.begin(),
                           [&](instruction_ref i) { return results.at(std::addressof(*i)); });
-            result = ins.op.compute(ins.result, values);
+            result = ins.op.compute(this->impl->ctx, ins.result, values);
        }
        results.emplace(std::addressof(ins), result);
    }
    return result;
 }

+bool operator==(const program& x, const program& y) { return to_string(x) == to_string(y); }
+
 std::ostream& operator<<(std::ostream& os, const program& p)
 {
    std::unordered_map<const instruction*, std::string> names;
@@ -192,4 +286,4 @@ std::ostream& operator<<(std::ostream& os, const program& p)
    return os;
 }

-} // namespace rtg
+} // namespace migraph
--- a/src/shape.cpp
+++ b/src/shape.cpp

-#include <rtg/shape.hpp>
-#include <rtg/stringutils.hpp>
+#include <migraph/shape.hpp>
+#include <migraph/stringutils.hpp>
 #include <numeric>
 #include <algorithm>
 #include <functional>
+#include <iostream>

-namespace rtg {
+namespace migraph {

-shape::shape() : m_type(float_type), m_packed(false) {}
+shape::shape() : m_type(float_type), m_standard(false) {}

-shape::shape(type_t t) : m_type(t), m_lens({1}), m_strides({1}), m_packed(true) {}
-shape::shape(type_t t, std::vector<std::size_t> l) : m_type(t), m_lens(std::move(l)), m_packed(true)
+shape::shape(type_t t) : m_type(t), m_lens({1}), m_strides({1}), m_standard(true) {}
+shape::shape(type_t t, std::vector<std::size_t> l)
+    : m_type(t), m_lens(std::move(l)), m_standard(true)
 {
    this->calculate_strides();
    assert(m_lens.size() == m_strides.size());
@@ -19,7 +21,9 @@ shape::shape(type_t t, std::vector<std::size_t> l, std::vector<std::size_t> s)
    : m_type(t), m_lens(std::move(l)), m_strides(std::move(s))
 {
    assert(m_lens.size() == m_strides.size());
-    m_packed = this->elements() == this->element_space();
+    assert(std::any_of(m_strides.begin(), m_strides.end(), [](auto x) { return x > 0; }) and
+           "At least one stride must be non-zero");
+    m_standard = this->packed() and not this->transposed();
 }

 void shape::calculate_strides()
@@ -39,6 +43,8 @@ const std::vector<std::size_t>& shape::strides() const { return this->m_strides;
 std::size_t shape::elements() const
 {
    assert(this->lens().size() == this->strides().size());
+    if(this->lens().empty())
+        return 0;
    return std::accumulate(
        this->lens().begin(), this->lens().end(), std::size_t{1}, std::multiplies<std::size_t>());
 }
@@ -63,27 +69,48 @@ std::size_t shape::index(const std::vector<std::size_t>& l) const
 std::size_t shape::index(std::size_t i) const
 {
    assert(this->lens().size() == this->strides().size());
-    return std::inner_product(
-        this->lens().begin(),
-        this->lens().end(),
-        this->strides().begin(),
-        std::size_t{0},
-        std::plus<std::size_t>{},
-        [&](std::size_t len, std::size_t stride) { return ((i / stride) % len) * stride; });
+    if(this->standard())
+        return i;
+    else
+        return std::inner_product(this->lens().begin(),
+                                  this->lens().end(),
+                                  this->strides().begin(),
+                                  std::size_t{0},
+                                  std::plus<std::size_t>{},
+                                  [&](std::size_t len, std::size_t stride) {
+                                      assert(stride > 0 and len > 0);
+                                      return ((i / stride) % len) * stride;
+                                  });
 }
-bool shape::packed() const { return this->m_packed; }
+bool shape::packed() const { return this->elements() == this->element_space(); }
+
+bool shape::transposed() const
+{
+    return not std::is_sorted(this->strides().rbegin(), this->strides().rend());
+}
+
+bool shape::broadcasted() const
+{
+    assert(this->lens().size() == this->strides().size());
+    return std::accumulate(this->strides().begin(),
+                           this->strides().end(),
+                           std::size_t{1},
+                           std::multiplies<std::size_t>()) == 0;
+}
+
+bool shape::standard() const { return this->m_standard; }
+
 std::size_t shape::element_space() const
 {
-    // TODO: Get rid of intermediate vector
    assert(this->lens().size() == this->strides().size());
-    std::vector<std::size_t> max_indices(this->lens().size());
-    std::transform(this->lens().begin(),
-                   this->lens().end(),
-                   std::vector<std::size_t>(this->lens().size(), 1).begin(),
-                   max_indices.begin(),
-                   std::minus<std::size_t>());
-    return std::inner_product(
-               max_indices.begin(), max_indices.end(), this->strides().begin(), std::size_t{0}) +
+    if(this->lens().empty())
+        return 0;
+    return std::inner_product(this->lens().begin(),
+                              this->lens().end(),
+                              this->strides().begin(),
+                              std::size_t{0},
+                              std::plus<std::size_t>{},
+                              [](std::size_t l, std::size_t s) { return (l - 1) * s; }) +
           1;
 }

@@ -91,13 +118,12 @@ std::string shape::type_string() const
 {
    switch(this->m_type)
    {
-    case any_type: return "any";
-#define RTG_SHAPE_TYPE_STRING_CASE(x, t) \
+#define MIGRAPH_SHAPE_TYPE_STRING_CASE(x, t) \
    case x: return #x;
-        RTG_SHAPE_VISIT_TYPES(RTG_SHAPE_TYPE_STRING_CASE)
-#undef RTG_SHAPE_TYPE_STRING_CASE
+        MIGRAPH_SHAPE_VISIT_TYPES(MIGRAPH_SHAPE_TYPE_STRING_CASE)
+#undef MIGRAPH_SHAPE_TYPE_STRING_CASE
    }
-    RTG_THROW("Invalid type");
+    MIGRAPH_THROW("Invalid type");
 }

 bool operator==(const shape& x, const shape& y)
@@ -109,9 +135,9 @@ bool operator!=(const shape& x, const shape& y) { return !(x == y); }
 std::ostream& operator<<(std::ostream& os, const shape& x)
 {
    os << x.type_string() << ", ";
-    os << "{" << to_string(x.lens()) << "}, ";
-    os << "{" << to_string(x.strides()) << "}";
+    os << "{" << to_string_range(x.lens()) << "}, ";
+    os << "{" << to_string_range(x.strides()) << "}";
    return os;
 }

-} // namespace rtg
+} // namespace migraph
--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
+#include <migraph/simplify_reshapes.hpp>
+#include <migraph/program.hpp>
+#include <migraph/instruction.hpp>
+#include <migraph/operators.hpp>
+#include <migraph/iterator_for.hpp>
+#include <migraph/ranges.hpp>
+#include <unordered_set>
+
+namespace migraph {
+
+bool is_reshaper(const std::string& name)
+{
+    // clang-format off
+    static const std::unordered_set<std::string> names = {
+        "reshape",
+        "transpose",
+        // "broadcast",
+        "contiguous"
+    };
+    // clang-format on
+    return contains(names, name);
+}
+
+void simplify_reshapes::apply(program& p) const
+{
+    for(auto ins : iterator_for(p))
+    {
+        if(not is_reshaper(ins->op.name()))
+            continue;
+        if(ins->output.size() != 1)
+            continue;
+        if(is_reshaper(ins->output.front()->op.name()))
+            continue;
+        // Gather reshapes
+        std::vector<instruction_ref> reshapes{ins};
+        while(is_reshaper(reshapes.back()->op.name()))
+        {
+            assert(!reshapes.back()->arguments.empty());
+            assert(p.has_instruction(reshapes.back()->arguments.front()));
+            reshapes.push_back(reshapes.back()->arguments.front());
+        }
+
+        std::pair<instruction_ref, instruction_ref> r{p.end(), p.end()};
+        for(auto start : iterator_for(reshapes))
+        {
+            auto last = std::find_if(reshapes.rbegin(), reshapes.rend(), [&](auto&& i) {
+                return i->result == (*start)->result and i != (*start);
+            });
+            if(last != reshapes.rend())
+            {
+                r = std::make_pair(*start, *last);
+                break;
+            }
+        }
+        if(r.first != r.second)
+        {
+            p.replace_instruction(r.first, r.second);
+        }
+    }
+}
+
+} // namespace migraph
--- a/src/targets/cpu/CMakeLists.txt
+++ b/src/targets/cpu/CMakeLists.txt

-add_library(rtg_cpu
+add_library(migraph_cpu
    cpu_target.cpp
+    cpu_lowering.cpp
+    gemm.cpp
 )
-rocm_clang_tidy_check(rtg_cpu)
-target_link_libraries(rtg_cpu rtg)
-target_include_directories(rtg_cpu PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
+
+find_path(BLAZE_INCLUDE blaze/Blaze.h)
+find_package(Threads)
+
+rocm_clang_tidy_check(migraph_cpu)
+target_link_libraries(migraph_cpu migraph Threads::Threads)
+target_include_directories(migraph_cpu PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
+target_include_directories(migraph_cpu PRIVATE ${BLAZE_INCLUDE})
+target_compile_definitions(migraph_cpu PRIVATE -DBLAZE_USE_CPP_THREADS)
--- a/src/targets/cpu/cpu_lowering.cpp
+++ b/src/targets/cpu/cpu_lowering.cpp
+
+#include <migraph/cpu/cpu_lowering.hpp>
+#include <migraph/instruction.hpp>
+#include <migraph/dfor.hpp>
+#include <migraph/operators.hpp>
+#include <migraph/shape_for_each.hpp>
+#include <migraph/iterator_for.hpp>
+#include <migraph/cpu/gemm.hpp>
+#include <unordered_map>
+
+namespace migraph {
+namespace cpu {
+
+template <typename T>
+T zero(const T&)
+{
+    return T(0);
+}
+
+//
+// cpu implemenataion of batch norm for inference
+//
+// inputs are:
+// args[0] -> input data buffer
+// args[1] -> mini batch mean
+// args[2] -> mini batch variance
+// args[3] -> gamma
+// args[4] -> bias
+//
+// The equation to compute batch norm for inference is:
+//
+// output[i] = bias + gamma * (input[i] + mean) / sqrt(variance + epsilon)
+//
+// the input data format should be nchw
+//
+struct cpu_batch_norm_inference
+{
+    batch_norm_inference op;
+
+    std::string name() const { return "cpu::batch_norm_inference"; }
+
+    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument output{output_shape};
+
+        double epsilon           = op.epsilon;
+        auto input               = args[0];
+        auto mini_batch_mean     = args[1];
+        auto mini_batch_variance = args[2];
+        auto arg_gamma           = args[3];
+        auto arg_bias            = args[4];
+
+        auto num_batch    = output_shape.lens()[0];
+        auto num_channels = output_shape.lens()[1];
+        auto image_height = output_shape.lens()[2];
+        auto image_width  = output_shape.lens()[3];
+
+        if(op.bn_mode == batch_norm_inference::spatial)
+        {
+            visit_all(output, input, mini_batch_mean, mini_batch_variance, arg_gamma, arg_bias)(
+                [&](auto result, auto buffer, auto mean, auto variance, auto gamma, auto bias) {
+
+                    dfor(num_batch, num_channels, image_height, image_width)(
+                        [&](std::size_t n, std::size_t c, std::size_t h, std::size_t w) {
+                            result(n, c, h, w) = gamma(c) * (buffer(n, c, h, w) - mean(c)) /
+                                                     std::sqrt(variance(c) + epsilon) +
+                                                 bias(c);
+                        });
+                });
+        }
+
+        if(op.bn_mode == batch_norm_inference::per_activation)
+        {
+            visit_all(output, input, mini_batch_mean, mini_batch_mean, arg_gamma, arg_bias)(
+                [&](auto result, auto buffer, auto mean, auto variance, auto gamma, auto bias) {
+
+                    dfor(num_batch, num_channels, image_height, image_width)(
+                        [&](std::size_t n, std::size_t c, std::size_t h, std::size_t w) {
+                            result(n, c, h, w) = gamma(c, h, w) *
+                                                     (buffer(n, c, h, w) - mean(c, h, w)) /
+                                                     std::sqrt(variance(c, h, w) + epsilon) +
+                                                 bias(c, h, w);
+                        });
+                });
+        }
+
+        return output;
+    }
+};
+
+struct cpu_convolution
+{
+    convolution op;
+
+    std::string name() const { return "cpu::convolution"; }
+    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
+            auto in_h = input.get_shape().lens()[2];
+            auto in_w = input.get_shape().lens()[3];
+
+            auto wei_c = weights.get_shape().lens()[1];
+            auto wei_h = weights.get_shape().lens()[2];
+            auto wei_w = weights.get_shape().lens()[3];
+
+            dfor(output_shape.lens()[0],
+                 output_shape.lens()[1],
+                 output_shape.lens()[2],
+                 output_shape.lens()[3])(
+                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
+                    const int start_x = i * op.stride[0] - op.padding[0];
+                    const int start_y = j * op.stride[1] - op.padding[1];
+
+                    double acc = 0;
+                    dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
+                        const int in_x = start_x + x;
+                        const int in_y = start_y + y;
+                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
+                        {
+                            acc += input(o, k, in_x, in_y) * weights(w, k, x, y);
+                        }
+                    });
+                    output(o, w, i, j) = acc;
+                });
+        });
+        return result;
+    }
+};
+
+struct max_pool
+{
+    static std::string name() { return "max"; }
+    static double start() { return std::numeric_limits<double>::lowest(); }
+
+    static double apply(double x, double y)
+    {
+        double m = std::max(x, y);
+        return (m);
+    }
+
+    static double final(double x, double) { return (x); }
+};
+
+struct avg_pool
+{
+    static std::string name() { return "average"; }
+    static double start() { return 0.0; }
+
+    static double apply(double x, double y) { return x + y; }
+
+    static double final(double x, double y) { return x / y; }
+};
+
+template <class Op>
+struct cpu_pooling
+{
+    pooling op;
+
+    std::string name() const { return "cpu::pooling_" + Op::name(); }
+    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            using type = typename decltype(output)::value_type;
+            auto in_h  = input.get_shape().lens()[2];
+            auto in_w  = input.get_shape().lens()[3];
+
+            dfor(output_shape.lens()[0],
+                 output_shape.lens()[1],
+                 output_shape.lens()[2],
+                 output_shape.lens()[3])(
+                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
+                    const int start_x0 = i * op.stride[0] - op.padding[0];
+                    const int start_y0 = j * op.stride[1] - op.padding[1];
+
+                    const int hend = std::min(start_x0 + op.lengths[0], in_h);
+                    const int wend = std::min(start_y0 + op.lengths[1], in_w);
+
+                    const int start_x = std::max(start_x0, 0);
+                    const int start_y = std::max(start_y0, 0);
+
+                    const int w_h       = (hend - start_x);
+                    const int w_w       = (wend - start_y);
+                    const int pool_size = std::max(w_h * w_w, 1);
+
+                    double acc = Op::start();
+                    dfor(w_h, w_w)([&](int x, int y) {
+                        const int in_x = start_x + x;
+                        const int in_y = start_y + y;
+                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
+                        {
+                            acc = Op::apply(acc, input(o, w, in_x, in_y));
+                        }
+                    });
+                    output(o, w, i, j) = type(Op::final(acc, pool_size));
+                });
+        });
+        return result;
+    }
+};
+
+struct cpu_contiguous
+{
+    contiguous op;
+    std::string name() const { return "cpu::contiguous"; }
+    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            shape_for_each(output.get_shape(), [&](const auto& idx) {
+                output(idx.begin(), idx.end()) = input(idx.begin(), idx.end());
+            });
+        });
+        return result;
+    }
+};
+
+struct cpu_gemm
+{
+    gemm op;
+    std::string name() const { return "cpu::gemm"; }
+    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        migemm(result, args[0], args[1], op.alpha, op.beta);
+        return result;
+    }
+};
+
+struct identity_op
+{
+    std::string name() const { return "cpu::identity"; }
+    auto fcn() const
+    {
+        return [](auto x) { return x; };
+    }
+};
+
+struct abs_op
+{
+    std::string name() const { return "cpu::abs"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::abs(x); };
+    }
+};
+
+struct exp_op
+{
+    std::string name() const { return "cpu::exp"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::exp(x); };
+    }
+};
+
+struct sin_op
+{
+    std::string name() const { return "cpu::sin"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::sin(x); };
+    }
+};
+
+struct cos_op
+{
+    std::string name() const { return "cpu::cos"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::cos(x); };
+    }
+};
+
+struct tan_op
+{
+    std::string name() const { return "cpu::tan"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::tan(x); };
+    }
+};
+
+struct asin_op
+{
+    std::string name() const { return "cpu::asin"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::asin(x); };
+    }
+};
+
+struct acos_op
+{
+    std::string name() const { return "cpu::acos"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::acos(x); };
+    }
+};
+
+struct atan_op
+{
+    std::string name() const { return "cpu::atan"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::atan(x); };
+    }
+};
+
+struct tanh_op
+{
+    std::string name() const { return "cpu::tanh"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::tanh(x); };
+    }
+};
+
+struct sigmoid_op
+{
+    std::string name() const { return "cpu::sigmoid"; }
+    auto fcn() const
+    {
+        return [](auto x) { return 1.f / (1.f + std::exp(-x)); };
+    }
+};
+
+struct neg_op
+{
+    std::string name() const { return "cpu::neg"; }
+    auto fcn() const
+    {
+        return [](auto x) { return -x; };
+    }
+};
+
+struct relu_op
+{
+    std::string name() const { return "cpu::relu"; }
+    auto fcn() const
+    {
+        return [](auto x) { return x > 0 ? x : 0; };
+    }
+};
+
+template <typename Op>
+struct cpu_unary
+{
+    Op op;
+    std::string name() const { return op.name(); }
+    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        result.visit([&](auto output) {
+            args[0].visit([&](auto input) {
+                std::transform(input.begin(), input.end(), output.begin(), op.fcn());
+            });
+        });
+        return result;
+    }
+};
+
+struct softmax2d
+{
+    std::string name() const { return "cpu::softmax2d"; }
+    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            using value_type = typename decltype(input)::value_type;
+            auto nb          = input.get_shape().lens()[0];
+            auto nc          = input.get_shape().lens()[1];
+            auto nh          = input.get_shape().lens()[2];
+            auto nw          = input.get_shape().lens()[3];
+            dfor(nb, nh, nw)([&](std::size_t b, std::size_t i, std::size_t j) {
+                value_type cmax = std::numeric_limits<value_type>::lowest();
+                for(int c = 0; c < nc; c++)
+                {
+                    cmax = std::max(cmax, input(b, c, i, j));
+                }
+                for(int c = 0; c < nc; c++)
+                {
+                    output(b, c, i, j) = std::exp(input(b, c, i, j) - cmax);
+                }
+                value_type sum = value_type(0);
+                for(int c = 0; c < nc; c++)
+                {
+                    sum += output(b, c, i, j);
+                }
+                for(int c = 0; c < nc; c++)
+                {
+                    output(b, c, i, j) = output(b, c, i, j) / sum;
+                }
+            });
+        });
+        return result;
+    }
+};
+
+struct add_op
+{
+    std::string name() const { return "add"; }
+    auto fcn() const
+    {
+        return [](auto x, auto y) { return x + y; };
+    }
+};
+
+struct sub_op
+{
+    std::string name() const { return "sub"; }
+    auto fcn() const
+    {
+        return [](auto x, auto y) { return x - y; };
+    }
+};
+
+struct mul_op
+{
+    std::string name() const { return "mul"; }
+    auto fcn() const
+    {
+        return [](auto x, auto y) { return x * y; };
+    }
+};
+
+struct div_op
+{
+    std::string name() const { return "div"; }
+    auto fcn() const
+    {
+        return [](auto x, auto y) { return x / y; };
+    }
+};
+
+template <typename Op>
+struct cpu_binary
+{
+    Op op;
+    std::string name() const { return op.name(); }
+    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0], args[1])([&](auto output, auto input1, auto input2) {
+            if(input1.get_shape().packed() and input2.get_shape().packed())
+            {
+                std::transform(
+                    input1.begin(), input1.end(), input2.begin(), output.begin(), op.fcn());
+            }
+            else
+            {
+                shape_for_each(output.get_shape(), [&](const auto& idx) {
+                    output(idx.begin(), idx.end()) =
+                        op.fcn()(input1(idx.begin(), idx.end()), input2(idx.begin(), idx.end()));
+                });
+            }
+        });
+        return result;
+    }
+};
+
+struct cpu_apply
+{
+    program* prog;
+    std::unordered_map<std::string, std::function<void(instruction_ref)>> apply_map{};
+
+    template <class T>
+    auto simple_op()
+    {
+        return [this](instruction_ref ins) { apply_simple_op<T>(ins); };
+    }
+
+    template <class T, class Op>
+    auto extend_op()
+    {
+        return [this](instruction_ref ins) { apply_extend_op<T, Op>(ins); };
+    }
+
+    void init()
+    {
+        apply_map["convolution"] = extend_op<cpu_convolution, convolution>();
+        apply_map["gemm"]        = extend_op<cpu_gemm, gemm>();
+        apply_map["batch_norm_inference"] =
+            extend_op<cpu_batch_norm_inference, batch_norm_inference>();
+        apply_map["contiguous"] = extend_op<cpu_contiguous, contiguous>();
+
+        apply_map["identity"] = simple_op<cpu_unary<identity_op>>();
+        apply_map["tanh"]     = simple_op<cpu_unary<tanh_op>>();
+        apply_map["sigmoid"]  = simple_op<cpu_unary<sigmoid_op>>();
+        apply_map["exp"]      = simple_op<cpu_unary<exp_op>>();
+        apply_map["neg"]      = simple_op<cpu_unary<neg_op>>();
+        apply_map["sin"]      = simple_op<cpu_unary<sin_op>>();
+        apply_map["cos"]      = simple_op<cpu_unary<cos_op>>();
+        apply_map["tan"]      = simple_op<cpu_unary<tan_op>>();
+        apply_map["add"]      = simple_op<cpu_binary<add_op>>();
+        apply_map["sub"]      = simple_op<cpu_binary<sub_op>>();
+        apply_map["mul"]      = simple_op<cpu_binary<mul_op>>();
+        apply_map["div"]      = simple_op<cpu_binary<div_op>>();
+
+        apply_map["softmax"] = simple_op<softmax2d>();
+    }
+
+    void apply()
+    {
+        init();
+        for(auto it : iterator_for(*prog))
+        {
+            if(it->op.name() == "activation")
+            {
+                apply_activation(it);
+            }
+            else if(it->op.name() == "pooling")
+            {
+                apply_pooling(it);
+            }
+            else if(apply_map.count(it->op.name()) > 0)
+            {
+                apply_map.at(it->op.name())(it);
+            }
+        }
+    }
+
+    template <class T>
+    void apply_simple_op(instruction_ref ins)
+    {
+        prog->replace_instruction(ins, T{}, ins->arguments);
+    }
+
+    template <class T, class Op>
+    void apply_extend_op(instruction_ref ins)
+    {
+        auto&& op = any_cast<Op>(ins->op);
+        prog->replace_instruction(ins, T{op}, ins->arguments);
+    }
+
+    void apply_activation(instruction_ref ins)
+    {
+        auto&& op = any_cast<activation>(ins->op);
+        if(op.mode == "relu")
+            prog->replace_instruction(ins, cpu_unary<relu_op>{}, ins->arguments);
+    }
+
+    void apply_pooling(instruction_ref ins)
+    {
+        auto&& op = any_cast<pooling>(ins->op);
+        if(op.mode == "max")
+            prog->replace_instruction(ins, cpu_pooling<max_pool>{op}, ins->arguments);
+        else if(op.mode == "average")
+            prog->replace_instruction(ins, cpu_pooling<avg_pool>{op}, ins->arguments);
+    }
+};
+
+void cpu_lowering::apply(program& p) const { cpu_apply{&p}.apply(); }
+
+} // namespace cpu
+
+} // namespace migraph
--- a/src/targets/cpu/cpu_target.cpp
+++ b/src/targets/cpu/cpu_target.cpp

-#include <rtg/cpu/cpu_target.hpp>
-#include <rtg/instruction.hpp>
-#include <rtg/dfor.hpp>
-#include <rtg/operators.hpp>
+#include <migraph/cpu/cpu_target.hpp>
+#include <migraph/cpu/cpu_lowering.hpp>

-namespace rtg {
+namespace migraph {
 namespace cpu {

-struct cpu_convolution
-{
-    convolution op;
-
-    std::string name() const { return "cpu::convolution"; }
-    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
-    argument compute(shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
-            auto in_h = input.get_shape().lens()[2];
-            auto in_w = input.get_shape().lens()[3];
-
-            auto wei_c = weights.get_shape().lens()[1];
-            auto wei_h = weights.get_shape().lens()[2];
-            auto wei_w = weights.get_shape().lens()[3];
-
-            dfor(output_shape.lens()[0],
-                 output_shape.lens()[1],
-                 output_shape.lens()[2],
-                 output_shape.lens()[3])(
-                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
-                    const int start_x = i * op.stride[0] - op.padding[0];
-                    const int start_y = j * op.stride[1] - op.padding[1];
-
-                    double acc = 0;
-                    dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
-                        const int in_x = start_x + x;
-                        const int in_y = start_y + y;
-                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
-                        {
-                            acc += input(o, k, in_x, in_y) * weights(w, k, x, y);
-                        }
-                    });
-                    output(o, w, i, j) = acc;
-                });
-        });
-        return result;
-    }
-};
-
-struct relu
-{
-    std::string name() const { return "cpu::relu"; }
-    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
-
-    argument compute(shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        result.visit([&](auto output) {
-            args[0].visit([&](auto input) {
-                std::transform(input.begin(), input.end(), output.begin(), [](auto x) {
-                    return x > 0 ? x : 0;
-                });
-            });
-        });
-        return result;
-    }
-};
-
-struct cpu_apply
-{
-    program* prog;
-
-    void apply()
-    {
-        for(auto it = prog->begin(); it != prog->end(); it++)
-        {
-            if(it->op.name() == "convolution")
-            {
-                apply_convolution(it);
-            }
-            else if(it->op.name() == "activation")
-            {
-                apply_activation(it);
-            }
-        }
-    }
-
-    void apply_convolution(instruction_ref ins)
-    {
-        auto&& op = any_cast<convolution>(ins->op);
-        prog->replace_instruction(ins, cpu_convolution{op}, ins->arguments);
-    }
-
-    void apply_activation(instruction_ref ins)
-    {
-        auto&& op = any_cast<activation>(ins->op);
-        if(op.mode == "relu")
-            prog->replace_instruction(ins, relu{}, ins->arguments);
-    }
-};
-
 std::string cpu_target::name() const { return "cpu"; }

-void cpu_target::apply(program& p) const { cpu_apply{&p}.apply(); }
+std::vector<pass> cpu_target::get_passes(context&) const { return {cpu_lowering{}}; }

 } // namespace cpu

-} // namespace rtg
+} // namespace migraph
--- a/src/targets/cpu/gemm.cpp
+++ b/src/targets/cpu/gemm.cpp
+#include <migraph/cpu/gemm.hpp>
+#include <migraph/dfor.hpp>
+#include <migraph/requires.hpp>
+#include <blaze/math/CustomMatrix.h>
+
+namespace migraph {
+namespace cpu {
+
+template <class T>
+using matrix = blaze::CustomMatrix<T, blaze::unaligned, blaze::unpadded>; // NOLINT
+
+template <class T>
+static auto make_mat(tensor_view<T> x)
+{
+    const auto& s = x.get_shape();
+    assert(s.lens().size() == 2);
+    if(s.transposed())
+        return matrix<T>{x.data(), s.lens()[1], s.lens()[0], s.strides()[1]};
+    return matrix<T>{x.data(), s.lens()[0], s.lens()[1], s.strides()[0]};
+}
+
+template <class T, class F>
+static void visit_mat(tensor_view<T> x, F f)
+{
+    auto mat = make_mat(x);
+    if(x.get_shape().transposed())
+        f(blaze::trans(mat));
+    else
+        f(mat);
+}
+
+template <class T>
+struct is_fast_gemm_type : std::false_type
+{
+};
+
+template <>
+struct is_fast_gemm_type<float> : std::true_type
+{
+};
+
+template <class T>
+void migemm_impl(tensor_view<T> cmat,
+                 tensor_view<T> amat,
+                 tensor_view<T> bmat,
+                 float alpha,
+                 float beta,
+                 std::true_type)
+{
+    visit_mat(amat, [&](const auto& a) {
+        visit_mat(bmat, [&](const auto& b) {
+            auto c = make_mat(cmat);
+            c      = (a * b) * alpha + beta * c;
+        });
+    });
+}
+
+template <class T>
+void migemm_impl(tensor_view<T> cmat,
+                 tensor_view<T> amat,
+                 tensor_view<T> bmat,
+                 float alpha,
+                 float beta,
+                 std::false_type)
+{
+    auto m = cmat.get_shape().lens()[0];
+    auto n = cmat.get_shape().lens()[1];
+    auto k = amat.get_shape().lens()[1];
+
+    assert(amat.get_shape().lens()[1] == bmat.get_shape().lens()[0]);
+    assert(m == amat.get_shape().lens()[0]);
+    assert(n == bmat.get_shape().lens()[1]);
+
+    dfor(m, n)([&](auto ii, auto jj) {
+        double s = cmat(ii, jj) * beta;
+        dfor(k)([&](auto kk) { s += amat(ii, kk) * bmat(kk, jj); });
+        cmat(ii, jj) = alpha * s;
+    });
+}
+
+template <class T>
+void migemm_impl(
+    tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, float alpha, float beta)
+{
+    migemm_impl(cmat, amat, bmat, alpha, beta, is_fast_gemm_type<T>{});
+}
+
+void migemm(
+    const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta)
+{
+    visit_all(c_arg, a_arg, b_arg)(
+        [&](auto cmat, auto amat, auto bmat) { migemm_impl(cmat, amat, bmat, alpha, beta); });
+}
+
+} // namespace cpu
+
+} // namespace migraph
--- a/src/targets/cpu/include/migraph/cpu/cpu_lowering.hpp
+++ b/src/targets/cpu/include/migraph/cpu/cpu_lowering.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_CPU_LOWERING_HPP
+#define MIGRAPH_GUARD_RTGLIB_CPU_LOWERING_HPP
+
+#include <migraph/program.hpp>
+
+namespace migraph {
+namespace cpu {
+
+struct cpu_lowering
+{
+    std::string name() const { return "cpu::lowering"; }
+    void apply(program& p) const;
+};
+
+} // namespace cpu
+
+} // namespace migraph
+
+#endif
--- a/src/targets/cpu/include/migraph/cpu/cpu_target.hpp
+++ b/src/targets/cpu/include/migraph/cpu/cpu_target.hpp
+#ifndef MIGRAPH_GUARD_MIGRAPHLIB_CPU_TARGET_HPP
+#define MIGRAPH_GUARD_MIGRAPHLIB_CPU_TARGET_HPP
+
+#include <migraph/program.hpp>
+
+namespace migraph {
+namespace cpu {
+
+struct cpu_target
+{
+    std::string name() const;
+    std::vector<pass> get_passes(context& ctx) const;
+    context get_context() const { return {}; }
+};
+
+} // namespace cpu
+
+} // namespace migraph
+
+#endif
--- a/src/targets/cpu/include/migraph/cpu/gemm.hpp
+++ b/src/targets/cpu/include/migraph/cpu/gemm.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_CPU_GEMM_HPP
+#define MIGRAPH_GUARD_RTGLIB_CPU_GEMM_HPP
+
+#include <migraph/argument.hpp>
+
+namespace migraph {
+namespace cpu {
+
+void migemm(
+    const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta);
+
+} // namespace cpu
+
+} // namespace migraph
+
+#endif
--- a/src/targets/cpu/include/rtg/cpu/cpu_target.hpp
+++ b/src/targets/cpu/include/rtg/cpu/cpu_target.hpp
-#ifndef RTG_GUARD_RTGLIB_CPU_TARGET_HPP
-#define RTG_GUARD_RTGLIB_CPU_TARGET_HPP
-
-#include <rtg/program.hpp>
-
-namespace rtg {
-namespace cpu {
-
-struct cpu_target
-{
-    std::string name() const;
-    void apply(program& p) const;
-};
-
-} // namespace cpu
-
-} // namespace rtg
-
-#endif
--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
+
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm /opt/rocm/hip /opt/rocm/hcc)
+find_package(miopen)
+
+# rocblas
+find_package(rocblas REQUIRED PATHS /opt/rocm)
+message(STATUS "Build with rocblas")
+
+if(NOT TARGET MIOpen)
+    message(SEND_ERROR "Cant find miopen")
+endif()
+
+add_library(migraph_device 
+    hip_contiguous.cpp
+)
+rocm_clang_tidy_check(migraph_device)
+target_link_libraries(migraph_device migraph hip::device)
+target_include_directories(migraph_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
+
+add_library(migraph_gpu
+    hip.cpp
+    target.cpp
+    lowering.cpp
+    write_literals.cpp
+    rocblas.cpp
+)
+rocm_clang_tidy_check(migraph_gpu)
+target_link_libraries(migraph_gpu migraph MIOpen migraph_device roc::rocblas)
+target_include_directories(migraph_gpu PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
+
+#include <migraph/gpu/hip.hpp>
+
+#include <migraph/manage_ptr.hpp>
+#include <miopen/miopen.h>
+
+#include <vector>
+
+namespace migraph {
+namespace gpu {
+
+using hip_ptr = MIGRAPH_MANAGE_PTR(void, hipFree);
+
+std::string hip_error(int error) { return hipGetErrorString(static_cast<hipError_t>(error)); }
+
+hip_ptr allocate_gpu(std::size_t sz)
+{
+    void* result;
+    auto status = hipMalloc(&result, sz);
+    if(status != hipSuccess)
+        MIGRAPH_THROW("Gpu allocation failed: " + hip_error(status));
+    return hip_ptr{result};
+}
+
+template <class T>
+hip_ptr write_to_gpu(const T& x)
+{
+    using type = typename T::value_type;
+    auto size  = x.size() * sizeof(type);
+    return write_to_gpu(x.data(), size);
+}
+
+template <class T>
+std::vector<T> read_from_gpu(const void* x, std::size_t sz)
+{
+    std::vector<T> result(sz);
+    auto status = hipMemcpy(result.data(), x, sz * sizeof(T), hipMemcpyDeviceToHost);
+    if(status != hipSuccess)
+        MIGRAPH_THROW("Copy from gpu failed: " + hip_error(status));
+    return result;
+}
+
+hip_ptr write_to_gpu(const void* x, std::size_t sz)
+{
+    auto result = allocate_gpu(sz);
+    auto status = hipMemcpy(result.get(), x, sz, hipMemcpyHostToDevice);
+    if(status != hipSuccess)
+        MIGRAPH_THROW("Copy to gpu failed: " + hip_error(status));
+    return result;
+}
+
+argument allocate_gpu(shape s)
+{
+    auto p = share(allocate_gpu(s.bytes() + 1));
+    return {s, [p]() mutable { return reinterpret_cast<char*>(p.get()); }};
+}
+
+argument to_gpu(argument arg)
+{
+    auto p = share(write_to_gpu(arg.data(), arg.get_shape().bytes()));
+    return {arg.get_shape(), [p]() mutable { return reinterpret_cast<char*>(p.get()); }};
+}
+
+argument from_gpu(argument arg)
+{
+    argument result;
+    arg.visit([&](auto x) {
+        using type = typename decltype(x)::value_type;
+        auto v     = read_from_gpu<type>(arg.data(), x.get_shape().bytes() / sizeof(type));
+        result     = {x.get_shape(), [v]() mutable { return reinterpret_cast<char*>(v.data()); }};
+    });
+    return result;
+}
+
+} // namespace gpu
+
+} // namespace migraph
--- a/src/targets/gpu/hip_contiguous.cpp
+++ b/src/targets/gpu/hip_contiguous.cpp
+
+#include <hip/hip_runtime.h>
+#include <migraph/operators.hpp>
+
+namespace migraph {
+namespace gpu {
+
+struct index
+{
+    std::size_t global;
+    std::size_t local;
+    std::size_t group;
+};
+
+template <class F>
+__global__ void launcher(F f)
+{
+    index idx{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x};
+    f(idx);
+}
+
+auto launch(std::size_t global, std::size_t local)
+{
+    return [=](auto f) {
+        assert(local > 0);
+        assert(global > 0);
+        using f_type = decltype(f);
+        dim3 nblocks(global / local);
+        dim3 nthreads(local);
+        hipLaunchKernelGGL((launcher<f_type>), nblocks, nthreads, 0, nullptr, f);
+    };
+}
+
+template <class F>
+void visit_tensor_size(std::size_t n, F f)
+{
+    switch(n)
+    {
+    case 1:
+    {
+        f(std::integral_constant<std::size_t, 1>{});
+        break;
+    }
+    case 2:
+    {
+        f(std::integral_constant<std::size_t, 2>{});
+        break;
+    }
+    case 3:
+    {
+        f(std::integral_constant<std::size_t, 3>{});
+        break;
+    }
+    case 4:
+    {
+        f(std::integral_constant<std::size_t, 4>{});
+        break;
+    }
+    case 5:
+    {
+        f(std::integral_constant<std::size_t, 5>{});
+        break;
+    }
+    default: throw std::runtime_error("Unknown tensor size");
+    }
+}
+
+template <size_t NDim>
+struct hip_index
+{
+    size_t d[NDim];
+    __device__ __host__ size_t& operator[](size_t i) { return d[i]; }
+    __device__ __host__ size_t operator[](size_t i) const { return d[i]; }
+};
+
+template <size_t NDim>
+struct hip_tensor_descriptor
+{
+    __device__ __host__ hip_tensor_descriptor() = default;
+    template <typename T, typename V>
+    __device__ __host__ hip_tensor_descriptor(const T& lens_ext, const V& strides_ext)
+    {
+        for(size_t i = 0; i < NDim; i++)
+            lens[i] = lens_ext[i];
+        for(size_t i = 0; i < NDim; i++)
+            strides[i] = strides_ext[i];
+    }
+    __device__ __host__ hip_index<NDim> multi(size_t idx)
+    {
+        hip_index<NDim> result{};
+        size_t tidx = idx;
+        for(size_t is = 0; is < NDim; is++)
+        {
+            result[is] = tidx / strides[is];
+            tidx       = tidx % strides[is];
+        }
+        return result;
+    }
+    __device__ __host__ size_t linear(hip_index<NDim> s)
+    {
+        size_t idx = 0;
+        for(size_t i = 0; i < NDim; i++)
+            idx += s[i] * strides[i];
+        return idx;
+    }
+    size_t lens[NDim]    = {};
+    size_t strides[NDim] = {};
+};
+
+void hip_contiguous(migraph::shape output_shape, migraph::argument arg, migraph::argument result)
+{
+    visit_all(result, arg)([&](auto output, auto input) {
+        visit_tensor_size(output_shape.lens().size(), [&](auto ndim) {
+            const auto& s = arg.get_shape();
+            hip_tensor_descriptor<ndim> a_desc(s.lens(), s.strides());
+            hip_tensor_descriptor<ndim> at_desc(output_shape.lens(), output_shape.strides());
+            auto* a             = input.data();
+            auto* at            = output.data();
+            auto nelements      = s.elements();
+            std::size_t nlocal  = 512;
+            std::size_t nglobal = 512 * nlocal;
+
+            launch(nglobal, nlocal)([=](auto idx) mutable {
+                for(size_t i = idx.global; i < nelements; i += nglobal)
+                {
+                    size_t lidx = a_desc.linear(at_desc.multi(i));
+                    at[i]       = a[lidx];
+                }
+            });
+        });
+    });
+}
+} // namespace gpu
+} // namespace migraph
--- a/src/targets/gpu/include/migraph/gpu/context.hpp
+++ b/src/targets/gpu/include/migraph/gpu/context.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_CONTEXT_HPP
+#define MIGRAPH_GUARD_RTGLIB_CONTEXT_HPP
+
+#include <migraph/gpu/miopen.hpp>
+#include <migraph/gpu/rocblas.hpp>
+
+namespace migraph {
+namespace gpu {
+
+struct context
+{
+    shared<miopen_handle> handle;
+    shared<rocblas_handle_ptr> rbhandle;
+};
+
+} // namespace gpu
+
+} // namespace migraph
+
+#endif