Merged from master still debugging resnet

038a4c52 · wsttiger · 06cc4f8f · 905d4ab0 · 038a4c52 · 038a4c52
Commit 038a4c52 authored Aug 22, 2018 by wsttiger
20 changed files
--- a/src/onnx/onnx.cpp
+++ b/src/onnx/onnx.cpp
@@ -6,6 +6,7 @@
 #include <unordered_map>
 #include <functional>
 #include <array>
+#include <utility>
 #include <vector>
 #include <migraph/fallthrough.hpp>
@@ -27,7 +28,7 @@ struct unknown
        else
            return input.front();
    }
-    argument compute(context&, shape, std::vector<argument>) const
+    argument compute(context&, const shape&, const std::vector<argument>&) const
    {
        MIGRAPH_THROW("not computable");
    }
@@ -103,7 +104,7 @@ struct onnx_parser
    }
    instruction_ref
-    parse_conv(std::string, attribute_map attributes, std::vector<instruction_ref> args)
+    parse_conv(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
    {
        convolution op;
        if(contains(attributes, "pads"))
@@ -128,8 +129,9 @@ struct onnx_parser
        return prog.add_instruction(op, args);
    }
-    instruction_ref
+    instruction_ref parse_pooling(const std::string& name,
-    parse_pooling(std::string name, attribute_map attributes, std::vector<instruction_ref> args)
+                                  attribute_map attributes,
+                                  std::vector<instruction_ref> args)
    {
        pooling op{name == "MaxPool" ? "max" : "average"};
        if(contains(attributes, "pads"))
@@ -144,30 +146,11 @@ struct onnx_parser
        {
            copy(attributes["kernel_shape"].ints(), op.lengths.begin());
        }
-        return prog.add_instruction(op, args);
+        return prog.add_instruction(op, std::move(args));
    }
    instruction_ref
-    parse_average_pooling(std::string, attribute_map attributes, std::vector<instruction_ref> args)
+    parse_reshape(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
-    {
-        pooling op{"average"};
-        if(contains(attributes, "pads"))
-        {
-            copy(attributes["pads"].ints(), op.padding.begin());
-        }
-        if(contains(attributes, "strides"))
-        {
-            copy(attributes["strides"].ints(), op.stride.begin());
-        }
-        if(contains(attributes, "kernel_shape"))
-        {
-            copy(attributes["kernel_shape"].ints(), op.lengths.begin());
-        }
-        return prog.add_instruction(op, args);
-    }
-    instruction_ref
-    parse_reshape(std::string, attribute_map attributes, std::vector<instruction_ref> args)
    {
        reshape op;
        if(args.size() == 1)
@@ -184,7 +167,7 @@ struct onnx_parser
    }
    instruction_ref
-    parse_flatten(std::string, attribute_map attributes, std::vector<instruction_ref> args)
+    parse_flatten(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
    {
        uint64_t axis = 0;
        if(contains(attributes, "axis"))
@@ -194,15 +177,16 @@ struct onnx_parser
        return prog.add_instruction(flatten{axis}, args[0]);
    }
-    instruction_ref
+    instruction_ref parse_constant(const std::string&,
-    parse_constant(std::string, attribute_map attributes, std::vector<instruction_ref>)
+                                   attribute_map attributes,
+                                   const std::vector<instruction_ref>&)
    {
        literal v = parse_value(attributes.at("value"));
        return prog.add_literal(v);
    }
    instruction_ref
-    parse_gemm(std::string, attribute_map attributes, std::vector<instruction_ref> args)
+    parse_gemm(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
    {
        float alpha = 1.0f;
        float beta  = 0.0f;
@@ -238,7 +222,7 @@ struct onnx_parser
    }
    instruction_ref
-    parse_batchnorm(std::string, attribute_map attributes, std::vector<instruction_ref> args)
+    parse_batchnorm(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
    {
        float epsilon                                 = 1e-5f;
        float momentum                                = 0.9f;
@@ -263,7 +247,7 @@ struct onnx_parser
                          : batch_norm_inference::per_activation;
        }
        batch_norm_inference op{epsilon, momentum, bn_mode, is_test};
-        return prog.add_instruction(op, args);
+        return prog.add_instruction(op, std::move(args));
    }
    void parse_from(std::istream& is)
@@ -312,7 +296,7 @@ struct onnx_parser
        }
    }
-    void parse_node(std::string name)
+    void parse_node(const std::string& name)
    {
        if(name.empty())
            MIGRAPH_THROW("Onnx node must have a name");

--- a/src/onnx/perf_onnx.cpp
+++ b/src/onnx/perf_onnx.cpp
+#include <migraph/onnx.hpp>
+#include <migraph/gpu/target.hpp>
+#include <migraph/gpu/hip.hpp>
+#include <migraph/generate.hpp>
+#include <migraph/verify.hpp>
+migraph::program::parameter_map create_param_map(const migraph::program& p, bool gpu = true)
+{
+    migraph::program::parameter_map m;
+    for(auto&& x : p.get_parameter_shapes())
+    {
+        if(gpu)
+            m[x.first] = migraph::gpu::to_gpu(migraph::generate_argument(x.second));
+        else
+            m[x.first] = migraph::generate_argument(x.second);
+    }
+    return m;
+}
+int main(int argc, char const* argv[])
+{
+    if(argc > 1)
+    {
+        std::string file = argv[1];
+        std::size_t n    = argc > 2 ? std::stoul(argv[2]) : 50;
+        auto p           = migraph::parse_onnx(file);
+        std::cout << "Compiling ... " << std::endl;
+        p.compile(migraph::gpu::target{});
+        std::cout << "Allocating params ... " << std::endl;
+        auto m = create_param_map(p);
+        std::cout << "Running performance report ... " << std::endl;
+        p.perf_report(std::cout, n, m);
+    }
+}
--- a/src/onnx/resnet18.cpp
+++ b/src/onnx/resnet18.cpp
@@ -61,52 +61,49 @@ int main(int argc, char const* argv[])
    std::cout << prog << std::endl;
    auto imageset = read_cifar10_images(datafile);
-    // GPU target
+    // // GPU target
-    prog.compile(migraph::gpu::target{});
+    // prog.compile(migraph::gpu::target{});
-    migraph::program::parameter_map m;
+    // migraph::program::parameter_map m;
-    auto s = migraph::shape{migraph::shape::float_type, {1, 3, 32, 32}};
-    m["output"] =
-        migraph::gpu::to_gpu(migraph::generate_argument(prog.get_parameter_shape("output")));
-    auto labels = imageset.first;
-    auto input  = imageset.second;
-    auto ptr    = input.data();
-    for(int i = 0; i < 10; i++)
-    {
-        std::cout << "label: " << (uint32_t)labels[i] << "  ---->  ";
-        m["0"]      = migraph::gpu::to_gpu(migraph::argument{s, &ptr[3072 * i]});
-        auto result = migraph::gpu::from_gpu(prog.eval(m));
-        std::vector<float> logits;
-        result.visit([&](auto output) { logits.assign(output.begin(), output.end()); });
-        std::vector<float> probs = softmax(logits);
-        for(auto x : probs)
-            std::cout << x << "  ";
-        std::cout << std::endl;
-        std::cout << std::endl;
-        for (int j = 0; j < 10; j++) {
-            std::cout << 255.0*input[i*3072+j] << "    ";
-        }
-        std::cout << std::endl;
-        std::cout << std::endl;
-        std::cout << std::endl;
-    }
-    // // // CPU target
-    // prog.compile(migraph::cpu::cpu_target{});
    // auto s = migraph::shape{migraph::shape::float_type, {1, 3, 32, 32}};
+    // for(auto&& x : prog.get_parameter_shapes())
+    // {
+    //     m[x.first] = migraph::gpu::to_gpu(migraph::generate_argument(x.second));
+    // }
    // auto labels = imageset.first;
    // auto input  = imageset.second;
    // auto ptr    = input.data();
    // for(int i = 0; i < 10; i++)
    // {
    //     std::cout << "label: " << (uint32_t)labels[i] << "  ---->  ";
-    //     auto input3 = migraph::argument{s, &ptr[3072 * i]};
+    //     m["0"]      = migraph::gpu::to_gpu(migraph::argument{s, &ptr[3072 * i]});
-    //     auto result = prog.eval({{"0", input3}});
+    //     auto result = migraph::gpu::from_gpu(prog.eval(m));
    //     std::vector<float> logits;
    //     result.visit([&](auto output) { logits.assign(output.begin(), output.end()); });
    //     std::vector<float> probs = softmax(logits);
    //     for(auto x : logits)
-    //         std::cout << x << "  ";
+    //         //std::cout << x << "  ";
+    //         //std::cout << x << std::endl;
+    //         printf("%10.5e    ", x);
+    //     std::cout << std::endl;
    //     std::cout << std::endl;
    // }
+    // // CPU target
+    prog.compile(migraph::cpu::cpu_target{});
+    auto s      = migraph::shape{migraph::shape::float_type, {1, 3, 32, 32}};
+    auto labels = imageset.first;
+    auto input  = imageset.second;
+    auto ptr    = input.data();
+    for(int i = 0; i < 10; i++)
+    {
+        std::cout << "label: " << (uint32_t)labels[i] << "  ---->  ";
+        auto input3 = migraph::argument{s, &ptr[3072 * i]};
+        auto result = prog.eval({{"0", input3}});
+        std::vector<float> logits;
+        result.visit([&](auto output) { logits.assign(output.begin(), output.end()); });
+        std::vector<float> probs = softmax(logits);
+        for(auto x : logits)
+            printf("%10.5e    ", x);
+        std::cout << std::endl;
+    }
 }
--- a/src/onnx/verify_onnx.cpp
+++ b/src/onnx/verify_onnx.cpp
@@ -7,7 +7,7 @@
 #include <migraph/generate.hpp>
 #include <migraph/verify.hpp>
-migraph::argument run_cpu(std::string file)
+migraph::argument run_cpu(const std::string& file)
 {
    auto p = migraph::parse_onnx(file);
    p.compile(migraph::cpu::cpu_target{});
@@ -21,7 +21,7 @@ migraph::argument run_cpu(std::string file)
    return out;
 }
-migraph::argument run_gpu(std::string file)
+migraph::argument run_gpu(const std::string& file)
 {
    auto p = migraph::parse_onnx(file);
    p.compile(migraph::gpu::target{});

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -2,9 +2,12 @@
 #include <migraph/stringutils.hpp>
 #include <migraph/instruction.hpp>
 #include <migraph/env.hpp>
+#include <migraph/time.hpp>
+#include <migraph/iterator_for.hpp>
 #include <iostream>
 #include <sstream>
 #include <algorithm>
+#include <utility>
 namespace migraph {
@@ -19,18 +22,68 @@ struct program_impl
 const operation& get_operation(instruction_ref ins) { return ins->op; }
+template <class F>
+static void print_program(std::ostream& os, const program& p, F annonate)
+{
+    std::unordered_map<instruction_ref, std::string> names;
+    int count = 0;
+    for(auto ins : iterator_for(p))
+    {
+        std::string var_name = "@" + std::to_string(count);
+        if(ins->op.name() == "@param")
+        {
+            var_name = any_cast<builtin::param>(ins->op).parameter;
+        }
+        os << var_name << " = ";
+        os << ins->op;
+        if(ins->op.name() == "@literal")
+        {
+            if(ins->lit.get_shape().elements() > 10)
+                os << "{ ... }";
+            else
+                os << "{" << ins->lit << "}";
+        }
+        if(!ins->arguments.empty())
+        {
+            char delim = '(';
+            for(auto&& arg : ins->arguments)
+            {
+                assert(p.has_instruction(arg) && "Instruction not found");
+                os << delim << names.at(arg);
+                delim = ',';
+            }
+            os << ")";
+        }
+        os << " -> " << ins->result;
+        annonate(ins, names);
+        os << std::endl;
+        names.emplace(ins, var_name);
+        count++;
+    }
+}
 program::program() : impl(std::make_unique<program_impl>()) {}
 program::program(program&&) noexcept = default;
 program& program::operator=(program&&) noexcept = default;
 program::~program() noexcept                    = default;
-instruction_ref program::add_instruction(operation op, std::vector<instruction_ref> args)
+instruction_ref program::add_instruction(const operation& op, std::vector<instruction_ref> args)
 {
-    return insert_instruction(impl->instructions.end(), std::move(op), std::move(args));
+    return insert_instruction(impl->instructions.end(), op, std::move(args));
 }
-instruction_ref
+instruction_ref program::insert_instruction(instruction_ref ins,
-program::insert_instruction(instruction_ref ins, operation op, std::vector<instruction_ref> args)
+                                            const operation& op,
+                                            std::vector<instruction_ref> args)
 {
    assert(std::all_of(
               args.begin(), args.end(), [&](instruction_ref x) { return has_instruction(x); }) &&
@@ -45,8 +98,9 @@ program::insert_instruction(instruction_ref ins, operation op, std::vector<instr
    return result;
 }
-instruction_ref
+instruction_ref program::replace_instruction(instruction_ref ins,
-program::replace_instruction(instruction_ref ins, operation op, std::vector<instruction_ref> args)
+                                             const operation& op,
+                                             std::vector<instruction_ref> args)
 {
    assert(std::all_of(
               args.begin(), args.end(), [&](instruction_ref x) { return has_instruction(x); }) &&
@@ -117,7 +171,7 @@ instruction_ref program::add_literal(literal l)
    return impl->instructions.begin();
 }
-instruction_ref program::add_outline(shape s)
+instruction_ref program::add_outline(const shape& s)
 {
    impl->instructions.push_front({builtin::outline{s}, s, {}});
    return impl->instructions.begin();
@@ -125,7 +179,7 @@ instruction_ref program::add_outline(shape s)
 instruction_ref program::add_parameter(std::string name, shape s)
 {
-    impl->instructions.push_front({builtin::param{std::move(name)}, s, {}});
+    impl->instructions.push_front({builtin::param{std::move(name)}, std::move(s), {}});
    return impl->instructions.begin();
 }
@@ -170,6 +224,7 @@ bool program::has_instruction(instruction_ref ins) const
               }) != impl->instructions.end();
 }
+std::size_t program::size() const { return impl->instructions.size(); }
 instruction_ref program::begin() const { return impl->instructions.begin(); }
 instruction_ref program::end() const { return impl->instructions.end(); }
@@ -195,8 +250,10 @@ void program::compile(const target& t)
            std::cout << "Pass: " << p.name() << std::endl;
        p.apply(*this);
        if(enabled(MIGRAPH_TRACE_COMPILE{}))
-            std::cout << *this << std::endl << std::endl;
+            std::cout << *this << std::endl;
 #ifndef NDEBUG
+        if(enabled(MIGRAPH_TRACE_COMPILE{}))
+            std::cout << "Validate ..." << std::endl;
        auto invalid = this->validate();
        if(invalid != impl->instructions.end())
        {
@@ -204,6 +261,8 @@ void program::compile(const target& t)
            MIGRAPH_THROW(p.name() + " pass produces invalid program at instruction " +
                          std::to_string(index) + ": " + invalid->op.name());
        }
+        if(enabled(MIGRAPH_TRACE_COMPILE{}))
+            std::cout << std::endl;
 #endif
    }
    auto invalid = this->validate();
@@ -214,86 +273,132 @@ void program::compile(const target& t)
    }
 }
-argument program::eval(std::unordered_map<std::string, argument> params) const
+template <class F>
+argument generic_eval(const program& p,
+                      context& ctx,
+                      std::unordered_map<std::string, argument> params,
+                      F trace)
 {
-    assert(this->validate() == impl->instructions.end());
+    assert(p.validate() == p.end());
-    std::unordered_map<const instruction*, argument> results;
+    std::unordered_map<instruction_ref, argument> results;
-    argument result;
+    results.reserve(p.size() * 2);
-    for(auto& ins : impl->instructions)
+    std::vector<argument> values;
+    values.reserve(16);
+    for(auto ins : iterator_for(p))
    {
-        if(ins.op.name() == "@literal")
+        if(ins->op.name() == "@literal")
        {
-            result = ins.lit.get_argument();
+            results.emplace(ins, trace(ins, [&] { return ins->lit.get_argument(); }));
        }
-        else if(ins.op.name() == "@param")
+        else if(ins->op.name() == "@param")
        {
-            result = params.at(any_cast<builtin::param>(ins.op).parameter);
+            std::cout << ins->op.name() << std::endl;
+            results.emplace(ins, trace(ins, [&] {
+                                return params.at(any_cast<builtin::param>(ins->op).parameter);
+                            }));
        }
-        else if(ins.op.name() == "@outline")
+        else if(ins->op.name() == "@outline")
        {
-            result = argument{ins.result, nullptr};
+            results.emplace(ins, trace(ins, [&] { return argument{ins->result, nullptr}; }));
        }
        else
        {
-            std::vector<argument> values(ins.arguments.size());
+            values.resize(ins->arguments.size());
-            std::transform(ins.arguments.begin(),
+            std::transform(ins->arguments.begin(),
-                           ins.arguments.end(),
+                           ins->arguments.end(),
                           values.begin(),
-                           [&](instruction_ref i) { return results.at(std::addressof(*i)); });
+                           [&](instruction_ref i) {
-            // std::cout << "Compute: " << ins.op.name() << std::endl;
+                               assert(results.find(i) != results.end());
-            result = ins.op.compute(this->impl->ctx, ins.result, values);
+                               return results[i];
+                           });
+            results.emplace(ins,
+                            trace(ins, [&] { return ins->op.compute(ctx, ins->result, values); }));
        }
-        results.emplace(std::addressof(ins), result);
+        assert(results.find(ins) != results.end());
    }
-    return result;
+    return results.at(std::prev(p.end()));
 }
-bool operator==(const program& x, const program& y) { return to_string(x) == to_string(y); }
+argument program::eval(std::unordered_map<std::string, argument> params) const
+{
+    return generic_eval(
+        *this, this->impl->ctx, std::move(params), [](auto&, auto f) { return f(); });
+}
-std::ostream& operator<<(std::ostream& os, const program& p)
+double common_average(const std::vector<double>& v)
 {
-    std::unordered_map<const instruction*, std::string> names;
+    std::size_t n = v.size() / 4;
-    int count = 0;
+    double total  = std::accumulate(v.begin() + n, v.end() - n, 0.0);
+    return total / std::distance(v.begin() + n, v.end() - n);
+}
-    for(auto& ins : p.impl->instructions)
+void program::perf_report(std::ostream& os, std::size_t n, parameter_map params) const
-    {
+{
-        std::string var_name = "@" + std::to_string(count);
+    using milliseconds = std::chrono::duration<double, std::milli>;
-        if(ins.op.name() == "@param")
+    // Run once by itself
+    eval(params);
+    // Run and time entire program
+    std::vector<double> total_vec;
+    total_vec.reserve(n);
+    for(std::size_t i = 0; i < n; i++)
    {
-            var_name = any_cast<builtin::param>(ins.op).parameter;
+        total_vec.push_back(time<milliseconds>([&] { eval(params); }));
    }
+    std::sort(total_vec.begin(), total_vec.end());
-        os << var_name << " = ";
+    std::unordered_map<instruction_ref, std::vector<double>> ins_vec;
+    // Fill the map
-        os << ins.op;
+    generic_eval(*this, this->impl->ctx, params, [&](auto ins, auto) {
+        ins_vec[ins].reserve(n);
-        if(ins.op.name() == "@literal")
+        return argument{};
+    });
+    // Run and time each instruction
+    for(std::size_t i = 0; i < n; i++)
    {
-            if(ins.lit.get_shape().elements() > 10)
+        generic_eval(*this, this->impl->ctx, params, [&](auto ins, auto f) {
-                os << "{ ... }";
+            argument result;
-            else
+            ins_vec[ins].push_back(time<milliseconds>([&] { result = f(); }));
-                os << "{" << ins.lit << "}";
+            return result;
+        });
    }
+    for(auto&& p : ins_vec)
-        if(!ins.arguments.empty())
+        std::sort(p.second.begin(), p.second.end());
-        {
+    // Run and time implicit overhead
-            char delim = '(';
+    std::vector<double> overhead_vec;
-            for(auto&& arg : ins.arguments)
+    overhead_vec.reserve(n);
+    for(std::size_t i = 0; i < n; i++)
    {
-                assert(p.has_instruction(arg) && "Instruction not found");
+        overhead_vec.push_back(time<milliseconds>([&] {
-                os << delim << names.at(std::addressof(*arg));
+            generic_eval(*this, this->impl->ctx, params, [](auto...) { return argument{}; });
-                delim = ',';
+        }));
-            }
-            os << ")";
    }
-        os << " -> " << ins.result;
+    double total_time             = common_average(total_vec);
+    double rate                   = std::ceil(1000.0 / total_time);
+    double overhead_time          = common_average(overhead_vec);
+    double overhead_percent       = overhead_time * 100.0 / total_time;
+    double total_instruction_time = 0.0;
+    for(auto&& p : ins_vec)
+        total_instruction_time += common_average(p.second);
+    double calculate_overhead_time    = total_time - total_instruction_time;
+    double calculate_overhead_percent = calculate_overhead_time * 100.0 / total_time;
+    print_program(
+        os, *this, [&](auto ins, auto&&) { os << ": " << common_average(ins_vec[ins]) << "ms"; });
+    os << "Rate: " << rate << "/sec" << std::endl;
+    os << "Total time: " << total_time << "ms" << std::endl;
+    os << "Total instructions time: " << total_instruction_time << "ms" << std::endl;
+    os << "Overhead time: " << overhead_time << "ms"
+       << ", " << calculate_overhead_time << "ms" << std::endl;
+    os << "Overhead: " << std::round(overhead_percent) << "%"
+       << ", " << std::round(calculate_overhead_percent) << "%" << std::endl;
+}
-        os << std::endl;
+bool operator==(const program& x, const program& y) { return to_string(x) == to_string(y); }
-        names.emplace(std::addressof(ins), var_name);
+std::ostream& operator<<(std::ostream& os, const program& p)
-        count++;
+{
-    }
+    print_program(os, p, [](auto&&...) {});
    return os;
 }

--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -8,46 +8,90 @@
 namespace migraph {
-shape::shape() : m_type(float_type), m_standard(false) {}
+struct shape_impl
+{
+    static std::shared_ptr<shape_impl> default_shape()
+    {
+        static std::shared_ptr<shape_impl> result = std::make_shared<shape_impl>();
+        return result;
+    }
-shape::shape(type_t t) : m_type(t), m_lens({1}), m_strides({1}), m_standard(true) {}
+    shape_impl() : m_type(shape::float_type), m_standard(false) {}
-shape::shape(type_t t, std::vector<std::size_t> l)
+    shape_impl(shape::type_t t) : m_type(t), m_lens({1}), m_strides({1}), m_standard(true) {}
+    shape_impl(shape::type_t t, std::vector<std::size_t> l)
        : m_type(t), m_lens(std::move(l)), m_standard(true)
-{
+    {
        this->calculate_strides();
        assert(m_lens.size() == m_strides.size());
-}
+    }
-shape::shape(type_t t, std::vector<std::size_t> l, std::vector<std::size_t> s)
+    shape_impl(shape::type_t t, std::vector<std::size_t> l, std::vector<std::size_t> s)
        : m_type(t), m_lens(std::move(l)), m_strides(std::move(s))
-{
+    {
        assert(m_lens.size() == m_strides.size());
        assert(std::any_of(m_strides.begin(), m_strides.end(), [](auto x) { return x > 0; }) and
               "At least one stride must be non-zero");
-    m_standard = this->packed() and not this->transposed();
+        m_standard = this->elements() == this->element_space() and
-}
+                     std::is_sorted(m_strides.rbegin(), m_strides.rend());
+    }
+    shape::type_t m_type;
+    std::vector<std::size_t> m_lens;
+    std::vector<std::size_t> m_strides;
+    bool m_standard;
-void shape::calculate_strides()
+    void calculate_strides()
-{
+    {
        m_strides.clear();
        m_strides.resize(m_lens.size(), 0);
        if(m_strides.empty())
            return;
        m_strides.back() = 1;
-    std::partial_sum(
+        std::partial_sum(m_lens.rbegin(),
-        m_lens.rbegin(), m_lens.rend() - 1, m_strides.rbegin() + 1, std::multiplies<std::size_t>());
+                         m_lens.rend() - 1,
-}
+                         m_strides.rbegin() + 1,
+                         std::multiplies<std::size_t>());
+    }
-shape::type_t shape::type() const { return this->m_type; }
+    std::size_t element_space() const
-const std::vector<std::size_t>& shape::lens() const { return this->m_lens; }
+    {
-const std::vector<std::size_t>& shape::strides() const { return this->m_strides; }
+        assert(m_lens.size() == m_strides.size());
-std::size_t shape::elements() const
+        if(m_lens.empty())
-{
+            return 0;
-    assert(this->lens().size() == this->strides().size());
+        return std::inner_product(m_lens.begin(),
-    if(this->lens().empty())
+                                  m_lens.end(),
+                                  m_strides.begin(),
+                                  std::size_t{0},
+                                  std::plus<std::size_t>{},
+                                  [](std::size_t l, std::size_t s) { return (l - 1) * s; }) +
+               1;
+    }
+    std::size_t elements() const
+    {
+        assert(m_lens.size() == m_strides.size());
+        if(m_lens.empty())
            return 0;
        return std::accumulate(
-        this->lens().begin(), this->lens().end(), std::size_t{1}, std::multiplies<std::size_t>());
+            m_lens.begin(), m_lens.end(), std::size_t{1}, std::multiplies<std::size_t>());
+    }
+};
+shape::shape() : impl(shape_impl::default_shape()) {}
+shape::shape(type_t t) : impl(std::make_shared<shape_impl>(t)) {}
+shape::shape(type_t t, std::vector<std::size_t> l)
+    : impl(std::make_shared<shape_impl>(t, std::move(l)))
+{
+}
+shape::shape(type_t t, std::vector<std::size_t> l, std::vector<std::size_t> s)
+    : impl(std::make_shared<shape_impl>(t, std::move(l), std::move(s)))
+{
 }
+shape::type_t shape::type() const { return impl->m_type; }
+const std::vector<std::size_t>& shape::lens() const { return impl->m_lens; }
+const std::vector<std::size_t>& shape::strides() const { return impl->m_strides; }
+std::size_t shape::elements() const { return impl->elements(); }
 std::size_t shape::bytes() const
 {
    std::size_t n = 0;
@@ -98,25 +142,13 @@ bool shape::broadcasted() const
                           std::multiplies<std::size_t>()) == 0;
 }
-bool shape::standard() const { return this->m_standard; }
+bool shape::standard() const { return impl->m_standard; }
-std::size_t shape::element_space() const
+std::size_t shape::element_space() const { return impl->element_space(); }
-{
-    assert(this->lens().size() == this->strides().size());
-    if(this->lens().empty())
-        return 0;
-    return std::inner_product(this->lens().begin(),
-                              this->lens().end(),
-                              this->strides().begin(),
-                              std::size_t{0},
-                              std::plus<std::size_t>{},
-                              [](std::size_t l, std::size_t s) { return (l - 1) * s; }) +
-           1;
-}
 std::string shape::type_string() const
 {
-    switch(this->m_type)
+    switch(this->type())
    {
 #define MIGRAPH_SHAPE_TYPE_STRING_CASE(x, t) \
    case x: return #x;

--- a/src/targets/cpu/cpu_lowering.cpp
+++ b/src/targets/cpu/cpu_lowering.cpp
@@ -7,6 +7,7 @@
 #include <migraph/iterator_for.hpp>
 #include <migraph/cpu/gemm.hpp>
 #include <unordered_map>
+#include <utility>
 namespace migraph {
 namespace cpu {
@@ -39,9 +40,9 @@ struct cpu_batch_norm_inference
    std::string name() const { return "cpu::batch_norm_inference"; }
-    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument output{output_shape};
@@ -95,7 +96,7 @@ struct cpu_convolution
    convolution op;
    std::string name() const { return "cpu::convolution"; }
-    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, shape output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
@@ -161,8 +162,8 @@ struct cpu_pooling
    pooling op;
    std::string name() const { return "cpu::pooling_" + Op::name(); }
-    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        visit_all(result, args[0])([&](auto output, auto input) {
@@ -208,8 +209,8 @@ struct cpu_contiguous
 {
    contiguous op;
    std::string name() const { return "cpu::contiguous"; }
-    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        visit_all(result, args[0])([&](auto output, auto input) {
@@ -225,9 +226,9 @@ struct cpu_gemm
 {
    gemm op;
    std::string name() const { return "cpu::gemm"; }
-    shape compute_shape(std::vector<shape> inputs) const { return op.compute_shape(inputs); }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        migemm(result, args[0], args[1], op.alpha, op.beta);
@@ -357,8 +358,8 @@ struct cpu_unary
 {
    Op op;
    std::string name() const { return op.name(); }
-    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
+    shape compute_shape(const std::vector<shape>& inputs) const { return inputs.front(); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        result.visit([&](auto output) {
@@ -373,8 +374,8 @@ struct cpu_unary
 struct softmax2d
 {
    std::string name() const { return "cpu::softmax2d"; }
-    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
+    shape compute_shape(const std::vector<shape>& inputs) const { return inputs.front(); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        visit_all(result, args[0])([&](auto output, auto input) {
@@ -449,8 +450,8 @@ struct cpu_binary
 {
    Op op;
    std::string name() const { return op.name(); }
-    shape compute_shape(std::vector<shape> inputs) const { return inputs.front(); }
+    shape compute_shape(const std::vector<shape>& inputs) const { return inputs.front(); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        visit_all(result, args[0], args[1])([&](auto output, auto input1, auto input2) {

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -18,6 +18,7 @@ target_link_libraries(migraph_device migraph hip::device)
 target_include_directories(migraph_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
 add_library(migraph_gpu
+    eliminate_workspace.cpp
    hip.cpp
    target.cpp
    lowering.cpp

--- a/src/targets/gpu/eliminate_workspace.cpp
+++ b/src/targets/gpu/eliminate_workspace.cpp
+#include <migraph/gpu/eliminate_workspace.hpp>
+#include <migraph/gpu/hip.hpp>
+#include <migraph/program.hpp>
+#include <migraph/instruction.hpp>
+#include <migraph/operators.hpp>
+#include <migraph/iterator_for.hpp>
+#include <migraph/ranges.hpp>
+#include <migraph/stringutils.hpp>
+namespace migraph {
+namespace gpu {
+void eliminate_workspace::apply(program& p) const
+{
+    std::size_t n = 0;
+    std::vector<instruction_ref> allocs;
+    for(auto ins : iterator_for(p))
+    {
+        if(ins->output.size() != 1)
+            continue;
+        if(ins->op.name() != "hip::allocate")
+            continue;
+        auto&& a = any_cast<hip_allocate>(ins->op);
+        if(a.tag == "workspace")
+        {
+            n = std::max(n, ins->get_shape().bytes());
+            allocs.push_back(ins);
+        }
+    }
+    auto ws = p.add_parameter("workspace", shape{shape::int8_type, {n}});
+    for(auto&& a : allocs)
+    {
+        p.replace_instruction(a, ws);
+        p.remove_instruction(a);
+    }
+}
+} // namespace gpu
+} // namespace migraph
--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -13,12 +13,28 @@ using hip_ptr = MIGRAPH_MANAGE_PTR(void, hipFree);
 std::string hip_error(int error) { return hipGetErrorString(static_cast<hipError_t>(error)); }
-hip_ptr allocate_gpu(std::size_t sz)
+std::size_t get_available_gpu_memory()
 {
+    size_t free, total;
+    auto status = hipMemGetInfo(&free, &total);
+    if(status != hipSuccess)
+        MIGRAPH_THROW("Failed getting available memory: " + hip_error(status));
+    return free;
+}
+hip_ptr allocate_gpu(std::size_t sz, bool host = false)
+{
+    if(sz > get_available_gpu_memory())
+        MIGRAPH_THROW("Memory not available to allocate buffer: " + std::to_string(sz));
    void* result;
-    auto status = hipMalloc(&result, sz);
+    auto status = host ? hipHostMalloc(&result, sz) : hipMalloc(&result, sz);
    if(status != hipSuccess)
+    {
+        if(host)
            MIGRAPH_THROW("Gpu allocation failed: " + hip_error(status));
+        else
+            allocate_gpu(sz, true);
+    }
    return hip_ptr{result};
 }
@@ -40,24 +56,24 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz)
    return result;
 }
-hip_ptr write_to_gpu(const void* x, std::size_t sz)
+hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false)
 {
-    auto result = allocate_gpu(sz);
+    auto result = allocate_gpu(sz, host);
    auto status = hipMemcpy(result.get(), x, sz, hipMemcpyHostToDevice);
    if(status != hipSuccess)
        MIGRAPH_THROW("Copy to gpu failed: " + hip_error(status));
    return result;
 }
-argument allocate_gpu(shape s)
+argument allocate_gpu(const shape& s, bool host)
 {
-    auto p = share(allocate_gpu(s.bytes() + 1));
+    auto p = share(allocate_gpu(s.bytes() + 1, host));
    return {s, [p]() mutable { return reinterpret_cast<char*>(p.get()); }};
 }
-argument to_gpu(argument arg)
+argument to_gpu(argument arg, bool host)
 {
-    auto p = share(write_to_gpu(arg.data(), arg.get_shape().bytes()));
+    auto p = share(write_to_gpu(arg.data(), arg.get_shape().bytes(), host));
    return {arg.get_shape(), [p]() mutable { return reinterpret_cast<char*>(p.get()); }};
 }

--- a/src/targets/gpu/include/migraph/gpu/eliminate_workspace.hpp
+++ b/src/targets/gpu/include/migraph/gpu/eliminate_workspace.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_ELIMINATE_WORKSPACE_HPP
+#define MIGRAPH_GUARD_RTGLIB_ELIMINATE_WORKSPACE_HPP
+#include <string>
+#include <migraph/instruction_ref.hpp>
+namespace migraph {
+struct program;
+namespace gpu {
+struct eliminate_workspace
+{
+    std::string name() const { return "eliminate_workspace"; }
+    void apply(program& p) const;
+};
+} // namespace gpu
+} // namespace migraph
+#endif
--- a/src/targets/gpu/include/migraph/gpu/hip.hpp
+++ b/src/targets/gpu/include/migraph/gpu/hip.hpp
@@ -2,25 +2,27 @@
 #define MIGRAPH_GUARD_MIGRAPHLIB_HIP_HPP
 #include <migraph/operators.hpp>
+#include <utility>
 namespace migraph {
 namespace gpu {
-migraph::argument allocate_gpu(migraph::shape s);
+migraph::argument allocate_gpu(const migraph::shape& s, bool host = false);
-migraph::argument to_gpu(migraph::argument arg);
+migraph::argument to_gpu(migraph::argument arg, bool host = false);
 migraph::argument from_gpu(migraph::argument arg);
 struct hip_allocate
 {
+    std::string tag{};
    std::string name() const { return "hip::allocate"; }
-    shape compute_shape(std::vector<shape> inputs) const
+    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs}.has(1);
        return inputs.front();
    }
-    argument compute(context&, shape output_shape, std::vector<argument>) const
+    argument compute(context&, const shape& output_shape, const std::vector<argument>&) const
    {
        return allocate_gpu(output_shape);
    }
@@ -29,12 +31,12 @@ struct hip_allocate
 struct hip_write
 {
    std::string name() const { return "hip::write"; }
-    shape compute_shape(std::vector<shape> inputs) const
+    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs}.has(1);
        return inputs.front();
    }
-    argument compute(context&, shape, std::vector<argument> args) const
+    argument compute(context&, const shape&, const std::vector<argument>& args) const
    {
        return to_gpu(args.front());
    }

--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -12,6 +12,7 @@
 #include <migraph/iterator_for.hpp>
 #include <migraph/gpu/rocblas.hpp>
 #include <migraph/gpu/context.hpp>
+#include <utility>
 namespace migraph {
 namespace gpu {
@@ -22,14 +23,15 @@ struct miopen_batch_norm_inference
    std::string name() const { return "gpu::batch_norm_inference"; }
-    shape compute_shape(std::vector<shape> inputs) const
+    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(6);
        return op.compute_shape(
            {inputs.at(0), inputs.at(1), inputs.at(2), inputs.at(3), inputs.at(4)});
    }
-    argument compute(context& ctx, shape output_shape, std::vector<argument> args) const
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
        auto x_desc  = make_tensor(args[0].get_shape());
        auto y_desc  = make_tensor(output_shape);
@@ -63,12 +65,13 @@ struct miopen_convolution
    miopenConvFwdAlgorithm_t algo{};
    std::string name() const { return "gpu::convolution"; }
-    shape compute_shape(std::vector<shape> inputs) const
+    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(4).standard();
        return op.compute_shape({inputs.at(0), inputs.at(1)});
    }
-    argument compute(context& ctx, shape output_shape, std::vector<argument> args) const
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
        auto x_desc = make_tensor(args[0].get_shape());
        auto w_desc = make_tensor(args[1].get_shape());
@@ -91,7 +94,7 @@ struct miopen_convolution
        return args[3];
    }
-    shape compile(context& ctx, shape output_shape, std::vector<instruction_ref> inputs)
+    shape compile(context& ctx, const shape& output_shape, std::vector<instruction_ref> inputs)
    {
        shape workspace_shape{};
        auto x_desc = make_tensor(inputs[0]->get_shape());
@@ -100,7 +103,7 @@ struct miopen_convolution
        std::size_t workspace_size = 0;
        miopenConvolutionForwardGetWorkSpaceSize(
-            ctx.handle.get(), x_desc.get(), w_desc.get(), cd.get(), y_desc.get(), &workspace_size);
+            ctx.handle.get(), w_desc.get(), x_desc.get(), cd.get(), y_desc.get(), &workspace_size);
        workspace_shape = shape{shape::int8_type, {workspace_size}};
        auto x         = to_gpu(generate_argument(inputs[0]->get_shape()));
@@ -108,7 +111,7 @@ struct miopen_convolution
        auto y         = to_gpu(generate_argument(output_shape));
        auto workspace = allocate_gpu(workspace_shape);
-        int algo_count;
+        int algo_count = 1;
        miopenConvAlgoPerf_t perf;
        miopenFindConvolutionForwardAlgorithm(ctx.handle.get(),
                                              x_desc.get(),
@@ -125,7 +128,8 @@ struct miopen_convolution
                                              workspace_size,
                                              false);
        algo = perf.fwd_algo;
-        return workspace_shape;
+        return algo == miopenConvolutionFwdAlgoWinograd ? shape{shape::int8_type, {0}}
+                                                        : workspace_shape;
    }
 };
@@ -135,12 +139,13 @@ struct miopen_pooling
    shared<pooling_descriptor> pd;
    std::string name() const { return "gpu::pooling"; }
-    shape compute_shape(std::vector<shape> inputs) const
+    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(2).standard();
        return op.compute_shape({inputs.at(0)});
    }
-    argument compute(context& ctx, shape output_shape, std::vector<argument> args) const
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
        auto x_desc = make_tensor(args[0].get_shape());
        auto y_desc = make_tensor(output_shape);
@@ -166,13 +171,14 @@ struct miopen_pooling
 struct miopen_add
 {
    std::string name() const { return "gpu::add"; }
-    shape compute_shape(std::vector<shape> inputs) const
+    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(3).not_broadcasted();
        return inputs.at(0);
    }
-    argument compute(context& ctx, shape output_shape, std::vector<argument> args) const
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
        if(args[1].get_shape().broadcasted())
        {
@@ -213,12 +219,13 @@ struct miopen_gemm
 {
    gemm op;
    std::string name() const { return "gpu::convolution"; }
-    shape compute_shape(std::vector<shape> inputs) const
+    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(3);
        return op.compute_shape({inputs.at(0), inputs.at(1)});
    }
-    argument compute(context& ctx, shape output_shape, std::vector<argument> args) const
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
        float alpha     = 1.0f;
        float beta      = 0.0f;
@@ -252,14 +259,14 @@ struct miopen_contiguous
 {
    contiguous op;
    std::string name() const { return "gpu::contiguous"; }
-    shape compute_shape(std::vector<shape> inputs) const
+    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(2);
        return op.compute_shape({inputs.at(0)});
    }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    argument compute(context&, shape output_shape, const std::vector<argument>& args) const
    {
-        hip_contiguous(output_shape, args.at(0), args.at(1));
+        hip_contiguous(std::move(output_shape), args.at(0), args.at(1));
        return args.at(1);
    }
 };
@@ -268,13 +275,14 @@ struct miopen_relu
 {
    shared<activation_descriptor> ad;
    std::string name() const { return "gpu::relu"; }
-    shape compute_shape(std::vector<shape> inputs) const
+    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(2).not_broadcasted();
        return inputs.at(1);
    }
-    argument compute(context& ctx, shape output_shape, std::vector<argument> args) const
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
        float alpha = 1, beta = 0;
        auto x_desc = make_tensor(args[0].get_shape());
@@ -297,42 +305,50 @@ struct miopen_apply
    program* prog = nullptr;
    context ctx{};
+    void check_shape(shape x, instruction_ref i)
+    {
+        assert(x == i->get_shape());
+        (void)x;
+        (void)i;
+    }
    void apply()
    {
        for(auto it = prog->begin(); it != prog->end(); it++)
        {
+            auto s = it->get_shape();
            if(it->op.name() == "convolution")
            {
-                apply_convolution(it);
+                check_shape(s, apply_convolution(it));
            }
            else if(it->op.name() == "activation")
            {
-                apply_activation(it);
+                check_shape(s, apply_activation(it));
            }
            else if(it->op.name() == "pooling")
            {
-                apply_pooling(it);
+                check_shape(s, apply_pooling(it));
            }
            else if(it->op.name() == "add")
            {
-                apply_add(it);
+                check_shape(s, apply_add(it));
            }
            else if(it->op.name() == "gemm")
            {
-                apply_gemm(it);
+                check_shape(s, apply_gemm(it));
            }
            else if(it->op.name() == "contiguous")
            {
-                apply_contiguous(it);
+                check_shape(s, apply_contiguous(it));
            }
            else if(it->op.name() == "batch_norm_inference")
            {
-                apply_batch_norm_inference(it);
+                check_shape(s, apply_batch_norm_inference(it));
            }
        }
    }
-    instruction_ref insert_allocation(instruction_ref ins, const shape& s)
+    instruction_ref insert_allocation(instruction_ref ins, const shape& s, std::string tag = "")
    {
        if(ins == --prog->end())
        {
@@ -341,70 +357,71 @@ struct miopen_apply
        else
        {
            auto is     = prog->add_outline(s);
-            auto result = prog->insert_instruction(ins, hip_allocate{}, is);
+            auto result = prog->insert_instruction(ins, hip_allocate{std::move(tag)}, is);
            return result;
        }
    }
-    void apply_convolution(instruction_ref ins)
+    instruction_ref apply_convolution(instruction_ref ins)
    {
        auto&& op = any_cast<convolution>(ins->op);
        auto conv = miopen_convolution{op, make_conv(op)};
        auto ws   = conv.compile(ctx, ins->result, ins->arguments);
-        auto workspace = insert_allocation(ins, ws);
+        auto workspace = insert_allocation(ins, ws, "workspace");
        auto output    = insert_allocation(ins, ins->result);
-        prog->replace_instruction(
+        return prog->replace_instruction(
            ins, conv, ins->arguments.at(0), ins->arguments.at(1), workspace, output);
    }
-    void apply_pooling(instruction_ref ins)
+    instruction_ref apply_pooling(instruction_ref ins)
    {
        auto&& op   = any_cast<pooling>(ins->op);
        auto pd     = make_pooling(op);
        auto output = insert_allocation(ins, ins->result);
-        prog->replace_instruction(
+        return prog->replace_instruction(
            ins, miopen_pooling{op, std::move(pd)}, ins->arguments.at(0), output);
    }
-    void apply_activation(instruction_ref ins)
+    instruction_ref apply_activation(instruction_ref ins)
    {
        auto&& op = any_cast<activation>(ins->op);
        auto ad   = make_relu();
        if(op.mode == "relu")
        {
            auto output = insert_allocation(ins, ins->result);
-            prog->replace_instruction(
+            return prog->replace_instruction(
                ins, miopen_relu{std::move(ad)}, ins->arguments.at(0), output);
        }
+        return ins;
    }
-    void apply_add(instruction_ref ins)
+    instruction_ref apply_add(instruction_ref ins)
    {
        auto output = insert_allocation(ins, ins->result);
-        prog->replace_instruction(
+        return prog->replace_instruction(
            ins, miopen_add{}, ins->arguments.at(0), ins->arguments.at(1), output);
    }
-    void apply_gemm(instruction_ref ins)
+    instruction_ref apply_gemm(instruction_ref ins)
    {
        auto&& op   = any_cast<gemm>(ins->op);
        auto output = insert_allocation(ins, ins->result);
-        prog->replace_instruction(
+        return prog->replace_instruction(
            ins, miopen_gemm{op}, ins->arguments.at(0), ins->arguments.at(1), output);
    }
-    void apply_contiguous(instruction_ref ins)
+    instruction_ref apply_contiguous(instruction_ref ins)
    {
        auto&& op   = any_cast<contiguous>(ins->op);
        auto output = insert_allocation(ins, ins->result);
-        prog->replace_instruction(ins, miopen_contiguous{op}, ins->arguments.at(0), output);
+        return prog->replace_instruction(ins, miopen_contiguous{op}, ins->arguments.at(0), output);
    }
-    void apply_batch_norm_inference(instruction_ref ins)
+    instruction_ref apply_batch_norm_inference(instruction_ref ins)
    {
        auto&& op       = any_cast<batch_norm_inference>(ins->op);
        auto output     = insert_allocation(ins, ins->result);
@@ -416,7 +433,7 @@ struct miopen_apply
                       ins->arguments.end(),
                       std::back_inserter(reshapes),
                       [&](auto i) { return prog->insert_instruction(ins, reshape_op, i); });
-        prog->replace_instruction(ins,
+        return prog->replace_instruction(ins,
                                         miopen_batch_norm_inference{op},
                                         ins->arguments.at(0),
                                         reshapes[0],

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -2,6 +2,7 @@
 #include <migraph/gpu/lowering.hpp>
 #include <migraph/gpu/write_literals.hpp>
 #include <migraph/gpu/context.hpp>
+#include <migraph/gpu/eliminate_workspace.hpp>
 #include <migraph/check_context.hpp>
 #include <migraph/auto_contiguous.hpp>
 #include <migraph/dead_code_elimination.hpp>
@@ -22,9 +23,9 @@ std::vector<pass> target::get_passes(migraph::context& gctx) const
        //simplify_reshapes{},
        //dead_code_elimination{},
        lowering{ctx},
-        //eliminate_contiguous{},
+        eliminate_workspace{},
-        //dead_code_elimination{},
+        eliminate_contiguous{},
-        //write_literals{},
+        dead_code_elimination{},
        write_literals{},
        //check_context<context>{},
        dead_code_elimination{}

--- a/test/cpu_ops_test.cpp
+++ b/test/cpu_ops_test.cpp
@@ -605,9 +605,7 @@ void transpose_test()
        result.visit([&](auto output) {
            std::vector<size_t> new_lens = {1, 3, 2, 2};
-            std::vector<size_t> new_strides = {12, 1, 6, 3};
            EXPECT(bool{output.get_shape().lens() == new_lens});
-            EXPECT(bool{output.get_shape().strides() == new_strides});
        });
    }
    {

--- a/test/gpu/miopen.cpp
+++ b/test/gpu/miopen.cpp
@@ -104,6 +104,7 @@ void verify_program()
    visit_all(cpu_arg_f.get(), gpu_arg)([](auto cpu, auto gpu) {
        if(not migraph::verify_range(cpu, gpu))
        {
+            // TODO: Check for nans
            std::cout << "FAILED: " << migraph::get_type_name<V>() << std::endl;
        }
    });
@@ -272,6 +273,29 @@ struct test_transpose
    }
 };
+struct test_batchnorm_inference_2
+{
+    const size_t width    = 14;
+    const size_t height   = 14;
+    const size_t channels = 256;
+    const size_t batches  = 1;
+    migraph::program create_program() const
+    {
+        migraph::program p;
+        migraph::shape s{migraph::shape::float_type, {batches, channels, height, width}};
+        migraph::shape vars{migraph::shape::float_type, {channels}};
+        auto x        = p.add_parameter("x", s);
+        auto mean     = p.add_parameter("mean", vars);
+        auto variance = p.add_parameter("variance", vars);
+        auto scale    = p.add_parameter("scale", vars);
+        auto bias     = p.add_parameter("bias", vars);
+        p.add_instruction(migraph::batch_norm_inference{}, x, mean, variance, scale, bias);
+        return p;
+    }
+};
 struct test_batchnorm_inference
 {
    const size_t width    = 3;
@@ -309,4 +333,5 @@ int main()
    verify_program<test_contiguous>();
    verify_program<test_transpose>();
    verify_program<test_batchnorm_inference>();
+    verify_program<test_batchnorm_inference_2>();
 }
--- a/test/include/basic_ops.hpp
+++ b/test/include/basic_ops.hpp
@@ -6,7 +6,7 @@ struct sum_op
 {
    std::string name() const { return "sum"; }
    migraph::argument
-    compute(migraph::context&, migraph::shape, std::vector<migraph::argument> args) const
+    compute(migraph::context&, const migraph::shape&, std::vector<migraph::argument> args) const
    {
        migraph::argument result;
        if(args.size() != 2)
@@ -36,7 +36,7 @@ struct minus_op
 {
    std::string name() const { return "minus"; }
    migraph::argument
-    compute(migraph::context&, migraph::shape, std::vector<migraph::argument> args) const
+    compute(migraph::context&, const migraph::shape&, std::vector<migraph::argument> args) const
    {
        migraph::argument result;
        if(args.size() != 2)
@@ -66,7 +66,7 @@ struct pass_op
 {
    std::string name() const { return "pass"; }
    migraph::argument
-    compute(migraph::context&, migraph::shape, std::vector<migraph::argument> args) const
+    compute(migraph::context&, const migraph::shape&, std::vector<migraph::argument> args) const
    {
        if(args.empty())
            return {};
@@ -85,7 +85,7 @@ struct pass_standard_op
 {
    std::string name() const { return "pass"; }
    migraph::argument
-    compute(migraph::context&, migraph::shape, std::vector<migraph::argument> args) const
+    compute(migraph::context&, const migraph::shape&, std::vector<migraph::argument> args) const
    {
        if(args.empty())
            return {};
@@ -109,12 +109,12 @@ struct nop
 {
    std::string name() const { return "nop"; }
    migraph::argument
-    compute(migraph::context&, migraph::shape, std::vector<migraph::argument>) const
+    compute(migraph::context&, const migraph::shape&, const std::vector<migraph::argument>&) const
    {
        return {};
    }
-    migraph::shape compute_shape(std::vector<migraph::shape>) const { return {}; }
+    migraph::shape compute_shape(const std::vector<migraph::shape>&) const { return {}; }
 };
 inline migraph::literal get_2x2()

--- a/test/include/test.hpp
+++ b/test/include/test.hpp
@@ -141,7 +141,7 @@ bool throws(F f)
 }
 template <class F, class Exception>
-bool throws(F f, std::string msg = "")
+bool throws(F f, const std::string& msg = "")
 {
    try
    {

--- a/test/op_shape_test.cpp
+++ b/test/op_shape_test.cpp
+#include <migraph/program.hpp>
+#include <migraph/iterator_for.hpp>
+#include <migraph/instruction.hpp>
+#include <migraph/operators.hpp>
+#include <sstream>
+#include "test.hpp"
+template <class... Ts>
+void expect_shape(const migraph::shape& expected, const migraph::operation& op, Ts... xs)
+{
+    migraph::program p;
+    std::vector<migraph::shape> shapes{xs...};
+    std::vector<migraph::instruction_ref> args(shapes.size());
+    std::transform(
+        shapes.begin(), shapes.end(), args.begin(), [&](auto&& s) { return p.add_outline(s); });
+    p.add_instruction(op, args);
+    if(p.get_shape() != expected)
+    {
+        std::cout << "FAILED: Incorrect shape for " << op.name() << ": ";
+        std::cout << expected << " != " << p.get_shape() << std::endl;
+        for(auto&& s : shapes)
+            std::cout << "    " << s << std::endl;
+    }
+}
+template <class... Ts>
+void throws_shape(const migraph::operation& op, Ts... xs)
+{
+    migraph::program p;
+    std::vector<migraph::shape> shapes{xs...};
+    std::vector<migraph::instruction_ref> args(shapes.size());
+    std::transform(
+        shapes.begin(), shapes.end(), args.begin(), [&](auto&& s) { return p.add_outline(s); });
+    bool thrown = test::throws([&] { p.add_instruction(op, args); });
+    if(not thrown)
+    {
+        std::cout << "FAILED: No error found for " << op.name() << ": ";
+        for(auto&& s : shapes)
+            std::cout << "    " << s << std::endl;
+    }
+}
+template <class...>
+struct always_false : std::false_type
+{
+};
+template <class... Ts>
+void throws_shape(const migraph::shape&, Ts...)
+{
+    static_assert(always_false<Ts...>{},
+                  "An expected shape should not be passed to throws_shape function");
+}
+void batch_norm_inference_shape()
+{
+    const size_t channels = 3;
+    migraph::shape s{migraph::shape::float_type, {4, channels, 3, 3}};
+    migraph::shape vars{migraph::shape::float_type, {channels}};
+    expect_shape(s, migraph::batch_norm_inference{}, s, vars, vars, vars, vars);
+    throws_shape(migraph::batch_norm_inference{}, s);
+    throws_shape(migraph::batch_norm_inference{}, s, vars, vars, vars, vars, vars);
+}
+void convolution_shape()
+{
+    migraph::shape output{migraph::shape::float_type, {4, 4, 1, 1}};
+    migraph::shape input{migraph::shape::float_type, {4, 3, 3, 3}};
+    migraph::shape weights{migraph::shape::float_type, {4, 3, 3, 3}};
+    expect_shape(output, migraph::convolution{}, input, weights);
+    throws_shape(migraph::convolution{}, input);
+    migraph::shape input2{migraph::shape::float_type, {3, 3}};
+    migraph::shape weights2{migraph::shape::float_type, {3, 3}};
+    throws_shape(migraph::convolution{}, input2, weights2);
+    throws_shape(migraph::convolution{}, input2, weights);
+}
+void transpose_shape()
+{
+    migraph::shape input{migraph::shape::float_type, {2, 2}};
+    migraph::shape output{migraph::shape::float_type, {2, 2}, {1, 2}};
+    expect_shape(input, migraph::transpose{{0, 1}}, input);
+    expect_shape(output, migraph::transpose{{1, 0}}, input);
+    throws_shape(migraph::transpose{{1, 2}}, input);
+}
+void contiguous_shape()
+{
+    migraph::shape output{migraph::shape::float_type, {2, 2}};
+    migraph::shape input{migraph::shape::float_type, {2, 2}, {1, 2}};
+    expect_shape(output, migraph::contiguous{}, input);
+    throws_shape(migraph::contiguous{}, input, input);
+    migraph::shape single{migraph::shape::float_type, {2}};
+    throws_shape(migraph::contiguous{}, single);
+}
+void reshape_shape()
+{
+    migraph::shape input{migraph::shape::float_type, {24, 1, 1, 1}};
+    for(auto&& new_shape :
+        std::vector<std::vector<int64_t>>{{8, 3, 1, 1}, {1, 3, 4, 2}, {1, 3, 4, 2}})
+    {
+        std::vector<std::size_t> lens(new_shape.size());
+        std::copy(new_shape.begin(), new_shape.end(), lens.begin());
+        migraph::shape output{migraph::shape::float_type, lens};
+        expect_shape(output, migraph::reshape{new_shape}, input);
+    }
+    for(auto&& new_shape : std::vector<std::vector<int64_t>>{{8, 3, 2, 2}, {1, 3, -1, -1}})
+    {
+        throws_shape(migraph::reshape{new_shape}, input);
+    }
+}
+void flatten_shape()
+{
+    migraph::shape input{migraph::shape::float_type, {2, 4, 6, 8}};
+    expect_shape(
+        migraph::shape{migraph::shape::float_type, {1, 2 * 4 * 6 * 8}}, migraph::flatten{0}, input);
+    expect_shape(
+        migraph::shape{migraph::shape::float_type, {2, 4 * 6 * 8}}, migraph::flatten{1}, input);
+    expect_shape(
+        migraph::shape{migraph::shape::float_type, {2 * 4, 6 * 8}}, migraph::flatten{2}, input);
+    expect_shape(
+        migraph::shape{migraph::shape::float_type, {2 * 4 * 6, 8}}, migraph::flatten{3}, input);
+    expect_shape(
+        migraph::shape{migraph::shape::float_type, {2 * 4 * 6 * 8, 1}}, migraph::flatten{4}, input);
+    throws_shape(migraph::flatten{5}, input);
+}
+int main()
+{
+    batch_norm_inference_shape();
+    convolution_shape();
+    transpose_shape();
+    contiguous_shape();
+    reshape_shape();
+    flatten_shape();
+}
--- a/test/operation.cpp
+++ b/test/operation.cpp
@@ -8,12 +8,12 @@ struct simple_operation
 {
    int data = 1;
    std::string name() const { return "simple"; }
-    migraph::shape compute_shape(std::vector<migraph::shape>) const
+    migraph::shape compute_shape(const std::vector<migraph::shape>&) const
    {
        MIGRAPH_THROW("not computable");
    }
    migraph::argument
-    compute(migraph::context&, migraph::shape, std::vector<migraph::argument>) const
+    compute(migraph::context&, const migraph::shape&, const std::vector<migraph::argument>&) const
    {
        MIGRAPH_THROW("not computable");
    }
@@ -27,12 +27,12 @@ struct simple_operation
 struct simple_operation_no_print
 {
    std::string name() const { return "simple"; }
-    migraph::shape compute_shape(std::vector<migraph::shape>) const
+    migraph::shape compute_shape(const std::vector<migraph::shape>&) const
    {
        MIGRAPH_THROW("not computable");
    }
    migraph::argument
-    compute(migraph::context&, migraph::shape, std::vector<migraph::argument>) const
+    compute(migraph::context&, const migraph::shape&, const std::vector<migraph::argument>&) const
    {
        MIGRAPH_THROW("not computable");
    }