Added op namespace on operators

d22bab64 · wsttiger · ad0ab357 · 3d264140 · d22bab64 · d22bab64
Commit d22bab64 authored Oct 01, 2018 by wsttiger
20 changed files
--- a/src/include/migraph/pass_config.hpp
+++ b/src/include/migraph/pass_config.hpp
+#ifndef MIGRAPH_GUARD_PASS_CONFIG_HPP
+#define MIGRAPH_GUARD_PASS_CONFIG_HPP
+#include <migraph/env.hpp>
+namespace migraph {
+MIGRAPH_DECLARE_ENV_VAR(MIGRAPH_DISABLE_MEMORY_COLORING)
+} // namespace migraph
+#endif // MIGRAPH_GUARD_PASS_CONFIG_HPP
--- a/src/include/migraph/program.hpp
+++ b/src/include/migraph/program.hpp
@@ -75,6 +75,8 @@ struct program
    shape get_parameter_shape(std::string name) const;
+    instruction_ref get_parameter(std::string name) const;
    std::unordered_map<std::string, shape> get_parameter_shapes() const;
    argument eval(parameter_map params) const;
@@ -100,7 +102,6 @@ struct program
    private:
    std::unique_ptr<program_impl> impl;
 };
 } // namespace migraph
 #endif
--- a/src/include/migraph/shape.hpp
+++ b/src/include/migraph/shape.hpp
@@ -63,6 +63,7 @@ struct shape
    const std::vector<std::size_t>& strides() const;
    std::size_t elements() const;
    std::size_t bytes() const;
+    std::size_t type_size() const;
    /// Map multiple indices to space index
    std::size_t index(std::initializer_list<std::size_t> l) const;

--- a/src/onnx/onnx.cpp
+++ b/src/onnx/onnx.cpp
@@ -48,13 +48,13 @@ struct onnx_parser
    onnx_parser()
    {
-        add_generic_op("Add", add{});
+        add_generic_op("Add", op::add{});
-        add_generic_op("Div", div{});
+        add_generic_op("Div", op::div{});
-        add_generic_op("MatMul", gemm{});
+        add_generic_op("MatMul", op::gemm{});
-        add_generic_op("Mul", mul{});
+        add_generic_op("Mul", op::mul{});
-        add_generic_op("Relu", activation{"relu"});
+        add_generic_op("Relu", op::activation{"relu"});
-        add_generic_op("Sub", sub{});
+        add_generic_op("Sub", op::sub{});
-        add_generic_op("Sum", add{});
+        add_generic_op("Sum", op::add{});
        add_mem_op("Constant", &onnx_parser::parse_constant);
        add_mem_op("Conv", &onnx_parser::parse_conv);
@@ -93,7 +93,7 @@ struct onnx_parser
                    uint64_t axis = (contains(attributes, "axis"))
                                        ? parse_value(attributes.at("axis")).at<uint64_t>()
                                        : 0;
-                    auto l = prog.add_instruction(broadcast{axis}, args);
+                    auto l = prog.add_instruction(op::broadcast{axis}, args);
                    return prog.add_instruction(x, args[0], l);
                }
            }
@@ -105,15 +105,16 @@ struct onnx_parser
    parse_softmax(const std::string&, const attribute_map&, std::vector<instruction_ref> args)
    {
        auto dims = args.front()->get_shape().lens();
-        auto r = prog.add_instruction(reshape{{long(dims[0]), long(dims[1]), 1, 1}}, args.front());
+        auto r =
-        auto s = prog.add_instruction(softmax{}, r);
+            prog.add_instruction(op::reshape{{long(dims[0]), long(dims[1]), 1, 1}}, args.front());
-        return prog.add_instruction(reshape{{long(dims[0]), long(dims[1])}}, s);
+        auto s = prog.add_instruction(op::softmax{}, r);
+        return prog.add_instruction(op::reshape{{long(dims[0]), long(dims[1])}}, s);
    }
    instruction_ref
    parse_conv(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
    {
-        convolution op;
+        op::convolution op;
        if(contains(attributes, "pads"))
        {
            copy(attributes["pads"].ints(), op.padding.begin());
@@ -130,8 +131,8 @@ struct onnx_parser
        {
            uint64_t axis = 1;
            auto l1       = prog.add_instruction(op, args[0], args[1]);
-            auto l2       = prog.add_instruction(broadcast{axis}, l1, args[2]);
+            auto l2       = prog.add_instruction(op::broadcast{axis}, l1, args[2]);
-            return prog.add_instruction(add{}, l1, l2);
+            return prog.add_instruction(op::add{}, l1, l2);
        }
        return prog.add_instruction(op, args);
    }
@@ -140,7 +141,7 @@ struct onnx_parser
                                  attribute_map attributes,
                                  std::vector<instruction_ref> args)
    {
-        pooling op{name == "MaxPool" ? "max" : "average"};
+        op::pooling op{name == "MaxPool" ? "max" : "average"};
        if(contains(attributes, "pads"))
        {
            copy(attributes["pads"].ints(), op.padding.begin());
@@ -159,7 +160,7 @@ struct onnx_parser
    instruction_ref
    parse_reshape(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
    {
-        reshape op;
+        op::reshape op;
        if(args.size() == 1)
        {
            literal s = parse_value(attributes.at("shape"));
@@ -181,7 +182,7 @@ struct onnx_parser
        {
            axis = parse_value(attributes.at("axis")).at<int>();
        }
-        return prog.add_instruction(flatten{axis}, args[0]);
+        return prog.add_instruction(op::flatten{axis}, args[0]);
    }
    instruction_ref parse_constant(const std::string&,
@@ -216,25 +217,25 @@ struct onnx_parser
            transb = parse_value(attributes.at("transB")).at<bool>();
        }
        std::vector<int64_t> perm = {1, 0};
-        auto l1 = (transa) ? prog.add_instruction(transpose{perm}, args[0]) : args[0];
+        auto l1 = (transa) ? prog.add_instruction(op::transpose{perm}, args[0]) : args[0];
-        auto l2 = (transb) ? prog.add_instruction(transpose{perm}, args[1]) : args[1];
+        auto l2 = (transb) ? prog.add_instruction(op::transpose{perm}, args[1]) : args[1];
        if(args.size() == 3)
        {
            uint64_t axis = 1;
-            auto l3       = prog.add_instruction(gemm{alpha, beta}, l1, l2);
+            auto l3       = prog.add_instruction(op::gemm{alpha, beta}, l1, l2);
-            auto l4       = prog.add_instruction(broadcast{axis}, l3, args[2]);
+            auto l4       = prog.add_instruction(op::broadcast{axis}, l3, args[2]);
-            return prog.add_instruction(add{}, l3, l4);
+            return prog.add_instruction(op::add{}, l3, l4);
        }
-        return prog.add_instruction(gemm{alpha, beta}, l1, l2);
+        return prog.add_instruction(op::gemm{alpha, beta}, l1, l2);
    }
    instruction_ref
    parse_batchnorm(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
    {
-        float epsilon                                 = 1e-5f;
+        float epsilon                                     = 1e-5f;
-        float momentum                                = 0.9f;
+        float momentum                                    = 0.9f;
-        batch_norm_inference::bn_infer_mode_t bn_mode = batch_norm_inference::spatial;
+        op::batch_norm_inference::bn_infer_mode_t bn_mode = op::batch_norm_inference::spatial;
-        bool is_test                                  = false;
+        bool is_test                                      = false;
        if(contains(attributes, "epsilon"))
        {
            epsilon = parse_value(attributes.at("epsilon")).at<float>();
@@ -250,10 +251,10 @@ struct onnx_parser
        if(contains(attributes, "spatial"))
        {
            bn_mode = (parse_value(attributes.at("spatial")).at<uint64_t>() > 0)
-                          ? batch_norm_inference::spatial
+                          ? op::batch_norm_inference::spatial
-                          : batch_norm_inference::per_activation;
+                          : op::batch_norm_inference::per_activation;
        }
-        batch_norm_inference op{epsilon, momentum, bn_mode, is_test};
+        op::batch_norm_inference op{epsilon, momentum, bn_mode, is_test};
        return prog.add_instruction(op, std::move(args));
    }

--- a/src/opt/common_header.hpp
+++ b/src/opt/common_header.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_COMMON_HEADER_HPP
+#define MIGRAPH_GUARD_RTGLIB_COMMON_HEADER_HPP
+#include <migraph/program.hpp>
+#include <migraph/stringutils.hpp>
+#include <migraph/instruction.hpp>
+#include <migraph/operators.hpp>
+#include <migraph/iterator_for.hpp>
+#include <migraph/pass_config.hpp>
+#include <set>
+#include <list>
+#include <vector>
+#include <queue>
+namespace migraph {
+//#define MIGRAPH_DEBUG_OPT
+#ifdef MIGRAPH_DEBUG_OPT
+#define MIGRAPH_DEBUG(s) s
+#else
+#define MIGRAPH_DEBUG(s)
+#endif // MIGRAPH_DEBUG_OPT
+} // namespace migraph
+#endif // MIGRAPH_GUARD_RTGLIB_COMMON_HEADER_HPP
--- a/src/opt/memory_coloring.cpp
+++ b/src/opt/memory_coloring.cpp
+#include <migraph/memory_coloring.hpp>
+#include "memory_coloring_impl.hpp"
+namespace migraph {
+void memory_coloring::apply(program& p) const
+{
+    if(!enabled(MIGRAPH_DISABLE_MEMORY_COLORING{}))
+    {
+        memory_coloring_impl opt(&p, allocation_op);
+        opt.run();
+    }
+}
+} // namespace migraph
--- a/src/opt/memory_coloring_impl.cpp
+++ b/src/opt/memory_coloring_impl.cpp
+#include "memory_coloring_impl.hpp"
+namespace migraph {
+void memory_coloring_impl::run()
+{
+    MIGRAPH_DEBUG(dump("---Before memory coloring---"));
+    MIGRAPH_DEBUG(dump_program());
+    register_operand_alias();
+    build();
+    if(num_of_lives != 0)
+    {
+        MIGRAPH_DEBUG(dump_intervals());
+        // Coloring
+        while(!alloc_queue.empty())
+        {
+            interval_ptr interval = alloc_queue.top();
+            allocate(interval);
+            alloc_queue.pop();
+        }
+        rewrite();
+        MIGRAPH_DEBUG(verify());
+    }
+}
+bool memory_coloring_impl::allocate(interval_ptr interval)
+{
+    shape s          = interval->result;
+    std::size_t size = s.bytes();
+    if(size == 0)
+        return false;
+    std::size_t element_size = size / s.elements();
+    live_range& segment      = interval->segment;
+    int vn                   = segment.vn;
+    std::priority_queue<live_range*, std::vector<live_range*>, ordering> conflict_queue;
+    std::unordered_map<long long, live_range*> offset2_live;
+    offset2_live.clear();
+    if(conflict_table.find(vn) != conflict_table.end())
+    {
+        std::set<int>& vn_set = conflict_table[vn];
+        for(auto& iter : vn_set)
+        {
+            live_range* range = live_ranges[iter];
+            long long offset  = range->offset;
+            if(offset != invalid_offset)
+            {
+                conflict_queue.push(range);
+                if(offset2_live.find(offset) == offset2_live.end())
+                {
+                    offset2_live[offset] = range;
+                }
+                else
+                {
+                    live_range* prev = offset2_live[offset];
+                    assert(prev->offset == offset);
+                    if(prev->size < range->size)
+                        offset2_live[offset] = range;
+                }
+            }
+        }
+    }
+    long long offset = 0;
+    while(!conflict_queue.empty())
+    {
+        live_range* range     = conflict_queue.top();
+        long long iter_offset = range->offset;
+        if(offset > iter_offset)
+        {
+            offset = std::max(offset, iter_offset + range->size);
+        }
+        else if(offset2_live[iter_offset] == range)
+        {
+            if((iter_offset > offset) && (iter_offset - offset) >= size)
+            {
+                break;
+            }
+            offset = iter_offset + range->size;
+        }
+        // alignment
+        if((offset % element_size) != 0)
+            offset += (element_size - (offset % element_size));
+        conflict_queue.pop();
+    }
+    segment.offset = offset;
+    MIGRAPH_DEBUG(segment.dump());
+    required_bytes = std::max(required_bytes, offset + segment.size);
+    return true;
+}
+void memory_coloring_impl::build()
+{
+    std::size_t num_of_instrs = p_program->size();
+    if(num_of_instrs == 0)
+        return;
+    int cur_points        = num_of_instrs * 2;
+    instruction_ref iter  = p_program->end();
+    instruction_ref begin = p_program->begin();
+    std::vector<instruction_ref> dead_instrs;
+    std::set<int> live_set;
+    // Build live intervals.
+    live_intervals.resize(num_of_instrs);
+    do
+    {
+        iter                      = std::prev(iter);
+        const instruction* p_iter = &(*iter);
+        interval_ptr def_interval = nullptr;
+        bool is_dead              = false;
+        if(instr2_live.find(p_iter) != instr2_live.end())
+        {
+            def_interval = instr2_live[p_iter];
+            bool is_lit  = is_literal(iter);
+            if(is_allocate(iter) || is_lit)
+            {
+                live_range& range        = def_interval->segment;
+                def_interval->result     = iter->get_shape();
+                def_interval->is_literal = is_lit;
+                if(!is_lit || unify_literals)
+                    alloc_queue.push(def_interval);
+                range.begin             = cur_points;
+                def_interval->def_point = cur_points;
+                range.size              = (iter->get_shape()).bytes();
+                live_set.erase(range.vn);
+            }
+        }
+        else if(!is_param(iter) && !is_outline(iter) && !is_check_context(iter))
+        {
+            is_dead = true;
+        }
+        int tie_ndx = get_input_tie_ndx(iter);
+        int cnt     = -1;
+        for(auto&& arg : iter->inputs())
+        {
+            cnt++;
+            if(is_param(arg) || is_outline(arg))
+            {
+                if(is_output_param(arg))
+                    is_dead = false;
+                if(def_interval != nullptr)
+                {
+                    def_interval->is_live_on_entry = true;
+                }
+                continue;
+            }
+            const instruction* p_arg = &(*arg);
+            if(cnt == tie_ndx && (def_interval != nullptr))
+            {
+                // input memory is used as this instruction's output.
+                // def is considered as use. Coalesce the live intervals.
+                def_interval->add_use(cur_points);
+                instr2_live[p_arg] = def_interval;
+            }
+            else if(instr2_live.find(p_arg) == instr2_live.end())
+            {
+                // First time see a use, create a live interval.
+                int id                = num_of_lives++;
+                interval_ptr interval = &(live_intervals[id]);
+                interval->id          = id;
+                interval->segment.end = cur_points;
+                interval->segment.vn  = ++max_value_number;
+                interval->add_use(cur_points);
+                instr2_live[p_arg] = interval;
+                add_conflicts(live_set, max_value_number);
+                live_set.insert(max_value_number);
+                live_ranges[max_value_number] = &(interval->segment);
+                earliest_end_point            = cur_points;
+                if(latest_end_point == -1)
+                    latest_end_point = cur_points;
+            }
+            else
+            {
+                interval_ptr interval = instr2_live[p_arg];
+                interval->add_use(cur_points);
+                assert(live_set.find(interval->id) != live_set.end());
+            }
+        }
+        if(is_dead)
+            dead_instrs.push_back(iter);
+        cur_points -= 2;
+    } while(iter != begin);
+}
+void memory_coloring_impl::register_operand_alias()
+{
+    operand_alias["hip::allocate"] = -1;
+    operand_alias["@outline"]      = -1;
+    operand_alias["check_context"] = -1;
+    operand_alias["@literal"]      = -1;
+    operand_alias["@param"]        = -1;
+    operand_alias["transpose"]     = 0;
+    operand_alias["flatten"]       = 0;
+    operand_alias["broadcast"]     = 1;
+    operand_alias["reshape"]       = 0;
+    operand_alias["pass"]          = 0;
+}
+void memory_coloring_impl::rewrite()
+{
+    instruction_ref end           = p_program->end();
+    instruction_ref scratch_param = end;
+    std::vector<std::size_t> dims;
+    dims.push_back(required_bytes / sizeof(float));
+    shape s       = {shape::float_type, dims};
+    scratch_param = p_program->add_parameter("scratch", s);
+    for(auto ins : iterator_for(*p_program))
+    {
+        const instruction* p_iter = &(*ins);
+        if(instr2_live.find(p_iter) != instr2_live.end())
+        {
+            interval_ptr interval = instr2_live[p_iter];
+            if(interval->get_begin() == invalid_offset)
+                continue;
+            if(!unify_literals && interval->is_literal)
+                continue;
+            std::size_t offset = 0;
+            if(interval->get_offset() == invalid_offset)
+            {
+                assert(interval->result.bytes() == 0);
+            }
+            else
+            {
+                offset = interval->get_offset();
+            }
+            if(is_allocate(ins))
+            {
+                assert(!ins->inputs().empty());
+                p_program->replace_instruction(
+                    ins, op::load{ins->inputs().at(0)->get_shape(), offset}, scratch_param);
+            }
+            else if(is_literal(ins))
+            {
+#if 0                
+                auto pre      = p_program->add_literal(ins->lit);
+                bool pre_copy = (interval->get_begin() < earliest_end_point);
+                p_program->replace_instruction(
+                    ins, write_literal{offset, pre_copy}, scratch_param, pre);
+#endif
+            }
+        }
+    }
+    MIGRAPH_DEBUG(dump("---After rewrite---"));
+    MIGRAPH_DEBUG(dump_program());
+}
+#ifdef MIGRAPH_DEBUG_OPT
+void memory_coloring_impl::dump(const std::string& str) { std::cout << str << std::endl; }
+void memory_coloring_impl::dump_program() { std::cout << *p_program << std::endl; }
+void memory_coloring_impl::dump_intervals()
+{
+    if(num_of_lives > 0)
+    {
+        std::cout << "---live intervals ---" << std::endl;
+        for(int i = 0; i < num_of_lives; ++i)
+        {
+            live_interval& interval = live_intervals[i];
+            interval.dump();
+        }
+        std::cout << "---conflict table---" << std::endl;
+        for(int i = 0; i <= max_value_number; ++i)
+        {
+            std::cout << " segment:" << i;
+            std::cout << " =>";
+            std::set<int>& table = conflict_table[i];
+            for(auto& iter : table)
+            {
+                std::cout << (iter) << ",";
+            }
+        }
+        std::cout << std::endl;
+    }
+}
+void memory_coloring_impl::verify()
+{
+    if(num_of_lives > 0)
+    {
+        for(int i = 0; i < num_of_lives; ++i)
+        {
+            live_interval& interval = live_intervals[i];
+            live_range& segment     = interval.segment;
+            if(segment.begin == invalid_offset)
+            {
+                assert(interval.is_live_on_entry);
+                continue;
+            }
+            if(segment.offset == invalid_offset)
+            {
+                continue;
+            }
+            int vn = segment.vn;
+            if(conflict_table.find(vn) != conflict_table.end())
+            {
+                std::set<int>& vn_set = conflict_table[vn];
+                for(auto& iter : vn_set)
+                {
+                    live_range* range = live_ranges[iter];
+                    if(range->offset == invalid_offset)
+                        continue;
+                    if(!is_disjoin(*range, segment))
+                        assert(false);
+                }
+            }
+        }
+    }
+}
+// map liveness tracking point to instruction enum.
+static int get_ins_enum(int x)
+{
+    if(x > 0)
+    {
+        return (x / 2) - 1;
+    }
+    else
+        return invalid_offset;
+}
+void live_range::dump()
+{
+    std::cout << " segment:" << vn;
+    std::cout << " [" << get_ins_enum(begin) << ", " << get_ins_enum(end) << "]";
+    if(offset != invalid_offset)
+    {
+        std::cout << " mem:";
+        std::cout << " [" << offset << "," << offset + size - 1 << "]";
+    }
+    std::cout << std::endl;
+}
+void live_interval::dump()
+{
+    std::cout << "id:" << id;
+    segment.dump();
+    std::cout << " uses:";
+    for(auto& iter : use_points)
+    {
+        std::cout << " " << get_ins_enum(iter) << ",";
+    }
+    std::cout << " def:";
+    std::cout << " " << get_ins_enum(def_point);
+    if(is_literal)
+        std::cout << " literal";
+    std::cout << " " << result;
+    std::cout << std::endl;
+}
+#endif
+} // namespace migraph
--- a/src/opt/memory_coloring_impl.hpp
+++ b/src/opt/memory_coloring_impl.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_MEMORY_COLORING_IMPL_HPP
+#define MIGRAPH_GUARD_RTGLIB_MEMORY_COLORING_IMPL_HPP
+#include "common_header.hpp"
+namespace migraph {
+static const int invalid_offset = -1;
+struct live_range
+{
+    int begin;        // begin point in the instruction stream.
+    int end;          // end point in the instruction stream.
+    long long offset; // offset to base pointer of allocated memory trunk.
+    int vn;           // value number that identifies this live_range.
+    long long size;   // size of required memory in bytes
+#ifdef MIGRAPH_DEBUG_OPT
+    void dump();
+#endif
+};
+struct live_interval
+{
+    live_interval() : segment({invalid_offset, invalid_offset, invalid_offset, invalid_offset, 0})
+    {
+        id               = invalid_offset;
+        def_point        = invalid_offset;
+        is_literal       = false;
+        is_live_on_entry = false;
+    }
+    void add_use(int use) { use_points.push_front(use); }
+    int get_begin() const { return segment.begin; }
+    int get_end() const { return segment.end; }
+    long long get_offset() const { return segment.offset; }
+#ifdef MIGRAPH_DEBUG_OPT
+    void dump();
+#endif
+    live_range segment;
+    int id;
+    std::list<int> use_points;
+    int def_point;
+    shape result;
+    bool is_literal;
+    bool is_live_on_entry;
+};
+using interval_ptr = live_interval*;
+struct memory_coloring_impl
+{
+    memory_coloring_impl(program* p, std::string alloc_op)
+        : p_program(p), allocation_op(std::move(alloc_op))
+    {
+        instr2_live.clear();
+        live_ranges.clear();
+        conflict_table.clear();
+        num_of_lives     = 0;
+        max_value_number = -1;
+        required_bytes   = 0;
+        operand_alias.clear();
+        earliest_end_point = -1;
+        latest_end_point   = -1;
+        unify_literals     = false;
+    }
+    bool allocate(interval_ptr);
+    void add_conflicts(std::set<int>& live_set, int val)
+    {
+        for(auto& iter : live_set)
+        {
+            conflict_table[iter].insert(val);
+            conflict_table[val].insert(iter);
+        }
+    }
+    void build();
+    void run();
+    void register_operand_alias();
+    void rewrite();
+    private:
+    static bool is_param(const instruction_ref ins) { return ins->name() == "@param"; }
+    static bool is_output_param(const instruction_ref ins)
+    {
+        return is_param(ins) && any_cast<builtin::param>(ins->get_operator()).parameter == "output";
+    }
+    bool is_allocate(const instruction_ref ins) { return ins->name() == allocation_op; }
+    static bool is_outline(const instruction_ref ins) { return ins->name() == "@outline"; }
+    static bool is_literal(const instruction_ref ins) { return ins->name() == "@literal"; }
+    static bool is_check_context(const instruction_ref ins)
+    {
+        return ins->name() == "check_context";
+    }
+    // get operand alias info.  This is a temporary workaround.
+    int get_input_tie_ndx(const instruction_ref ins)
+    {
+        std::string name = ins->name();
+        if(operand_alias.find(name) != operand_alias.end())
+            return operand_alias[name];
+        if(is_allocate(ins))
+        {
+            // This happens to custom allocators.
+            operand_alias[name] = -1;
+            return -1;
+        }
+        int cnt           = -1;
+        int last_allocate = -1;
+        for(auto&& arg : ins->inputs())
+        {
+            cnt++;
+            if(is_allocate(arg) || is_output_param(arg))
+                last_allocate = cnt;
+        }
+        assert(last_allocate != -1);
+        operand_alias[name] = last_allocate;
+        return last_allocate;
+    }
+#ifdef MIGRAPH_DEBUG_OPT
+    static bool is_disjoin(live_range& range1, live_range& range2)
+    {
+        if((range1.size == 0) || (range2.size == 0))
+            return false;
+        long long end1 = range1.offset + range1.size - 1;
+        long long end2 = range2.offset + range2.size - 1;
+        return ((end1 < range2.offset) || (end2 < range1.offset));
+    }
+    void dump(const std::string&);
+    void dump_program();
+    void dump_intervals();
+    void verify();
+#endif
+    struct ordering
+    {
+        bool operator()(const interval_ptr i1, const interval_ptr i2) const
+        {
+            int len1 = i1->get_end() - i1->get_begin();
+            int len2 = i2->get_end() - i2->get_begin();
+            if(len1 != len2)
+            {
+                return (len1 < len2);
+            }
+            else if(i1->result.bytes() != i2->result.bytes())
+            {
+                return (i1->result.bytes() < i2->result.bytes());
+            }
+            else
+            {
+                return i1->id > i2->id;
+            }
+        }
+        bool operator()(const live_range* i1, const live_range* i2) const
+        {
+            return (i1->offset > i2->offset);
+        }
+    };
+    program* p_program;
+    std::unordered_map<const instruction*, interval_ptr> instr2_live;
+    // universe of live intervals.
+    std::vector<live_interval> live_intervals;
+    // Map live range value number to live range.
+    std::unordered_map<int, live_range*> live_ranges;
+    // Map live range value number to a set of conflicting live ranges' value numbers.
+    std::unordered_map<int, std::set<int>> conflict_table;
+    // Priority queue for coloring.
+    std::priority_queue<interval_ptr, std::vector<interval_ptr>, ordering> alloc_queue;
+    std::unordered_map<std::string, int> operand_alias;
+    int num_of_lives;
+    int max_value_number;
+    long long required_bytes;
+    // The earliest program point where an live interval ends.
+    int earliest_end_point;
+    // The latest program point where an live interval ends.
+    int latest_end_point;
+    // Whether to unify literals into coloring.
+    bool unify_literals;
+    std::string allocation_op{};
+};
+} // namespace migraph
+#endif
--- a/src/program.cpp
+++ b/src/program.cpp
@@ -203,6 +203,25 @@ shape program::get_parameter_shape(std::string name) const
        return {};
 }
+instruction_ref program::get_parameter(std::string name) const
+{
+    auto ins = std::find_if(
+        impl->instructions.begin(), impl->instructions.end(), [&](const instruction& x) {
+            if(x.name() == "@param")
+            {
+                return any_cast<builtin::param>(x.get_operator()).parameter == name;
+            }
+            else
+            {
+                return false;
+            }
+        });
+    if(ins != this->end())
+        return ins;
+    else
+        return this->end();
+}
 std::unordered_map<std::string, shape> program::get_parameter_shapes() const
 {
    std::unordered_map<std::string, shape> result;
@@ -437,5 +456,4 @@ std::ostream& operator<<(std::ostream& os, const program& p)
    print_program(os, p, [](auto&&...) {});
    return os;
 }
 } // namespace migraph
--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -98,6 +98,12 @@ std::size_t shape::bytes() const
    this->visit_type([&](auto as) { n = as.size(); });
    return n * this->element_space();
 }
+std::size_t shape::type_size() const
+{
+    std::size_t n = 0;
+    this->visit_type([&](auto as) { n = as.size(); });
+    return n;
+}
 std::size_t shape::index(std::initializer_list<std::size_t> l) const
 {
    assert(l.size() <= this->lens().size());

--- a/src/targets/cpu/CMakeLists.txt
+++ b/src/targets/cpu/CMakeLists.txt
@@ -13,3 +13,8 @@ target_link_libraries(migraph_cpu migraph Threads::Threads)
 target_include_directories(migraph_cpu PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
 target_include_directories(migraph_cpu PRIVATE ${BLAZE_INCLUDE})
 target_compile_definitions(migraph_cpu PRIVATE -DBLAZE_USE_CPP_THREADS)
+#install (TARGETS migraph_cpu
+#  LIBRARY DESTINATION /opt/rocm/lib)
+#install (DIRECTORY include/migraph DESTINATION /opt/rocm/include)
--- a/src/targets/cpu/cpu_lowering.cpp
+++ b/src/targets/cpu/cpu_lowering.cpp
@@ -36,7 +36,7 @@ T zero(const T&)
 //
 struct cpu_batch_norm_inference
 {
-    batch_norm_inference op;
+    op::batch_norm_inference op;
    std::string name() const { return "cpu::batch_norm_inference"; }
@@ -58,7 +58,7 @@ struct cpu_batch_norm_inference
        auto image_height = output_shape.lens()[2];
        auto image_width  = output_shape.lens()[3];
-        if(op.bn_mode == batch_norm_inference::spatial)
+        if(op.bn_mode == op::batch_norm_inference::spatial)
        {
            visit_all(output, input, mini_batch_mean, mini_batch_variance, arg_gamma, arg_bias)(
                [&](auto result, auto buffer, auto mean, auto variance, auto gamma, auto bias) {
@@ -73,7 +73,7 @@ struct cpu_batch_norm_inference
                });
        }
-        if(op.bn_mode == batch_norm_inference::per_activation)
+        if(op.bn_mode == op::batch_norm_inference::per_activation)
        {
            visit_all(output, input, mini_batch_mean, mini_batch_mean, arg_gamma, arg_bias)(
                [&](auto result, auto buffer, auto mean, auto variance, auto gamma, auto bias) {
@@ -95,7 +95,7 @@ struct cpu_batch_norm_inference
 struct cpu_convolution
 {
-    convolution op;
+    op::convolution op;
    std::string name() const { return "cpu::convolution"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
@@ -136,7 +136,7 @@ struct cpu_convolution
 struct cpu_im2col
 {
-    im2col op;
+    op::im2col op;
    static std::string name() { return "cpu::im2col"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
@@ -218,7 +218,7 @@ struct avg_pool
 template <class Op>
 struct cpu_pooling
 {
-    pooling op;
+    op::pooling op;
    std::string name() const { return "cpu::pooling_" + Op::name(); }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
@@ -266,7 +266,7 @@ struct cpu_pooling
 struct cpu_contiguous
 {
-    contiguous op;
+    op::contiguous op;
    std::string name() const { return "cpu::contiguous"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
@@ -284,7 +284,7 @@ struct cpu_contiguous
 struct cpu_gemm
 {
-    gemm op;
+    op::gemm op;
    std::string name() const { return "cpu::gemm"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
@@ -551,12 +551,12 @@ struct cpu_apply
    void init()
    {
-        apply_map["im2col"]      = extend_op<cpu_im2col, im2col>();
+        apply_map["im2col"]      = extend_op<cpu_im2col, op::im2col>();
-        apply_map["convolution"] = extend_op<cpu_convolution, convolution>();
+        apply_map["convolution"] = extend_op<cpu_convolution, op::convolution>();
-        apply_map["gemm"]        = extend_op<cpu_gemm, gemm>();
+        apply_map["gemm"]        = extend_op<cpu_gemm, op::gemm>();
        apply_map["batch_norm_inference"] =
-            extend_op<cpu_batch_norm_inference, batch_norm_inference>();
+            extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
-        apply_map["contiguous"] = extend_op<cpu_contiguous, contiguous>();
+        apply_map["contiguous"] = extend_op<cpu_contiguous, op::contiguous>();
        apply_map["identity"] = simple_op<cpu_unary<identity_op>>();
        apply_map["tanh"]     = simple_op<cpu_unary<tanh_op>>();
@@ -609,14 +609,14 @@ struct cpu_apply
    void apply_activation(instruction_ref ins)
    {
-        auto&& op = any_cast<activation>(ins->get_operator());
+        auto&& op = any_cast<op::activation>(ins->get_operator());
        if(op.mode == "relu")
            prog->replace_instruction(ins, cpu_unary<relu_op>{}, ins->inputs());
    }
    void apply_pooling(instruction_ref ins)
    {
-        auto&& op = any_cast<pooling>(ins->get_operator());
+        auto&& op = any_cast<op::pooling>(ins->get_operator());
        if(op.mode == "max")
            prog->replace_instruction(ins, cpu_pooling<max_pool>{op}, ins->inputs());
        else if(op.mode == "average")

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -40,3 +40,12 @@ add_library(migraph_gpu
 rocm_clang_tidy_check(migraph_gpu)
 target_link_libraries(migraph_gpu migraph MIOpen migraph_device roc::rocblas)
 target_include_directories(migraph_gpu PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
+#install (TARGETS migraph_gpu
+#  LIBRARY DESTINATION /opt/rocm/lib)
+#install (DIRECTORY include/migraph DESTINATION /opt/rocm/include)
+#install (TARGETS migraph_device
+#  LIBRARY DESTINATION /opt/rocm/lib)
+#install (DIRECTORY include/migraph DESTINATION /opt/rocm/include)
--- a/src/targets/gpu/eliminate_workspace.cpp
+++ b/src/targets/gpu/eliminate_workspace.cpp
@@ -6,12 +6,16 @@
 #include <migraph/iterator_for.hpp>
 #include <migraph/ranges.hpp>
 #include <migraph/stringutils.hpp>
+#include <migraph/pass_config.hpp>
 namespace migraph {
 namespace gpu {
 void eliminate_workspace::apply(program& p) const
 {
+    if(!enabled(MIGRAPH_DISABLE_MEMORY_COLORING{}))
+        return;
    std::size_t n = 0;
    std::vector<instruction_ref> allocs;
    for(auto ins : iterator_for(p))

--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -90,6 +90,9 @@ argument from_gpu(argument arg)
 void gpu_sync() { hipDeviceSynchronize(); }
+void copy_to_gpu(char* dst, const char* src, std::size_t size)
+{
+    hipMemcpy(dst, src, size, hipMemcpyHostToDevice);
+}
 } // namespace gpu
 } // namespace migraph
--- a/src/targets/gpu/include/migraph/gpu/batchnorm.hpp
+++ b/src/targets/gpu/include/migraph/gpu/batchnorm.hpp
@@ -22,7 +22,7 @@ namespace gpu {
 struct miopen_batch_norm_inference
 {
-    batch_norm_inference op;
+    op::batch_norm_inference op;
    std::string name() const { return "gpu::batch_norm_inference"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument

--- a/src/targets/gpu/include/migraph/gpu/context.hpp
+++ b/src/targets/gpu/include/migraph/gpu/context.hpp
@@ -12,12 +12,11 @@ struct context
 {
    shared<miopen_handle> handle;
    shared<rocblas_handle_ptr> rbhandle;
+    argument scratch;
    std::vector<argument> literals{};
    void finish() const { gpu_sync(); }
 };
 } // namespace gpu
 } // namespace migraph
 #endif
--- a/src/targets/gpu/include/migraph/gpu/contiguous.hpp
+++ b/src/targets/gpu/include/migraph/gpu/contiguous.hpp
@@ -22,7 +22,7 @@ namespace gpu {
 struct miopen_contiguous
 {
-    contiguous op;
+    op::contiguous op;
    std::string name() const { return "gpu::contiguous"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument compute(context&, shape output_shape, const std::vector<argument>& args) const;

--- a/src/targets/gpu/include/migraph/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraph/gpu/convolution.hpp
@@ -22,7 +22,7 @@ namespace gpu {
 struct miopen_convolution
 {
-    convolution op;
+    op::convolution op;
    shared<convolution_descriptor> cd;
    miopenConvFwdAlgorithm_t algo{};

--- a/src/targets/gpu/include/migraph/gpu/gemm.hpp
+++ b/src/targets/gpu/include/migraph/gpu/gemm.hpp
@@ -22,7 +22,7 @@ namespace gpu {
 struct miopen_gemm
 {
-    gemm op;
+    op::gemm op;
    std::string name() const { return "gpu::gemm"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument