Create allocate op and replace_allocate pass (#1183)

* add allocate op header * formatting * add replace_allocate pass * formatting * move output param to remove_allocate pass * formatting * fix bugs in replace_allocate pass * formatting * fix verify if tests * formatting * move if op logic * formatting * cleanup lowering * cleanup lowering * formatting * fix tidy * formatting * fix tidy * add cpu allocate check * formatting * change cpu allocate in pass * formatting * add some tests for replace_allocate pass * formatting * pass by ref * fix run_pass * formatting * update variable name for module * update dce to use contains() and fix tidy * formatting * update cppcheck * add if test * formatting * add if test * rename var to mod_output_names * formatting * remove conditional * update allocate op and tests * formatting * update replace_allocate tests * update create_output_names() and conditional in replace_allocate * formatting * remove extra variable in replace_allocate * update tools script for allocation_model Co-authored-by: Umang Yadav <29876643+umangyadav@users.noreply.github.com> Co-authored-by: Chris Austen <causten@users.noreply.github.com> Co-authored-by: Paul Fultz II <pfultz2@yahoo.com>

Create allocate op and replace_allocate pass (#1183)
* add allocate op header * formatting * add replace_allocate pass * formatting * move output param to remove_allocate pass * formatting * fix bugs in replace_allocate pass * formatting * fix verify if tests * formatting * move if op logic * formatting * cleanup lowering * cleanup lowering * formatting * fix tidy * formatting * fix tidy * add cpu allocate check * formatting * change cpu allocate in pass * formatting * add some tests for replace_allocate pass * formatting * pass by ref * fix run_pass * formatting * update variable name for module * update dce to use contains() and fix tidy * formatting * update cppcheck * add if test * formatting * add if test * rename var to mod_output_names * formatting * remove conditional * update allocate op and tests * formatting * update replace_allocate tests * update create_output_names() and conditional in replace_allocate * formatting * remove extra variable in replace_allocate * update tools script for allocation_model Co-authored-by: Umang Yadav <29876643+umangyadav@users.noreply.github.com> Co-authored-by: Chris Austen <causten@users.noreply.github.com> Co-authored-by: Paul Fultz II <pfultz2@yahoo.com>
add6fb3b · kahmed10 · GitHub · f5980619 · add6fb3b · add6fb3b
Unverified Commit add6fb3b authored Jun 17, 2022 by kahmed10 Committed by GitHub Jun 17, 2022
16 changed files
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -54,6 +54,7 @@ add_library(migraphx
    reduce_dims.cpp
    register_op.cpp
    register_target.cpp
+    replace_allocate.cpp
    simplify_qdq.cpp
    rewrite_batchnorm.cpp
    rewrite_pooling.cpp
@@ -80,6 +81,7 @@ register_migraphx_ops(
    acosh
    acos
    add
+    allocate
    argmax
    argmin
    asinh

--- a/src/dead_code_elimination.cpp
+++ b/src/dead_code_elimination.cpp
@@ -4,6 +4,7 @@
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/functional.hpp>
 #include <migraphx/ranges.hpp>
+#include <migraphx/stringutils.hpp>
 #include <unordered_set>
 namespace migraphx {
@@ -24,9 +25,10 @@ void dead_code_elimination::apply(module& m) const
        // Skip the last instruction
        if(i == last)
            break;
-        // Skip instruction with empty shape as output unless its a builtin or undefined or identity
+        // Skip instruction with empty shape as output unless its a builtin, undefined, identity, or
+        // allocate
        if(i->get_shape().elements() == 0 and i->name().front() != '@' and
-           i->name() != "undefined" and i->name() != "identity")
+           not contains({"undefined", "identity", "allocate"}, i->name()))
            continue;
        assert(std::distance(m.begin(), i) <= std::distance(m.begin(), last));
        std::unordered_set<instruction_ref> visited;

--- a/src/include/migraphx/allocation_model.hpp
+++ b/src/include/migraphx/allocation_model.hpp
@@ -28,6 +28,8 @@ struct allocation_model
    operation allocate(const shape& s) const;
    /// Create a preallocated operator for the given shape
    operation preallocate(const shape& s, const std::string& id) const;
+    /// Check if outputs are to be inserted
+    bool needs_out_params() const;
 };
 #else
@@ -45,6 +47,8 @@ struct allocation_model
    operation allocate(const shape& s) const;
    //
    operation preallocate(const shape& s, std::string id) const;
+    //
+    bool needs_out_params() const;
 };
 #else
@@ -136,6 +140,12 @@ struct allocation_model
        return (*this).private_detail_te_get_handle().preallocate(s, std::move(id));
    }
+    bool needs_out_params() const
+    {
+        assert((*this).private_detail_te_handle_mem_var);
+        return (*this).private_detail_te_get_handle().needs_out_params();
+    }
    friend bool is_shared(const allocation_model& private_detail_x,
                          const allocation_model& private_detail_y)
    {
@@ -154,6 +164,7 @@ struct allocation_model
        virtual std::string copy() const                                    = 0;
        virtual operation allocate(const shape& s) const                    = 0;
        virtual operation preallocate(const shape& s, std::string id) const = 0;
+        virtual bool needs_out_params() const                               = 0;
    };
    template <typename PrivateDetailTypeErasedT>
@@ -200,6 +211,12 @@ struct allocation_model
            return private_detail_te_value.preallocate(s, std::move(id));
        }
+        bool needs_out_params() const override
+        {
+            return private_detail_te_value.needs_out_params();
+        }
        PrivateDetailTypeErasedT private_detail_te_value;
    };

--- a/src/include/migraphx/op/allocate.hpp
+++ b/src/include/migraphx/op/allocate.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_ALLOCATE_HPP
+#define MIGRAPHX_GUARD_OPERATORS_ALLOCATE_HPP
+#include <array>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/streamutils.hpp>
+#include <migraphx/literal.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/config.hpp>
+#include <cmath>
+#include <utility>
+#include <migraphx/shape.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct allocate
+{
+    shape s{};
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.s, "shape"));
+    }
+    std::string name() const { return "allocate"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        migraphx::check_shapes{inputs, *this}.has(0);
+        return s;
+    }
+    argument compute(const shape& output_shape, const std::vector<argument>&) const
+    {
+        return {output_shape};
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/replace_allocate.hpp
+++ b/src/include/migraphx/replace_allocate.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_REPLACE_ALLOCATE_HPP
+#define MIGRAPHX_GUARD_RTGLIB_REPLACE_ALLOCATE_HPP
+#include <migraphx/config.hpp>
+#include <migraphx/allocation_model.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct module;
+struct replace_allocate
+{
+    allocation_model model;
+    bool offload_copy = false;
+    std::string name() const { return "replace_allocate"; }
+    void apply(module& m) const;
+};
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/replace_allocate.cpp
+++ b/src/replace_allocate.cpp
+#include <migraphx/replace_allocate.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/op/allocate.hpp>
+#include <map>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+std::unordered_map<instruction_ref, std::string> create_output_names(const module& mod)
+{
+    std::unordered_map<instruction_ref, std::string> mod_output_names{};
+    auto last = std::prev(mod.end());
+    if(last->name() == "@return")
+    {
+        const auto& prog_outputs = last->inputs();
+        std::vector<instruction_ref> outputs_alias(prog_outputs.size());
+        std::transform(prog_outputs.begin(),
+                       prog_outputs.end(),
+                       outputs_alias.begin(),
+                       [](const auto& i) { return instruction::get_output_alias(i); });
+        std::size_t index = 0;
+        for(auto ins : outputs_alias)
+        {
+            mod_output_names[ins] = mod.name() + ":#output_" + std::to_string(index++);
+        }
+    }
+    else
+    {
+        auto ins              = instruction::get_output_alias(last);
+        mod_output_names[ins] = "output";
+    }
+    return mod_output_names;
+}
+void insert_submod_allocations(instruction_ref ins, module& mod, const allocation_model& model)
+{
+    std::vector<instruction_ref> inputs = ins->inputs();
+    std::vector<module_ref> mod_args    = ins->module_inputs();
+    std::map<std::string, shape> name_shapes;
+    for(const auto& smod : mod_args)
+    {
+        auto ps = smod->get_parameter_shapes();
+        name_shapes.insert(ps.begin(), ps.end());
+    }
+    for(auto& pn : name_shapes)
+    {
+        const auto& s = pn.second;
+        instruction_ref output{};
+        output = mod.insert_instruction(ins, model.allocate(s));
+        inputs.push_back(output);
+    }
+    mod.replace_instruction(ins, ins->get_operator(), inputs, mod_args);
+}
+void replace_allocate::apply(module& m) const
+{
+    auto mod_output_names  = create_output_names(m);
+    bool main_offload_copy = m.name() == "main" ? this->offload_copy : false;
+    for(auto ins : iterator_for(m))
+    {
+        auto op      = ins->get_operator();
+        auto op_name = op.name();
+        // check if allocations from submodules need to be inserted
+        // for now, only the "if" operator is affected
+        if(op_name == "if")
+        {
+            insert_submod_allocations(ins, m, model);
+            continue;
+        }
+        if(op_name != "allocate")
+            continue;
+        auto s = ins->get_shape();
+        if(not main_offload_copy and model.needs_out_params() and contains(mod_output_names, ins))
+        {
+            auto out_param = m.add_parameter(mod_output_names[ins], s);
+            m.replace_instruction(ins, out_param);
+            continue;
+        }
+        m.replace_instruction(
+            ins,
+            m.insert_instruction(ins,
+                                 make_op(model.name(), migraphx::value{{"shape", to_value(s)}})));
+    }
+}
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/include/migraphx/cpu/allocation_model.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/allocation_model.hpp
@@ -15,6 +15,7 @@ struct cpu_allocation_model
    std::string copy() const;
    operation allocate(const shape& s) const;
    operation preallocate(const shape& s, const std::string& id) const;
+    bool needs_out_params() const { return false; }
 };
 } // namespace cpu

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -291,30 +291,8 @@ struct cpu_apply
 {
    module* modl;
    std::unordered_map<std::string, std::function<instruction_ref(instruction_ref)>> apply_map{};
-    std::unordered_map<instruction_ref, std::string> prog_output_names{};
    instruction_ref last{};
-    void create_output_names()
-    {
-        this->last = instruction::get_output_alias(std::prev(modl->end()));
-        if(this->last->name() == "@return")
-        {
-            const auto& prog_outputs = last->inputs();
-            std::vector<instruction_ref> outputs_alias(prog_outputs.size());
-            std::transform(prog_outputs.begin(),
-                           prog_outputs.end(),
-                           outputs_alias.begin(),
-                           [](const auto& i) { return instruction::get_output_alias(i); });
-            std::size_t index = 0;
-            for(auto ins : outputs_alias)
-            {
-                prog_output_names[ins] = modl->name() + ":#output_" + std::to_string(index++);
-            }
-        }
-    }
    void extend_op(const std::string& op_name, const std::string& cpu_name, bool allocate = true)
    {
        apply_map.emplace(op_name, [=](instruction_ref ins) {
@@ -360,7 +338,6 @@ struct cpu_apply
    void init()
    {
-        create_output_names();
        extend_dnnl_algos("dnnl::binary",
                          {
                              {"add", "binary_add"},
@@ -490,7 +467,7 @@ struct cpu_apply
    instruction_ref insert_allocation(instruction_ref ins, const shape& s) const
    {
-        return modl->insert_instruction(ins, make_op("cpu::allocate", {{"shape", to_value(s)}}));
+        return modl->insert_instruction(ins, make_op("allocate", {{"shape", to_value(s)}}));
    }
 };

--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp
@@ -13,6 +13,7 @@
 #include <migraphx/memory_coloring.hpp>
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/register_target.hpp>
+#include <migraphx/replace_allocate.hpp>
 #include <migraphx/rewrite_batchnorm.hpp>
 #include <migraphx/rewrite_pooling.hpp>
 #include <migraphx/rewrite_quantization.hpp>
@@ -70,6 +71,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
            lowering{},
            eliminate_contiguous{"dnnl::reorder"},
            dead_code_elimination{},
+            replace_allocate{cpu_allocation_model{}},
+            dead_code_elimination{},
            adjust_allocation{cpu_allocation_model{}},
            dead_code_elimination{},
            fuse_ops{&ctx},

--- a/src/targets/gpu/include/migraphx/gpu/allocation_model.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/allocation_model.hpp
@@ -16,6 +16,7 @@ struct gpu_allocation_model
    std::string copy() const;
    operation allocate(const shape& s) const;
    operation preallocate(const shape& s, const std::string& id) const;
+    bool needs_out_params() const { return true; }
 };
 } // namespace gpu

--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -58,7 +58,6 @@ struct miopen_apply
    const lowering* pass = nullptr;
    std::unordered_map<std::string, std::function<instruction_ref(instruction_ref)>> apply_map{};
    instruction_ref last{};
-    std::unordered_map<instruction_ref, std::string> prog_output_names{};
    bool offload_copy   = false;
    bool int8_x4_format = true;
    bool compute_fp32   = false;
@@ -77,27 +76,6 @@ struct miopen_apply
        (void)i;
    }
-    void create_output_names()
-    {
-        this->last = instruction::get_output_alias(std::prev(mod->end()));
-        if(this->last->name() == "@return")
-        {
-            const auto& prog_outputs = last->inputs();
-            std::vector<instruction_ref> outputs_alias(prog_outputs.size());
-            std::transform(prog_outputs.begin(),
-                           prog_outputs.end(),
-                           outputs_alias.begin(),
-                           [](const auto& i) { return instruction::get_output_alias(i); });
-            std::size_t index = 0;
-            for(auto ins : outputs_alias)
-            {
-                prog_output_names[ins] = mod->name() + ":#output_" + std::to_string(index++);
-            }
-        }
-    }
    const std::unordered_set<std::string>& get_rocblas_fp32_archs()
    {
        static std::unordered_set<std::string> supported_archs{"gfx908", "gfx90a"};
@@ -120,7 +98,6 @@ struct miopen_apply
 #endif
        offload_copy = (mod->name() == "main") ? pass->offload_copy : false;
-        create_output_names();
        add_generic_op("acos");
        add_generic_op("acosh");
@@ -201,7 +178,7 @@ struct miopen_apply
        add_quant_convolution_op();
    }
-    void copy_params()
+    void copy_params() const
    {
        if(not offload_copy)
            return;
@@ -261,7 +238,7 @@ struct miopen_apply
        copy_params();
    }
-    instruction_ref insert_precompile_op(instruction_ref ins)
+    instruction_ref insert_precompile_op(instruction_ref ins) const
    {
        auto output                       = insert_allocation(ins, ins->get_shape());
        std::vector<instruction_ref> refs = ins->inputs();
@@ -274,28 +251,9 @@ struct miopen_apply
            ins->module_inputs());
    }
-    instruction_ref insert_allocation(instruction_ref ins, const shape& s, std::string tag = "")
+    instruction_ref insert_allocation(instruction_ref ins, const shape& s) const
    {
-        // Instruction's output is an input of the ret instruction
+        return mod->insert_instruction(ins, make_op("allocate", {{"shape", to_value(s)}}));
-        if(offload_copy)
-        {
-            auto result = mod->insert_instruction(
-                ins, make_op("hip::allocate", {{"shape", to_value(s)}, {"tag", std::move(tag)}}));
-            return result;
-        }
-        auto ins_alias = instruction::get_output_alias(ins);
-        if(last->name() == "@return" and tag.empty() and prog_output_names.count(ins_alias) > 0)
-        {
-            return mod->add_parameter(prog_output_names[ins_alias], s);
-        }
-        else if(ins == last and tag.empty())
-        {
-            return mod->add_parameter("output", s);
-        }
-        return mod->insert_instruction(
-            ins, make_op("hip::allocate", {{"shape", to_value(s)}, {"tag", std::move(tag)}}));
    }
    void add_convolution_op()
@@ -306,7 +264,7 @@ struct miopen_apply
            auto conv = miopen_convolution{op, make_conv(op)};
            auto ws   = conv.find(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
-            auto workspace = insert_allocation(ins, ws, "workspace");
+            auto workspace = insert_allocation(ins, ws);
            auto output    = insert_allocation(ins, ins->get_shape());
            return mod->replace_instruction(
@@ -322,7 +280,7 @@ struct miopen_apply
            auto conv = miopen_deconvolution{op, make_deconv(op)};
            auto ws   = conv.compile(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
-            auto workspace = insert_allocation(ins, ws, "workspace");
+            auto workspace = insert_allocation(ins, ws);
            auto output    = insert_allocation(ins, ins->get_shape());
            return mod->replace_instruction(
@@ -383,7 +341,7 @@ struct miopen_apply
            }
            auto args      = ins->inputs();
-            auto workspace = insert_allocation(ins, ws, "workspace");
+            auto workspace = insert_allocation(ins, ws);
            auto output    = insert_allocation(ins, ins->get_shape());
            return mod->replace_instruction(ins, conv, args[0], args[1], workspace, output);
@@ -480,33 +438,7 @@ struct miopen_apply
            auto sync_cond = mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_cond);
            inputs.front() = sync_cond;
-            std::vector<module_ref> mod_args = ins->module_inputs();
+            return mod->replace_instruction(ins, ins->get_operator(), inputs, ins->module_inputs());
-            std::map<std::string, shape> name_shapes;
-            for(const auto& smod : mod_args)
-            {
-                auto ps = smod->get_parameter_shapes();
-                name_shapes.insert(ps.begin(), ps.end());
-            }
-            bool ins_output_allocated = false;
-            for(auto& pn : name_shapes)
-            {
-                const auto& s = pn.second;
-                instruction_ref output{};
-                if(s == ins->get_shape() and not ins_output_allocated)
-                {
-                    output               = insert_allocation(ins, s);
-                    ins_output_allocated = true;
-                }
-                else
-                {
-                    output = mod->insert_instruction(
-                        ins, make_op("hip::allocate", {{"shape", to_value(s)}}));
-                }
-                inputs.push_back(output);
-            }
-            return mod->replace_instruction(ins, ins->get_operator(), inputs, mod_args);
        });
    }
@@ -525,20 +457,17 @@ struct miopen_apply
            inputs.at(0)     = synced_max_iter;
            inputs.at(1)     = cpu_cond;
            auto copy_inputs = inputs;
-            std::transform(
+            std::transform(copy_inputs.begin(),
-                copy_inputs.begin(), copy_inputs.end(), std::back_inserter(inputs), [&](auto in) {
+                           copy_inputs.end(),
-                    return mod->insert_instruction(
+                           std::back_inserter(inputs),
-                        ins, make_op("hip::allocate", {{"shape", to_value(in->get_shape())}}));
+                           [&](auto in) { return insert_allocation(ins, in->get_shape()); });
-                });
            auto mod_args = ins->module_inputs();
            auto output   = insert_allocation(ins, ins->get_shape());
            const auto* sub_mod = mod_args.front();
-            auto cond_out       = mod->insert_instruction(
+            auto cond_out       = insert_allocation(ins, sub_mod->get_output_shapes().front());
-                ins,
-                make_op("hip::allocate",
-                        {{"shape", to_value(sub_mod->get_output_shapes().front())}}));
            // add cond and mod outputs to the argument list
            inputs.push_back(cond_out);
            inputs.push_back(output);

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -17,6 +17,7 @@
 #include <migraphx/preallocate_param.hpp>
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/register_target.hpp>
+#include <migraphx/replace_allocate.hpp>
 #include <migraphx/rewrite_batchnorm.hpp>
 #include <migraphx/rewrite_pooling.hpp>
 #include <migraphx/rewrite_quantization.hpp>
@@ -109,6 +110,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        lowering{&ctx, options.offload_copy},
        eliminate_contiguous{"gpu::contiguous"},
        dead_code_elimination{},
+        replace_allocate{gpu_allocation_model{}, options.offload_copy},
+        dead_code_elimination{},
        eliminate_concat{concat_gpu_optimization{}},
        dead_code_elimination{},
        pack_int8_args{},

--- a/test/gpu/adjust_allocation.cpp
+++ b/test/gpu/adjust_allocation.cpp
@@ -6,6 +6,7 @@
 #include <migraphx/auto_contiguous.hpp>
 #include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/eliminate_contiguous.hpp>
+#include <migraphx/replace_allocate.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/op/add.hpp>
@@ -20,12 +21,15 @@
 void run_lowering(migraphx::program& p, bool offload_copy = false)
 {
    auto ctx = migraphx::gpu::context{};
-    migraphx::run_passes(*p.get_main_module(),
+    migraphx::run_passes(
-                         {migraphx::auto_contiguous{},
+        *p.get_main_module(),
-                          migraphx::gpu::lowering{&ctx, offload_copy},
+        {migraphx::auto_contiguous{},
-                          migraphx::dead_code_elimination{},
+         migraphx::gpu::lowering{&ctx, offload_copy},
-                          migraphx::eliminate_contiguous{"gpu::contiguous"},
+         migraphx::dead_code_elimination{},
-                          migraphx::dead_code_elimination{}});
+         migraphx::eliminate_contiguous{"gpu::contiguous"},
+         migraphx::dead_code_elimination{},
+         migraphx::replace_allocate{migraphx::gpu::gpu_allocation_model{}, offload_copy},
+         migraphx::dead_code_elimination{}});
 }
 TEST_CASE(tanh_shape)

--- a/test/gpu/pack_int8_args.cpp
+++ b/test/gpu/pack_int8_args.cpp
@@ -2,13 +2,14 @@
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/lowering.hpp>
 #include <migraphx/gpu/target.hpp>
+#include <migraphx/gpu/allocation_model.hpp>
 #include <migraphx/apply_alpha_beta.hpp>
 #include <migraphx/adjust_allocation.hpp>
 #include <migraphx/gpu/pack_int8_args.hpp>
 #include <migraphx/gpu/rocblas.hpp>
 #include <migraphx/auto_contiguous.hpp>
 #include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/eliminate_contiguous.hpp>
+#include <migraphx/replace_allocate.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/pass_manager.hpp>
@@ -22,6 +23,8 @@ void run_passes(migraphx::module& m)
                         {migraphx::auto_contiguous{},
                          migraphx::gpu::lowering{&ctx, false},
                          migraphx::dead_code_elimination{},
+                          migraphx::replace_allocate{migraphx::gpu::gpu_allocation_model{}},
+                          migraphx::dead_code_elimination{},
                          migraphx::gpu::pack_int8_args{},
                          migraphx::dead_code_elimination{}});
 }

--- a/test/replace_allocate.cpp
+++ b/test/replace_allocate.cpp
+#include <migraphx/allocation_model.hpp>
+#include <migraphx/replace_allocate.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/register_op.hpp>
+#include <basic_ops.hpp>
+#include <test.hpp>
+struct allocate_no_out : migraphx::auto_register_op<allocate_no_out>
+{
+    migraphx::shape s{};
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::pack(f(self.s, "shape"));
+    }
+    std::string name() const { return "allocate_no_out"; }
+    migraphx::shape compute_shape(const std::vector<migraphx::shape>& inputs) const
+    {
+        migraphx::check_shapes{inputs, *this}.has(0);
+        return s;
+    }
+    migraphx::argument compute(migraphx::context&,
+                               const migraphx::shape& output_shape,
+                               const std::vector<migraphx::argument>&) const
+    {
+        return {output_shape};
+    }
+};
+struct allocate_with_out : migraphx::auto_register_op<allocate_with_out>
+{
+    migraphx::shape s{};
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::pack(f(self.s, "shape"));
+    }
+    std::string name() const { return "allocate_with_out"; }
+    migraphx::shape compute_shape(const std::vector<migraphx::shape>& inputs) const
+    {
+        migraphx::check_shapes{inputs, *this}.has(0);
+        return s;
+    }
+    migraphx::argument compute(migraphx::context&,
+                               const migraphx::shape& output_shape,
+                               const std::vector<migraphx::argument>&) const
+    {
+        return {output_shape};
+    }
+};
+// allocation model that has no out params
+struct allocation_no_out_model
+{
+    std::string name() const { return "allocate_no_out"; }
+    migraphx::operation allocate(const migraphx::shape& s) const
+    {
+        return migraphx::make_op(name(), {{"shape", to_value(s)}});
+    }
+    migraphx::operation preallocate(const migraphx::shape&, const std::string&) const { return {}; }
+    std::string copy() const { return {}; }
+    bool needs_out_params() const { return false; }
+};
+// allocation model with out params
+struct allocation_with_out_model
+{
+    std::string name() const { return "allocate_with_out"; }
+    migraphx::operation allocate(const migraphx::shape& s) const
+    {
+        return migraphx::make_op(name(), {{"shape", to_value(s)}});
+    }
+    migraphx::operation preallocate(const migraphx::shape&, const std::string&) const { return {}; }
+    std::string copy() const { return {}; }
+    bool needs_out_params() const { return true; }
+};
+void run_pass(migraphx::module& m, migraphx::allocation_model model, bool offload_copy = false)
+{
+    migraphx::run_passes(m,
+                         {migraphx::replace_allocate{std::move(model), offload_copy},
+                          migraphx::dead_code_elimination{}});
+}
+void run_pass(migraphx::program& p, migraphx::allocation_model model, bool offload_copy = false)
+{
+    migraphx::run_passes(p,
+                         {migraphx::replace_allocate{std::move(model), offload_copy},
+                          migraphx::dead_code_elimination{}});
+}
+migraphx::module create_simple_program()
+{
+    migraphx::module m;
+    migraphx::shape s{migraphx::shape::float_type, {5}};
+    auto x = m.add_parameter("x", s);
+    auto y = m.add_parameter("y", s);
+    auto alloc =
+        m.add_instruction(migraphx::make_op("allocate", {{"shape", migraphx::to_value(s)}}));
+    m.add_instruction(pass_op{}, alloc, x, y);
+    return m;
+}
+TEST_CASE(allocate_no_out)
+{
+    migraphx::module m = create_simple_program();
+    run_pass(m, allocation_no_out_model{});
+    EXPECT(std::any_of(m.begin(), m.end(), [](const migraphx::instruction& ins) {
+        return migraphx::contains(ins.name(), "allocate_no_out");
+    }));
+}
+TEST_CASE(allocate_with_out_param)
+{
+    migraphx::module m = create_simple_program();
+    run_pass(m, allocation_with_out_model{});
+    EXPECT(std::none_of(m.begin(), m.end(), [](const migraphx::instruction& ins) {
+        return migraphx::contains(ins.name(), "allocate");
+    }));
+}
+TEST_CASE(allocate_with_out_return)
+{
+    migraphx::module m = create_simple_program();
+    m.add_return({std::prev(m.end())});
+    run_pass(m, allocation_with_out_model{});
+    EXPECT(std::none_of(m.begin(), m.end(), [](const migraphx::instruction& ins) {
+        return migraphx::contains(ins.name(), "allocate");
+    }));
+}
+TEST_CASE(allocate_with_out_no_params)
+{
+    migraphx::module m;
+    migraphx::shape s{migraphx::shape::float_type, {5}};
+    auto x = m.add_parameter("x", s);
+    auto y = m.add_parameter("y", s);
+    auto z = m.add_parameter("z", s);
+    auto alloc =
+        m.add_instruction(migraphx::make_op("allocate", {{"shape", migraphx::to_value(s)}}));
+    auto pass1 = m.add_instruction(pass_op{}, alloc, x, y);
+    auto alloc2 =
+        m.add_instruction(migraphx::make_op("allocate", {{"shape", migraphx::to_value(s)}}));
+    m.add_instruction(pass_op{}, alloc2, z, pass1);
+    run_pass(m, allocation_with_out_model{});
+    EXPECT(std::any_of(m.begin(), m.end(), [](const migraphx::instruction& ins) {
+        return migraphx::contains(ins.name(), "allocate_with_out");
+    }));
+}
+TEST_CASE(if_allocate)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape cond_s{migraphx::shape::bool_type};
+    auto cond = mm->add_parameter("cond", cond_s);
+    migraphx::shape s{migraphx::shape::float_type, {5}};
+    auto x = mm->add_parameter("x", s);
+    auto y = mm->add_parameter("y", s);
+    auto* then_mod = p.create_module("If_0_if");
+    auto alloc     = then_mod->add_instruction(
+        migraphx::make_op("allocate", {{"shape", migraphx::to_value(s)}}));
+    auto a1 = then_mod->add_instruction(pass_op{}, alloc, x);
+    then_mod->add_return({a1});
+    auto* else_mod = p.create_module("If_0_else");
+    auto alloc1    = else_mod->add_instruction(
+        migraphx::make_op("allocate", {{"shape", migraphx::to_value(s)}}));
+    auto a2 = else_mod->add_instruction(pass_op{}, alloc1, y);
+    else_mod->add_return({a2});
+    mm->add_instruction(migraphx::make_op("if"), {cond}, {then_mod, else_mod});
+    run_pass(p, allocation_with_out_model{});
+    EXPECT(std::any_of(mm->begin(), mm->end(), [](const migraphx::instruction& ins) {
+        return migraphx::contains(ins.name(), "allocate_with_out");
+    }));
+}
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/tools/include/allocation_model.hpp
+++ b/tools/include/allocation_model.hpp
@@ -28,6 +28,8 @@ struct allocation_model
    operation allocate(const shape& s) const;
    /// Create a preallocated operator for the given shape
    operation preallocate(const shape& s, const std::string& id) const;
+    /// Check if outputs are to be inserted
+    bool needs_out_params() const;
 };
 #else
@@ -37,7 +39,8 @@ interface('allocation_model',
    virtual('name', returns='std::string', const=True),
    virtual('copy', returns='std::string', const=True),
    virtual('allocate', s='const shape&', returns='operation', const=True),
-    virtual('preallocate', s='const shape&', id='std::string', returns='operation', const=True)
+    virtual('preallocate', s='const shape&', id='std::string', returns='operation', const=True),
+    virtual('needs_out_params', returns='bool', const=True)
 )
 %>