Merge pull request #2019 from ROCmSoftwarePlatform/rel57_workitems

9fee7233 · Chris Austen · GitHub · 0bc60894 · 97cc1dfc · 9fee7233
Unverified Commit 9fee7233 authored Aug 09, 2023 by Chris Austen Committed by GitHub Aug 09, 2023
20 changed files
--- a/src/targets/gpu/include/migraphx/gpu/mlir.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/mlir.hpp
@@ -29,6 +29,7 @@
 #include <migraphx/gpu/config.hpp>
 #include <migraphx/gpu/code_object_op.hpp>
 #include <migraphx/instruction_ref.hpp>
+#include <migraphx/gpu/tuning_config.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -36,16 +37,19 @@ struct module;
 namespace gpu {

 MIGRAPHX_GPU_EXPORT std::string dump_mlir(const module& m);
-
 MIGRAPHX_GPU_EXPORT code_object_op compile_mlir(const context& ctx,
                                                module m,
-                                                const std::vector<instruction_ref>& inputs);
+                                                const std::vector<instruction_ref>& inputs,
+                                                const value& solution);

 MIGRAPHX_GPU_EXPORT instruction_ref insert_mlir(module& m,
                                                instruction_ref ins,
                                                code_object_op co,
                                                const std::vector<instruction_ref>& inputs);

+MIGRAPHX_GPU_EXPORT tuning_config get_tuning_config_mlir(module m,
+                                                         const std::vector<shape>& inputs);
+
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/tuning_config.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/tuning_config.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_TUNING_CONFIG_HPP
+#define MIGRAPHX_GUARD_GPU_TUNING_CONFIG_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/value.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct tuning_config
+{
+    value problem;
+    std::vector<value> solutions;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_TUNING_CONFIG_HPP
--- a/src/targets/gpu/jit/mlir.cpp
+++ b/src/targets/gpu/jit/mlir.cpp
@@ -36,11 +36,12 @@ struct mlir_compiler : compiler<mlir_compiler>

    operation compile_op(context&, const std::vector<shape>&, const value&) const { return {}; }

-    compiler_replace compile(context& ctx, instruction_ref ins, const operation&) const
+    compiler_replace
+    compile(context& ctx, instruction_ref ins, const operation&, const value& solution) const
    {
        auto* smod = ins->module_inputs().front();
        assert(smod->get_parameter_names().size() == ins->inputs().size() - 1);
-        return insert(compile_mlir(ctx, *smod, ins->inputs()));
+        return insert(compile_mlir(ctx, *smod, ins->inputs(), solution));
    }

    compiler_replace insert(code_object_op co) const
@@ -50,6 +51,16 @@ struct mlir_compiler : compiler<mlir_compiler>
                    m.replace_instruction(ins, mlir);
                }};
    }
+
+    optional<tuning_config>
+    get_tuning_config(context&, instruction_ref ins, const operation&, bool exhaustive) const
+    {
+        if(not exhaustive)
+            return nullopt;
+        auto shapes = to_shapes(ins->inputs());
+        auto* smod  = ins->module_inputs().front();
+        return get_tuning_config_mlir(*smod, shapes);
+    }
 };

 } // namespace gpu

--- a/src/targets/gpu/jit/pointwise.cpp
+++ b/src/targets/gpu/jit/pointwise.cpp
@@ -72,7 +72,7 @@ struct pointwise_compiler : compiler<pointwise_compiler>
        hip_compile_options options;
        options.inputs         = inputs;
        options.output         = inputs.back();
-        options.virtual_inputs = reduce_dims(inputs);
+        options.virtual_inputs = reduce_dims(normalize_permutation(inputs));
        options.params         = "-Wno-float-equal";
        auto axis              = find_fast_axis(options.virtual_inputs);
        auto vec               = vectorize::elements(ctx, axis, options.virtual_inputs);

--- a/src/targets/gpu/jit/reduce.cpp
+++ b/src/targets/gpu/jit/reduce.cpp
@@ -84,7 +84,7 @@ static shape get_reduced_shape(const shape& s, const std::vector<T>& axes)
    std::fill(lens.begin(), lens.end(), 1);
    for(const auto& axis : axes)
        lens[axis] = s.lens()[axis];
-    return shape{s.type(), lens};
+    return s.with_lens(lens);
 }

 template <class T>
@@ -93,7 +93,7 @@ static shape get_output_shape(const shape& s, const std::vector<T>& axes)
    auto lens = s.lens();
    for(const auto& axis : axes)
        lens[axis] = 1;
-    return shape{s.type(), lens};
+    return s.with_lens(lens);
 }

 template <class ReduceLens>
@@ -228,7 +228,7 @@ struct fused_reduce_compiler : compiler<fused_reduce_compiler>
        auto virtual_inputs = inputs;
        virtual_inputs.push_back(get_reduced_shape(inputs.front(), axes));
        virtual_inputs.push_back(get_output_shape(inputs.front(), axes));
-        virtual_inputs           = reduce_dims(virtual_inputs);
+        virtual_inputs           = reduce_dims(normalize_permutation(virtual_inputs));
        auto reduce_output_shape = virtual_inputs.back();
        virtual_inputs.pop_back();
        auto reduction_shape = virtual_inputs.back();

--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
@@ -52,6 +52,7 @@
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/device_name.hpp>
 #include <migraphx/gpu/perfdb.hpp>
+#include <migraphx/gpu/tuning_config.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/permutation.hpp>
 #include <deque>
@@ -134,6 +135,10 @@ using mlir_block             = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirBlock, mlirBlockD
 using mlir_pass_manager      = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirPassManager, mlirPassManagerDestroy);
 using mlir_tuning_table      = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRockTuningTable,
                                                      mlirRockTuningTableDestroy);
+using mlir_tuning_space      = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRockTuningSpace,
+                                                      mlirRockTuningSpaceDestroy);
+using mlir_tuning_param      = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRockTuningParam,
+                                                      mlirRockTuningParamDestroy);

 std::string_view to_string_view(MlirStringRef s) { return {s.data, s.length}; }

@@ -616,18 +621,30 @@ struct mlir_program
        }
    }

-    code_object_op compile() MIGRAPHX_TIDY_CONST
+    void run_high_level_pipeline() MIGRAPHX_TIDY_CONST
    {
        mlir_pass_manager pm_front{mlirPassManagerCreate(ctx.get())};
-        mlir_pass_manager pm_back{mlirPassManagerCreate(ctx.get())};
-        // 1st pipeline to call
        mlirMIGraphXAddHighLevelPipeline(pm_front.get());
        mlirPassManagerRunOnOp(pm_front.get(), mlirModuleGetOperation(mmodule.get()));
+    }

-        // 2nd pipeline to call
-        get_module_tuned();
+    void run_backend_pipeline() MIGRAPHX_TIDY_CONST
+    {
+        mlir_pass_manager pm_back{mlirPassManagerCreate(ctx.get())};
        mlirMIGraphXAddBackendPipeline(pm_back.get(), target_arch.c_str());
        mlirPassManagerRunOnOp(pm_back.get(), mlirModuleGetOperation(mmodule.get()));
+    }
+
+    code_object_op compile(const value& solution) MIGRAPHX_TIDY_CONST
+    {
+        // 1st pipeline to call
+        run_high_level_pipeline();
+        if(solution.is_null())
+            get_module_tuned();
+        else
+            set_tuning(solution);
+        // 2nd pipeline to call
+        run_backend_pipeline();

        code_object_op op{};
        op.symbol_name                = sym_name;
@@ -658,6 +675,33 @@ struct mlir_program
        MIGRAPHX_THROW("Failed to compile mlir program");
    }

+    void set_tuning(const value& v)
+    {
+        auto str = v.to<std::string>();
+        // We need to make a copy of the buffer since mlirRockTuningSetFromStr may modify the string
+        std::vector<char> buffer(str.begin(), str.end());
+        buffer.push_back(0);
+        if(not mlirRockTuningSetFromStr(mmodule.get(), buffer.data()))
+            MIGRAPHX_THROW("Failed setting tuning key: " + str);
+    }
+
+    tuning_config get_tuning_config() MIGRAPHX_TIDY_CONST
+    {
+        tuning_config tc;
+        run_high_level_pipeline();
+        mlir_tuning_space params{mlirRockTuningSpaceCreate(mmodule.get())};
+        for(auto i : range(mlirRockTuningGetNumParamsFull(params.get())))
+        {
+            mlir_tuning_param param{mlirRockTuningParamCreate()};
+            if(not mlirRockTuningParamGet(params.get(), i, param.get()))
+                MIGRAPHX_THROW("Incorrect mlir tuning parameter: " + std::to_string(i));
+            tc.solutions.push_back(std::string{mlirRockTuningGetParamStr(param.get())});
+        }
+        mlir_tuning_table tuning_table{mlirRockTuningTableCreate()};
+        tc.problem = std::string{mlirRockTuningGetKey(tuning_table.get(), mmodule.get())};
+        return tc;
+    }
+
    std::string get_tune_params(bool xdlops) const { return get_mlir_perf_for_conv(pp, xdlops); }

    // This function appends to tuning cfg file that could be
@@ -749,14 +793,14 @@ std::string dump_mlir(const module& m)
    return mlir_print(&mlirOperationPrint, mod_op);
 }

-void adjust_param_shapes(module& m, const std::vector<instruction_ref>& inputs)
+void adjust_param_shapes(module& m, const std::vector<shape>& inputs)
 {
    auto names = m.get_parameter_names();
    std::sort(names.begin(), names.end());
    for(auto i : range(names.size()))
    {
        const auto& name  = names[i];
-        const auto& input = inputs[i]->get_shape();
+        const auto& input = inputs[i];
        auto param        = m.get_parameter(name);
        if(input.standard())
            continue;
@@ -794,9 +838,12 @@ void adjust_param_shapes(module& m, const std::vector<instruction_ref>& inputs)
    }
 }

-code_object_op compile_mlir(const context&, module m, const std::vector<instruction_ref>& inputs)
+code_object_op compile_mlir(const context&,
+                            module m,
+                            const std::vector<instruction_ref>& inputs,
+                            const value& solution)
 {
-    adjust_param_shapes(m, inputs);
+    adjust_param_shapes(m, to_shapes(inputs));
    const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});

    if(trace)
@@ -808,8 +855,9 @@ code_object_op compile_mlir(const context&, module m, const std::vector<instruct
    auto mod_op = mlirModuleGetOperation(mp.mmodule.get());
    if(trace)
        std::cout << mlir_print(&mlirOperationPrint, mod_op) << std::endl;
-    auto co   = mp.compile();
-    co.output = m.get_output_shapes().front();
+    auto co            = mp.compile(solution);
+    co.expected_inputs = to_shapes(inputs);
+    co.output          = m.get_output_shapes().front();
    return co;
 }

@@ -829,6 +877,16 @@ instruction_ref insert_mlir(module& m,
    return m.insert_instruction(ins, co, refs);
 }

+tuning_config get_tuning_config_mlir(module m, const std::vector<shape>& inputs)
+{
+    adjust_param_shapes(m, inputs);
+
+    mlir_program mp;
+    mp.find_target();
+    mp.parse(m);
+    return mp.get_tuning_config();
+}
+
 #else

 std::string dump_mlir(const module&) { return {}; }
@@ -840,11 +898,11 @@ void use(T&)

 // Disabling clang-tidy warning on non-real useage.
 // NOLINTBEGIN(performance-unnecessary-value-param)
-code_object_op compile_mlir(const context&, module, const std::vector<instruction_ref>&)
+code_object_op
+compile_mlir(const context&, module, const std::vector<instruction_ref>&, const value&)
 {
    return {};
 }
-// NOLINTEND(performance-unnecessary-value-param)

 instruction_ref
 // cppcheck-suppress funcArgNamesDifferent
@@ -854,6 +912,9 @@ insert_mlir(module& m, instruction_ref, code_object_op co, const std::vector<ins
    return m.end();
 }

+tuning_config get_tuning_config_mlir(module, const std::vector<shape>&) { return {}; }
+// NOLINTEND(performance-unnecessary-value-param)
+
 #endif

 } // namespace gpu

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -75,7 +75,9 @@ namespace gpu {
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_REDUCE_FUSION)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NHWC)
+#ifndef _WIN32
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK)
+#endif

 struct id_pass
 {
@@ -136,7 +138,9 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        enable_pass(not enabled(MIGRAPHX_DISABLE_REDUCE_FUSION{}), fuse_reduce{}),
        dead_code_elimination{},
+#ifndef _WIN32
        enable_pass(enabled(MIGRAPHX_ENABLE_CK{}), fuse_ck{}),
+#endif
        dead_code_elimination{},
        enable_pass(mlir_enabled(), fuse_mlir{&ctx}),
        dead_code_elimination{},

--- a/src/tf/parse_batchnorm.cpp
+++ b/src/tf/parse_batchnorm.cpp
@@ -52,7 +52,6 @@ struct parse_batchnorm : op_parser<parse_batchnorm>
        auto x_type = args[0]->get_shape().type();

        // unsqueeze tensors of shape (C) to broadcast correctly
-        auto rt  = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {0.5}});
        auto eps = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {epsilon}});

        auto scale_unsqueeze =
@@ -64,11 +63,11 @@ struct parse_batchnorm : op_parser<parse_batchnorm>
        auto var_unsqueeze =
            info.add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2}}}), args[4]);

-        auto numer   = info.add_broadcastable_binary_op("sub", args[0], mean_unsqueeze);
-        auto var_eps = info.add_broadcastable_binary_op("add", var_unsqueeze, eps);
-        auto denom   = info.add_broadcastable_binary_op("pow", var_eps, rt);
-        auto div0    = info.add_broadcastable_binary_op("div", numer, denom);
-        auto r0      = info.add_broadcastable_binary_op("mul", div0, scale_unsqueeze);
+        auto x_sub_mean = info.add_broadcastable_binary_op("sub", args[0], mean_unsqueeze);
+        auto var_eps    = info.add_broadcastable_binary_op("add", var_unsqueeze, eps);
+        auto rsqrt      = info.add_instruction(make_op("rsqrt"), var_eps);
+        auto mul0       = info.add_broadcastable_binary_op("mul", scale_unsqueeze, rsqrt);
+        auto r0         = info.add_broadcastable_binary_op("mul", x_sub_mean, mul0);
        return info.add_broadcastable_binary_op("add", r0, bias_unsqueeze);
    }
 };

--- a/test/api/CMakeLists.txt
+++ b/test/api/CMakeLists.txt
@@ -36,7 +36,7 @@ endfunction()
 function(add_c_api_test TEST_NAME TEST_SRC TEST_DIR)
    set(NAME test_api_${TEST_NAME})
    add_executable(${NAME} EXCLUDE_FROM_ALL ${TEST_SRC})
-    target_link_libraries(${NAME} migraphx_c migraphx)
+    target_link_libraries(${NAME} migraphx_c)
    target_include_directories(${NAME} PUBLIC ../include)
    add_test(NAME ${NAME} COMMAND $<TARGET_FILE:${NAME}> WORKING_DIRECTORY ${TEST_DIR}) 
    add_dependencies(tests ${NAME})

--- a/test/api/test_custom_op.cpp
+++ b/test/api/test_custom_op.cpp
@@ -99,7 +99,7 @@ TEST_CASE(run_sigmoid_custom_op)
    EXPECT(bool{result == migraphx::argument(s, expected_result.data())});
 }

-extern "C" void migraphx_test_private_disable_exception_catch(bool b);
+extern "C" MIGRAPHX_C_EXPORT void migraphx_test_private_disable_exception_catch(bool);

 TEST_CASE(run_sigmoid_with_incorrect_shape)
 {

--- a/test/api/test_gpu.cpp
+++ b/test/api/test_gpu.cpp
@@ -34,7 +34,6 @@ TEST_CASE(load_and_run)
    auto shapes_before = p.get_output_shapes();
    migraphx::compile_options options;
    options.set_offload_copy();
-    options.set_exhaustive_tune_flag();
    p.compile(migraphx::target("gpu"), options);
    auto shapes_after = p.get_output_shapes();
    CHECK(shapes_before.size() == 1);

--- a/test/gpu/mlir.cpp
+++ b/test/gpu/mlir.cpp
@@ -84,7 +84,7 @@ migraphx::program create_program_from_mlir(const migraphx::module& mmlir)
    inputs.push_back(mm->add_parameter("output", mmlir.get_output_shapes().front()));

    migraphx::gpu::context ctx;
-    migraphx::gpu::insert_mlir(*mm, mm->end(), compile_mlir(ctx, mmlir, inputs), inputs);
+    migraphx::gpu::insert_mlir(*mm, mm->end(), compile_mlir(ctx, mmlir, inputs, {}), inputs);
    return p;
 }


--- a/test/include/test.hpp
+++ b/test/include/test.hpp
@@ -384,7 +384,7 @@ bool throws(F f, const std::string& msg = "")
 }

 template <class T, class U>
-auto near(T px, U py, double ptol = 1e-6f)
+auto within_abs(T px, U py, double ptol = 1e-6f)
 {
    return make_function("near", [](auto x, auto y, auto tol) { return std::abs(x - y) < tol; })(
        px, py, ptol);

--- a/test/jit.cpp
+++ b/test/jit.cpp
@@ -82,9 +82,9 @@ TEST_CASE(generate_module)

    auto f = compile_module<float(float, float)>(m);

-    EXPECT(test::near(f(2, 2), 2));
-    EXPECT(test::near(f(10, 6), 4));
-    EXPECT(test::near(f(1, 2), std::sqrt(3)));
+    EXPECT(test::within_abs(f(2, 2), 2));
+    EXPECT(test::within_abs(f(10, 6), 4));
+    EXPECT(test::within_abs(f(1, 2), std::sqrt(3)));
 }

 TEST_CASE(generate_module_with_literals)
@@ -99,9 +99,9 @@ TEST_CASE(generate_module_with_literals)

    auto f = compile_module<float(float, float)>(m);

-    EXPECT(test::near(f(1, 2), 2));
-    EXPECT(test::near(f(9, 6), 4));
-    EXPECT(test::near(f(0, 2), std::sqrt(3)));
+    EXPECT(test::within_abs(f(1, 2), 2));
+    EXPECT(test::within_abs(f(9, 6), 4));
+    EXPECT(test::within_abs(f(0, 2), std::sqrt(3)));
 }

 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/multi_target/multitarget_test.cpp
+++ b/test/multi_target/multitarget_test.cpp
@@ -34,13 +34,13 @@
 #include <migraphx/literal.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/shape.hpp>
-#include <migraphx/verify.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/functional.hpp>
 #include <basic_ops.hpp>
 #include <migraphx/compile_options.hpp>
 #include <migraphx/register_target.hpp>
+#include <migraphx/generate.hpp>
 #include "test.hpp"

 // check if it is custom_op or run_on_module operator
@@ -180,38 +180,74 @@ TEST_CASE(multitarget_compile_cpu_gpu)
    auto z_param = mm->add_parameter("z", s);
    auto cpu_ins = mm->add_instruction(
        migraphx::make_op("run_on_target", {{"target_id", 1}}), {x_param, y_param}, {cpu_mod});
+    auto cpu_ins_0 =
+        mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), cpu_ins);
    auto gpu_ins = mm->add_instruction(
-        migraphx::make_op("run_on_target", {{"target_id", 0}}), {cpu_ins, z_param}, {gpu_mod});
-    mm->add_return({gpu_ins});
-    p.compile({migraphx::make_target("gpu"), migraphx::make_target("cpu")});
+        migraphx::make_op("run_on_target", {{"target_id", 0}}), {cpu_ins_0, z_param}, {gpu_mod});
+    auto gpu_ins_0 =
+        mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), gpu_ins);
+
+    mm->add_return({gpu_ins_0});
+    migraphx::compile_options gpu_opts;
+    gpu_opts.offload_copy = true;
+    p.compile({migraphx::make_target("gpu"), migraphx::make_target("cpu")}, {gpu_opts});
    EXPECT(check_compiled_program(p, {migraphx::make_target("gpu"), migraphx::make_target("cpu")}));
+    migraphx::parameter_map params;
+    params["x"] = migraphx::fill_argument(s, 1);
+    params["y"] = migraphx::fill_argument(s, 2);
+    params["z"] = migraphx::fill_argument(s, 3);
+    auto result = p.eval(params).back();
+    auto gold   = migraphx::fill_argument(s, 6);
+    EXPECT(gold == result);
 }

-TEST_CASE(single_target_compile)
+TEST_CASE(single_target_multi_compile)
 {
    migraphx::program p;
-    auto* mm = p.get_main_module();
-
    migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
+    auto* mm         = p.get_main_module();
+    auto boxes_param = mm->add_parameter("boxes", boxes_s);

+    auto* gpu_mod        = p.create_module("gpu_mod");
+    auto boxes_param_gpu = gpu_mod->add_parameter("boxes_param_gpu", boxes_s);
    migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
    std::vector<float> scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
-
-    auto boxes_l         = mm->add_parameter("boxes", boxes_s);
-    auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
-    auto max_out_l       = mm->add_literal(int64_t{4});
-    auto iou_threshold   = mm->add_literal(0.5f);
-    auto score_threshold = mm->add_literal(0.0f);
-
-    auto r = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}),
-                                 boxes_l,
-                                 scores_l,
-                                 max_out_l,
-                                 iou_threshold,
-                                 score_threshold);
-    mm->add_return({r});
-    p.compile(migraphx::make_target("gpu"));
-    EXPECT(is_compiled_gpu_module(*p.get_main_module()));
+    auto scores_l                 = gpu_mod->add_literal(migraphx::literal(scores_s, scores_vec));
+    auto max_out_l                = gpu_mod->add_literal(int64_t{4});
+    auto iou_threshold            = gpu_mod->add_literal(0.5f);
+    auto score_threshold          = gpu_mod->add_literal(0.0f);
+    auto r                        = gpu_mod->add_instruction(
+        migraphx::make_op("nonmaxsuppression",
+                                                 {{"center_point_box", true}, {"use_dyn_output", true}}),
+        boxes_param_gpu,
+        scores_l,
+        max_out_l,
+        iou_threshold,
+        score_threshold);
+    gpu_mod->add_return({r});
+
+    auto run_on_gpu = mm->add_instruction(
+        migraphx::make_op("run_on_target", {{"target_id", 0}}), {boxes_param}, {gpu_mod});
+    auto run_on_gpu_0 =
+        mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), run_on_gpu);
+    mm->add_return({run_on_gpu_0});
+
+    // compile using multi-target compilation path
+    migraphx::compile_options gpu_opts;
+    gpu_opts.offload_copy = true;
+    // need to add "ref" to avoid ambigious call to "compile()"
+    p.compile({migraphx::make_target("gpu"), migraphx::make_target("ref")}, {gpu_opts});
+    EXPECT(check_compiled_program(p, {migraphx::make_target("gpu"), migraphx::make_target("ref")}));
+    // eval
+    migraphx::parameter_map params;
+    std::vector<float> boxes_vec  = {0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
+                                     0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0};
+    params["boxes"]               = migraphx::argument(boxes_s, boxes_vec.data());
+    auto output                   = p.eval(params).back();
+    std::vector<int64_t> gold_vec = {0, 0, 3, 0, 0, 0, 0, 0, 5};
+    auto gold =
+        migraphx::argument(migraphx::shape{migraphx::shape::int64_type, {3, 3}}, gold_vec.data());
+    EXPECT(output == gold);
 }

 TEST_CASE(multitarget_compile_if_then_else)
@@ -224,54 +260,65 @@ TEST_CASE(multitarget_compile_if_then_else)
    auto x = mm->add_parameter("x", ds);
    auto y = mm->add_parameter("y", ds);

-    auto* then_mod           = p.create_module("if_gpu_mod");
-    std::vector<float> data1 = {0.384804, -1.77948, -0.453775, 0.477438, -1.06333, -1.12893};
-    auto l1                  = then_mod->add_literal(migraphx::literal(ds, data1));
-    auto a1                  = then_mod->add_instruction(migraphx::make_op("add"), x, l1);
+    auto* then_mod = p.create_module("if_gpu_mod");
+    std::vector<float> data1(ds.elements(), 1);
+    auto l1    = then_mod->add_literal(migraphx::literal(ds, data1));
+    auto gpu_x = then_mod->add_parameter("gpu_x", ds);
+    auto a1    = then_mod->add_instruction(migraphx::make_op("add"), gpu_x, l1);
    then_mod->add_return({a1});

-    auto* else_mod           = p.create_module("else_cpu_mod");
-    std::vector<float> data2 = {-0.258047, 0.360394, 0.536804, -0.577762, 1.0217, 1.02442};
-    auto l2                  = else_mod->add_literal(migraphx::literal(ds, data2));
-    auto a2                  = else_mod->add_instruction(migraphx::make_op("mul"), y, l2);
+    auto* else_mod = p.create_module("else_cpu_mod");
+    std::vector<float> data2(ds.elements(), 2);
+    auto l2    = else_mod->add_literal(migraphx::literal(ds, data2));
+    auto cpu_y = else_mod->add_parameter("cpu_y", ds);
+    auto a2    = else_mod->add_instruction(migraphx::make_op("mul"), cpu_y, l2);
    else_mod->add_return({a2});

    auto* run_on_cpu_mod = p.create_module("run_on_cpu");
    auto run_cpu_ins     = run_on_cpu_mod->add_instruction(
-        migraphx::make_op("run_on_target", {{"target_id", 1}}), {}, {else_mod});
-    run_on_cpu_mod->add_return({run_cpu_ins});
+        migraphx::make_op("run_on_target", {{"target_id", 1}}), {y}, {else_mod});
+    auto run_cpu_ins_0 = run_on_cpu_mod->add_instruction(
+        migraphx::make_op("get_tuple_elem", {{"index", 0}}), run_cpu_ins);
+    run_on_cpu_mod->add_return({run_cpu_ins_0});

    auto* run_on_gpu_mod = p.create_module("run_on_gpu");
    auto run_gpu_ins     = run_on_gpu_mod->add_instruction(
-        migraphx::make_op("run_on_target", {{"target_id", 0}}), {}, {then_mod});
-    run_on_gpu_mod->add_return({run_gpu_ins});
+        migraphx::make_op("run_on_target", {{"target_id", 0}}), {x}, {then_mod});
+    auto run_gpu_ins_0 = run_on_gpu_mod->add_instruction(
+        migraphx::make_op("get_tuple_elem", {{"index", 0}}), run_gpu_ins);
+    run_on_gpu_mod->add_return({run_gpu_ins_0});

    auto ret =
        mm->add_instruction(migraphx::make_op("if"), {cond}, {run_on_gpu_mod, run_on_cpu_mod});
    auto r = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), ret);
    mm->add_return({r});
    // compile
-    p.compile({migraphx::make_target("gpu"), migraphx::make_target("cpu")});
+    migraphx::compile_options gpu_opts;
+    gpu_opts.offload_copy = true;
+    p.compile({migraphx::make_target("gpu"), migraphx::make_target("cpu")}, {gpu_opts});
    EXPECT(check_compiled_program(p, {migraphx::make_target("gpu"), migraphx::make_target("cpu")}));
+    migraphx::parameter_map params;
+    params["x"] = migraphx::fill_argument(ds, 2);
+    params["y"] = migraphx::fill_argument(ds, 3);
+    for(bool cond_val : {true, false})
+    {
+        params["cond"] = migraphx::argument(cond_s, &cond_val);
+        auto result    = p.eval(params).back();
+        auto gold      = migraphx::fill_argument(ds, (cond_val ? 3 : 6));
+        EXPECT(gold == result);
+    }
 }

+// TODO : FPGA compilation is broken right now, below test mentions fpga but doesn't compile for it
 TEST_CASE(multitarget_compile_nested_if_then_else)
 {
-    float seed = 0.0f;
-    std::mt19937 gen(seed);
-    std::uniform_real_distribution<> dis(0.0, 1.0);
-    auto get_random_values = [&](size_t elements) {
-        std::vector<float> rand_samples(elements);
-        std::generate(rand_samples.begin(), rand_samples.end(), [&]() { return dis(gen); });
-        return rand_samples;
-    };
-
    std::unordered_map<std::size_t, std::size_t> counter_map = {{0, 0}, {1, 0}};
    migraphx::shape ds{migraphx::shape::float_type, {2, 3}};
    migraphx::program p;
    auto* mm = p.get_main_module();
    migraphx::shape cond_s{migraphx::shape::bool_type};
-    auto cond               = mm->add_parameter("cond", cond_s);
+    auto cond_0             = mm->add_parameter("cond_0", cond_s);
+    auto cond_1             = mm->add_parameter("cond_1", cond_s);
    auto x                  = mm->add_parameter("x", ds);
    auto y                  = mm->add_parameter("y", ds);
    auto z                  = mm->add_parameter("z", ds);
@@ -280,20 +327,22 @@ TEST_CASE(multitarget_compile_nested_if_then_else)
                                  std::size_t tid) {
        std::string mod_name =
            "target_" + std::to_string(tid) + "_" + std::to_string(counter_map[tid]++);
-        auto* test_mod          = prog.create_module(mod_name);
-        std::vector<float> data = get_random_values(ds.elements());
-        auto l1                 = test_mod->add_literal(migraphx::literal(ds, data));
-        auto test_mod_param     = test_mod->add_parameter(mod_name, ds);
-        // instruction with local literal and main_mod param as inputs
-        auto ins1 = test_mod->add_instruction(migraphx::make_op("add"), x, l1);
-        // instructinon with local param and local ins as inputs
-        auto ins2 = test_mod->add_instruction(migraphx::make_op("mul"), ins1, test_mod_param);
-        // instruction with local ins and parent ins as inputs
-        auto ins3 = test_mod->add_instruction(migraphx::make_op("sub"), ins2, inputs.front());
+        auto* test_mod = prog.create_module(mod_name);
+        std::vector<float> data(ds.elements(), -1);
+        auto l1               = test_mod->add_literal(migraphx::literal(ds, data));
+        auto test_mod_param_0 = test_mod->add_parameter(mod_name + "_param_0", ds);
+        auto test_mod_param_1 = test_mod->add_parameter(mod_name + "_param_1", ds);
+        auto test_mod_param_2 = test_mod->add_parameter(mod_name + "_param_2", ds);
+        auto ins1 = test_mod->add_instruction(migraphx::make_op("add"), test_mod_param_0, l1);
+        auto ins2 = test_mod->add_instruction(migraphx::make_op("mul"), ins1, test_mod_param_1);
+        auto ins3 = test_mod->add_instruction(migraphx::make_op("sub"), ins2, test_mod_param_2);
        test_mod->add_return({ins3});
        auto* run_on_target_mod = prog.create_module("run_on_" + mod_name);
-        run_on_target_mod->add_instruction(
-            migraphx::make_op("run_on_target", {{"target_id", tid}}), {inputs.front()}, {test_mod});
+        auto run_ins            = run_on_target_mod->add_instruction(
+            migraphx::make_op("run_on_target", {{"target_id", tid}}), inputs, {test_mod});
+        auto run_ins_0 = run_on_target_mod->add_instruction(
+            migraphx::make_op("get_tuple_elem", {{"index", 0}}), run_ins);
+        run_on_target_mod->add_return({run_ins_0});
        return run_on_target_mod;
    };

@@ -307,15 +356,30 @@ TEST_CASE(multitarget_compile_nested_if_then_else)
    ref_mod->add_return({ref_add});

    auto* then_mod        = p.create_module("then_mod");
-    auto then_mod_param   = then_mod->add_parameter("then_mod_param", ds);
-    auto then_mod_ref_ins = then_mod->add_instruction(
-        migraphx::make_op("run_on_target", {{"target_id", 3}}), {then_mod_param, y}, {ref_mod});
+    auto then_mod_cond    = then_mod->add_parameter("then_mod_cond", cond_s);
+    auto then_mod_param_0 = then_mod->add_parameter("then_mod_param_0", ds);
+    auto then_mod_param_1 = then_mod->add_parameter("then_mod_param_1", ds);
+    auto then_mod_param_2 = then_mod->add_parameter("then_mod_param_2", ds);
+    auto then_mod_ref_ins =
+        then_mod->add_instruction(migraphx::make_op("run_on_target", {{"target_id", 3}}),
+                                  {then_mod_param_0, then_mod_param_1},
+                                  {ref_mod});
    auto then_mod_ref_ins_0 = then_mod->add_instruction(
        migraphx::make_op("get_tuple_elem", {{"index", 0}}), then_mod_ref_ins);
-    then_mod->add_instruction(
+    auto then_mod_if = then_mod->add_instruction(
        migraphx::make_op("if"),
-        {cond},
-        {create_test_module(p, {z}, 1), create_test_module(p, {then_mod_ref_ins_0}, 0)});
+        {then_mod_cond,
+         then_mod_param_0,
+         then_mod_param_1,
+         then_mod_param_2,
+         then_mod_ref_ins_0,
+         then_mod_param_1,
+         then_mod_param_2},
+        {create_test_module(p, {then_mod_param_0, then_mod_param_1, then_mod_param_2}, 1),
+         create_test_module(p, {then_mod_ref_ins_0, then_mod_param_1, then_mod_param_2}, 0)});
+    auto then_mod_if_0 =
+        then_mod->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), then_mod_if);
+    then_mod->add_return({then_mod_if_0});

    // create nested else_mod with multiple targets.
    // else_mod has one instruction that runs a module on "fpga" and another instruction that
@@ -326,53 +390,105 @@ TEST_CASE(multitarget_compile_nested_if_then_else)
    auto fpga_add  = fpga_mod->add_instruction(migraphx::make_op("add"), fpga_x, fpga_y);
    fpga_mod->add_return({fpga_add});

-    auto* else_mod         = p.create_module("else_mod");
-    auto else_mod_param    = else_mod->add_parameter("else_mod_param", ds);
-    auto else_mod_fpga_ins = else_mod->add_instruction(
-        migraphx::make_op("run_on_target", {{"target_id", 2}}), {else_mod_param, y}, {fpga_mod});
+    auto* else_mod        = p.create_module("else_mod");
+    auto else_mod_cond    = else_mod->add_parameter("else_mod_cond", cond_s);
+    auto else_mod_param_0 = else_mod->add_parameter("else_mod_param_0", ds);
+    auto else_mod_param_1 = else_mod->add_parameter("else_mod_param_1", ds);
+    auto else_mod_param_2 = else_mod->add_parameter("else_mod_param_2", ds);
+    auto else_mod_fpga_ins =
+        else_mod->add_instruction(migraphx::make_op("run_on_target", {{"target_id", 2}}),
+                                  {else_mod_param_0, else_mod_param_2},
+                                  {fpga_mod});
    auto else_mod_fpga_ins_0 = else_mod->add_instruction(
        migraphx::make_op("get_tuple_elem", {{"index", 0}}), else_mod_fpga_ins);

-    else_mod->add_instruction(migraphx::make_op("if"),
-                              {cond},
-                              {create_test_module(p, {else_mod_fpga_ins_0}, 0),
-                               create_test_module(p, {else_mod_param}, 1)});
+    auto else_mod_if = else_mod->add_instruction(
+        migraphx::make_op("if"),
+        {else_mod_cond,
+         else_mod_fpga_ins_0,
+         else_mod_param_0,
+         else_mod_param_1,
+         else_mod_param_2,
+         else_mod_param_1,
+         else_mod_param_0},
+        {create_test_module(p, {else_mod_fpga_ins_0, else_mod_param_0, else_mod_param_1}, 0),
+         create_test_module(p, {else_mod_param_2, else_mod_param_1, else_mod_param_0}, 1)});
+    auto else_mod_if_0 =
+        else_mod->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), else_mod_if);
+    else_mod->add_return({else_mod_if_0});

    // Create nested and multi-target main module using "If"
-    auto main_if_ins =
-        mm->add_instruction(migraphx::make_op("if"), {cond, x}, {then_mod, else_mod});
+    auto main_if_ins = mm->add_instruction(
+        migraphx::make_op("if"), {cond_0, cond_1, x, y, z, cond_1, x, y, z}, {then_mod, else_mod});
    auto r = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), main_if_ins);
    mm->add_return({r});

    // compile
+    migraphx::compile_options gpu_opts;
+    gpu_opts.offload_copy = true;
+
    p.compile({migraphx::make_target("gpu"),
               migraphx::make_target("cpu"),
-               migraphx::make_target("fpga"),
-               migraphx::make_target("ref")});
+               migraphx::make_target("ref"),
+               migraphx::make_target("ref")},
+              {gpu_opts});
    EXPECT(check_compiled_program(p,
                                  {migraphx::make_target("gpu"),
                                   migraphx::make_target("cpu"),
-                                   migraphx::make_target("fpga"),
+                                   migraphx::make_target("ref"),
                                   migraphx::make_target("ref")}));
+    // do evaluation using different conditions
+    // TODO: make two conditional to cover all the paths
+    migraphx::parameter_map params;
+    float x_i   = 2.0;
+    float y_i   = 3.0;
+    float z_i   = 4.0;
+    params["x"] = migraphx::fill_argument(ds, x_i);
+    params["y"] = migraphx::fill_argument(ds, y_i);
+    params["z"] = migraphx::fill_argument(ds, z_i);
+    // cover all paths with different combination of conditions
+    std::vector<std::pair<bool, bool>> test_conds = {
+        {true, true}, {true, false}, {false, true}, {false, false}};
+    for(auto [cond_val_0, cond_val_1] : test_conds)
+    {
+        params["cond_0"] = migraphx::argument(cond_s, &cond_val_0);
+        params["cond_1"] = migraphx::argument(cond_s, &cond_val_1);
+        auto result      = p.eval(params).back();
+        // main has one instruction that is : if_then_else
+        // then mod is doing : {tmp = x+y; (cond) ? (((x-1)*y)-z)  : (((tmp-1)*y)-z);}
+        // else mod is doing : {tmp = x+z; (cond) ? (((tmp-1)*x)-y) : (((z-1)*y)-x);}
+        float gold_i = -1.0;
+        if(cond_val_0)
+        {
+            float tmp_i = x_i + y_i;
+            gold_i      = (cond_val_1) ? (((x_i - 1) * y_i) - z_i) : (((tmp_i - 1) * y_i) - z_i);
+        }
+        else
+        {
+            float tmp_i = x_i + z_i;
+            gold_i      = (cond_val_1) ? (((tmp_i - 1) * x_i) - y_i) : (((z_i - 1) * y_i) - x_i);
+        }
+        auto gold = migraphx::fill_argument(ds, gold_i);
+        EXPECT(gold == result);
+    }
 }

+// TODO : FPGA compilation is broken right now, below test mentions fpga but doesn't compile for it
 TEST_CASE(multitarget_select_module)
 {
    migraphx::program p;
-    auto* mm = p.get_main_module();
-    migraphx::shape lit_s{migraphx::shape{migraphx::shape::float_type, {1}}};
-    auto literal_ins = mm->add_literal(migraphx::literal{lit_s, {6}});
-
    // create batch submodules
    auto create_submodule = [&](std::size_t batch_size, const std::string& module_name) {
        auto* submod = p.create_module(module_name);
        migraphx::shape sm_shape{migraphx::shape::float_type, {batch_size, 4}};
        auto sm_input = submod->add_parameter("data", sm_shape);
+        migraphx::shape lit_s{migraphx::shape{migraphx::shape::float_type, {1}}};
+        auto literal_ins = submod->add_literal(migraphx::literal{lit_s, {6}});
        auto broadcast_lit =
            submod->add_instruction(migraphx::make_op("multibroadcast"), literal_ins, sm_input);
        auto add_ins0 = submod->add_instruction(migraphx::make_op("add"), sm_input, broadcast_lit);
        auto add_ins1 = submod->add_instruction(migraphx::make_op("add"), add_ins0, broadcast_lit);
-        submod->add_return({add_ins0, add_ins1});
+        submod->add_return({add_ins1});
        return submod;
    };
    auto* batch1 = create_submodule(1, "batch_1");
@@ -380,36 +496,45 @@ TEST_CASE(multitarget_select_module)
    auto* batch3 = create_submodule(3, "batch_3");
    auto* batch4 = create_submodule(4, "batch_4");

-    migraphx::shape s{migraphx::shape::float_type, {{1, 4}, {4, 4}}};
-    auto input        = mm->add_parameter("data", s);
    auto* run_cpu_mod = p.create_module("cpu_mod");
-    auto cpu_param    = run_cpu_mod->add_parameter(
-        "cpu_data", migraphx::shape{migraphx::shape::float_type, {1, 4}});
+    auto cpu_param =
+        run_cpu_mod->add_parameter("data", migraphx::shape{migraphx::shape::float_type, {1, 4}});
    auto run_cpu_ins = run_cpu_mod->add_instruction(
        migraphx::make_op("run_on_target", {{"target_id", 1}}), {cpu_param}, {batch1});
-    run_cpu_mod->add_return({run_cpu_ins});
+    auto run_cpu_ins_0 = run_cpu_mod->add_instruction(
+        migraphx::make_op("get_tuple_elem", {{"index", 0}}), run_cpu_ins);
+    run_cpu_mod->add_return({run_cpu_ins_0});

    auto* run_gpu_mod = p.create_module("gpu_mod");
-    auto gpu_param    = run_gpu_mod->add_parameter(
-        "gpu_data", migraphx::shape{migraphx::shape::float_type, {2, 4}});
+    auto gpu_param =
+        run_gpu_mod->add_parameter("data", migraphx::shape{migraphx::shape::float_type, {2, 4}});
    auto run_gpu_ins = run_gpu_mod->add_instruction(
        migraphx::make_op("run_on_target", {{"target_id", 0}}), {gpu_param}, {batch2});
-    run_gpu_mod->add_return({run_gpu_ins});
+    auto run_gpu_ins_0 = run_gpu_mod->add_instruction(
+        migraphx::make_op("get_tuple_elem", {{"index", 0}}), run_gpu_ins);
+    run_gpu_mod->add_return({run_gpu_ins_0});

    auto* run_fpga_mod = p.create_module("fpga_mod");
-    auto fpga_param    = run_fpga_mod->add_parameter(
-        "fpga_data", migraphx::shape{migraphx::shape::float_type, {3, 4}});
+    auto fpga_param =
+        run_fpga_mod->add_parameter("data", migraphx::shape{migraphx::shape::float_type, {3, 4}});
    auto run_fpga_ins = run_fpga_mod->add_instruction(
        migraphx::make_op("run_on_target", {{"target_id", 2}}), {fpga_param}, {batch3});
-    run_fpga_mod->add_return({run_fpga_ins});
+    auto run_fpga_ins_0 = run_fpga_mod->add_instruction(
+        migraphx::make_op("get_tuple_elem", {{"index", 0}}), run_fpga_ins);
+    run_fpga_mod->add_return({run_fpga_ins_0});

    auto* run_ref_mod = p.create_module("ref_mod");
-    auto ref_param    = run_fpga_mod->add_parameter(
-        "ref_data", migraphx::shape{migraphx::shape::float_type, {4, 4}});
+    auto ref_param =
+        run_ref_mod->add_parameter("data", migraphx::shape{migraphx::shape::float_type, {4, 4}});
    auto run_ref_ins = run_ref_mod->add_instruction(
        migraphx::make_op("run_on_target", {{"target_id", 3}}), {ref_param}, {batch4});
-    run_ref_mod->add_return({run_ref_ins});
+    auto run_ref_ins_0 = run_ref_mod->add_instruction(
+        migraphx::make_op("get_tuple_elem", {{"index", 0}}), run_ref_ins);
+    run_ref_mod->add_return({run_ref_ins_0});

+    auto* mm = p.get_main_module();
+    migraphx::shape dyn_s{migraphx::shape::float_type, {{1, 4}, {4, 4}}};
+    auto input                              = mm->add_parameter("data", dyn_s);
    std::vector<migraphx::shape> sub_shapes = {};
    sub_shapes.push_back(migraphx::shape{migraphx::shape::float_type, {{1, 4}, {4, 4}}});
    sub_shapes.push_back(migraphx::shape{migraphx::shape::float_type, {{1, 4}, {4, 4}}});
@@ -419,18 +544,34 @@ TEST_CASE(multitarget_select_module)
        {input},
        {run_cpu_mod, run_gpu_mod, run_fpga_mod, run_ref_mod});
    auto ret0 = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), sm_ins);
-    auto ret1 = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 1}}), sm_ins);
-    mm->add_return({ret0, ret1});
+    mm->add_return({ret0});
    // compile
+    migraphx::compile_options gpu_opts;
+    gpu_opts.offload_copy = true;
    p.compile({migraphx::make_target("gpu"),
               migraphx::make_target("cpu"),
-               migraphx::make_target("fpga"),
-               migraphx::make_target("ref")});
+               migraphx::make_target("ref"),
+               migraphx::make_target("ref")},
+              {gpu_opts});
    EXPECT(check_compiled_program(p,
                                  {migraphx::make_target("gpu"),
                                   migraphx::make_target("cpu"),
-                                   migraphx::make_target("fpga"),
+                                   migraphx::make_target("ref"),
                                   migraphx::make_target("ref")}));
+    // program does the 12+x where x has dynamic shape {{1, 4}, {4, 4}}
+    for(const size_t bs : {1, 2, 3, 4})
+    {
+        migraphx::shape arg_shape{migraphx::shape::float_type, {bs, 4}};
+        migraphx::parameter_map params;
+        params["data"] = migraphx::generate_argument(arg_shape, arg_shape.elements());
+        std::vector<float> input_data;
+        params["data"].visit([&](const auto& vec) { input_data.assign(vec.begin(), vec.end()); });
+        std::transform(input_data.begin(), input_data.end(), input_data.begin(), [](const auto& i) {
+            return i + 12.0;
+        });
+        auto result = p.eval(params).back();
+        EXPECT(migraphx::argument(arg_shape, input_data.data()) == result);
+    }
 }

 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/onnx/.onnxrt-commit
+++ b/test/onnx/.onnxrt-commit
-3be6eb53c8b359703cb645ed2cb1cdf106924b7c
+21a71d52bd2074b770807b209939ec11e2c64fa7
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -6165,6 +6165,101 @@ def shape_test():
    return ([node], [x], [y])


+@onnx_test()
+def shape_dyn_test0():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT,
+                                      [None, 4, None, None])
+    y = helper.make_tensor_value_info('y', TensorProto.INT64, [4])
+
+    node = onnx.helper.make_node(
+        'Shape',
+        inputs=['x'],
+        outputs=['y'],
+    )
+
+    return ([node], [x], [y])
+
+
+@onnx_test()
+def shape_dyn_test1():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT,
+                                      [None, 4, None, None])
+    y = helper.make_tensor_value_info('y', TensorProto.INT64, [2])
+
+    node = onnx.helper.make_node('Shape', inputs=['x'], outputs=['y'], start=2)
+
+    return ([node], [x], [y])
+
+
+@onnx_test()
+def shape_dyn_test2():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT,
+                                      [None, 4, None, None])
+    y = helper.make_tensor_value_info('y', TensorProto.INT64, [2])
+
+    node = onnx.helper.make_node('Shape',
+                                 inputs=['x'],
+                                 outputs=['y'],
+                                 start=-2)
+
+    return ([node], [x], [y])
+
+
+@onnx_test()
+def shape_dyn_test3():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT,
+                                      [None, 4, None, None])
+    y = helper.make_tensor_value_info('y', TensorProto.INT64, [2])
+
+    node = onnx.helper.make_node('Shape',
+                                 inputs=['x'],
+                                 outputs=['y'],
+                                 start=1,
+                                 end=2)
+
+    return ([node], [x], [y])
+
+
+@onnx_test()
+def shape_end_oob_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT,
+                                      [None, 4, None, None])
+    y = helper.make_tensor_value_info('y', TensorProto.INT64, [2])
+
+    node = onnx.helper.make_node('Shape', inputs=['x'], outputs=['y'], end=5)
+
+    return ([node], [x], [y])
+
+
+@onnx_test()
+def shape_start_oob_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT,
+                                      [None, 4, None, None])
+    y = helper.make_tensor_value_info('y', TensorProto.INT64, [2])
+
+    node = onnx.helper.make_node('Shape',
+                                 inputs=['x'],
+                                 outputs=['y'],
+                                 start=-6)
+
+    return ([node], [x], [y])
+
+
+@onnx_test()
+def shape_end_less_start_error():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT,
+                                      [None, 4, None, None])
+    y = helper.make_tensor_value_info('y', TensorProto.INT64, [2])
+
+    node = onnx.helper.make_node('Shape',
+                                 inputs=['x'],
+                                 outputs=['y'],
+                                 start=3,
+                                 end=1)
+
+    return ([node], [x], [y])
+
+
 @onnx_test()
 def shape_gather_test():
    values = np.array([1])

--- a/test/onnx/onnx_test.cpp
+++ b/test/onnx/onnx_test.cpp
@@ -440,14 +440,13 @@ TEST_CASE(batch_norm_flat_test)
    auto mean  = mm->add_parameter("mean", {migraphx::shape::float_type, {1}});
    auto var   = mm->add_parameter("variance", {migraphx::shape::float_type, {1}});

-    auto rt  = mm->add_literal(migraphx::literal{migraphx::shape::float_type, {0.5}});
    auto eps = mm->add_literal(migraphx::literal{migraphx::shape::float_type, {1e-6f}});

-    auto numer   = add_common_op(*mm, migraphx::make_op("sub"), {x, mean});
-    auto var_eps = add_common_op(*mm, migraphx::make_op("add"), {var, eps});
-    auto denom   = add_common_op(*mm, migraphx::make_op("pow"), {var_eps, rt});
-    auto div0    = add_common_op(*mm, migraphx::make_op("div"), {numer, denom});
-    auto r0      = add_common_op(*mm, migraphx::make_op("mul"), {div0, scale});
+    auto x_sub_mean = add_common_op(*mm, migraphx::make_op("sub"), {x, mean});
+    auto var_eps    = add_common_op(*mm, migraphx::make_op("add"), {var, eps});
+    auto rsqrt      = mm->add_instruction(migraphx::make_op("rsqrt"), {var_eps});
+    auto mul0       = add_common_op(*mm, migraphx::make_op("mul"), {scale, rsqrt});
+    auto r0         = add_common_op(*mm, migraphx::make_op("mul"), {x_sub_mean, mul0});
    add_common_op(*mm, migraphx::make_op("add"), {r0, bias});

    auto prog = optimize_onnx("batch_norm_flat_test.onnx");
@@ -465,14 +464,13 @@ TEST_CASE(batch_norm_rank_2_test)
    auto mean  = mm->add_parameter("mean", {migraphx::shape::float_type, {5}});
    auto var   = mm->add_parameter("variance", {migraphx::shape::float_type, {5}});

-    auto rt  = mm->add_literal(migraphx::literal{migraphx::shape::float_type, {0.5}});
    auto eps = mm->add_literal(migraphx::literal{migraphx::shape::float_type, {1e-6f}});

-    auto numer   = add_common_op(*mm, migraphx::make_op("sub"), {x, mean});
-    auto var_eps = add_common_op(*mm, migraphx::make_op("add"), {var, eps});
-    auto denom   = add_common_op(*mm, migraphx::make_op("pow"), {var_eps, rt});
-    auto div0    = add_common_op(*mm, migraphx::make_op("div"), {numer, denom});
-    auto r0      = add_common_op(*mm, migraphx::make_op("mul"), {div0, scale});
+    auto x_sub_mean = add_common_op(*mm, migraphx::make_op("sub"), {x, mean});
+    auto var_eps    = add_common_op(*mm, migraphx::make_op("add"), {var, eps});
+    auto rsqrt      = mm->add_instruction(migraphx::make_op("rsqrt"), {var_eps});
+    auto mul0       = add_common_op(*mm, migraphx::make_op("mul"), {scale, rsqrt});
+    auto r0         = add_common_op(*mm, migraphx::make_op("mul"), {x_sub_mean, mul0});
    add_common_op(*mm, migraphx::make_op("add"), {r0, bias});

    auto prog = optimize_onnx("batch_norm_rank_2_test.onnx");
@@ -490,7 +488,6 @@ TEST_CASE(batch_norm_1d_test)
    auto mean  = mm->add_parameter("mean", {migraphx::shape::float_type, {3}});
    auto var   = mm->add_parameter("variance", {migraphx::shape::float_type, {3}});

-    auto rt  = mm->add_literal(migraphx::literal{migraphx::shape::half_type, {0.5}});
    auto eps = mm->add_literal(migraphx::literal{migraphx::shape::half_type, {1e-5f}});

    auto usq_scale = mm->add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1}}}), scale);
@@ -498,11 +495,11 @@ TEST_CASE(batch_norm_1d_test)
    auto usq_mean  = mm->add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1}}}), mean);
    auto usq_var   = mm->add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1}}}), var);

-    auto numer   = add_common_op(*mm, migraphx::make_op("sub"), {x, usq_mean});
-    auto var_eps = add_common_op(*mm, migraphx::make_op("add"), {usq_var, eps});
-    auto denom   = add_common_op(*mm, migraphx::make_op("pow"), {var_eps, rt});
-    auto div0    = add_common_op(*mm, migraphx::make_op("div"), {numer, denom});
-    auto r0      = add_common_op(*mm, migraphx::make_op("mul"), {div0, usq_scale});
+    auto x_sub_mean = add_common_op(*mm, migraphx::make_op("sub"), {x, usq_mean});
+    auto var_eps    = add_common_op(*mm, migraphx::make_op("add"), {usq_var, eps});
+    auto rsqrt      = mm->add_instruction(migraphx::make_op("rsqrt"), var_eps);
+    auto mul0       = add_common_op(*mm, migraphx::make_op("mul"), {usq_scale, rsqrt});
+    auto r0         = add_common_op(*mm, migraphx::make_op("mul"), {x_sub_mean, mul0});
    add_common_op(*mm, migraphx::make_op("add"), {r0, usq_bias});

    auto prog = optimize_onnx("batch_norm_1d_test.onnx");
@@ -520,7 +517,6 @@ TEST_CASE(batch_norm_2d_test)
    auto mean  = mm->add_parameter("mean", {migraphx::shape::float_type, {3}});
    auto var   = mm->add_parameter("variance", {migraphx::shape::float_type, {3}});

-    auto rt  = mm->add_literal(migraphx::literal{migraphx::shape::float_type, {0.5}});
    auto eps = mm->add_literal(migraphx::literal{migraphx::shape::float_type, {1e-5f}});

    auto usq_scale = mm->add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2}}}), scale);
@@ -528,11 +524,11 @@ TEST_CASE(batch_norm_2d_test)
    auto usq_mean  = mm->add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2}}}), mean);
    auto usq_var   = mm->add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2}}}), var);

-    auto numer   = add_common_op(*mm, migraphx::make_op("sub"), {x, usq_mean});
-    auto var_eps = add_common_op(*mm, migraphx::make_op("add"), {usq_var, eps});
-    auto denom   = add_common_op(*mm, migraphx::make_op("pow"), {var_eps, rt});
-    auto div0    = add_common_op(*mm, migraphx::make_op("div"), {numer, denom});
-    auto r0      = add_common_op(*mm, migraphx::make_op("mul"), {div0, usq_scale});
+    auto x_sub_mean = add_common_op(*mm, migraphx::make_op("sub"), {x, usq_mean});
+    auto var_eps    = add_common_op(*mm, migraphx::make_op("add"), {usq_var, eps});
+    auto rsqrt      = mm->add_instruction(migraphx::make_op("rsqrt"), var_eps);
+    auto mul0       = add_common_op(*mm, migraphx::make_op("mul"), {usq_scale, rsqrt});
+    auto r0         = add_common_op(*mm, migraphx::make_op("mul"), {x_sub_mean, mul0});
    add_common_op(*mm, migraphx::make_op("add"), {r0, usq_bias});

    auto prog = optimize_onnx("batch_norm_2d_test.onnx");
@@ -550,7 +546,6 @@ TEST_CASE(batch_norm_3d_test)
    auto mean  = mm->add_parameter("mean", {migraphx::shape::half_type, {2}});
    auto var   = mm->add_parameter("variance", {migraphx::shape::half_type, {2}});

-    auto rt  = mm->add_literal(migraphx::literal{migraphx::shape::half_type, {0.5}});
    auto eps = mm->add_literal(migraphx::literal{migraphx::shape::half_type, {1e-6f}});

    auto usq_scale =
@@ -561,12 +556,13 @@ TEST_CASE(batch_norm_3d_test)
        mm->add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2, 3}}}), mean);
    auto usq_var = mm->add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2, 3}}}), var);

-    auto numer   = add_common_op(*mm, migraphx::make_op("sub"), {x, usq_mean});
-    auto var_eps = add_common_op(*mm, migraphx::make_op("add"), {usq_var, eps});
-    auto denom   = add_common_op(*mm, migraphx::make_op("pow"), {var_eps, rt});
-    auto div0    = add_common_op(*mm, migraphx::make_op("div"), {numer, denom});
-    auto r0      = add_common_op(*mm, migraphx::make_op("mul"), {div0, usq_scale});
+    auto x_sub_mean = add_common_op(*mm, migraphx::make_op("sub"), {x, usq_mean});
+    auto var_eps    = add_common_op(*mm, migraphx::make_op("add"), {usq_var, eps});
+    auto rsqrt      = mm->add_instruction(migraphx::make_op("rsqrt"), var_eps);
+    auto mul0       = add_common_op(*mm, migraphx::make_op("mul"), {usq_scale, rsqrt});
+    auto r0         = add_common_op(*mm, migraphx::make_op("mul"), {x_sub_mean, mul0});
    add_common_op(*mm, migraphx::make_op("add"), {r0, usq_bias});
+
    auto prog = optimize_onnx("batch_norm_3d_test.onnx");

    EXPECT(p == prog);
@@ -908,7 +904,6 @@ TEST_CASE(constant_test)

 TEST_CASE(constant_fill_test)
 {
-
    migraphx::program p;
    auto* mm = p.get_main_module();
    migraphx::shape s{migraphx::shape::float_type, {2, 3}};
@@ -1105,7 +1100,6 @@ TEST_CASE(conv_bn_relu_maxpool_test)
    auto p5 = mm->add_parameter("5", {migraphx::shape::float_type, {1}});
    auto p6 = mm->add_parameter("6", {migraphx::shape::float_type, {1}});

-    auto rt  = mm->add_literal(migraphx::literal{migraphx::shape::float_type, {0.5}});
    auto eps = mm->add_literal(migraphx::literal{migraphx::shape::float_type, {1e-5f}});

    uint64_t axis = 1;
@@ -1120,25 +1114,12 @@ TEST_CASE(conv_bn_relu_maxpool_test)
    auto usq_mean  = mm->add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2}}}), p5);
    auto usq_var   = mm->add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2}}}), p6);

-    auto mb_mean = mm->add_instruction(
-        migraphx::make_op("multibroadcast", {{"out_lens", {1, 1, 28, 28}}}), usq_mean);
-    auto numer = mm->add_instruction(migraphx::make_op("sub"), l5, mb_mean);
-    auto mb_eps =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {1, 1, 1}}}), eps);
-    auto var_eps = mm->add_instruction(migraphx::make_op("add"), usq_var, mb_eps);
-    auto mb_rt =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {1, 1, 1}}}), rt);
-    auto denom    = mm->add_instruction(migraphx::make_op("pow"), var_eps, mb_rt);
-    auto mb_denom = mm->add_instruction(
-        migraphx::make_op("multibroadcast", {{"out_lens", {1, 1, 28, 28}}}), denom);
-    auto div0     = mm->add_instruction(migraphx::make_op("div"), numer, mb_denom);
-    auto mb_scale = mm->add_instruction(
-        migraphx::make_op("multibroadcast", {{"out_lens", {1, 1, 28, 28}}}), usq_scale);
-    auto r0      = mm->add_instruction(migraphx::make_op("mul"), div0, mb_scale);
-    auto mb_bias = mm->add_instruction(
-        migraphx::make_op("multibroadcast", {{"out_lens", {1, 1, 28, 28}}}), usq_bias);
-
-    auto l6 = mm->add_instruction(migraphx::make_op("add"), r0, mb_bias);
+    auto x_sub_mean = add_common_op(*mm, migraphx::make_op("sub"), {l5, usq_mean});
+    auto var_eps    = add_common_op(*mm, migraphx::make_op("add"), {usq_var, eps});
+    auto rsqrt      = mm->add_instruction(migraphx::make_op("rsqrt"), var_eps);
+    auto mul0       = add_common_op(*mm, migraphx::make_op("mul"), {usq_scale, rsqrt});
+    auto r0         = add_common_op(*mm, migraphx::make_op("mul"), {x_sub_mean, mul0});
+    auto l6         = add_common_op(*mm, migraphx::make_op("add"), {r0, usq_bias});

    auto l7 = mm->add_instruction(migraphx::make_op("relu"), l6);
    mm->add_instruction(migraphx::make_op("pooling",
@@ -6079,6 +6060,118 @@ TEST_CASE(shape_test)
    EXPECT(p == prog);
 }

+TEST_CASE(shape_dyn_test0)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape s{migraphx::shape::float_type, {{1, 4, {1, 4}}, {4, 4}, {2, 4}, {2, 4}}};
+    auto p0 = mm->add_parameter("x", s);
+    migraphx::shape s_shape{migraphx::shape::int64_type, {4}};
+    auto ret = mm->add_instruction(migraphx::make_op("dimensions_of", {{"end", 4}}), p0);
+    mm->add_return({ret});
+
+    migraphx::onnx_options options;
+    options.map_dyn_input_dims["x"] = {{1, 4, {1, 4}}, {4, 4}, {2, 4}, {2, 4}};
+    auto prog                       = parse_onnx("shape_dyn_test0.onnx", options);
+
+    EXPECT(p == prog);
+}
+
+TEST_CASE(shape_dyn_test1)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape s{migraphx::shape::float_type, {{1, 4, {1, 4}}, {4, 4}, {2, 4}, {2, 4}}};
+    auto p0 = mm->add_parameter("x", s);
+    migraphx::shape s_shape{migraphx::shape::int64_type, {4}};
+    auto ret =
+        mm->add_instruction(migraphx::make_op("dimensions_of", {{"start", 2}, {"end", 4}}), p0);
+    mm->add_return({ret});
+
+    migraphx::onnx_options options;
+    options.map_dyn_input_dims["x"] = {{1, 4, {1, 4}}, {4, 4}, {2, 4}, {2, 4}};
+    auto prog                       = parse_onnx("shape_dyn_test1.onnx", options);
+
+    EXPECT(p == prog);
+}
+
+TEST_CASE(shape_dyn_test2)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape s{migraphx::shape::float_type, {{1, 4, {1, 4}}, {4, 4}, {2, 4}, {2, 4}}};
+    auto p0 = mm->add_parameter("x", s);
+    migraphx::shape s_shape{migraphx::shape::int64_type, {4}};
+    auto ret =
+        mm->add_instruction(migraphx::make_op("dimensions_of", {{"start", 2}, {"end", 4}}), p0);
+    mm->add_return({ret});
+
+    migraphx::onnx_options options;
+    options.map_dyn_input_dims["x"] = {{1, 4, {1, 4}}, {4, 4}, {2, 4}, {2, 4}};
+    auto prog                       = parse_onnx("shape_dyn_test2.onnx", options);
+
+    EXPECT(p == prog);
+}
+
+TEST_CASE(shape_dyn_test3)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape s{migraphx::shape::float_type, {{1, 4, {1, 4}}, {4, 4}, {2, 4}, {2, 4}}};
+    auto p0 = mm->add_parameter("x", s);
+    migraphx::shape s_shape{migraphx::shape::int64_type, {4}};
+    auto ret =
+        mm->add_instruction(migraphx::make_op("dimensions_of", {{"start", 1}, {"end", 2}}), p0);
+    mm->add_return({ret});
+
+    migraphx::onnx_options options;
+    options.map_dyn_input_dims["x"] = {{1, 4, {1, 4}}, {4, 4}, {2, 4}, {2, 4}};
+    auto prog                       = parse_onnx("shape_dyn_test3.onnx", options);
+
+    EXPECT(p == prog);
+}
+
+TEST_CASE(shape_end_oob_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape s{migraphx::shape::float_type, {{1, 4, {1, 4}}, {4, 4}, {2, 4}, {2, 4}}};
+    auto p0 = mm->add_parameter("x", s);
+    migraphx::shape s_shape{migraphx::shape::int64_type, {4}};
+    auto ret = mm->add_instruction(migraphx::make_op("dimensions_of", {{"end", 4}}), p0);
+    mm->add_return({ret});
+
+    migraphx::onnx_options options;
+    options.map_dyn_input_dims["x"] = {{1, 4, {1, 4}}, {4, 4}, {2, 4}, {2, 4}};
+    auto prog                       = migraphx::parse_onnx("shape_end_oob_test.onnx", options);
+
+    EXPECT(p == prog);
+}
+
+TEST_CASE(shape_start_oob_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape s{migraphx::shape::float_type, {{1, 4, {1, 4}}, {4, 4}, {2, 4}, {2, 4}}};
+    auto p0 = mm->add_parameter("x", s);
+    migraphx::shape s_shape{migraphx::shape::int64_type, {4}};
+    auto ret = mm->add_instruction(migraphx::make_op("dimensions_of", {{"end", 4}}), p0);
+    mm->add_return({ret});
+
+    migraphx::onnx_options options;
+    options.map_dyn_input_dims["x"] = {{1, 4, {1, 4}}, {4, 4}, {2, 4}, {2, 4}};
+    auto prog                       = migraphx::parse_onnx("shape_start_oob_test.onnx", options);
+
+    EXPECT(p == prog);
+}
+
+TEST_CASE(shape_end_less_start_error)
+{
+    migraphx::onnx_options options;
+    options.map_dyn_input_dims["x"] = {{1, 4, {1, 4}}, {4, 4}, {2, 4}, {2, 4}};
+    EXPECT(test::throws([&] { migraphx::parse_onnx("shape_end_less_start_error.onnx", options); }));
+}
+
 TEST_CASE(shape_gather_test)
 {
    migraphx::program p;
@@ -7150,7 +7243,8 @@ TEST_CASE(variable_batch_user_input_test6)

 TEST_CASE(variable_batch_user_input_test7)
 {
-    // if entry in map_dyn_input_dims is all fixed dynamic_dimensions, convert it to a static shape
+    // if entry in map_dyn_input_dims is all fixed dynamic_dimensions, convert it to a static
+    // shape
    migraphx::program p;
    auto* mm = p.get_main_module();
    auto l0  = mm->add_parameter("0", migraphx::shape{migraphx::shape::float_type, {2, 3, 16, 16}});

--- a/test/onnx/selu_test.onnx
+++ b/test/onnx/selu_test.onnx
--- a/test/onnx/shape_dyn_test0.onnx
+++ b/test/onnx/shape_dyn_test0.onnx