Merge remote-tracking branch 'origin/develop' into ck-host-lib

baac1dab · Alan Turner · 830dff7a · 77042e30 · baac1dab · baac1dab
Commit baac1dab authored May 24, 2023 by Alan Turner
20 changed files
--- a/src/targets/gpu/kernels/include/migraphx/kernels/print.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/print.hpp
@@ -244,13 +244,13 @@ __device__ void print_once(Ts... xs)
 template <class... Ts>
 __device__ void println(Ts... xs)
 {
-    print_each(&coutln, xs...);
+    print_each(&cout, xs..., '\n');
 }

 template <class... Ts>
 __device__ void println_once(Ts... xs)
 {
-    print_each_once(&coutln, xs...);
+    print_each_once(&cout, xs..., '\n');
 }

 } // namespace migraphx

--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -79,20 +79,21 @@ __device__ void dpp_reduce(T& in, Op op)
 #endif

 // NOLINTNEXTLINE
-#define MIGRAPHX_DPP_REDUCE(op, prefix)                                                            \
+#define MIGRAPHX_DPP_REDUCE(op, prefix, sign)                                                      \
    __device__ inline void dpp_reduce(double& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f64); } \
    __device__ inline void dpp_reduce(float& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f32); }  \
    __device__ inline void dpp_reduce(half& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f16); }   \
    __device__ inline void dpp_reduce(int32_t& x, op)                                              \
    {                                                                                              \
-        MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_u32);                                                  \
+        MIGRAPHX_DPP_REDUCE_ASM(x, prefix##sign##32);                                              \
    }                                                                                              \
    __device__ inline void dpp_reduce(uint32_t& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_u32); }

-MIGRAPHX_DPP_REDUCE(op::sum, v_add)
-MIGRAPHX_DPP_REDUCE(op::max, v_max)
-MIGRAPHX_DPP_REDUCE(op::min, v_min)
-MIGRAPHX_DPP_REDUCE(op::product, v_mul)
+// Note: when max and min are in int32_t, signed version of instruction needs to be used.
+MIGRAPHX_DPP_REDUCE(op::sum, v_add, _u)
+MIGRAPHX_DPP_REDUCE(op::product, v_mul, _u)
+MIGRAPHX_DPP_REDUCE(op::max, v_max, _i)
+MIGRAPHX_DPP_REDUCE(op::min, v_min, _i)

 template <class Op, class T, class Index, class F>
 __device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
@@ -174,6 +175,25 @@ struct inner_storage_tag
 template <class T>
 using is_inner_storage = is_base_of<inner_storage_tag, remove_cv_t<remove_reference_t<T>>>;

+template <class Size, class F>
+struct lazy_inner_storage : inner_storage_tag
+{
+    using type = remove_reference_t<decltype(declval<F>()(0, _c<0>))>;
+    F f;
+    constexpr Size rsize() const { return {}; }
+    template <class U, class V>
+    constexpr auto operator()(U j, V d) const
+    {
+        return f(j, d);
+    }
+};
+
+template <class Size, class F>
+constexpr lazy_inner_storage<Size, F> make_lazy_inner_storage(Size, F f)
+{
+    return {{}, f};
+}
+
 template <class R, class F>
 struct storage_access : F
 {
@@ -278,6 +298,14 @@ struct reducer_base
        });
    }

+    template <class F>
+    __device__ auto lazy_inner(F f) const
+    {
+        return this->inner_sliced([=](auto n, auto&&... xs) {
+            return make_lazy_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); });
+        });
+    }
+
    template <class Op, class T, class Read>
    __device__ auto reduce(Op op, T init, Read read) const
    {
@@ -396,25 +424,6 @@ struct block_large
        index idx;
        Slicer slice;

-        template <class Size, class F>
-        struct inner_storage : inner_storage_tag
-        {
-            using type = remove_reference_t<decltype(declval<F>()(0, _c<0>))>;
-            F f;
-            constexpr Size rsize() const { return {}; }
-            template <class U, class V>
-            constexpr auto operator()(U j, V d) const
-            {
-                return f(j, d);
-            }
-        };
-
-        template <class Size, class F>
-        constexpr inner_storage<Size, F> make_inner_storage(Size, F f)
-        {
-            return {f};
-        }
-
        template <class Op, class T, class Read, class N, class... Ts>
        __device__ auto reduce_impl(Op op, T init, Read read, N n, Ts&&... xs) const
        {
@@ -439,7 +448,7 @@ struct block_large
        template <class R, class F, class N, class... Ts>
        __device__ auto inner_impl(F f, N n, Ts&&... xs) const
        {
-            return make_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); });
+            return make_lazy_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); });
        }
    };

@@ -469,25 +478,6 @@ struct lane
        index idx;
        Slicer slice;

-        template <class Size, class F>
-        struct inner_storage : inner_storage_tag
-        {
-            using type = remove_reference_t<decltype(declval<F>()(0, _c<0>))>;
-            F f;
-            constexpr Size rsize() const { return {}; }
-            template <class U, class V>
-            constexpr auto operator()(U j, V d) const
-            {
-                return f(j, d);
-            }
-        };
-
-        template <class Size, class F>
-        constexpr inner_storage<Size, F> make_inner_storage(Size, F f)
-        {
-            return {f};
-        }
-
        template <class Op, class T, class Read, class N, class U, class... Us>
        __device__ auto reduce_impl(Op op, T init, Read read, N n, U&& x, Us&&... xs) const
        {
@@ -518,7 +508,7 @@ struct lane
        template <class R, class F, class N, class... Ts>
        __device__ auto inner_impl(F f, N n, Ts&&... xs) const
        {
-            return make_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); });
+            return make_lazy_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); });
        }
    };
    template <class Slicer>
@@ -577,5 +567,21 @@ simple_reduce(Op op, T init, Input input, Output output, ReadInput read, WriteOu
    });
 }

+template <class Algo, class Reduced, class Output, class F>
+__device__ void fused_reduce(Output output, F f)
+{
+    Algo::template run<Reduced>([&](auto out_idx, auto r) {
+        auto result = f(r, out_idx);
+        if constexpr(reduce::is_inner_storage<decltype(result)>{})
+        {
+            r.inner([&](auto& y, auto x) { y = x; })(output, result);
+        }
+        else
+        {
+            r.outer([&] { output[out_idx] = implicit_conversion(result); });
+        }
+    });
+}
+
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_REDUCE_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
@@ -218,7 +218,15 @@ using common_type_t = typename common_type<Ts...>::type;

 #define MIGRAPHX_REQUIRES(...) class = enable_if_t<__VA_ARGS__>

-constexpr unsigned long int_max(unsigned long n) { return (1u << (n * 8)) - 1; }
+constexpr unsigned long int_max(unsigned long n)
+{
+    // Note, left shift cannot be used to get the maximum value of int64_type or
+    // uint64_type because it is undefined behavior to left shift 64 bits for
+    // these types
+    if(n == sizeof(int64_t))
+        return -1;
+    return (1ul << (n * 8)) - 1;
+}

 template <class T,
          MIGRAPHX_REQUIRES(is_integral<T>{} or is_floating_point<T>{} or
@@ -228,9 +236,9 @@ constexpr T numeric_max()
    if constexpr(is_integral<T>{})
    {
        if constexpr(is_unsigned<T>{})
-            return int_max(sizeof(T)) * 2;
-        else
            return int_max(sizeof(T));
+        else
+            return int_max(sizeof(T)) / 2;
    }
    else if constexpr(is_same<T, double>{})
        return __DBL_MAX__;

--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -135,7 +135,7 @@ constexpr vec<vec_type<T>, N> vec_packed_at(T x, I i)
        return vec<T, N>{x};
    else
    {
-        MIGRAPHX_ASSERT((i + N) < vec_size<T>());
+        MIGRAPHX_ASSERT((i + N) <= vec_size<T>());
        vec<vec_type<T>, N> result = {0};
        for(int j = 0; j < N; j++)
        {

--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -83,8 +83,7 @@ struct miopen_apply
        auto& ctx      = get_context();
        int8_x4_format = get_int8_x4_format(ctx);
        compute_fp32   = get_compute_fp32_flag();
-
-        offload_copy = (mod->name() == "main") ? pass->offload_copy : false;
+        offload_copy   = (mod->name() == "main") ? pass->offload_copy : false;

        add_generic_op("contiguous");

@@ -112,6 +111,7 @@ struct miopen_apply
        add_loop_op();
        add_neg_op();
        add_nms_op();
+        add_select_module_op();
    }

    void copy_params() const
@@ -359,6 +359,20 @@ struct miopen_apply
            return mod->replace_instruction(ins, gpu_out);
        });
    }
+
+    /**
+     * Adds dynamic allocation for submodule output parameter.
+     */
+    void add_select_module_op()
+    {
+        apply_map.emplace("select_module", [=](instruction_ref ins) {
+            auto s                              = ins->get_shape();
+            auto output                         = insert_allocation(ins, s);
+            std::vector<instruction_ref> inputs = ins->inputs();
+            inputs.push_back(output);
+            return mod->replace_instruction(ins, ins->get_operator(), inputs, ins->module_inputs());
+        });
+    }
 };

 void lowering::apply(module& m) const { miopen_apply{&m, this}.apply(); }

--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
@@ -30,6 +30,7 @@
 #include <mlir-c/BuiltinTypes.h>
 #include <mlir-c/Diagnostics.h>
 #include <mlir-c/Dialect/MIGraphX.h>
+#include <mlir-c/Dialect/Rock.h>
 #include <mlir-c/IntegerSet.h>
 #include <mlir-c/Pass.h>
 #include <mutex>
@@ -55,12 +56,16 @@
 #include <migraphx/permutation.hpp>
 #include <deque>
 #include <variant>
+#include <fstream>
+#include <sstream>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_MLIR);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_TUNING_DB);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_MLIR_TUNING_CFG);

 #ifdef MIGRAPHX_MLIR
 template <class T, class F, F f> // NOLINT
@@ -124,6 +129,8 @@ using mlir_op_printing_flags = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirOpPrintingFlags,
 using mlir_region            = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRegion, mlirRegionDestroy);
 using mlir_block             = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirBlock, mlirBlockDestroy);
 using mlir_pass_manager      = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirPassManager, mlirPassManagerDestroy);
+using mlir_tuning_table      = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRockTuningTable,
+                                                      mlirRockTuningTableDestroy);

 std::string_view to_string_view(MlirStringRef s) { return {s.data, s.length}; }

@@ -157,10 +164,10 @@ std::string mlir_print(F f, T x)
    return ss.str();
 }

-const std::unordered_set<std::string>& get_xdlops_archs()
+bool has_xdlops(const std::string& target_arch)
 {
-    static std::unordered_set<std::string> supported_archs{"gfx908", "gfx90a"};
-    return supported_archs;
+    const auto device_name = trim(split_string(target_arch, ':').front());
+    return (starts_with(device_name, "gfx9") and device_name >= "gfx908");
 }

 struct mlir_program
@@ -190,10 +197,14 @@ struct mlir_program
                result = mlirF64TypeGet(ctx.get());
            else if(as.is_integral())
            {
-                if(as.is_signed())
-                    result = mlirIntegerTypeSignedGet(ctx.get(), as.size() * 8);
-                else
-                    result = mlirIntegerTypeGet(ctx.get(), as.size() * 8);
+                // Note: rocMLIR use signless integer type for tensors types. This
+                // will translate to signed implementation for current supported
+                // operations.
+                if(as.is_unsigned())
+                {
+                    MIGRAPHX_THROW("Unsupported type: " + std::to_string(as.type_enum()));
+                }
+                result = mlirIntegerTypeGet(ctx.get(), as.size() * 8);
            }
            else
                MIGRAPHX_THROW("Unsupported type: " + std::to_string(as.type_enum()));
@@ -313,7 +324,8 @@ struct mlir_program
                                     std::string,
                                     value,
                                     std::vector<value>,
-                                     MlirType>;
+                                     MlirType,
+                                     MlirAttribute>;
    using named_attribute_t = std::pair<std::string_view, attribute_t>;

    MlirNamedAttribute name_attribute(const named_attribute_t& na) const
@@ -455,7 +467,7 @@ struct mlir_program

        auto ops = create_operation_state("func.func");
        ops.add_attributes({{"function_type", make_function_type(inputs, outputs)},
-                            {"sym_name", std::string("main")},
+                            {"sym_name", sym_name},
                            {"kernel", std::string("mixr")},
                            {"arch", target_arch}});
        ops.add_region(std::move(region));
@@ -470,13 +482,17 @@ struct mlir_program
    {
        if(ins->name() == "@return")
            return "func.return";
+        if(ins->name() == "@literal")
+        {
+            return "tosa.const";
+        }
        return "migraphx." + ins->name();
    }

    static value get_operator_value(const operation& op)
    {
        auto v = op.to_value();
-        if(op.name() == "convolution")
+        if(op.name() == "convolution" or op.name() == "quant_convolution")
        {
            // Adjust symetrical padding
            if(v.at("padding").size() == v.at("stride").size())
@@ -498,31 +514,53 @@ struct mlir_program
        return ins->get_shape();
    }

+    static std::string get_symbol_name(const module& m)
+    {
+        for(auto ins : iterator_for(m))
+        {
+            if(ins->name() == "convolution" or ins->name() == "dot")
+            {
+                return "mlir_" + ins->name();
+            }
+        }
+        return "main";
+    }
+
    void parse(const module& m)
    {
+        sym_name   = get_symbol_name(m);
        auto mbody = mlirModuleGetBody(mmodule.get());
        std::unordered_map<instruction_ref, MlirValue> ins_map;
        auto fbody = insert(mbody, m, ins_map);
+
        for(auto ins : iterator_for(m))
        {
            if(ins->name() == "@param")
                continue;
+            if(ins->name() == "contiguous")
+            {
+                ins_map[ins] = ins_map[ins->inputs().at(0)];
+                continue;
+            }
            auto name = get_name(ins);
            auto ops  = create_operation_state(name);
            ops.add_attribute_value(get_operator_value(ins->get_operator()));
            if(ins->name() != "@return")
                ops.add_results({get_shape(ins)});
-            if(ins->name() == "convolution")
+            if(ins->name() == "@literal")
+            {
+                literal r            = ins->get_literal();
+                MlirType tensor_type = make_tensor(ins->get_shape());
+                MlirAttribute mlir_value_attr =
+                    mlirDenseElementsAttrRawBufferGet(tensor_type, r.get_shape().bytes(), r.data());
+                ops.add_attributes({{"value", mlir_value_attr}});
+            }
+            if(ins->name() == "convolution" or ins->name() == "dot")
            {
                pp =
                    problem_params{ins->get_operator(), to_shapes(ins->inputs()), ins->get_shape()};
                // check if HW supports xdlops
-                auto target_chip  = trim(split_string(target_arch, ':').front());
-                bool xdlops       = contains(get_xdlops_archs(), target_chip);
-                std::string tuned = get_tune_params(xdlops);
-                if(not tuned.empty())
-                    ops.add_attributes({{"perf_config", tuned}});
-                if(xdlops)
+                if(has_xdlops(target_arch))
                    ops.add_attributes({{"xdlopsV2", true}});
            }

@@ -542,15 +580,19 @@ struct mlir_program

    code_object_op compile() MIGRAPHX_TIDY_CONST
    {
-        mlir_pass_manager pm{mlirPassManagerCreate(ctx.get())};
+        mlir_pass_manager pm_front{mlirPassManagerCreate(ctx.get())};
+        mlir_pass_manager pm_back{mlirPassManagerCreate(ctx.get())};
        // 1st pipeline to call
-        mlirMIGraphXAddHighLevelPipeline(pm.get());
+        mlirMIGraphXAddHighLevelPipeline(pm_front.get());
+        mlirPassManagerRun(pm_front.get(), mmodule.get());
+
        // 2nd pipeline to call
-        mlirMIGraphXAddBackendPipeline(pm.get(), target_arch.c_str());
-        mlirPassManagerRun(pm.get(), mmodule.get());
+        get_module_tuned();
+        mlirMIGraphXAddBackendPipeline(pm_back.get(), target_arch.c_str());
+        mlirPassManagerRun(pm_back.get(), mmodule.get());

        code_object_op op{};
-        op.symbol_name                = "main";
+        op.symbol_name                = sym_name;
        op.code_object                = get_binary();
        std::tie(op.global, op.local) = get_launch_params();
        return op;
@@ -578,7 +620,74 @@ struct mlir_program
        MIGRAPHX_THROW("Failed to compile mlir program");
    }

-    std::string get_tune_params(bool xdlops) { return get_mlir_perf_for_conv(pp, xdlops); }
+    std::string get_tune_params(bool xdlops) const { return get_mlir_perf_for_conv(pp, xdlops); }
+
+    // This function appends to tuning cfg file that could be
+    // used with rocMLIR tuning scripts.
+    void dump_tuning_cfg(const char* prob_config) const
+    {
+        std::string tuning_cfg_path = string_value_of(MIGRAPHX_MLIR_TUNING_CFG{});
+        if(!tuning_cfg_path.empty())
+        {
+            std::vector<std::string> tokens = split_string(prob_config, '\t');
+            std::string prob                = tokens[1];
+            if(starts_with(prob, "conv"))
+            {
+                tuning_cfg_path += ".conv";
+            }
+            else
+            {
+                tuning_cfg_path += ".gemm";
+            }
+            std::ofstream tuning_cfg(tuning_cfg_path, std::ios::app);
+            tuning_cfg << prob << std::endl;
+        }
+    }
+
+    static mlir_tuning_table create_tuning_table()
+    {
+        mlir_tuning_table tuning_table{mlirRockTuningTableCreate()};
+        std::string tuning_db_path = string_value_of(MIGRAPHX_MLIR_TUNING_DB{});
+        if(!tuning_db_path.empty())
+        {
+            std::ifstream tuning_db_tsv(tuning_db_path);
+            if(tuning_db_tsv)
+            {
+                std::string line;
+                while(std::getline(tuning_db_tsv, line))
+                {
+                    std::vector<std::string> tokens = split_string(line, '\t');
+                    std::string arch                = tokens[0];
+                    std::string prob                = tokens[1];
+                    std::string perf                = tokens[2];
+                    std::string key                 = arch.append("\t").append(prob);
+                    mlirRockTuningUpdateTable(tuning_table.get(), key.c_str(), perf.c_str(), 1.0);
+                }
+            }
+        }
+        else
+        {
+            std::cerr
+                << "WARNING: MLIR tuning db not found. Please set MIGRAPHX_MLIR_TUNING_DB for "
+                   "optimal performance."
+                << std::endl;
+        }
+        return tuning_table;
+    }
+
+    bool get_module_tuned() const
+    {
+        static mlir_tuning_table tuning_table = create_tuning_table();
+        if(!mlirRockTuningSetFromTable(tuning_table.get(), mmodule.get()))
+        {
+            const char* prob_config = mlirRockTuningGetKey(tuning_table.get(), mmodule.get());
+            std::stringstream key(prob_config);
+            std::cerr << "fails to set param on" << prob_config << std::endl;
+            dump_tuning_cfg(prob_config);
+            return false;
+        }
+        return true;
+    }

    mlir_context ctx;
    MlirLocation location;
@@ -586,6 +695,7 @@ struct mlir_program
    problem_params pp;
    std::deque<std::string> strings{};
    std::string target_arch;
+    std::string sym_name;
 };

 std::string dump_mlir(const module& m)
@@ -645,12 +755,13 @@ code_object_op compile_mlir(const context&, module m, const std::vector<instruct
 {
    adjust_param_shapes(m, inputs);
    const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
-    if(trace)
-        std::cout << m << std::endl;
-
    // set mutex while llvm thread support is disabled.
    static std::mutex g_mlirc_mutex; // NOLINT
    const std::lock_guard<std::mutex> lock(g_mlirc_mutex);
+
+    if(trace)
+        std::cout << m << std::endl;
+
    mlir_program mp;
    mp.find_target();
    mp.parse(m);

--- a/src/targets/gpu/rocblas.cpp
+++ b/src/targets/gpu/rocblas.cpp
@@ -47,32 +47,17 @@ rocblas_handle_ptr create_rocblas_handle_ptr(hipStream_t s)
    return rb;
 }

-const std::unordered_set<std::string>& get_rocblas_fp32_archs()
-{
-    static std::unordered_set<std::string> supported_archs{"gfx908", "gfx90a"};
-    return supported_archs;
-}
-
 bool get_compute_fp32_flag()
 {
-    bool compute_fp32 = false;
-#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
    const auto device_name = trim(split_string(get_device_name(), ':').front());
-    if(contains(get_rocblas_fp32_archs(), device_name))
-        compute_fp32 = true;
-#endif
-    return compute_fp32;
+    return (starts_with(device_name, "gfx9") and device_name >= "gfx908");
 }

 bool get_int8_x4_format(context& ctx)
 {
-    bool int8_x4_format = true;
-#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
    rocblas_gemm_flags flag;
    rocblas_query_int8_layout_flag(ctx.get_stream().get_rocblas(), &flag);
-    int8_x4_format = (flag == rocblas_gemm_flags_pack_int8x4);
-#endif
-    return int8_x4_format;
+    return flag == rocblas_gemm_flags_pack_int8x4;
 }
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -26,13 +26,13 @@
 #include <migraphx/check_context.hpp>
 #include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/eliminate_allocation.hpp>
-#include <migraphx/eliminate_common_subexpression.hpp>
 #include <migraphx/eliminate_concat.hpp>
 #include <migraphx/eliminate_contiguous.hpp>
 #include <migraphx/eliminate_data_type.hpp>
 #include <migraphx/eliminate_identity.hpp>
 #include <migraphx/eliminate_pad.hpp>
 #include <migraphx/fuse_pointwise.hpp>
+#include <migraphx/fuse_reduce.hpp>
 #include <migraphx/inline_module.hpp>
 #include <migraphx/insert_pad.hpp>
 #include <migraphx/layout_nhwc.hpp>
@@ -40,7 +40,7 @@
 #include <migraphx/normalize_ops.hpp>
 #include <migraphx/optimize_module.hpp>
 #include <migraphx/preallocate_param.hpp>
-#include <migraphx/propagate_constant.hpp>
+#include <migraphx/promote_literals.hpp>
 #include <migraphx/register_target.hpp>
 #include <migraphx/replace_allocate.hpp>
 #include <migraphx/rewrite_gelu.hpp>
@@ -48,9 +48,9 @@
 #include <migraphx/rewrite_quantization.hpp>
 #include <migraphx/rewrite_rnn.hpp>
 #include <migraphx/schedule.hpp>
-#include <migraphx/simplify_algebra.hpp>
 #include <migraphx/simplify_qdq.hpp>
 #include <migraphx/simplify_reshapes.hpp>
+#include <migraphx/split_single_dyn_dim.hpp>
 #include <migraphx/gpu/allocation_model.hpp>
 #include <migraphx/gpu/compile_miopen.hpp>
 #include <migraphx/gpu/compile_ops.hpp>
@@ -74,9 +74,8 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS)
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_POINTWISE_FUSION)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_REDUCE_FUSION)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NHWC)
-
 struct id_pass
 {
    std::string name() const { return "id"; }
@@ -93,20 +92,24 @@ pass enable_pass(bool enabled, pass p)
 std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_options& options) const
 {
    auto& ctx = any_cast<context>(gctx);
+    ctx.set_exhaustive_tune_flag(options.exhaustive_tune);
    std::set<shape::type_t> unsupported_types(shape::types().begin(), shape::types().end());
    unsupported_types.erase(shape::type_t::float_type);
    unsupported_types.erase(shape::type_t::half_type);
    unsupported_types.erase(shape::type_t::bool_type);
    unsupported_types.erase(shape::type_t::int8_type);
    unsupported_types.erase(shape::type_t::uint8_type);
+    unsupported_types.erase(shape::type_t::int32_type);
    unsupported_types.erase(shape::type_t::tuple_type);
    // clang-format off
    return
    {
+        split_single_dyn_dim{},
+        dead_code_elimination{},
        normalize_ops{},
        dead_code_elimination{},
        simplify_qdq{},
-        rewrite_quantization{},
+        enable_pass(not mlir_enabled(), rewrite_quantization{}),
        dead_code_elimination{},
        eliminate_data_type{unsupported_types, shape::type_t::float_type},
        simplify_reshapes{},
@@ -130,9 +133,11 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        fuse_ck_gemm_softmax_gemm{&ctx},
        dead_code_elimination{},
        optimize_module{},
-        enable_pass(not enabled(MIGRAPHX_DISABLE_POINTWISE_FUSION{}), fuse_pointwise{}),
+        fuse_pointwise{},
        dead_code_elimination{},
-        fuse_mlir{&ctx},
+        enable_pass(not enabled(MIGRAPHX_DISABLE_REDUCE_FUSION{}), fuse_reduce{}),
+        dead_code_elimination{},
+        enable_pass(mlir_enabled(), fuse_mlir{&ctx}),
        dead_code_elimination{},
        fuse_ck{&ctx},
        dead_code_elimination{},
@@ -153,6 +158,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        compile_ops{&ctx},
        dead_code_elimination{},
+        promote_literals{},
+        dead_code_elimination{},
        write_literals{&ctx},
        schedule{gpu::schedule_model{ctx.get_current_device().nstreams()}, not enabled(MIGRAPHX_DISABLE_SCHEDULE_PASS{})},
        memory_coloring{"hip::allocate"},

--- a/src/targets/ref/CMakeLists.txt
+++ b/src/targets/ref/CMakeLists.txt
@@ -31,10 +31,9 @@ set_target_properties(migraphx_ref PROPERTIES EXPORT_NAME ref)
 rocm_set_soversion(migraphx_ref ${MIGRAPHX_SO_VERSION})

 find_path(BLAZE_INCLUDE blaze/Blaze.h)
-find_package(Threads)

 rocm_clang_tidy_check(migraphx_ref)
-target_link_libraries(migraphx_ref migraphx Threads::Threads)
+target_link_libraries(migraphx_ref PUBLIC migraphx)
 target_include_directories(migraphx_ref PRIVATE ${BLAZE_INCLUDE})
 target_compile_definitions(migraphx_ref PRIVATE -DBLAZE_USE_CPP_THREADS)


--- a/src/targets/ref/include/migraphx/ref/target.hpp
+++ b/src/targets/ref/include/migraphx/ref/target.hpp
@@ -46,8 +46,6 @@ struct target
    argument allocate(const shape& s) const;
 };

-MIGRAPHX_REGISTER_TARGET(target);
-
 } // namespace ref
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/tf/op_parser.cpp
+++ b/src/tf/op_parser.cpp
@@ -46,6 +46,7 @@ std::vector<std::string> get_op_parsers()
                   op_parser_map().end(),
                   std::back_inserter(result),
                   [&](auto&& p) { return p.first; });
+    std::sort(result.begin(), result.end());
    return result;
 }


--- a/src/tf/tf.cpp
+++ b/src/tf/tf.cpp
@@ -22,6 +22,7 @@
 * THE SOFTWARE.
 */
 #include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/tf/op_parser.hpp>
 #include <iostream>
 #include <fstream>
 #include <unordered_map>
@@ -62,5 +63,7 @@ program parse_tf(const std::string& name, const tf_options& options)
    return std::move(parser.prog);
 }

+std::vector<std::string> get_tf_operators() { return tf::get_op_parsers(); }
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/version.h.in
+++ b/src/version.h.in
@@ -24,4 +24,6 @@
 // clang-format off
 #define MIGRAPHX_VERSION_MAJOR @PROJECT_VERSION_MAJOR@
 #define MIGRAPHX_VERSION_MINOR @PROJECT_VERSION_MINOR@
+#define MIGRAPHX_VERSION_PATCH @PROJECT_VERSION_PATCH@
+#define MIGRAPHX_VERSION_TWEAK @PROJECT_VERSION_TWEAK@
 // clang-format on
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -106,19 +106,11 @@ function(add_test_executable TEST_NAME)
    if(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
        set_target_properties(${TEST_NAME} PROPERTIES COMPILE_FLAGS -pthread LINK_FLAGS -pthread)
    endif()
-
-    separate_arguments(MIOPEN_TEST_FLAGS_ARGS UNIX_COMMAND ${MIOPEN_TEST_FLAGS})
-
-    if(MIOPEN_TEST_ALL)
-        set(TEST_COMMAND ${TEST_NAME} ${MIOPEN_TEST_FLOAT_ARG} --all ${MIOPEN_TEST_FLAGS_ARGS})
-    else()
-        set(TEST_COMMAND ${TEST_NAME} ${MIOPEN_TEST_FLOAT_ARG} ${MIOPEN_TEST_FLAGS_ARGS})
-    endif()
-
+    set(TEST_COMMAND ${TEST_NAME})
    add_test_command(${TEST_NAME} ${TEST_COMMAND})
    add_dependencies(tests ${TEST_NAME})
    add_dependencies(check ${TEST_NAME})
-    target_link_libraries(${TEST_NAME} migraphx migraphx_ref migraphx_onnx)
+    target_link_libraries(${TEST_NAME} migraphx migraphx_onnx migraphx_ref)
    target_include_directories(${TEST_NAME} PUBLIC include)
 endfunction(add_test_executable)

@@ -142,6 +134,9 @@ if(MIGRAPHX_ENABLE_GPU)
            COST 10
            RESOURCE_LOCK gpu
        )
+        if(MIGRAPHX_USE_HIPRTC)
+        target_compile_definitions(test_gpu_${BASE_NAME} PUBLIC -DMIGRAPHX_USE_HIPRTC)
+        endif()
        target_link_libraries(test_gpu_${BASE_NAME} migraphx_gpu migraphx_kernels)
    endforeach()
 endif()
@@ -182,7 +177,7 @@ endforeach()
 set(TEST_TF_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tf)
 add_executable(test_tf tf/tf_test.cpp)
 rocm_clang_tidy_check(test_tf)
-target_link_libraries(test_tf migraphx_tf migraphx_ref)
+target_link_libraries(test_tf migraphx_tf)
 target_include_directories(test_tf PUBLIC include)
 add_test(NAME test_tf COMMAND $<TARGET_FILE:test_tf> WORKING_DIRECTORY ${TEST_TF_DIR})
 add_dependencies(tests test_tf)
@@ -216,10 +211,7 @@ function(test_headers PREFIX)
        string(MAKE_C_IDENTIFIER ${HEADER_REL} TEST_NAME)
        get_filename_component(BASE_NAME ${HEADER} NAME_WE)
        test_header(header_${TEST_NAME} ${PREFIX}/${BASE_NAME}.hpp)
-
-        if(MIGRAPHX_ENABLE_GPU)
-            target_link_libraries(header_${TEST_NAME} migraphx_gpu)
-        endif()
+        target_link_libraries(header_${TEST_NAME} migraphx_all_targets)
    endforeach()
 endfunction()

@@ -229,3 +221,10 @@ test_headers(migraphx/ref ${CMAKE_SOURCE_DIR}/src/targets/ref/include/migraphx/r
 if(MIGRAPHX_ENABLE_GPU)
    test_headers(migraphx/gpu ${CMAKE_SOURCE_DIR}/src/targets/gpu/include/migraphx/gpu/*.hpp)
 endif()
+if(MIGRAPHX_ENABLE_CPU)
+    test_headers(migraphx/cpu ${CMAKE_SOURCE_DIR}/src/targets/cpu/include/migraphx/cpu/*.hpp)
+endif()
+if(MIGRAPHX_ENABLE_FPGA)
+    test_headers(migraphx/fpga ${CMAKE_SOURCE_DIR}/src/targets/fpga/include/migraphx/fpga/*.hpp)
+endif()
+
--- a/test/api/CMakeLists.txt
+++ b/test/api/CMakeLists.txt
@@ -25,7 +25,7 @@ function(add_api_test TEST_NAME TEST_SRC TEST_DIR)
    set(NAME test_api_${TEST_NAME})
    add_executable(${NAME} EXCLUDE_FROM_ALL ${TEST_SRC})
    rocm_clang_tidy_check(${NAME})
-    target_link_libraries(${NAME} migraphx_c migraphx)
+    target_link_libraries(${NAME} migraphx_c migraphx migraphx_all_targets)
    target_include_directories(${NAME} PUBLIC ../include)
    add_test(NAME ${NAME} COMMAND $<TARGET_FILE:${NAME}> WORKING_DIRECTORY ${TEST_DIR}) 
    add_dependencies(tests ${NAME})
@@ -48,6 +48,7 @@ add_api_test(assign test_assign.cpp ${TEST_ONNX_DIR})
 add_api_test(compile_options test_compile_options.cpp ${TEST_ONNX_DIR})
 add_api_test(lookup test_lookup.cpp ${TEST_ONNX_DIR})
 add_api_test(module_construct test_module_construct.cpp ${TEST_ONNX_DIR})
+add_api_test(dynamic_shape test_dynamic_shape.cpp ${TEST_ONNX_DIR})
 add_api_test(ref test_cpu.cpp ${TEST_ONNX_DIR})
 add_api_test(save_load test_save_load.cpp ${TEST_ONNX_DIR})
 add_api_test(op test_op_construct.cpp ${TEST_ONNX_DIR})
@@ -56,8 +57,10 @@ add_api_test(custom_op test_custom_op.cpp ${TEST_ONNX_DIR})
 add_api_test(tf_parser test_tf_parser.cpp ${TEST_TF_DIR})
 # GPU-based tests
 if(MIGRAPHX_ENABLE_GPU)
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
+find_package(hip)
 add_api_test(gpu test_gpu.cpp ${TEST_ONNX_DIR})
-target_link_libraries(test_api_gpu migraphx_gpu)
+target_link_libraries(test_api_gpu)
 add_api_test(custom_op_gpu test_custom_op_gpu.cpp ${TEST_ONNX_DIR})
-target_link_libraries(test_api_custom_op_gpu migraphx_gpu)
+target_link_libraries(test_api_custom_op_gpu)
 endif()
--- a/test/api/test_dynamic_shape.cpp
+++ b/test/api/test_dynamic_shape.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/migraphx.h>
+#include <migraphx/migraphx.hpp>
+#include "test.hpp"
+
+TEST_CASE(create_dynamic_dimensions)
+{
+    migraphx::dynamic_dimension dd0{1, 4};
+    EXPECT(not dd0.is_fixed());
+    migraphx::dynamic_dimension dd1{4, 4};
+    EXPECT(dd1.is_fixed());
+    migraphx::optimals opts{1, 2, 4};
+    migraphx::dynamic_dimension dd2{1, 4, opts};
+    migraphx::dynamic_dimensions dyn_dims0{dd0, dd1, dd2};
+    CHECK(bool{dyn_dims0[0] == dd0});
+    CHECK(bool{dyn_dims0[1] == dd1});
+    CHECK(bool{dyn_dims0[2] == dd2});
+    CHECK(bool{dyn_dims0[2] != dd0});
+    EXPECT(dyn_dims0.size() == 3);
+}
+
+TEST_CASE(create_dynamic_shape)
+{
+    migraphx::dynamic_dimensions dyn_dims(migraphx::dynamic_dimension{1, 4},
+                                          migraphx::dynamic_dimension{78, 92},
+                                          migraphx::dynamic_dimension{1, 4, {1, 4}});
+    migraphx::shape dyn_shape{migraphx_shape_float_type, dyn_dims};
+    CHECK(bool{dyn_shape.dynamic()});
+    CHECK(bool{dyn_shape.dyn_dims()[0] == migraphx::dynamic_dimension{1, 4}});
+
+    migraphx::shape static_shape{migraphx_shape_float_type, {3, 8}};
+    EXPECT(not static_shape.dynamic());
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/api/test_gpu.cpp
+++ b/test/api/test_gpu.cpp
@@ -25,7 +25,6 @@
 #include <hip/hip_runtime_api.h>
 #include <migraphx/migraphx.h>
 #include <migraphx/migraphx.hpp>
-
 #include <migraphx/manage_ptr.hpp>
 #include "test.hpp"

@@ -35,6 +34,7 @@ TEST_CASE(load_and_run)
    auto shapes_before = p.get_output_shapes();
    migraphx::compile_options options;
    options.set_offload_copy();
+    options.set_exhaustive_tune_flag();
    p.compile(migraphx::target("gpu"), options);
    auto shapes_after = p.get_output_shapes();
    CHECK(shapes_before.size() == 1);
@@ -71,6 +71,105 @@ hip_ptr get_hip_buffer(size_t size)
    return hip_ptr{ptr};
 }

+// TODO: placeholder until we have a way to copy tuple arguments to/from device through c++ api
+// TEST_CASE(dynamic_batch_load_and_run)
+//{
+//    migraphx::onnx_options o_options;
+//    migraphx::dynamic_dimensions dyn_dims = {{1, 4, {2, 4}}, {3, 3}, {4, 4}, {4, 4}};
+//    o_options.set_dyn_input_parameter_shape("0", dyn_dims);
+//    dyn_dims = {{2, 2}, {3, 3}, {3, 3}, {3, 3}};
+//    o_options.set_dyn_input_parameter_shape("1", dyn_dims);
+//    auto p = migraphx::parse_onnx("conv_dynamic_batch_test.onnx", o_options);
+//    migraphx::compile_options c_options;
+//    c_options.set_split_single_dyn_dim();
+//    p.compile(migraphx::target("gpu"), c_options);
+//    auto out_shapes = p.get_output_shapes();
+//    CHECK(out_shapes.size() == 1);
+//    EXPECT(out_shapes[0].dynamic());
+//
+//    std::vector<float> a(0.12, 2*3*4*4);
+//    std::vector<float> c(0.75, 2*3*3*3);
+//
+//    auto param_shapes = p.get_parameter_shapes();
+//    int batch_size    = 2;
+//    std::unordered_map<std::string, migraphx::argument> arg_map;
+//
+//    arg_map["0"] = migraphx::argument(param_shapes["0"].to_static(batch_size), a.data());
+//    arg_map["1"] = migraphx::argument(param_shapes["1"].to_static(batch_size), c.data());
+//
+//    migraphx::program_parameters pp;
+//    std::vector<hip_ptr> buffs;
+//    std::vector<migraphx::argument> args;
+//
+//    // copy to GPU and create parameter map
+//    for(auto&& name : param_shapes.names())
+//    {
+//        if(arg_map.find(name) != arg_map.end())
+//        {
+//            args.push_back(arg_map.at(name));
+//        }
+//        else
+//        {
+//            migraphx::shape static_shape = param_shapes[name].to_static(batch_size);
+//            auto output_arg              = migraphx::argument(static_shape);
+//            args.push_back(output_arg);
+//        }
+//        buffs.push_back(get_hip_buffer(args.rbegin()->get_shape().bytes()));
+//        auto err = hipMemcpy(buffs.rbegin()->get(),
+//                             args.rbegin()->data(),
+//                             args.rbegin()->get_shape().bytes(),
+//                             hipMemcpyHostToDevice);
+//        EXPECT(err == hipSuccess);
+//        pp.add(name, migraphx::argument(args.rbegin()->get_shape(), buffs.rbegin()->get()));
+//    }
+//
+//    auto output = p.eval(pp)[0];
+//
+//    // copy output back to host
+//    auto host_arg = migraphx::argument(output.get_shape());
+//    auto err      = hipMemcpy(
+//        host_arg.data(), output.data(), output.get_shape().bytes(), hipMemcpyDeviceToHost);
+//    EXPECT(err == hipSuccess);
+//}
+
+TEST_CASE(dynamic_batch_load_and_run_offload)
+{
+    migraphx::onnx_options o_options;
+    migraphx::dynamic_dimensions dyn_dims = {migraphx::dynamic_dimension{1, 4, {2, 4}},
+                                             migraphx::dynamic_dimension{3, 3},
+                                             migraphx::dynamic_dimension{4, 4},
+                                             migraphx::dynamic_dimension{4, 4}};
+    o_options.set_dyn_input_parameter_shape("0", dyn_dims);
+    dyn_dims = {migraphx::dynamic_dimension{2, 2},
+                migraphx::dynamic_dimension{3, 3},
+                migraphx::dynamic_dimension{3, 3},
+                migraphx::dynamic_dimension{3, 3}};
+    o_options.set_dyn_input_parameter_shape("1", dyn_dims);
+    auto p             = migraphx::parse_onnx("conv_dynamic_batch_test.onnx", o_options);
+    auto shapes_before = p.get_output_shapes();
+    migraphx::compile_options c_options;
+    c_options.set_offload_copy();
+    p.compile(migraphx::target("gpu"), c_options);
+    auto out_shapes = p.get_output_shapes();
+    CHECK(out_shapes.size() == 1);
+    EXPECT(out_shapes[0].dynamic());
+
+    // batch size = 2
+    std::vector<float> a(2 * 3 * 4 * 4, 0.12);
+    std::vector<float> c(2 * 3 * 3 * 3, 0.75);
+    migraphx::program_parameters pp;
+    auto param_shapes = p.get_parameter_shapes();
+    pp.add("0",
+           migraphx::argument(migraphx::shape(migraphx_shape_float_type, {2, 3, 4, 4}), a.data()));
+    pp.add("1",
+           migraphx::argument(migraphx::shape(migraphx_shape_float_type, {2, 3, 3, 3}), c.data()));
+    auto outputs = p.eval(pp);
+
+    CHECK(shapes_before.size() == outputs.size());
+    CHECK(bool{outputs.front().get_shape() ==
+               migraphx::shape(migraphx_shape_float_type, {2, 1, 3, 3})});
+}
+
 TEST_CASE(load_and_run_async)
 {
    auto p             = migraphx::parse_onnx("conv_relu_maxpool_test.onnx");

--- a/test/check_shapes_test.cpp
+++ b/test/check_shapes_test.cpp
@@ -36,7 +36,7 @@ bool create_shapes(bool dynamic_allowed)
    try
    {
        shape a{shape::int64_type, {3}};
-        shape b{shape::float_type, {{3, 6, 0}, {4, 4, 0}}};
+        shape b{shape::float_type, {{3, 6}, {4, 4}}};
        auto op = migraphx::make_op("add");
        migraphx::check_shapes{{a, b}, op, dynamic_allowed}.has(2);
        return true;

--- a/test/fpga/get_target_assignments.cpp
+++ b/test/fpga/get_target_assignments.cpp
@@ -26,7 +26,6 @@
 #include <migraphx/make_op.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/register_target.hpp>
-#include <migraphx/fpga/target.hpp>
 #include <migraphx/target_assignments.hpp>
 #include <migraphx/iterator_for.hpp>


--- a/test/fuse_pointwise.cpp
+++ b/test/fuse_pointwise.cpp
@@ -329,4 +329,36 @@ TEST_CASE(all_scalar_input)
    EXPECT(p1 == p2);
 }

+TEST_CASE(no_input)
+{
+    migraphx::program p;
+    {
+        auto* mm = p.get_main_module();
+        migraphx::shape g_shape{migraphx::shape::int64_type, {1}, {0}};
+        migraphx::shape s_indices{migraphx::shape::int32_type, {3}};
+        std::vector<int> indices{3, 800, 800};
+        auto a0  = mm->add_literal(migraphx::literal{s_indices, indices});
+        auto a1  = mm->add_literal(migraphx::literal{g_shape, {1}});
+        int axis = 0;
+        auto out = mm->add_instruction(migraphx::make_op("gather", {{"axis", axis}}), a0, a1);
+        mm->add_return({out});
+    }
+    run_pass(p);
+
+    // This should NOT create a pointwise module if there are no inputs here.
+    migraphx::program p2;
+    {
+        auto* mm = p2.get_main_module();
+        migraphx::shape g_shape{migraphx::shape::int64_type, {1}, {0}};
+        migraphx::shape s_indices{migraphx::shape::int32_type, {3}};
+        std::vector<int> indices{3, 800, 800};
+        auto a0  = mm->add_literal(migraphx::literal{s_indices, indices});
+        auto a1  = mm->add_literal(migraphx::literal{g_shape, {1}});
+        int axis = 0;
+        auto out = mm->add_instruction(migraphx::make_op("gather", {{"axis", axis}}), a0, a1);
+        mm->add_return({out});
+    }
+    EXPECT(p == p2);
+}
+
 int main(int argc, const char* argv[]) { test::run(argc, argv); }