Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into dyn_model_test

3a4d36cf · charlie · 6bec381f · e19f78ae · 3a4d36cf · 3a4d36cf
Commit 3a4d36cf authored Sep 30, 2022 by charlie
20 changed files
--- a/test/ref_ops_test.cpp
+++ b/test/ref_ops_test.cpp
@@ -3591,7 +3591,7 @@ TEST_CASE(multinomial_test)
    result.visit([&](auto output) { result_vec.assign(output.begin(), output.end()); });

    std::vector<int> res_dist(5, 0);
-    for(auto& r : result_vec)
+    for(const auto& r : result_vec)
        res_dist[r]++;
    auto dist_sum     = std::accumulate(dist.begin(), dist.end(), 0);
    auto res_dist_sum = std::accumulate(res_dist.begin(), res_dist.end(), 0);
@@ -3624,6 +3624,174 @@ TEST_CASE(neg_test)
    EXPECT(migraphx::verify_range(result_vector, gold));
 }

+TEST_CASE(nms_dynamic_out_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
+    std::vector<float> boxes_vec = {0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
+                                    0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0};
+
+    migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
+    std::vector<float> scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    auto boxes_l         = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
+    auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+    auto max_out_l       = mm->add_literal(int64_t{4});
+    auto iou_threshold   = mm->add_literal(0.5f);
+    auto score_threshold = mm->add_literal(0.0f);
+
+    auto r = mm->add_instruction(
+        migraphx::make_op("nonmaxsuppression",
+                          {{"center_point_box", true}, {"use_dyn_output", true}}),
+        boxes_l,
+        scores_l,
+        max_out_l,
+        iou_threshold,
+        score_threshold);
+    mm->add_return({r});
+
+    p.compile(migraphx::ref::target{});
+    auto output = p.eval({}).back();
+    std::vector<int64_t> result;
+    output.visit([&](auto out) { result.assign(out.begin(), out.end()); });
+    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5};
+    EXPECT(migraphx::verify_range(result, gold));
+}
+
+TEST_CASE(nms_dynamic_batch_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_s{migraphx::shape::float_type, {{1, 3, 0}, {6, 6, 0}, {4, 4, 0}}};
+
+    migraphx::shape scores_s{migraphx::shape::float_type, {{1, 3, 0}, {1, 1, 0}, {6, 6, 0}}};
+
+    auto boxes_p         = mm->add_parameter("boxes", boxes_s);
+    auto scores_p        = mm->add_parameter("scores", scores_s);
+    auto max_out_l       = mm->add_literal(int64_t{4});
+    auto iou_threshold   = mm->add_literal(0.5f);
+    auto score_threshold = mm->add_literal(0.0f);
+
+    auto r = mm->add_instruction(
+        migraphx::make_op("nonmaxsuppression",
+                          {{"center_point_box", true}, {"use_dyn_output", true}}),
+        boxes_p,
+        scores_p,
+        max_out_l,
+        iou_threshold,
+        score_threshold);
+    mm->add_return({r});
+
+    p.compile(migraphx::ref::target{});
+
+    std::vector<float> boxes_vec  = {0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
+                                    0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0,
+                                    0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
+                                    0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0};
+    std::vector<float> scores_vec = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    migraphx::shape input_fixed_shape0{migraphx::shape::float_type, {2, 6, 4}};
+    migraphx::shape input_fixed_shape1{migraphx::shape::float_type, {2, 1, 6}};
+    migraphx::parameter_map params0;
+    params0["boxes"]  = migraphx::argument(input_fixed_shape0, boxes_vec.data());
+    params0["scores"] = migraphx::argument(input_fixed_shape1, scores_vec.data());
+    auto output       = p.eval(params0).back();
+
+    std::vector<int64_t> result;
+    output.visit([&](auto out) { result.assign(out.begin(), out.end()); });
+    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5, 1, 0, 3, 1, 0, 0, 1, 0, 5};
+    EXPECT(migraphx::verify_range(result, gold));
+}
+
+TEST_CASE(nms_dynamic_boxes_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_s{migraphx::shape::float_type, {{1, 1, 0}, {4, 20, 0}, {4, 4, 0}}};
+
+    migraphx::shape scores_s{migraphx::shape::float_type, {{1, 1, 0}, {1, 1, 0}, {4, 20, 0}}};
+
+    auto boxes_p         = mm->add_parameter("boxes", boxes_s);
+    auto scores_p        = mm->add_parameter("scores", scores_s);
+    auto max_out_l       = mm->add_literal(int64_t{4});
+    auto iou_threshold   = mm->add_literal(0.5f);
+    auto score_threshold = mm->add_literal(0.0f);
+
+    auto r = mm->add_instruction(
+        migraphx::make_op("nonmaxsuppression",
+                          {{"center_point_box", true}, {"use_dyn_output", true}}),
+        boxes_p,
+        scores_p,
+        max_out_l,
+        iou_threshold,
+        score_threshold);
+    mm->add_return({r});
+
+    p.compile(migraphx::ref::target{});
+
+    std::vector<float> boxes_vec  = {0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
+                                    0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0};
+    std::vector<float> scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    migraphx::shape input_fixed_shape0{migraphx::shape::float_type, {1, 6, 4}};
+    migraphx::shape input_fixed_shape1{migraphx::shape::float_type, {1, 1, 6}};
+    migraphx::parameter_map params0;
+    params0["boxes"]  = migraphx::argument(input_fixed_shape0, boxes_vec.data());
+    params0["scores"] = migraphx::argument(input_fixed_shape1, scores_vec.data());
+    auto output       = p.eval(params0).back();
+
+    std::vector<int64_t> result;
+    output.visit([&](auto out) { result.assign(out.begin(), out.end()); });
+    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5};
+    EXPECT(migraphx::verify_range(result, gold));
+}
+
+TEST_CASE(nms_dynamic_classes_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_s{migraphx::shape::float_type, {{1, 1, 0}, {6, 6, 0}, {4, 4, 0}}};
+
+    migraphx::shape scores_s{migraphx::shape::float_type, {{1, 1, 0}, {1, 3, 0}, {6, 6, 0}}};
+
+    auto boxes_p         = mm->add_parameter("boxes", boxes_s);
+    auto scores_p        = mm->add_parameter("scores", scores_s);
+    auto max_out_l       = mm->add_literal(int64_t{2});
+    auto iou_threshold   = mm->add_literal(0.5f);
+    auto score_threshold = mm->add_literal(0.0f);
+
+    auto r = mm->add_instruction(
+        migraphx::make_op("nonmaxsuppression",
+                          {{"center_point_box", true}, {"use_dyn_output", true}}),
+        boxes_p,
+        scores_p,
+        max_out_l,
+        iou_threshold,
+        score_threshold);
+    mm->add_return({r});
+
+    p.compile(migraphx::ref::target{});
+
+    std::vector<float> boxes_vec  = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                    0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                    0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+    std::vector<float> scores_vec = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+    migraphx::shape input_fixed_shape0{migraphx::shape::float_type, {1, 6, 4}};
+    migraphx::shape input_fixed_shape1{migraphx::shape::float_type, {1, 2, 6}};
+    migraphx::parameter_map params0;
+    params0["boxes"]  = migraphx::argument(input_fixed_shape0, boxes_vec.data());
+    params0["scores"] = migraphx::argument(input_fixed_shape1, scores_vec.data());
+    auto output       = p.eval(params0).back();
+
+    std::vector<int64_t> result;
+    output.visit([&](auto out) { result.assign(out.begin(), out.end()); });
+    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 1, 3, 0, 1, 0};
+    EXPECT(migraphx::verify_range(result, gold));
+}
+
 TEST_CASE(nms_not_center_test)
 {
    migraphx::program p;
@@ -3642,7 +3810,9 @@ TEST_CASE(nms_not_center_test)
    auto iou_threshold   = mm->add_literal(0.5f);
    auto score_threshold = mm->add_literal(0.0f);

-    auto r = mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
+    // set use_dyn_output back to false in operator map
+    auto r =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"use_dyn_output", false}}),
                            boxes_l,
                            scores_l,
                            max_out_l,
@@ -3675,7 +3845,8 @@ TEST_CASE(nms_test)
    auto iou_threshold   = mm->add_literal(0.5f);
    auto score_threshold = mm->add_literal(0.0f);

-    auto r = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}),
+    auto r =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
                            boxes_l,
                            scores_l,
                            max_out_l,
@@ -3712,7 +3883,8 @@ TEST_CASE(nms_transpose1_test)

    auto transpose_boxes = mm->add_instruction(
        migraphx::make_op("transpose", {{"permutation", {0, 2, 1}}}), t_boxes_l);
-    auto r = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}),
+    auto r =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
                            transpose_boxes,
                            scores_l,
                            max_out_l,
@@ -3749,7 +3921,8 @@ TEST_CASE(nms_transpose2_test)

    auto transpose_boxes = mm->add_instruction(
        migraphx::make_op("transpose", {{"permutation", {1, 2, 0}}}), t_boxes_l);
-    auto r = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}),
+    auto r =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
                            transpose_boxes,
                            scores_l,
                            max_out_l,
@@ -3815,7 +3988,8 @@ TEST_CASE(not_test)
        std::vector<char> results_vector;
        result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
        std::vector<bool> gold(data.size());
-        std::transform(data.begin(), data.end(), gold.begin(), [](bool n) -> bool { return !n; });
+        std::transform(
+            data.begin(), data.end(), gold.begin(), [](bool n) -> bool { return not n; });
        EXPECT(migraphx::verify_range(results_vector, gold));
    }
 }

--- a/test/rewrite_gelu_test.cpp
+++ b/test/rewrite_gelu_test.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/rewrite_gelu.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/ref/target.hpp>
+#include <migraphx/op/convolution.hpp>
+#include <migraphx/op/reshape.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/ranges.hpp>
+#include <test.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/common.hpp>
+
+#include <migraphx/serialize.hpp>
+
+#include <migraphx/verify.hpp>
+
+TEST_CASE(bias_gelu)
+{
+    migraphx::shape s1{migraphx::shape::half_type, {2, 4, 8}};
+    migraphx::shape s2{migraphx::shape::half_type};
+    migraphx::module m1;
+    {
+        auto a    = m1.add_parameter("a", s1);
+        auto b    = m1.add_parameter("b", s1);
+        auto add1 = m1.add_instruction(migraphx::make_op("add"), a, b);
+        auto l1   = m1.add_literal(migraphx::literal{s2, {1.4140625f}});
+        auto div  = add_common_op(m1, migraphx::make_op("div"), {add1, l1});
+        auto erf  = m1.add_instruction(migraphx::make_op("erf"), div);
+        auto l2   = m1.add_literal(migraphx::literal{s2, {1.0f}});
+        auto add2 = add_common_op(m1, migraphx::make_op("add"), {erf, l2});
+        auto mul  = m1.add_instruction(migraphx::make_op("mul"), add1, add2);
+        auto l3   = m1.add_literal(migraphx::literal{s2, {0.5f}});
+        mul       = add_common_op(m1, migraphx::make_op("mul"), {mul, l3});
+        m1.add_return({mul});
+    }
+    migraphx::rewrite_gelu pass;
+    pass.apply(m1);
+    migraphx::dead_code_elimination dce;
+    dce.apply(m1);
+
+    migraphx::module m2;
+    {
+        auto a   = m2.add_parameter("a", s1);
+        auto b   = m2.add_parameter("b", s1);
+        auto add = m2.add_instruction(migraphx::make_op("add"), a, b);
+        auto l1  = m2.add_literal(migraphx::literal{s2, {1.702f}});
+        auto mul = add_common_op(m2, migraphx::make_op("mul"), {add, l1});
+        auto sig = m2.add_instruction(migraphx::make_op("neg"), mul);
+        sig      = m2.add_instruction(migraphx::make_op("exp"), sig);
+        auto l2  = m2.add_literal(migraphx::literal{s2, {1.0f}});
+        sig      = add_common_op(m2, migraphx::make_op("add"), {sig, l2});
+        sig      = m2.add_instruction(migraphx::make_op("div"), add, sig);
+        m2.add_return({sig});
+    }
+
+    EXPECT(m1 == m2);
+}
+
+TEST_CASE(non_bias_gelu)
+{
+    migraphx::shape s1{migraphx::shape::half_type, {2, 4, 8}};
+    migraphx::shape s2{migraphx::shape::half_type};
+    migraphx::module m1;
+    {
+        auto a    = m1.add_parameter("a", s1);
+        auto b    = m1.add_parameter("b", s1);
+        auto sub  = m1.add_instruction(migraphx::make_op("sub"), a, b);
+        auto l1   = m1.add_literal(migraphx::literal{s2, {1.4140625f}});
+        auto div  = add_common_op(m1, migraphx::make_op("div"), {sub, l1});
+        auto erf  = m1.add_instruction(migraphx::make_op("erf"), div);
+        auto l2   = m1.add_literal(migraphx::literal{s2, {1.0f}});
+        auto add2 = add_common_op(m1, migraphx::make_op("add"), {erf, l2});
+        auto mul  = m1.add_instruction(migraphx::make_op("mul"), sub, add2);
+        auto l3   = m1.add_literal(migraphx::literal{s2, {0.5f}});
+        mul       = add_common_op(m1, migraphx::make_op("mul"), {mul, l3});
+        m1.add_return({mul});
+    }
+    migraphx::rewrite_gelu pass;
+    pass.apply(m1);
+    migraphx::dead_code_elimination dce;
+    dce.apply(m1);
+
+    migraphx::module m2;
+    {
+        auto a   = m2.add_parameter("a", s1);
+        auto b   = m2.add_parameter("b", s1);
+        auto sub = m2.add_instruction(migraphx::make_op("sub"), a, b);
+        auto l1  = m2.add_literal(migraphx::literal{s2, {1.702f}});
+        auto mul = add_common_op(m2, migraphx::make_op("mul"), {sub, l1});
+        auto sig = m2.add_instruction(migraphx::make_op("neg"), mul);
+        sig      = m2.add_instruction(migraphx::make_op("exp"), sig);
+        auto l2  = m2.add_literal(migraphx::literal{s2, {1.0f}});
+        sig      = add_common_op(m2, migraphx::make_op("add"), {sig, l2});
+        sig      = m2.add_instruction(migraphx::make_op("div"), sub, sig);
+        m2.add_return({sig});
+    }
+
+    EXPECT(m1 == m2);
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/shape_test.cpp
+++ b/test/shape_test.cpp
@@ -43,7 +43,7 @@ TEST_CASE(test_shape_assign)
    migraphx::shape s1{migraphx::shape::float_type, {100, 32, 8, 8}};
    migraphx::shape s2 = s1; // NOLINT
    EXPECT(s1 == s2);
-    EXPECT(!(s1 != s2));
+    EXPECT(not(s1 != s2));
 }

 TEST_CASE(test_shape_packed_default)
@@ -325,7 +325,7 @@ TEST_CASE(test_shape_default_copy)
    migraphx::shape s1{};
    migraphx::shape s2{};
    EXPECT(s1 == s2);
-    EXPECT(!(s1 != s2));
+    EXPECT(not(s1 != s2));
 }

 TEST_CASE(test_shape_normalize_standard1)

--- a/test/simplify_algebra_test.cpp
+++ b/test/simplify_algebra_test.cpp
@@ -30,7 +30,6 @@
 #include <migraphx/instruction.hpp>
 #include <basic_ops.hpp>
 #include <migraphx/make_op.hpp>
-
 #include <test.hpp>

 void run_pass(migraphx::module& m)
@@ -237,6 +236,105 @@ TEST_CASE(simplify_mul_conv1)
    EXPECT(new_conv->outputs().front()->name() != "mul");
 }

+TEST_CASE(simplify_mul_conv2)
+{
+    migraphx::module m;
+    auto x = m.add_parameter("x", {migraphx::shape::int32_type, {1, 128, 28, 28}});
+    auto w =
+        m.add_literal(migraphx::generate_literal({migraphx::shape::int32_type, {256, 128, 3, 3}}));
+    auto conv = m.add_instruction(
+        migraphx::make_op("convolution",
+                          {{"padding", {1, 1}}, {"stride", {2, 2}}, {"dilation", {1, 1}}}),
+        x,
+        w);
+    auto a      = m.add_literal(migraphx::generate_literal({migraphx::shape::int32_type, {256}}));
+    auto unsq_a = m.add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2}}}), a);
+    auto b      = m.add_instruction(
+        migraphx::make_op("multibroadcast", {{"out_lens", {1, 256, 14, 14}}}), unsq_a);
+    auto mul = m.add_instruction(migraphx::make_op("mul"), conv, b);
+    m.add_instruction(pass_op{}, mul);
+    EXPECT(conv->outputs().front()->name() == "mul");
+    run_pass(m);
+    auto new_conv =
+        std::find_if(m.begin(), m.end(), [](auto&& ins) { return ins.name() == "convolution"; });
+    EXPECT(new_conv->outputs().front()->name() != "mul");
+}
+
+// len = 1 case
+TEST_CASE(simplify_mul_conv3)
+{
+    migraphx::module m;
+    auto x = m.add_parameter("x", {migraphx::shape::int32_type, {1, 128, 28, 28}});
+    auto w =
+        m.add_literal(migraphx::generate_literal({migraphx::shape::int32_type, {256, 128, 3, 3}}));
+    auto conv = m.add_instruction(
+        migraphx::make_op("convolution",
+                          {{"padding", {1, 1}}, {"stride", {2, 2}}, {"dilation", {1, 1}}}),
+        x,
+        w);
+    auto a = m.add_literal(
+        migraphx::generate_literal({migraphx::shape::int32_type, {256, 1, 1}, {1, 18, 1}}));
+    auto b =
+        m.add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {1, 256, 14, 14}}}), a);
+    auto mul = m.add_instruction(migraphx::make_op("mul"), conv, b);
+    m.add_instruction(pass_op{}, mul);
+    EXPECT(conv->outputs().front()->name() == "mul");
+    run_pass(m);
+    auto new_conv =
+        std::find_if(m.begin(), m.end(), [](auto&& ins) { return ins.name() == "convolution"; });
+    EXPECT(new_conv->outputs().front()->name() != "mul");
+}
+
+// Previously broadcasted literal case, should skip
+TEST_CASE(simplify_mul_conv_skip1)
+{
+    migraphx::module m;
+    auto x = m.add_parameter("x", {migraphx::shape::int32_type, {1, 128, 28, 28}});
+    auto w =
+        m.add_literal(migraphx::generate_literal({migraphx::shape::int32_type, {256, 128, 3, 3}}));
+    auto conv = m.add_instruction(
+        migraphx::make_op("convolution",
+                          {{"padding", {1, 1}}, {"stride", {2, 2}}, {"dilation", {1, 1}}}),
+        x,
+        w);
+    auto a = m.add_literal(
+        migraphx::generate_literal({migraphx::shape::int32_type, {256, 14, 14}, {1, 0, 0}}));
+    auto b = m.add_instruction(
+        migraphx::make_op("broadcast", {{"axis", 1}, {"out_lens", {1, 256, 14, 14}}}), a);
+    auto mul = m.add_instruction(migraphx::make_op("mul"), conv, b);
+    m.add_instruction(pass_op{}, mul);
+    EXPECT(conv->outputs().front()->name() == "mul");
+    run_pass(m);
+    auto new_conv =
+        std::find_if(m.begin(), m.end(), [](auto&& ins) { return ins.name() == "convolution"; });
+    EXPECT(new_conv->outputs().front()->name() == "mul");
+}
+
+// Another previously broadcasted literal case, should skip
+TEST_CASE(simplify_mul_conv_skip2)
+{
+    migraphx::module m;
+    auto x = m.add_parameter("x", {migraphx::shape::int32_type, {1, 128, 28, 28}});
+    auto w =
+        m.add_literal(migraphx::generate_literal({migraphx::shape::int32_type, {256, 128, 3, 3}}));
+    auto conv = m.add_instruction(
+        migraphx::make_op("convolution",
+                          {{"padding", {1, 1}}, {"stride", {2, 2}}, {"dilation", {1, 1}}}),
+        x,
+        w);
+    auto a = m.add_literal(
+        migraphx::generate_literal({migraphx::shape::int32_type, {256, 14, 14}, {1, 0, 0}}));
+    auto b =
+        m.add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {1, 256, 14, 14}}}), a);
+    auto mul = m.add_instruction(migraphx::make_op("mul"), conv, b);
+    m.add_instruction(pass_op{}, mul);
+    EXPECT(conv->outputs().front()->name() == "mul");
+    run_pass(m);
+    auto new_conv =
+        std::find_if(m.begin(), m.end(), [](auto&& ins) { return ins.name() == "convolution"; });
+    EXPECT(new_conv->outputs().front()->name() == "mul");
+}
+
 TEST_CASE(simplify_mul_slice_conv1)
 {
    migraphx::module m1;
@@ -358,7 +456,33 @@ TEST_CASE(simplify_mul_add)
    EXPECT(m1 == m2);
 }

-TEST_CASE(simplify_inner_broadcast)
+TEST_CASE(simplify_dot_add)
+{
+    migraphx::module m1;
+    {
+        auto x   = m1.add_parameter("x", {migraphx::shape::float_type, {2, 2}});
+        auto one = m1.add_literal(get_2x2());
+        auto two = m1.add_literal(get_2x2(1));
+        auto sum = m1.add_instruction(migraphx::make_op("add"), one, x);
+        auto dot = m1.add_instruction(migraphx::make_op("dot"), sum, two);
+        m1.add_instruction(pass_op{}, dot);
+    }
+    run_pass(m1);
+
+    migraphx::module m2;
+    {
+        auto x    = m2.add_parameter("x", {migraphx::shape::float_type, {2, 2}});
+        auto one  = m2.add_literal(get_2x2());
+        auto two  = m2.add_literal(get_2x2(1));
+        auto dot1 = m2.add_instruction(migraphx::make_op("dot"), x, two);
+        auto dot2 = m2.add_instruction(migraphx::make_op("dot"), one, two);
+        auto sum  = m2.add_instruction(migraphx::make_op("add"), dot1, dot2);
+        m2.add_instruction(pass_op{}, sum);
+    }
+    EXPECT(m1 == m2);
+}
+
+TEST_CASE(simplify_inner_broadcast1)
 {
    auto b = migraphx::op::broadcast{1, {2, 1, 4, 5}};
    migraphx::module m1;
@@ -383,6 +507,31 @@ TEST_CASE(simplify_inner_broadcast)
    EXPECT(m1 == m2);
 }

+TEST_CASE(simplify_inner_broadcast2)
+{
+    auto b = migraphx::op::multibroadcast{{2, 1, 4, 5}};
+    migraphx::module m1;
+    {
+        auto x   = m1.add_parameter("x", {migraphx::shape::int32_type, {1, 1, 1, 1}});
+        auto y   = m1.add_parameter("y", {migraphx::shape::int32_type, {1, 1, 1, 1}});
+        auto xb  = m1.add_instruction(b, x);
+        auto yb  = m1.add_instruction(b, y);
+        auto sum = m1.add_instruction(migraphx::make_op("add"), xb, yb);
+        m1.add_instruction(pass_op{}, sum);
+    }
+    run_pass(m1);
+
+    migraphx::module m2;
+    {
+        auto x    = m2.add_parameter("x", {migraphx::shape::int32_type, {1, 1, 1, 1}});
+        auto y    = m2.add_parameter("y", {migraphx::shape::int32_type, {1, 1, 1, 1}});
+        auto sum  = m2.add_instruction(migraphx::make_op("add"), x, y);
+        auto sumb = m2.add_instruction(b, sum);
+        m2.add_instruction(pass_op{}, sumb);
+    }
+    EXPECT(m1 == m2);
+}
+
 TEST_CASE(simplify_add_conv1)
 {
    migraphx::module m;
@@ -1477,6 +1626,48 @@ TEST_CASE(simplify_dot_horiz_flipped)
    EXPECT(m1.sort() == m2.sort());
 }

+// test if contiguous is added as necessary for reshapes
+TEST_CASE(simplify_dot_horiz_reshape)
+{
+    auto s = migraphx::shape{migraphx::shape::int32_type, {3, 4, 4}};
+    migraphx::module m1;
+    {
+        auto input = m1.add_parameter("input", s);
+        auto a     = m1.add_literal(migraphx::generate_literal(s, 0));
+        auto b     = m1.add_literal(migraphx::generate_literal(s, 1));
+        auto x     = m1.add_instruction(migraphx::make_op("dot"), input, a);
+        auto y     = m1.add_instruction(migraphx::make_op("dot"), input, b);
+        auto x_rsp = m1.add_instruction(migraphx::make_op("reshape", {{"dims", {3, 4, 2, 2}}}), x);
+        auto y_rsp =
+            m1.add_instruction(migraphx::make_op("unsqueeze", {{"axes", {2}}, {"steps", {2}}}), y);
+        auto sum = m1.add_instruction(migraphx::make_op("add"), {x_rsp, y_rsp});
+        m1.add_instruction(pass_op{}, sum);
+    }
+    run_pass(m1);
+
+    migraphx::module m2;
+    {
+        auto input  = m2.add_parameter("input", s);
+        auto a      = m2.add_literal(migraphx::generate_literal(s, 0));
+        auto b      = m2.add_literal(migraphx::generate_literal(s, 1));
+        auto concat = m2.add_instruction(migraphx::make_op("concat", {{"axis", 2}}), a, b);
+        auto dot    = m2.add_instruction(migraphx::make_op("dot"), input, concat);
+        auto x      = m2.add_instruction(
+            migraphx::make_op("slice", {{"axes", {2}}, {"starts", {0}}, {"ends", {4}}}), dot);
+        auto y = m2.add_instruction(
+            migraphx::make_op("slice", {{"axes", {2}}, {"starts", {4}}, {"ends", {8}}}), dot);
+        auto x_cont = m2.add_instruction(migraphx::make_op("contiguous"), x);
+        auto x_rsp =
+            m2.add_instruction(migraphx::make_op("reshape", {{"dims", {3, 4, 2, 2}}}), x_cont);
+        auto y_rsp =
+            m2.add_instruction(migraphx::make_op("unsqueeze", {{"axes", {2}}, {"steps", {2}}}), y);
+        auto sum = m2.add_instruction(migraphx::make_op("add"), {x_rsp, y_rsp});
+        m2.add_instruction(pass_op{}, sum);
+    }
+
+    EXPECT(m1.sort() == m2.sort());
+}
+
 TEST_CASE(simplify_conv_horiz)
 {
    auto s  = migraphx::shape{migraphx::shape::int32_type, {8, 3, 64, 64}};
@@ -1782,13 +1973,19 @@ TEST_CASE(simplify_mul_slice_conv_horiz_fusion)
    }
    EXPECT(m1.sort() == m2.sort());
 }
-TEST_CASE(reorder_reshape_slice)
+
+template <std::size_t BS, bool TransposeInput>
+void reorder_reshape_slice()
 {
    std::vector<int64_t> perm0 = {0, 2, 1, 3};
    std::vector<int64_t> perm1 = {0, 2, 3, 1};
-    auto create_m1             = [&](std::size_t batch_size) {
    migraphx::module m1;
-        auto s     = migraphx::shape{migraphx::shape::float_type, {batch_size, 128, 1920}};
+    {
+        auto s = migraphx::shape{migraphx::shape::float_type, {BS, 128, 1920}};
+        if(TransposeInput)
+        {
+            s = migraphx::shape{migraphx::shape::float_type, {BS, 128, 1920}, {165120, 1, 128}};
+        }
        auto input = m1.add_parameter("input", s);
        auto slc0  = m1.add_instruction(
            migraphx::make_op("slice", {{"axes", {2}}, {"starts", {0}}, {"ends", {640}}}), input);
@@ -1803,7 +2000,7 @@ TEST_CASE(reorder_reshape_slice)
        auto c1 = m1.add_instruction(migraphx::make_op("contiguous"), slc1);
        auto c2 = m1.add_instruction(migraphx::make_op("contiguous"), slc2);

-        std::vector<int64_t> lens = {static_cast<int64_t>(batch_size), 128, 10, 64};
+        std::vector<int64_t> lens = {static_cast<int64_t>(BS), 128, 10, 64};
        auto r0 = m1.add_instruction(migraphx::make_op("reshape", {{"dims", lens}}), c0);
        auto r1 = m1.add_instruction(migraphx::make_op("reshape", {{"dims", lens}}), c1);
        auto r2 = m1.add_instruction(migraphx::make_op("reshape", {{"dims", lens}}), c2);
@@ -1815,16 +2012,23 @@ TEST_CASE(reorder_reshape_slice)
        auto sum = m1.add_instruction(migraphx::make_op("add"), t0, t1);
        auto ret = m1.add_instruction(migraphx::make_op("dot"), sum, t2);
        m1.add_return({ret});
-
-        return m1;
    };

-    auto create_m2 = [&](std::size_t batch_size) {
    migraphx::module m2;
-        auto s     = migraphx::shape{migraphx::shape::float_type, {batch_size, 128, 1920}};
+    {
+        auto s = migraphx::shape{migraphx::shape::float_type, {BS, 128, 1920}};
+        if(TransposeInput)
+        {
+            s = migraphx::shape{migraphx::shape::float_type, {BS, 128, 1920}, {165120, 1, 128}};
+        }
        auto input     = m2.add_parameter("input", s);
-        std::vector<int64_t> lens = {static_cast<int64_t>(batch_size), 128, 30, 64};
-        auto r = m2.add_instruction(migraphx::make_op("reshape", {{"dims", lens}}), input);
+        auto rsp_input = input;
+        if(TransposeInput)
+        {
+            rsp_input = m2.add_instruction(migraphx::make_op("contiguous"), {input});
+        }
+        std::vector<int64_t> lens = {static_cast<int64_t>(BS), 128, 30, 64};
+        auto r = m2.add_instruction(migraphx::make_op("reshape", {{"dims", lens}}), rsp_input);

        auto slc0 = m2.add_instruction(
            migraphx::make_op("slice", {{"axes", {2}}, {"starts", {0}}, {"ends", {10}}}), r);
@@ -1843,27 +2047,25 @@ TEST_CASE(reorder_reshape_slice)
        auto sum = m2.add_instruction(migraphx::make_op("add"), t0, t1);
        auto ret = m2.add_instruction(migraphx::make_op("dot"), sum, t2);
        m2.add_return({ret});
-
-        return m2;
    };
-
-    auto test = [&](std::size_t batch_size) {
-        auto m1 = create_m1(batch_size);
    run_pass(m1);
-        auto m2 = create_m2(batch_size);
    EXPECT(m1.sort() == m2.sort());
-    };
-
-    test(1);
-    test(4);
-    test(8);
 }

-TEST_CASE(reorder_reshape_slice_move_axis1)
+TEST_CASE_REGISTER(reorder_reshape_slice<1, true>); // test if contiguous is added as necessary if
+                                                    // input is transposed
+TEST_CASE_REGISTER(reorder_reshape_slice<4, true>);
+TEST_CASE_REGISTER(reorder_reshape_slice<8, true>);
+TEST_CASE_REGISTER(reorder_reshape_slice<1, false>);
+TEST_CASE_REGISTER(reorder_reshape_slice<4, false>);
+TEST_CASE_REGISTER(reorder_reshape_slice<8, false>);
+
+template <std::size_t BS>
+void reorder_reshape_slice_move_axis1()
 {
-    auto create_m1 = [](std::size_t batch_size) {
    migraphx::module m1;
-        auto s = migraphx::shape{migraphx::shape::float_type, {batch_size, 256, 96}};
+    {
+        auto s                     = migraphx::shape{migraphx::shape::float_type, {BS, 256, 96}};
        std::vector<int64_t> perm0 = {0, 2, 1, 3};
        std::vector<int64_t> perm1 = {0, 2, 3, 1};
        auto input                 = m1.add_parameter("input", s);
@@ -1878,7 +2080,7 @@ TEST_CASE(reorder_reshape_slice_move_axis1)
        auto c1 = m1.add_instruction(migraphx::make_op("contiguous"), slc1);
        auto c2 = m1.add_instruction(migraphx::make_op("contiguous"), slc2);

-        std::vector<int64_t> lens = {static_cast<int64_t>(batch_size), 64, 4, 32};
+        std::vector<int64_t> lens = {static_cast<int64_t>(BS), 64, 4, 32};
        auto r0 = m1.add_instruction(migraphx::make_op("reshape", {{"dims", lens}}), c0);
        auto r1 = m1.add_instruction(migraphx::make_op("reshape", {{"dims", lens}}), c1);
        auto r2 = m1.add_instruction(migraphx::make_op("reshape", {{"dims", lens}}), c2);
@@ -1890,50 +2092,45 @@ TEST_CASE(reorder_reshape_slice_move_axis1)
        auto sum = m1.add_instruction(migraphx::make_op("add"), t0, t1);
        auto ret = m1.add_instruction(migraphx::make_op("dot"), sum, t2);
        m1.add_return({ret});
-
-        return m1;
    };

-    auto create_m2 = [](std::size_t batch_size) {
-        migraphx::module m;
-        auto s = migraphx::shape{migraphx::shape::float_type, {batch_size, 256, 96}};
+    migraphx::module m2;
+    {
+        auto s                     = migraphx::shape{migraphx::shape::float_type, {BS, 256, 96}};
        std::vector<int64_t> perm0 = {0, 2, 1, 3};
        std::vector<int64_t> perm1 = {0, 2, 3, 1};
-        auto input                 = m.add_parameter("input", s);
-        std::vector<int64_t> lens  = {static_cast<int64_t>(batch_size), 64, 4, 96};
-        auto rsp  = m.add_instruction(migraphx::make_op("reshape", {{"dims", lens}}), input);
-        auto slc0 = m.add_instruction(
+        auto input                 = m2.add_parameter("input", s);
+        std::vector<int64_t> lens  = {static_cast<int64_t>(BS), 64, 4, 96};
+        auto rsp  = m2.add_instruction(migraphx::make_op("reshape", {{"dims", lens}}), input);
+        auto slc0 = m2.add_instruction(
            migraphx::make_op("slice", {{"axes", {3}}, {"starts", {0}}, {"ends", {32}}}), rsp);
-        auto t0 = m.add_instruction(migraphx::make_op("transpose", {{"permutation", perm0}}), slc0);
-        auto slc1 = m.add_instruction(
+        auto t0 =
+            m2.add_instruction(migraphx::make_op("transpose", {{"permutation", perm0}}), slc0);
+        auto slc1 = m2.add_instruction(
            migraphx::make_op("slice", {{"axes", {3}}, {"starts", {32}}, {"ends", {64}}}), rsp);
-        auto t1 = m.add_instruction(migraphx::make_op("transpose", {{"permutation", perm0}}), slc1);
-        auto slc2 = m.add_instruction(
+        auto t1 =
+            m2.add_instruction(migraphx::make_op("transpose", {{"permutation", perm0}}), slc1);
+        auto slc2 = m2.add_instruction(
            migraphx::make_op("slice", {{"axes", {3}}, {"starts", {64}}, {"ends", {96}}}), rsp);
-        auto t2 = m.add_instruction(migraphx::make_op("transpose", {{"permutation", perm1}}), slc2);
-
-        auto sum = m.add_instruction(migraphx::make_op("add"), t0, t1);
-        auto ret = m.add_instruction(migraphx::make_op("dot"), sum, t2);
-        m.add_return({ret});
+        auto t2 =
+            m2.add_instruction(migraphx::make_op("transpose", {{"permutation", perm1}}), slc2);

-        return m;
+        auto sum = m2.add_instruction(migraphx::make_op("add"), t0, t1);
+        auto ret = m2.add_instruction(migraphx::make_op("dot"), sum, t2);
+        m2.add_return({ret});
    };

-    auto test = [&](std::size_t batch_size) {
-        auto m1 = create_m1(batch_size);
-        auto m2 = create_m2(batch_size);
    run_pass(m1);
    EXPECT(m1.sort() == m2.sort());
-    };
-
-    test(4);
-    test(8);
 }

+TEST_CASE_REGISTER(reorder_reshape_slice_move_axis1<4>);
+TEST_CASE_REGISTER(reorder_reshape_slice_move_axis1<8>);
+
 TEST_CASE(reorder_reshape_slice_move_axis2)
 {
-    auto create_m1 = [] {
    migraphx::module m1;
+    {
        migraphx::shape s{migraphx::shape::float_type, {128, 96}};
        auto input = m1.add_parameter("input", s);
        auto slc0  = m1.add_instruction(
@@ -1955,32 +2152,75 @@ TEST_CASE(reorder_reshape_slice_move_axis2)
        auto sum = m1.add_instruction(migraphx::make_op("add"), r0, r1);
        auto ret = m1.add_instruction(migraphx::make_op("mul"), sum, r2);
        m1.add_return({ret});
-
-        return m1;
    };

-    auto create_m2 = [] {
-        migraphx::module m;
+    migraphx::module m2;
+    {
        auto s                    = migraphx::shape{migraphx::shape::float_type, {128, 96}};
-        auto input                = m.add_parameter("input", s);
+        auto input                = m2.add_parameter("input", s);
        std::vector<int64_t> lens = {1, 16, 8, 96};
-        auto rsp  = m.add_instruction(migraphx::make_op("reshape", {{"dims", lens}}), input);
-        auto slc0 = m.add_instruction(
+        auto rsp  = m2.add_instruction(migraphx::make_op("reshape", {{"dims", lens}}), input);
+        auto slc0 = m2.add_instruction(
            migraphx::make_op("slice", {{"axes", {3}}, {"starts", {0}}, {"ends", {32}}}), rsp);
-        auto slc1 = m.add_instruction(
+        auto slc1 = m2.add_instruction(
            migraphx::make_op("slice", {{"axes", {3}}, {"starts", {32}}, {"ends", {64}}}), rsp);
-        auto slc2 = m.add_instruction(
+        auto slc2 = m2.add_instruction(
            migraphx::make_op("slice", {{"axes", {3}}, {"starts", {64}}, {"ends", {96}}}), rsp);

-        auto sum = m.add_instruction(migraphx::make_op("add"), slc0, slc1);
-        auto ret = m.add_instruction(migraphx::make_op("mul"), sum, slc2);
-        m.add_return({ret});
+        auto sum = m2.add_instruction(migraphx::make_op("add"), slc0, slc1);
+        auto ret = m2.add_instruction(migraphx::make_op("mul"), sum, slc2);
+        m2.add_return({ret});
+    };

-        return m;
+    run_pass(m1);
+    EXPECT(m1.sort() == m2.sort());
+}
+
+TEST_CASE(reorder_reshape_slice_len_1)
+{
+    migraphx::module m1;
+    {
+        migraphx::shape s{migraphx::shape::float_type, {1, 128, 3}};
+        auto input = m1.add_parameter("input", s);
+        auto slc0  = m1.add_instruction(
+            migraphx::make_op("slice", {{"axes", {2}}, {"starts", {0}}, {"ends", {1}}}), input);
+        auto slc1 = m1.add_instruction(
+            migraphx::make_op("slice", {{"axes", {2}}, {"starts", {1}}, {"ends", {2}}}), input);
+        auto slc2 = m1.add_instruction(
+            migraphx::make_op("slice", {{"axes", {2}}, {"starts", {2}}, {"ends", {3}}}), input);
+
+        auto c0 = m1.add_instruction(migraphx::make_op("contiguous"), slc0);
+        auto c1 = m1.add_instruction(migraphx::make_op("contiguous"), slc1);
+        auto c2 = m1.add_instruction(migraphx::make_op("contiguous"), slc2);
+
+        std::vector<int64_t> lens = {1, 128};
+        auto r0 = m1.add_instruction(migraphx::make_op("reshape", {{"dims", lens}}), c0);
+        auto r1 = m1.add_instruction(migraphx::make_op("reshape", {{"dims", lens}}), c1);
+        auto r2 = m1.add_instruction(migraphx::make_op("reshape", {{"dims", lens}}), c2);
+
+        auto sum = m1.add_instruction(migraphx::make_op("add"), r0, r1);
+        auto ret = m1.add_instruction(migraphx::make_op("mul"), sum, r2);
+        m1.add_return({ret});
+    };
+
+    migraphx::module m2;
+    {
+        auto s                    = migraphx::shape{migraphx::shape::float_type, {1, 128, 3}};
+        auto input                = m2.add_parameter("input", s);
+        std::vector<int64_t> lens = {1, 384};
+        auto rsp  = m2.add_instruction(migraphx::make_op("reshape", {{"dims", lens}}), input);
+        auto slc0 = m2.add_instruction(
+            migraphx::make_op("slice", {{"axes", {1}}, {"starts", {0}}, {"ends", {128}}}), rsp);
+        auto slc1 = m2.add_instruction(
+            migraphx::make_op("slice", {{"axes", {1}}, {"starts", {128}}, {"ends", {256}}}), rsp);
+        auto slc2 = m2.add_instruction(
+            migraphx::make_op("slice", {{"axes", {1}}, {"starts", {256}}, {"ends", {384}}}), rsp);
+
+        auto sum = m2.add_instruction(migraphx::make_op("add"), slc0, slc1);
+        auto ret = m2.add_instruction(migraphx::make_op("mul"), sum, slc2);
+        m2.add_return({ret});
    };

-    auto m1 = create_m1();
-    auto m2 = create_m2();
    run_pass(m1);
    EXPECT(m1.sort() == m2.sort());
 }
@@ -2020,13 +2260,12 @@ TEST_CASE(reorder_reshape_slice_not_apply)
    EXPECT(m1.sort() == m2.sort());
 }

-TEST_CASE(reorder_reshape_slice_diff_dims)
+template <std::size_t BS>
+void reorder_reshape_slice_diff_dims()
 {
-    auto create_m1 = [](std::size_t batch_size) {
    migraphx::module m1;
-        auto s = migraphx::shape{migraphx::shape::float_type, {batch_size, 96, 96}};
-        std::vector<int64_t> perm0 = {0, 2, 1, 3};
-        std::vector<int64_t> perm1 = {0, 2, 3, 1};
+    {
+        auto s     = migraphx::shape{migraphx::shape::float_type, {BS, 96, 96}};
        auto input = m1.add_parameter("input", s);
        auto slc0  = m1.add_instruction(
            migraphx::make_op("slice", {{"axes", {2}}, {"starts", {0}}, {"ends", {32}}}), input);
@@ -2039,34 +2278,31 @@ TEST_CASE(reorder_reshape_slice_diff_dims)
        auto c1 = m1.add_instruction(migraphx::make_op("contiguous"), slc1);
        auto c2 = m1.add_instruction(migraphx::make_op("contiguous"), slc2);

-        std::vector<int64_t> lens  = {static_cast<int64_t>(batch_size), 32, 3, 32};
-        std::vector<int64_t> lens1 = {static_cast<int64_t>(batch_size), 48, 2, 32};
+        std::vector<int64_t> lens  = {static_cast<int64_t>(BS), 32, 3, 32};
+        std::vector<int64_t> lens1 = {static_cast<int64_t>(BS), 48, 2, 32};
        auto r0 = m1.add_instruction(migraphx::make_op("reshape", {{"dims", lens}}), c0);
        auto r1 = m1.add_instruction(migraphx::make_op("reshape", {{"dims", lens}}), c1);
        auto r2 = m1.add_instruction(migraphx::make_op("reshape", {{"dims", lens1}}), c2);

        m1.add_return({r0, r1, r2});
-
-        return m1;
    };

-    auto test = [&](std::size_t batch_size) {
-        auto m1 = create_m1(batch_size);
    auto m2 = m1;
    run_pass(m1);
    EXPECT(m1.sort() == m2.sort());
-    };
-
-    test(4);
-    test(8);
 }

-TEST_CASE(reorder_slice_trans)
+TEST_CASE_REGISTER(reorder_reshape_slice_diff_dims<4>);
+TEST_CASE_REGISTER(reorder_reshape_slice_diff_dims<8>);
+
+template <std::size_t BS>
+void reorder_slice_trans()
 {
    std::vector<int64_t> perm = {0, 2, 1};
-    auto create_m1            = [&](std::size_t batch_size) {
+
    migraphx::module m1;
-        auto s     = migraphx::shape{migraphx::shape::float_type, {batch_size, 128, 1920}};
+    {
+        auto s     = migraphx::shape{migraphx::shape::float_type, {BS, 128, 1920}};
        auto input = m1.add_parameter("input", s);
        auto slc0  = m1.add_instruction(
            migraphx::make_op("slice", {{"axes", {2}}, {"starts", {0}}, {"ends", {640}}}), input);
@@ -2084,13 +2320,11 @@ TEST_CASE(reorder_slice_trans)
        auto sum = m1.add_instruction(migraphx::make_op("add"), t0, t1);
        auto ret = m1.add_instruction(migraphx::make_op("mul"), sum, t2);
        m1.add_return({ret});
-
-        return m1;
    };

-    auto create_m2 = [&](std::size_t batch_size) {
    migraphx::module m2;
-        auto s     = migraphx::shape{migraphx::shape::float_type, {batch_size, 128, 1920}};
+    {
+        auto s     = migraphx::shape{migraphx::shape::float_type, {BS, 128, 1920}};
        auto input = m2.add_parameter("input", s);
        auto r = m2.add_instruction(migraphx::make_op("transpose", {{"permutation", perm}}), input);

@@ -2104,26 +2338,21 @@ TEST_CASE(reorder_slice_trans)
        auto sum = m2.add_instruction(migraphx::make_op("add"), slc0, slc1);
        auto ret = m2.add_instruction(migraphx::make_op("mul"), sum, slc2);
        m2.add_return({ret});
-
-        return m2;
    };

-    auto test = [&](std::size_t batch_size) {
-        auto m1 = create_m1(batch_size);
    run_pass(m1);
-        auto m2 = create_m2(batch_size);
    EXPECT(m1.sort() == m2.sort());
-    };
-
-    test(1);
-    test(8);
 }

-TEST_CASE(reorder_slice_trans_diff_perm)
+TEST_CASE_REGISTER(reorder_slice_trans<1>);
+TEST_CASE_REGISTER(reorder_slice_trans<8>);
+
+template <std::size_t BS>
+void reorder_slice_trans_diff_perm()
 {
-    auto create_m1 = [](std::size_t batch_size) {
    migraphx::module m1;
-        auto s = migraphx::shape{migraphx::shape::float_type, {batch_size, 128, 1920}};
+    {
+        auto s                     = migraphx::shape{migraphx::shape::float_type, {BS, 128, 1920}};
        std::vector<int64_t> perm0 = {0, 2, 1};
        std::vector<int64_t> perm1 = {0, 1, 2};
        auto input                 = m1.add_parameter("input", s);
@@ -2146,21 +2375,16 @@ TEST_CASE(reorder_slice_trans_diff_perm)
        auto sum = m1.add_instruction(migraphx::make_op("add"), t0, t1);
        auto ret = m1.add_instruction(migraphx::make_op("dot"), sum, t2);
        m1.add_return({ret});
-
-        return m1;
    };

-    auto test = [&](std::size_t batch_size) {
-        auto m1 = create_m1(batch_size);
    run_pass(m1);
    auto m2 = m1;
    EXPECT(m1.sort() == m2.sort());
-    };
-
-    test(1);
-    test(4);
 }

+TEST_CASE_REGISTER(reorder_slice_trans_diff_perm<1>);
+TEST_CASE_REGISTER(reorder_slice_trans_diff_perm<4>);
+
 TEST_CASE(reorder_slice_ins_deps)
 {
    auto create_module = [] {

--- a/test/simplify_reshapes_test.cpp
+++ b/test/simplify_reshapes_test.cpp
@@ -39,6 +39,35 @@ void run_pass(migraphx::module& m)
    migraphx::run_passes(m, {migraphx::simplify_reshapes{}, migraphx::dead_code_elimination{}});
 }

+inline std::vector<std::vector<std::size_t>> to_lens(const std::vector<migraphx::shape>& shapes)
+{
+    std::vector<std::vector<std::size_t>> result;
+    std::transform(shapes.begin(), shapes.end(), std::back_inserter(result), [&](const auto& s) {
+        return s.lens();
+    });
+    return result;
+}
+
+migraphx::module make_concat_multibroadcast(const std::vector<size_t>& in_lens,
+                                            const std::vector<size_t>& mbcast_lens,
+                                            const int axis)
+{
+    migraphx::module m;
+    auto s = migraphx::shape{migraphx::shape::float_type, in_lens};
+    auto x = m.add_parameter("x", s);
+    auto y = m.add_parameter("y", s);
+    auto z = m.add_parameter("z", s);
+    auto xm =
+        m.add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", mbcast_lens}}), x);
+    auto ym =
+        m.add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", mbcast_lens}}), y);
+    auto zm =
+        m.add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", mbcast_lens}}), z);
+    auto concat = m.add_instruction(migraphx::make_op("concat", {{"axis", axis}}), xm, ym, zm);
+    m.add_return({concat});
+    return m;
+}
+
 TEST_CASE(double_contig)
 {
    migraphx::program p;
@@ -328,6 +357,87 @@ TEST_CASE(nop_convert)
    EXPECT(std::distance(m.begin(), m.end()) == n - 1);
 }

+TEST_CASE(concat_multibroadcasts1)
+{
+    // Broadcasted batch dim, new axis < old axis
+    std::vector<std::size_t> in_lens     = {3, 4};
+    std::vector<std::size_t> mbcast_lens = {2, 3, 4};
+    const int axis                       = 2;
+    auto m                               = make_concat_multibroadcast(in_lens, mbcast_lens, axis);
+    auto out_shape                       = m.get_output_shapes().back();
+    auto n                               = std::distance(m.begin(), m.end());
+    run_pass(m);
+    EXPECT(m.get_output_shapes().back().lens() == out_shape.lens());
+    EXPECT(std::distance(m.begin(), m.end()) == n - 2);
+    auto new_concat =
+        std::find_if(m.begin(), m.end(), [](auto ins) { return ins.name() == "concat"; });
+    EXPECT(bool{new_concat != m.end()});
+    auto cd = std::distance(m.begin(), new_concat);
+    auto new_mb =
+        std::find_if(m.begin(), m.end(), [](auto ins) { return ins.name() == "multibroadcast"; });
+    auto md = std::distance(m.begin(), new_mb);
+    EXPECT(cd == md - 1);
+    EXPECT(migraphx::any_cast<migraphx::op::concat>(new_concat->get_operator()).axis == 1);
+}
+
+TEST_CASE(concat_multibroadcasts2)
+{
+    // Broadcasted middle dim, new axis == old axis
+    std::vector<std::size_t> in_lens     = {3, 1, 4};
+    std::vector<std::size_t> mbcast_lens = {3, 2, 4};
+    const int axis                       = 0;
+    auto m                               = make_concat_multibroadcast(in_lens, mbcast_lens, axis);
+    auto out_shape                       = m.get_output_shapes().back();
+    auto n                               = std::distance(m.begin(), m.end());
+    run_pass(m);
+    EXPECT(m.get_output_shapes().back().lens() == out_shape.lens());
+    EXPECT(std::distance(m.begin(), m.end()) == n - 2);
+    auto new_concat =
+        std::find_if(m.begin(), m.end(), [](auto ins) { return ins.name() == "concat"; });
+    EXPECT(bool{new_concat != m.end()});
+    auto cd = std::distance(m.begin(), new_concat);
+    auto new_mb =
+        std::find_if(m.begin(), m.end(), [](auto ins) { return ins.name() == "multibroadcast"; });
+    auto md = std::distance(m.begin(), new_mb);
+    EXPECT(cd == md - 1);
+    EXPECT(migraphx::any_cast<migraphx::op::concat>(new_concat->get_operator()).axis == 0);
+}
+
+TEST_CASE(concat_multibroadcasts3)
+{
+    // Broadcasted middle dim, new axis == old axis
+    std::vector<std::size_t> in_lens     = {3, 1, 4};
+    std::vector<std::size_t> mbcast_lens = {3, 2, 4};
+    const int axis                       = 2;
+    auto m                               = make_concat_multibroadcast(in_lens, mbcast_lens, axis);
+    auto out_shape                       = m.get_output_shapes().back();
+    auto n                               = std::distance(m.begin(), m.end());
+    run_pass(m);
+    EXPECT(m.get_output_shapes().back().lens() == out_shape.lens());
+    EXPECT(std::distance(m.begin(), m.end()) == n - 2);
+    auto new_concat =
+        std::find_if(m.begin(), m.end(), [](auto ins) { return ins.name() == "concat"; });
+    EXPECT(bool{new_concat != m.end()});
+    auto cd = std::distance(m.begin(), new_concat);
+    auto new_mb =
+        std::find_if(m.begin(), m.end(), [](auto ins) { return ins.name() == "multibroadcast"; });
+    auto md = std::distance(m.begin(), new_mb);
+    EXPECT(cd == md - 1);
+    EXPECT(migraphx::any_cast<migraphx::op::concat>(new_concat->get_operator()).axis == 2);
+}
+
+TEST_CASE(concat_multibroadcasts4)
+{
+    // Broadcasted batch dim, axis is broadcasted dim
+    std::vector<std::size_t> in_lens     = {3, 4};
+    std::vector<std::size_t> mbcast_lens = {2, 3, 4};
+    const int axis                       = 0;
+    auto m                               = make_concat_multibroadcast(in_lens, mbcast_lens, axis);
+    auto m1                              = m;
+    run_pass(m);
+    EXPECT(m1 == m);
+}
+
 TEST_CASE(concat_transpose1)
 {
    migraphx::module m;
@@ -1275,4 +1385,82 @@ TEST_CASE(transpose_slice_single_transpose)
    EXPECT(m1 == m2);
 }

+TEST_CASE(transpose_slice_non_packed_axis)
+{
+    migraphx::module m1;
+    {
+        auto x = m1.add_parameter("x", {migraphx::shape::float_type, {2, 384, 36, 64}});
+        auto transpose =
+            m1.add_instruction(migraphx::make_op("transpose", {{"permutation", {0, 2, 1, 3}}}), x);
+        auto slice = m1.add_instruction(
+            migraphx::make_op("slice", {{"axes", {1}}, {"starts", {0}}, {"ends", {12}}}),
+            transpose);
+        auto sqrt = m1.add_instruction(migraphx::make_op("sqrt"), slice);
+        m1.add_return({sqrt});
+    }
+    auto output_shapes = m1.get_output_shapes();
+    run_pass(m1);
+    EXPECT(m1.get_output_shapes() == output_shapes);
+    migraphx::module m2;
+    {
+        auto x = m2.add_parameter("x", {migraphx::shape::float_type, {2, 384, 36, 64}});
+        auto unsqueeze =
+            m2.add_instruction(migraphx::make_op("unsqueeze", {{"axes", {2}}, {"steps", {12}}}), x);
+        auto transpose = m2.add_instruction(
+            migraphx::make_op("transpose", {{"permutation", {3, 0, 2, 1, 4}}}), unsqueeze);
+        auto slice = m2.add_instruction(
+            migraphx::make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), transpose);
+        auto squeeze = m2.add_instruction(migraphx::make_op("squeeze", {{"axes", {0}}}), slice);
+        auto sqrt    = m2.add_instruction(migraphx::make_op("sqrt"), squeeze);
+        m2.add_return({sqrt});
+    }
+    EXPECT(m1 == m2);
+}
+
+TEST_CASE(transpose_slice_non_packed_multi_axis)
+{
+    migraphx::module m1;
+    {
+        auto x = m1.add_parameter("x", {migraphx::shape::float_type, {2, 384, 36, 64}});
+        auto transpose =
+            m1.add_instruction(migraphx::make_op("transpose", {{"permutation", {0, 2, 1, 3}}}), x);
+        auto slice1 = m1.add_instruction(
+            migraphx::make_op("slice", {{"axes", {1}}, {"starts", {0}}, {"ends", {12}}}),
+            transpose);
+        auto slice2 = m1.add_instruction(
+            migraphx::make_op("slice", {{"axes", {1}}, {"starts", {12}}, {"ends", {24}}}),
+            transpose);
+        auto transpose2 = m1.add_instruction(
+            migraphx::make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), slice2);
+        auto slice3 = m1.add_instruction(
+            migraphx::make_op("slice", {{"axes", {1}}, {"starts", {24}}, {"ends", {36}}}),
+            transpose);
+        m1.add_return({slice1, transpose2, slice3});
+    }
+    auto output_shapes = m1.get_output_shapes();
+    run_pass(m1);
+    EXPECT(to_lens(m1.get_output_shapes()) == to_lens(output_shapes));
+    migraphx::module m2;
+    {
+        auto x = m2.add_parameter("x", {migraphx::shape::float_type, {2, 384, 36, 64}});
+        auto unsqueeze =
+            m2.add_instruction(migraphx::make_op("unsqueeze", {{"axes", {2}}, {"steps", {12}}}), x);
+        auto transpose = m2.add_instruction(
+            migraphx::make_op("transpose", {{"permutation", {3, 0, 2, 1, 4}}}), unsqueeze);
+        auto slice1 = m2.add_instruction(
+            migraphx::make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), transpose);
+        auto squeeze1 = m2.add_instruction(migraphx::make_op("squeeze", {{"axes", {0}}}), slice1);
+        auto slice2   = m2.add_instruction(
+            migraphx::make_op("slice", {{"axes", {0}}, {"starts", {1}}, {"ends", {2}}}), transpose);
+        auto squeeze2   = m2.add_instruction(migraphx::make_op("squeeze", {{"axes", {0}}}), slice2);
+        auto transpose2 = m2.add_instruction(
+            migraphx::make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), squeeze2);
+        auto slice3 = m2.add_instruction(
+            migraphx::make_op("slice", {{"axes", {0}}, {"starts", {2}}, {"ends", {3}}}), transpose);
+        auto squeeze3 = m2.add_instruction(migraphx::make_op("squeeze", {{"axes", {0}}}), slice3);
+        m2.add_return({squeeze1, transpose2, squeeze3});
+    }
+    EXPECT(m1.sort() == m2.sort());
+}
+
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/tf/gen_tf_pb.py
+++ b/test/tf/gen_tf_pb.py
@@ -495,10 +495,10 @@ def relu6_test(g1):


 @tf_test
-def relu6_mismatch_test(g1):
+def relu6_half_test(g1):
    with g1.as_default():
        g1_input = tf.compat.v1.placeholder(tf.float16,
-                                            shape=(1, 3, 13, 37),
+                                            shape=(1, 3, 16, 16),
                                            name='0')
        tf.nn.relu6(g1_input, 'relu6')

@@ -708,7 +708,7 @@ if __name__ == '__main__':
    pow_test()
    relu_test()
    relu6_test()
-    relu6_mismatch_test()
+    relu6_half_test()
    reshape_test()
    rsqrt_test()
    shape_test()

--- a/test/tf/relu6_mismatch_test.pb
+++ b/test/tf/relu6_mismatch_test.pb
@@ -2,7 +2,7 @@
 :
 0Placeholder*
 dtype0*
-shape:
%
+shape:

 relu6Relu60*
 T0"
\ No newline at end of file
--- a/test/tf/tf_test.cpp
+++ b/test/tf/tf_test.cpp
@@ -729,27 +729,23 @@ TEST_CASE(relu6_test)
    EXPECT(p == prog);
 }

-TEST_CASE(relu6_mismatch_test)
+TEST_CASE(relu6_half_test)
 {
    migraphx::program p;

    auto* mm = p.get_main_module();
-    std::vector<size_t> input_lens{1, 3, 13, 37};
+    std::vector<size_t> input_lens{1, 3, 16, 16};
    auto l0 = mm->add_parameter("0", migraphx::shape{migraphx::shape::half_type, input_lens});
-    auto min_val = mm->add_literal(0.0f);
-    auto max_val = mm->add_literal(6.0f);
-
-    auto l0_convert = mm->add_instruction(
-        migraphx::make_op("convert", {{"target_type", migraphx::shape::float_type}}), l0);
-
+    auto min_val =
+        mm->add_literal(migraphx::literal{migraphx::shape{migraphx::shape::half_type}, {0.0f}});
+    auto max_val =
+        mm->add_literal(migraphx::literal{migraphx::shape{migraphx::shape::half_type}, {6.0f}});
    min_val = mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", input_lens}}),
                                  min_val);
    max_val = mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", input_lens}}),
                                  max_val);
-
-    mm->add_instruction(migraphx::make_op("clip"), l0_convert, min_val, max_val);
-
-    auto prog = optimize_tf("relu6_mismatch_test.pb", false);
+    mm->add_instruction(migraphx::make_op("clip"), l0, min_val, max_val);
+    auto prog = optimize_tf("relu6_half_test.pb", false);

    EXPECT(p == prog);
 }

--- a/test/verify/gemm_add_broadcast1.cpp
+++ b/test/verify/gemm_add_broadcast1.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/apply_alpha_beta.hpp>
+struct gemm_add_broadcast1 : verify_program<gemm_add_broadcast1>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape m1_shape{migraphx::shape::float_type, {1, 2, 3}};
+        migraphx::shape m2_shape{migraphx::shape::float_type, {1, 3, 4}};
+        migraphx::shape m3_shape{migraphx::shape::float_type, {1, 1, 4}};
+        auto l1 = mm->add_parameter("1", m1_shape);
+        auto l2 = mm->add_parameter("2", m2_shape);
+        auto l3 = mm->add_parameter("3", m3_shape);
+        auto l3_b =
+            mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {1, 2, 4}}}), l3);
+
+        auto dot = mm->add_instruction(migraphx::make_op("dot"), l1, l2);
+        mm->add_instruction(migraphx::make_op("add"), dot, l3_b);
+        return p;
+    }
+};
--- a/test/verify/gemm_add_broadcast2.cpp
+++ b/test/verify/gemm_add_broadcast2.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/apply_alpha_beta.hpp>
+struct gemm_add_broadcast2 : verify_program<gemm_add_broadcast2>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape m1_shape{migraphx::shape::float_type, {1, 2, 3}};
+        migraphx::shape m2_shape{migraphx::shape::float_type, {1, 3, 4}};
+        migraphx::shape m3_shape{migraphx::shape::float_type, {1, 2, 1}};
+        auto l1 = mm->add_parameter("1", m1_shape);
+        auto l2 = mm->add_parameter("2", m2_shape);
+        auto l3 = mm->add_parameter("3", m3_shape);
+        auto l3_b =
+            mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {1, 2, 4}}}), l3);
+
+        auto dot = mm->add_instruction(migraphx::make_op("dot"), l1, l2);
+        mm->add_instruction(migraphx::make_op("add"), dot, l3_b);
+        return p;
+    }
+};
--- a/test/verify/run_verify.cpp
+++ b/test/verify/run_verify.cpp
@@ -169,7 +169,7 @@ void run_verify::verify(const std::string& name, const migraphx::program& p) con
    for(const auto& tname : migraphx::get_targets())
    {
        // TODO(varunsh): once verify tests can run, remove fpga
-        if(tname == "ref" || tname == "fpga")
+        if(tname == "ref" or tname == "fpga")
            continue;

        // if tests disabled, skip running it

--- a/test/verify/test_add_gelu_half.cpp
+++ b/test/verify/test_add_gelu_half.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+struct test_add_gelu_half : verify_program<test_add_gelu_half>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        std::vector<size_t> input_lens{1, 1, 5};
+        auto x     = mm->add_parameter("x", {migraphx::shape::half_type, input_lens});
+        auto y     = mm->add_parameter("y", {migraphx::shape::half_type, input_lens});
+        auto half  = mm->add_literal(migraphx::literal{{migraphx::shape::half_type}, {0.5f}});
+        auto one   = mm->add_literal(migraphx::literal{{migraphx::shape::half_type}, {1.0f}});
+        auto sqrt2 = mm->add_literal(migraphx::literal{{migraphx::shape::half_type}, {M_SQRT2}});
+        auto add   = mm->add_instruction(migraphx::make_op("add"), x, y);
+        auto half_mbcast = mm->add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", input_lens}}), half);
+        auto mul_half     = mm->add_instruction(migraphx::make_op("mul"), add, half_mbcast);
+        auto sqrt2_mbcast = mm->add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", input_lens}}), sqrt2);
+        auto div        = mm->add_instruction(migraphx::make_op("div"), add, sqrt2_mbcast);
+        auto erf        = mm->add_instruction(migraphx::make_op("erf"), div);
+        auto one_mbcast = mm->add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", input_lens}}), one);
+        auto add_one = mm->add_instruction(migraphx::make_op("add"), erf, one_mbcast);
+        mm->add_instruction(migraphx::make_op("mul"), mul_half, add_one);
+        return p;
+    }
+};
--- a/src/targets/gpu/include/migraphx/gpu/add.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/add.hpp
@@ -21,22 +21,25 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_ADD_HPP
-#define MIGRAPHX_GUARD_RTGLIB_ADD_HPP

-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/add.hpp>
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>

-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-struct hip_add : binary_device<hip_add, device::add>
+struct test_concat_axis_2 : verify_program<test_concat_axis_2>
 {
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape s0{migraphx::shape::int32_type, {3, 2, 1}};
+        migraphx::shape s1{migraphx::shape::int32_type, {3, 2, 1}};
+        migraphx::shape s2{migraphx::shape::int32_type, {3, 2, 1}};
+        auto l0 = mm->add_parameter("x", s0);
+        auto l1 = mm->add_parameter("y", s1);
+        auto l2 = mm->add_parameter("z", s2);
+        mm->add_instruction(migraphx::make_op("concat", {{"axis", 2}}), l0, l1, l2);
+        return p;
+    }
 };
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/concat.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/concat.hpp
@@ -21,41 +21,26 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_CONCAT_HPP
-#define MIGRAPHX_GUARD_RTGLIB_CONCAT_HPP

-#include <migraphx/argument.hpp>
-#include <migraphx/reflect.hpp>
-#include <migraphx/op/concat.hpp>
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>

-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-struct context;
-
-struct hip_concat
+struct test_conv_group_add : verify_program<test_conv_group_add>
 {
-    op::concat op;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-
-    std::string name() const { return "gpu::concat"; }
-    shape compute_shape(std::vector<shape> inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    migraphx::program create_program() const
    {
-        return shapes.size() - 1;
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape s{migraphx::shape::float_type, {1, 68, 28, 28}};
+        auto x    = mm->add_parameter("x", s);
+        auto w    = mm->add_parameter("w", {migraphx::shape::float_type, {68, 17, 1, 1}});
+        auto b    = mm->add_parameter("b", {migraphx::shape::float_type, {68}});
+        auto conv = mm->add_instruction(migraphx::make_op("convolution", {{"group", 4}}), x, w);
+        auto bb   = mm->add_instruction(
+            migraphx::make_op("broadcast", {{"axis", 1}, {"out_lens", {1, 68, 28, 28}}}), b);
+        mm->add_instruction(migraphx::make_op("add"), conv, bb);
+        return p;
    }
 };
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/softmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/softmax.hpp
@@ -21,54 +21,53 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_SOFTMAX_HPP
-#define MIGRAPHX_GUARD_RTGLIB_SOFTMAX_HPP

-#include <migraphx/gpu/lowering.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/op/softmax.hpp>
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
 #include <migraphx/generate.hpp>
-#include <migraphx/shape_for_each.hpp>
-#include <migraphx/config.hpp>
-#include <migraphx/gpu/miopen.hpp>
-#include <migraphx/gpu/hip.hpp>
-#include <migraphx/dfor.hpp>
-#include <migraphx/gpu/device/contiguous.hpp>
-#include <migraphx/gpu/device/add.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/gpu/rocblas.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <utility>
+#include <migraphx/make_op.hpp>
+#include <migraphx/common.hpp>

-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-struct context;
+/*
+    Checking for y == 0 ? eps : y

-struct hip_softmax
+    Adding this because HIP fmod sign changes when y = 0 resulting in nan and -nan not beign
+   consistent between ref and gpu implementations.
+*/
+migraphx::instruction_ref add_epsilon(migraphx::module& m, migraphx::instruction_ref y)
 {
-    op::softmax op;
+    auto zero = m.add_literal(0.0f);
+    auto eps  = m.add_literal(1e-3f);
+    auto op_y = add_common_op(m, migraphx::make_op("equal"), {y, zero});
+    return add_common_op(m, migraphx::make_op("where"), {op_y, eps, y});
+}

-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
+struct test_fmod : verify_program<test_fmod>
+{
+    migraphx::program create_program() const
    {
-        return migraphx::reflect(self.op, f);
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape s{migraphx::shape::float_type, {64}};
+        auto x        = mm->add_parameter("x", s);
+        auto y        = mm->add_parameter("y", s);
+        auto op_where = add_epsilon(*mm, y);
+        mm->add_instruction(migraphx::make_op("fmod"), x, op_where);
+        return p;
    }
+};

-    std::string name() const { return "gpu::softmax"; }
-    shape compute_shape(const std::vector<shape>& inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+struct test_mod : verify_program<test_mod>
+{
+    migraphx::program create_program() const
    {
-        return shapes.size() - 1;
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape s{migraphx::shape::float_type, {64}};
+        auto x        = mm->add_parameter("x", s);
+        auto y        = mm->add_parameter("y", s);
+        auto op_where = add_epsilon(*mm, y);
+        mm->add_instruction(migraphx::make_op("mod"), x, op_where);
+        return p;
    }
 };
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif
--- a/test/verify/test_layernorm.cpp
+++ b/test/verify/test_layernorm.cpp
@@ -29,14 +29,16 @@

 #include <migraphx/op/reduce_mean.hpp>

-migraphx::instruction_ref
-add_layernorm(migraphx::module& m, migraphx::instruction_ref x, std::vector<size_t> dims)
+migraphx::instruction_ref add_layernorm(migraphx::module& m,
+                                        migraphx::instruction_ref x,
+                                        std::vector<size_t> dims,
+                                        float eps = 1e-12f)
 {
    auto scale =
        m.add_parameter("scale", migraphx::shape{migraphx::shape::float_type, {dims.back()}});
    auto bias =
        m.add_parameter("bias", migraphx::shape{migraphx::shape::float_type, {dims.back()}});
-    auto epsilon  = m.add_literal(1e-12f);
+    auto epsilon  = m.add_literal(eps);
    auto exponent = m.add_literal(2.0f);

    auto mean = m.add_instruction(migraphx::op::reduce_mean({2}), x);
@@ -68,7 +70,7 @@ struct test_layernorm : verify_program<test_layernorm>
    {
        migraphx::program p;
        auto* mm                 = p.get_main_module();
-        std::vector<size_t> dims = {1, 1, 5};
+        std::vector<size_t> dims = {1, 2, 5};
        auto x = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, dims});
        add_layernorm(*mm, x, dims);
        return p;
@@ -88,6 +90,19 @@ struct test_layernorm2 : verify_program<test_layernorm2>
    }
 };

+struct test_layernorm_eps : verify_program<test_layernorm_eps>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm                 = p.get_main_module();
+        std::vector<size_t> dims = {1, 2, 5};
+        auto x = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, dims});
+        add_layernorm(*mm, x, dims, 1e-5f);
+        return p;
+    }
+};
+
 struct test_layernorm_triadd : verify_program<test_layernorm_triadd>
 {
    migraphx::program create_program() const

--- a/src/targets/gpu/include/migraphx/gpu/clip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/clip.hpp
@@ -21,41 +21,27 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_CLIP_HPP
-#define MIGRAPHX_GUARD_RTGLIB_CLIP_HPP

-#include <migraphx/argument.hpp>
-#include <migraphx/reflect.hpp>
-#include <migraphx/op/clip.hpp>
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>

-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-struct context;
-
-struct hip_clip
+struct test_slice2 : verify_program<test_slice2>
 {
-    op::clip op;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-
-    std::string name() const { return "gpu::clip"; }
-    shape compute_shape(std::vector<shape> inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    migraphx::program create_program() const
    {
-        return shapes.size() - 1;
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape s{migraphx::shape::float_type, {1, 44, 57, 57}};
+        auto x      = mm->add_parameter("x", {migraphx::shape::float_type, {1, 44, 57, 57}});
+        auto y      = mm->add_parameter("y", {migraphx::shape::float_type, {1, 44, 56, 56}});
+        auto slice0 = mm->add_instruction(
+            migraphx::make_op(
+                "slice",
+                {{"axes", {0, 2, 3, 1}}, {"starts", {0, 1, 1, 0}}, {"ends", {1, 57, 57, 44}}}),
+            x);
+        mm->add_instruction(migraphx::make_op("add"), y, slice0);
+        return p;
    }
 };
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/softmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/softmax.hpp
@@ -21,23 +21,23 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_SOFTMAX_HPP
-#define MIGRAPHX_GUARD_RTGLIB_DEVICE_SOFTMAX_HPP

-#include <migraphx/argument.hpp>
-#include <migraphx/config.hpp>
-#include <hip/hip_runtime_api.h>
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/common.hpp>

-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-namespace device {
-
-void softmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis);
-
-} // namespace device
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif
+struct test_softmax_large1 : verify_program<test_softmax_large1>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm   = p.get_main_module();
+        auto x     = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, {2, 4}});
+        auto large = mm->add_literal({migraphx::shape{migraphx::shape::float_type}, {10000}});
+        auto add   = migraphx::add_common_op(*mm, migraphx::make_op("add"), {x, large});
+        mm->add_instruction(migraphx::make_op("softmax", {{"axis", -1}}), add);
+        return p;
+    }
+};
--- a/src/targets/gpu/include/migraphx/gpu/acosh.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/acosh.hpp
@@ -21,22 +21,23 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_ACOSH_HPP
-#define MIGRAPHX_GUARD_RTGLIB_ACOSH_HPP

-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/acosh.hpp>
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/common.hpp>

-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-struct hip_acosh : unary_device<hip_acosh, device::acosh>
+struct test_softmax_large2 : verify_program<test_softmax_large2>
 {
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm   = p.get_main_module();
+        auto x     = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, {2, 4}});
+        auto large = mm->add_literal({migraphx::shape{migraphx::shape::float_type}, {-10000}});
+        auto add   = migraphx::add_common_op(*mm, migraphx::make_op("add"), {x, large});
+        mm->add_instruction(migraphx::make_op("softmax", {{"axis", -1}}), add);
+        return p;
+    }
 };
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif
--- a/test/verify/test_unbatched_gemm_1.cpp
+++ b/test/verify/test_unbatched_gemm_1.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/apply_alpha_beta.hpp>
+struct test_unbatched_gemm_1 : verify_program<test_unbatched_gemm_1>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape m1_shape{migraphx::shape::float_type, {2, 32, 64}};
+        migraphx::shape m2_shape{migraphx::shape::float_type, {64, 64}};
+        migraphx::shape m3_shape{migraphx::shape::float_type, {2, 32, 192}};
+        auto l1 = mm->add_parameter("1", m1_shape);
+        auto l2 = mm->add_literal(migraphx::generate_literal(m2_shape));
+        l2 = mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {2, 64, 64}}}),
+                                 l2);
+        auto l3 = mm->add_literal(migraphx::generate_literal(m2_shape));
+        l3 = mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {2, 64, 64}}}),
+                                 l3);
+        auto l4 = mm->add_literal(migraphx::generate_literal(m2_shape));
+        l4 = mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {2, 64, 64}}}),
+                                 l4);
+        auto concat = mm->add_instruction(migraphx::make_op("concat", {{"axis", 2}}), l2, l3, l4);
+
+        auto l5     = mm->add_parameter("3", m3_shape);
+        float alpha = 1.0f;
+        float beta  = 1.0f;
+        migraphx::add_apply_alpha_beta(
+            *mm, {l1, concat, l5}, migraphx::make_op("dot"), alpha, beta);
+        return p;
+    }
+};