Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into print_matmul_perf_flops

a175f14c · Shucai Xiao · 5496a55d · c7419a9c · a175f14c · a175f14c
Commit a175f14c authored Feb 09, 2022 by Shucai Xiao
20 changed files
--- a/examples/migraphx/cpp_parse_load_save/README.md
+++ b/examples/migraphx/cpp_parse_load_save/README.md
@@ -15,7 +15,7 @@ p = parse_onnx(input_file, options);
 ```

 ## Saving
-An instantiated migraphx::program object can then be serialized to MessagePack (.msgpack) format and saved so that it can be loaded for future uses. 
+An instantiated migraphx::program object can then be serialized to MessagePack (.mxr) format and saved so that it can be loaded for future uses. 

 A program can be saved with either of the following:
 ```

--- a/examples/migraphx/cpp_parse_load_save/parse_load_save.cpp
+++ b/examples/migraphx/cpp_parse_load_save/parse_load_save.cpp
@@ -77,7 +77,7 @@ int main(int argc, char** argv)
        std::cout << "Saving program..." << std::endl;
        std::string output_file;
        output_file = save_arg == nullptr ? "out" : save_arg;
-        output_file.append(".msgpack");
+        output_file.append(".mxr");

        migraphx::file_options options;
        options.set_file_format("msgpack");

--- a/examples/vision/python_yolov4/yolov4_inference.ipynb
+++ b/examples/vision/python_yolov4/yolov4_inference.ipynb
@@ -50,10 +50,10 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "if not os.path.exists(\"yolov4_fp16.msgpack\"):\n",
-    "    !/opt/rocm/bin/migraphx-driver compile ./utilities/yolov4.onnx --gpu --enable-offload-copy --fp16ref --binary -o yolov4_fp16.msgpack\n",
-    "if not os.path.exists(\"yolov4.msgpack\"):\n",
-    "    !/opt/rocm/bin/migraphx-driver compile ./utilities/yolov4.onnx --gpu --enable-offload-copy --binary -o yolov4.msgpack"
+    "if not os.path.exists(\"yolov4_fp16.mxr\"):\n",
+    "    !/opt/rocm/bin/migraphx-driver compile ./utilities/yolov4.onnx --gpu --enable-offload-copy --fp16ref --binary -o yolov4_fp16.mxr\n",
+    "if not os.path.exists(\"yolov4.mxr\"):\n",
+    "    !/opt/rocm/bin/migraphx-driver compile ./utilities/yolov4.onnx --gpu --enable-offload-copy --binary -o yolov4.mxr"
   ]
  },
  {
@@ -115,8 +115,8 @@
   "outputs": [],
   "source": [
    "# Load serialized model (either single- or half-precision)\n",
-    "model = migraphx.load(\"yolov4.msgpack\", format=\"msgpack\")\n",
-    "#model = migraphx.load(\"yolov4_fp16.msgpack\", format=\"msgpack\")\n",
+    "model = migraphx.load(\"yolov4.mxr\", format=\"msgpack\")\n",
+    "#model = migraphx.load(\"yolov4_fp16.mxr\", format=\"msgpack\")\n",
    "\n",
    "# Get the name of the input parameter and convert image data to an MIGraphX argument\n",
    "input_name = next(iter(model.get_parameter_shapes()))\n",

--- a/src/cpp_generator.cpp
+++ b/src/cpp_generator.cpp
@@ -88,6 +88,7 @@ struct cpp_generator_impl
    std::stringstream fs{};
    std::size_t function_count                                = 0;
    std::function<std::string(std::string)> fmap              = nullptr;
+    std::function<std::string(shape)> fresult                 = nullptr;
    std::unordered_map<std::string, std::string> point_op_map = {};
 };
 cpp_generator::cpp_generator() : impl(std::make_unique<cpp_generator_impl>()) {}
@@ -104,6 +105,8 @@ cpp_generator::~cpp_generator() noexcept = default;

 void cpp_generator::fmap(const std::function<std::string(std::string)>& f) { impl->fmap = f; }

+void cpp_generator::fresult(const std::function<std::string(shape)>& f) { impl->fresult = f; }
+
 void cpp_generator::add_point_op(const std::string& op_name, const std::string& code)
 {
    impl->point_op_map[op_name] = code;
@@ -174,7 +177,12 @@ cpp_generator::function cpp_generator::generate_module(const module& m)
                           ins->inputs().end(),
                           std::back_inserter(args),
                           [&](auto i) { return names.at(i); });
-            return this->generate_point_op(ins->get_operator(), args);
+
+            auto s = this->generate_point_op(ins->get_operator(), args);
+            if(impl->fresult)
+                return impl->fresult(ins->get_shape()) + '(' + s + ')';
+            else
+                return s;
        });
    return f;
 }

--- a/src/include/migraphx/cpp_generator.hpp
+++ b/src/include/migraphx/cpp_generator.hpp
@@ -68,6 +68,8 @@ struct cpp_generator

    void fmap(const std::function<std::string(std::string)>& f);

+    void fresult(const std::function<std::string(shape)>& f);
+
    void add_point_op(const std::string& op_name, const std::string& code);

    std::string generate_point_op(const operation& op, const std::vector<std::string>& args);

--- a/src/include/migraphx/op/squeeze.hpp
+++ b/src/include/migraphx/op/squeeze.hpp
@@ -37,43 +37,49 @@ struct squeeze
    std::string name() const { return "squeeze"; }
    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1).standard();
+        check_shapes{inputs, *this}.has(1);
        auto input_shape = inputs[0];
        auto type        = input_shape.type();
        auto old_lens    = input_shape.lens();
-
+        auto old_strides = input_shape.strides();
        if(std::any_of(axes.begin(), axes.end(), [&](auto axis) { return old_lens[axis] != 1; }))
        {
            MIGRAPHX_THROW("squeeze axis dimension should be equal to 1");
        }
        std::vector<std::size_t> new_lens;
+        std::vector<std::size_t> new_strides;
        if(axes.empty())
        {
-            std::copy_if(old_lens.begin(),
-                         old_lens.end(),
-                         std::back_inserter(new_lens),
-                         [](auto len) { return len != 1; });
+            for(auto i : range(old_lens.size()))
+            {
+                if(old_lens[i] != 1)
+                {
+                    new_lens.push_back(old_lens[i]);
+                    new_strides.push_back(old_strides[i]);
+                }
+            }
        }
        else
        {
-            for(std::size_t i = 0; i < old_lens.size(); i++)
+            for(auto i : range(old_lens.size()))
            {
                if(std::find(axes.begin(), axes.end(), i) == axes.end())
                {
                    new_lens.push_back(old_lens[i]);
+                    new_strides.push_back(old_strides[i]);
                }
            }
        }
-
        if(new_lens.empty())
        {
            return shape{type};
        }
        else
        {
-            return shape{type, new_lens};
+            return shape{type, new_lens, new_strides};
        }
    }
+
    argument compute(shape output_shape, std::vector<argument> args) const
    {
        return args[0].reshape(output_shape);

--- a/src/include/migraphx/par_for.hpp
+++ b/src/include/migraphx/par_for.hpp
@@ -78,8 +78,8 @@ void par_for_impl(std::size_t n, std::size_t threadsize, F f)
 template <class F>
 void par_for(std::size_t n, std::size_t min_grain, F f)
 {
-    const auto threadsize =
-        std::min<std::size_t>(std::thread::hardware_concurrency(), n / min_grain);
+    const auto threadsize = std::min<std::size_t>(std::thread::hardware_concurrency(),
+                                                  n / std::max<std::size_t>(1, min_grain));
    par_for_impl(n, threadsize, f);
 }


--- a/src/targets/gpu/compile_ops.cpp
+++ b/src/targets/gpu/compile_ops.cpp
@@ -12,6 +12,8 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_COMPILE_PARALLEL);
+
 struct precompile_op
 {
    operation op = op::identity{};
@@ -70,6 +72,14 @@ struct compiled_result
    instruction_ref ins;
 };

+template <class F>
+void par_compile(std::size_t n, F f)
+{
+    if(n == 0)
+        return;
+    par_for(n, n / value_of(MIGRAPHX_GPU_COMPILE_PARALLEL{}, n), f);
+}
+
 void compile_ops::apply(module& m) const
 {
    auto compilers = make_compilers(pointwise_compiler{});
@@ -85,7 +95,7 @@ void compile_ops::apply(module& m) const
        compiles.emplace_back([=]() -> compiled_result { return {c(*ctx, ins, preop), ins}; });
    }
    std::vector<compiled_result> results(compiles.size());
-    par_for(compiles.size(), 1, [&](auto i) { results[i] = compiles[i](); });
+    par_compile(compiles.size(), [&](auto i) { results[i] = compiles[i](); });
    for(const auto& cr : results)
    {
        m.replace_instruction(cr.ins, cr.op, cr.ins->inputs());

--- a/src/targets/gpu/compile_pointwise.cpp
+++ b/src/targets/gpu/compile_pointwise.cpp
@@ -70,6 +70,9 @@ operation compile_pointwise(context& ctx, const std::vector<shape>& inputs, modu
    g.add_point_op("less", "migraphx::abs(${0} < ${1})");
    g.add_point_op("greater", "migraphx::abs(${0} > ${1})");
    g.add_point_op("not", "migraphx::abs(not ${0})");
+    // Add explict conversions
+    g.fresult(
+        [](const shape& s) { return "migraphx::convert<" + shape::cpp_type(s.type()) + ">"; });
    auto name =
        g.create_function(g.generate_module(m).set_attributes({"__device__"}).set_generic_types(m));
    return compile_pointwise((ctx), inputs, "MIGRAPHX_LIFT(" + name + ")", g.str());

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -587,6 +587,11 @@ struct miopen_fusion
        return pack(f(self.ops, "ops"));
    }

+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+
    value compile(context& ctx, const shape&, std::vector<shape> inputs)
    {
        // Compensate for allocation

--- a/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
@@ -6,15 +6,32 @@

 namespace migraphx {

+template <class T>
+struct remove_vec_impl
+{
+    using type = T;
+};
+
+template <class T, index_int N>
+struct remove_vec_impl<vec<T, N>>
+{
+    using type = T;
+};
+
+template <class T>
+using remove_vec = typename remove_vec_impl<T>::type;
+
 template <class T, class... Shapes>
 constexpr auto traverse_preload(Shapes... ss)
 {
    return [=](auto f, auto... g) {
        index_int offset = 0;
        auto each        = [&](auto x) {
+            using type          = remove_vec<typename decltype(x)::type>;
            constexpr auto s    = decltype(x.get_shape()){};
            constexpr auto size = _c<s.element_space()>;
-            if constexpr(not s.broadcasted() or (s.elements() - size) < 64)
+            if constexpr(not s.broadcasted() or (s.elements() - size) < 64 or
+                         not is_same<T, type>{})
                return f(x, offset, false_type{});
            else
            {
@@ -78,23 +95,23 @@ __device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
        invoke);
 }

-template <class T>
-struct remove_vec
+template <class T, class Shape>
+struct shape_type : Shape
 {
    using type = T;
 };

-template <class T, index_int N>
-struct remove_vec<vec<T, N>>
+template <class T>
+constexpr auto make_shape_type(T)
 {
-    using type = T;
-};
+    return shape_type<typename T::type, typename T::shape_type>{};
+}

 template <class T, class... Ts>
 __device__ auto preload(index idx, Ts... xs)
 {
-    using type               = typename remove_vec<T>::type;
-    constexpr auto size      = decltype(compute_preload_size<type>(xs.get_shape()...)){};
+    using type               = remove_vec<T>;
+    constexpr auto size      = decltype(compute_preload_size<type>(make_shape_type(xs)...)){};
    const index_int max_size = 512 * sizeof(type);
    return [=](auto f) {
        if constexpr(size > 0 and size < max_size)

--- a/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
@@ -10,6 +10,7 @@ template <class T, class Shape>
 struct tensor_view
 {
    using type       = T;
+    using shape_type = Shape;

    constexpr Shape get_shape() const { return Shape{}; }
    constexpr index_int size() const { return get_shape().elements(); }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
@@ -25,6 +25,16 @@ struct is_convertible : bool_constant<__is_convertible(From, To)>
 {
 };

+template <class T, class U>
+struct is_same : false_type
+{
+};
+
+template <class T>
+struct is_same<T, T> : true_type
+{
+};
+
 #define MIGRAPHX_REQUIRES(...) class = enable_if_t<__VA_ARGS__>

 } // namespace migraphx

--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -66,6 +66,9 @@ __device__ __host__ auto as_vec(T* x)
        return reinterpret_cast<vec<T, N>*>(x);
 }

+template <class T, index_int N>
+using safe_vec = vec<std::conditional_t<std::is_same<T, bool>{}, uint8_t, T>, N>;
+
 template <class... Ts>
 constexpr auto vec_transform(Ts... xs)
 {
@@ -74,7 +77,7 @@ constexpr auto vec_transform(Ts... xs)
        {
            using type                  = decltype(f(vec_at(xs, 0)...));
            constexpr auto size         = common_vec_size<Ts...>();
-            vec<type, size> result = {0};
+            safe_vec<type, size> result = {0};
            for(int i = 0; i < size; i++)
                result[i] = f(vec_at(xs, i)...);
            return result;

--- a/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
@@ -50,14 +50,14 @@ constexpr auto shape_step(Shape s, Axis)
    });
 }

-// Bools can not be used as a vector type so convert it to int8
+// Bools can not be used as a vector type so convert it to uint8
 template <class T>
 __device__ __host__ T* remove_bool(T* x)
 {
    return x;
 }

-inline __device__ __host__ int8_t* remove_bool(bool* x) { return reinterpret_cast<int8_t*>(x); }
+inline __device__ __host__ uint8_t* remove_bool(bool* x) { return reinterpret_cast<uint8_t*>(x); }

 template <index_int N, class T, class Axis>
 __device__ __host__ auto as_vec(T x, Axis axis)

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -44,7 +44,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS)
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_POINTWISE_FUSION)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_POINTWISE_FUSION)

 struct id_pass
 {
@@ -100,7 +100,7 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        simplify_reshapes{},
        propagate_constant{},
        dead_code_elimination{},
-        enable_pass(enabled(MIGRAPHX_ENABLE_POINTWISE_FUSION{}), fuse_pointwise{}),
+        enable_pass(not enabled(MIGRAPHX_DISABLE_POINTWISE_FUSION{}), fuse_pointwise{}),
        dead_code_elimination{},
        mlir_conv{&ctx},
        lowering{&ctx, options.offload_copy},

--- a/test/api/test_save_load.cpp
+++ b/test/api/test_save_load.cpp
@@ -4,7 +4,7 @@

 TEST_CASE(load_save_default)
 {
-    std::string filename = "migraphx_api_load_save.dat";
+    std::string filename = "migraphx_api_load_save.mxr";
    auto p1              = migraphx::parse_onnx("conv_relu_maxpool_test.onnx");
    auto s1              = p1.get_output_shapes();


--- a/test/gpu/fast_math.cpp
+++ b/test/gpu/fast_math.cpp
-#include <test.hpp>
-#include <migraphx/quantization.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/op/add.hpp>
-#include <migraphx/op/mul.hpp>
-#include <migraphx/op/multibroadcast.hpp>
-#include <migraphx/op/pow.hpp>
-#include <migraphx/op/tanh.hpp>
-#include <migraphx/gpu/target.hpp>
-#include <migraphx/instruction.hpp>
-
-migraphx::program create_gelu()
-{
-    migraphx::program p;
-    auto* mm                 = p.get_main_module();
-    std::vector<float> data0 = {0.044715};
-    std::vector<float> data1 = {0.797885};
-    std::vector<float> data2 = {3};
-    std::vector<float> data3 = {0.5};
-    migraphx::shape s0{migraphx::shape::float_type, {1}};
-
-    std::vector<size_t> x_dims{1, 1, 5};
-
-    auto x         = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, x_dims});
-    auto const_val = mm->add_literal(migraphx::literal{s0, data0});
-    auto sqrt_2_pi = mm->add_literal(migraphx::literal{s0, data1});
-    auto three_val = mm->add_literal(migraphx::literal{s0, data2});
-    auto half_val  = mm->add_literal(migraphx::literal{s0, data3});
-
-    auto mbcast_3         = mm->add_instruction(migraphx::op::multibroadcast{x_dims}, three_val);
-    auto pow_op           = mm->add_instruction(migraphx::op::pow{}, x, mbcast_3);
-    auto mbcast_const     = mm->add_instruction(migraphx::op::multibroadcast{x_dims}, const_val);
-    auto mul_const        = mm->add_instruction(migraphx::op::mul{}, mbcast_const, pow_op);
-    auto add_x            = mm->add_instruction(migraphx::op::add{}, x, mul_const);
-    auto mbcast_sqrt_2_pi = mm->add_instruction(migraphx::op::multibroadcast{x_dims}, sqrt_2_pi);
-    auto mul_add_x        = mm->add_instruction(migraphx::op::mul{}, mbcast_sqrt_2_pi, add_x);
-    auto tanh_op          = mm->add_instruction(migraphx::op::tanh{}, mul_add_x);
-    auto mbcast_half      = mm->add_instruction(migraphx::op::multibroadcast{x_dims}, half_val);
-    auto mul_half         = mm->add_instruction(migraphx::op::mul{}, mbcast_half, tanh_op);
-    auto add_mul_half     = mm->add_instruction(migraphx::op::add{}, mul_half, mbcast_half);
-    auto mul_x            = mm->add_instruction(migraphx::op::mul{}, x, add_mul_half);
-    mm->add_return({mul_x});
-
-    return p;
-}
-
-TEST_CASE(enable_fast_gelu)
-{
-    migraphx::program p = create_gelu();
-    p.compile(migraphx::gpu::target{});
-    CHECK(any_of(*p.get_main_module(), [&](auto&& i) { return i.name() == "gpu::gelu"; }));
-}
-
-TEST_CASE(disable_fast_gelu)
-{
-    migraphx::program p = create_gelu();
-    migraphx::compile_options options;
-    options.fast_math = false;
-    p.compile(migraphx::gpu::target{}, options);
-    CHECK(any_of(*p.get_main_module(), [&](auto&& i) { return i.name() == "gpu::gelu_new"; }));
-}
-
-int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/op_shape_test.cpp
+++ b/test/op_shape_test.cpp
@@ -1446,6 +1446,27 @@ TEST_CASE(test_squeeze_all)
    expect_shape(s2, migraphx::make_op("squeeze", {{"axes", {0}}}), s1);
 }

+TEST_CASE(test_squeeze_transpose)
+{
+    migraphx::shape s1{migraphx::shape::float_type, {4, 4, 1}, {4, 1, 4}};
+    migraphx::shape s2{migraphx::shape::float_type, {4, 4}, {4, 1}};
+    expect_shape(s2, migraphx::make_op("squeeze", {{"axes", {2}}}), s1);
+}
+
+TEST_CASE(test_squeeze_multibroadcast)
+{
+    migraphx::shape s1{migraphx::shape::float_type, {2, 3, 1, 4}, {0, 1, 1, 0}};
+    migraphx::shape s2{migraphx::shape::float_type, {2, 3, 4}, {0, 1, 0}};
+    expect_shape(s2, migraphx::make_op("squeeze", {{"axes", {2}}}), s1);
+}
+
+TEST_CASE(test_squeeze_slice)
+{
+    migraphx::shape s1{migraphx::shape::float_type, {2, 3, 1, 4}, {108, 36, 6, 1}};
+    migraphx::shape s2{migraphx::shape::float_type, {2, 3, 4}, {108, 36, 1}};
+    expect_shape(s2, migraphx::make_op("squeeze", {{"axes", {2}}}), s1);
+}
+
 TEST_CASE(test_squeeze_negative_axis)
 {
    migraphx::shape s1{migraphx::shape::float_type, {4, 1, 3, 1, 3}};

--- a/test/ref_ops_nonstd_shape_test.cpp
+++ b/test/ref_ops_nonstd_shape_test.cpp
@@ -3,9 +3,9 @@
 #include <migraphx/literal.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/ref/target.hpp>
+#include <migraphx/generate.hpp>
 #include <migraphx/verify.hpp>
 #include <migraphx/make_op.hpp>
-#include <migraphx/auto_contiguous.hpp>
 #include <migraphx/pass_manager.hpp>
 #include "test.hpp"

@@ -13,11 +13,7 @@ TEST_CASE(argmax_test_nonstd_shape)
 {
    migraphx::program p;
    auto* mm = p.get_main_module();
-    std::vector<float> data = {1.2255,  1.6834,  -2.0305, -0.3221, 0.4701,  0.2583, 0.7545, 2.5758,
-                               -1.6849, 0.0928,  0.9022,  -0.8765, -0.4090, 0.9301, 2.0724, -1.5706,
-                               0.4867,  -0.1493, 0.6957,  -0.2179, 0.7142,  0.7177, 0.0183, 1.3497};
-    migraphx::shape data_shape{migraphx::shape::float_type, {2, 3, 4}};
-    auto dl = mm->add_literal(migraphx::literal{data_shape, data});
+    auto dl = mm->add_literal(migraphx::generate_literal({migraphx::shape::float_type, {2, 3, 4}}));
    auto dl_trans =
        mm->add_instruction(migraphx::make_op("transpose", {{"permutation", {1, 2, 0}}}), dl);
    mm->add_instruction(migraphx::make_op("argmax", {{"axis", -3}}), dl_trans);
@@ -36,11 +32,7 @@ TEST_CASE(argmin_test_nonstd_shape)
 {
    migraphx::program p;
    auto* mm = p.get_main_module();
-    std::vector<float> data = {1.2255,  1.6834,  -2.0305, -0.3221, 0.4701,  0.2583, 0.7545, 2.5758,
-                               -1.6849, 0.0928,  0.9022,  -0.8765, -0.4090, 0.9301, 2.0724, -1.5706,
-                               0.4867,  -0.1493, 0.6957,  -0.2179, 0.7142,  0.7177, 0.0183, 1.3497};
-    migraphx::shape data_shape{migraphx::shape::float_type, {2, 3, 4}};
-    auto dl = mm->add_literal(migraphx::literal{data_shape, data});
+    auto dl = mm->add_literal(migraphx::generate_literal({migraphx::shape::float_type, {2, 3, 4}}));
    auto dl_trans =
        mm->add_instruction(migraphx::make_op("transpose", {{"permutation", {1, 2, 0}}}), dl);
    mm->add_instruction(migraphx::make_op("argmin", {{"axis", -1}}), dl_trans);
@@ -55,4 +47,62 @@ TEST_CASE(argmin_test_nonstd_shape)
    EXPECT(migraphx::verify_range(result_vec, res_gold_vec));
 }

+TEST_CASE(squeeze_transpose_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    auto l0 =
+        mm->add_literal(migraphx::generate_literal({migraphx::shape::float_type, {4, 1, 3, 1, 3}}));
+    auto l0_trans =
+        mm->add_instruction(migraphx::make_op("transpose", {{"permutation", {1, 2, 3, 0, 4}}}), l0);
+    mm->add_instruction(migraphx::make_op("squeeze"), l0_trans);
+    auto p_uncompiled = p;
+    p.compile(migraphx::ref::target{});
+    auto result          = p.eval({}).back();
+    auto expected_result = p_uncompiled.eval({}).back();
+    // contiguous is required to read the values in standard shaped order
+    auto tr_op               = migraphx::make_op("contiguous");
+    auto std_expected_result = tr_op.compute(result.get_shape(), {expected_result});
+    EXPECT(result.get_shape() == migraphx::shape{migraphx::shape::float_type, {3, 4, 3}});
+    EXPECT(result == std_expected_result);
+}
+
+TEST_CASE(squeeze_multibroadcast_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    auto l0 =
+        mm->add_literal(migraphx::generate_literal({migraphx::shape::float_type, {1, 3, 1, 3}}));
+    auto l0_brcst = mm->add_instruction(
+        migraphx::make_op("multibroadcast", {{"out_lens", {4, 1, 3, 4, 3}}}), l0);
+    mm->add_instruction(migraphx::make_op("squeeze"), l0_brcst);
+    auto p_uncompiled = p;
+    p.compile(migraphx::ref::target{});
+    auto result              = p.eval({}).back();
+    auto expected_result     = p_uncompiled.eval({}).back();
+    auto tr_op               = migraphx::make_op("contiguous");
+    auto std_expected_result = tr_op.compute(result.get_shape(), {expected_result});
+    EXPECT(result.get_shape() == migraphx::shape{migraphx::shape::float_type, {4, 3, 4, 3}});
+    EXPECT(result == std_expected_result);
+}
+
+TEST_CASE(squeeze_slice_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    auto l0 =
+        mm->add_literal(migraphx::generate_literal({migraphx::shape::float_type, {1, 3, 4, 3}}));
+    auto l0_slice = mm->add_instruction(
+        migraphx::make_op("slice", {{"axes", {2}}, {"starts", {2}}, {"ends", {3}}}), l0);
+    mm->add_instruction(migraphx::make_op("squeeze"), l0_slice);
+    auto p_uncompiled = p;
+    p.compile(migraphx::ref::target{});
+    auto result              = p.eval({}).back();
+    auto expected_result     = p_uncompiled.eval({}).back();
+    auto tr_op               = migraphx::make_op("contiguous");
+    auto std_expected_result = tr_op.compute(result.get_shape(), {expected_result});
+    EXPECT(result.get_shape() == migraphx::shape{migraphx::shape::float_type, {3, 3}});
+    EXPECT(result == std_expected_result);
+}
+
 int main(int argc, const char* argv[]) { test::run(argc, argv); }