Merge remote-tracking branch 'origin/develop' into ck-int8-fusion

80bf741a · Alan Turner · 99626b4c · 0e6ee3f7 · 80bf741a · 80bf741a
Commit 80bf741a authored May 19, 2023 by Alan Turner
20 changed files
--- a/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
@@ -218,7 +218,15 @@ using common_type_t = typename common_type<Ts...>::type;

 #define MIGRAPHX_REQUIRES(...) class = enable_if_t<__VA_ARGS__>

-constexpr unsigned long int_max(unsigned long n) { return (1u << (n * 8)) - 1; }
+constexpr unsigned long int_max(unsigned long n)
+{
+    // Note, left shift cannot be used to get the maximum value of int64_type or
+    // uint64_type because it is undefined behavior to left shift 64 bits for
+    // these types
+    if(n == sizeof(int64_t))
+        return -1;
+    return (1ul << (n * 8)) - 1;
+}

 template <class T,
          MIGRAPHX_REQUIRES(is_integral<T>{} or is_floating_point<T>{} or
@@ -228,9 +236,9 @@ constexpr T numeric_max()
    if constexpr(is_integral<T>{})
    {
        if constexpr(is_unsigned<T>{})
-            return int_max(sizeof(T)) * 2;
-        else
            return int_max(sizeof(T));
+        else
+            return int_max(sizeof(T)) / 2;
    }
    else if constexpr(is_same<T, double>{})
        return __DBL_MAX__;

--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -135,7 +135,7 @@ constexpr vec<vec_type<T>, N> vec_packed_at(T x, I i)
        return vec<T, N>{x};
    else
    {
-        MIGRAPHX_ASSERT((i + N) < vec_size<T>());
+        MIGRAPHX_ASSERT((i + N) <= vec_size<T>());
        vec<vec_type<T>, N> result = {0};
        for(int j = 0; j < N; j++)
        {

--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
@@ -197,9 +197,13 @@ struct mlir_program
                result = mlirF64TypeGet(ctx.get());
            else if(as.is_integral())
            {
-                if(as.is_signed())
-                    result = mlirIntegerTypeSignedGet(ctx.get(), as.size() * 8);
-                else
+                // Note: rocMLIR use signless integer type for tensors types. This
+                // will translate to signed implementation for current supported
+                // operations.
+                if(as.is_unsigned())
+                {
+                    MIGRAPHX_THROW("Unsupported type: " + std::to_string(as.type_enum()));
+                }
                result = mlirIntegerTypeGet(ctx.get(), as.size() * 8);
            }
            else
@@ -320,7 +324,8 @@ struct mlir_program
                                     std::string,
                                     value,
                                     std::vector<value>,
-                                     MlirType>;
+                                     MlirType,
+                                     MlirAttribute>;
    using named_attribute_t = std::pair<std::string_view, attribute_t>;

    MlirNamedAttribute name_attribute(const named_attribute_t& na) const
@@ -477,13 +482,17 @@ struct mlir_program
    {
        if(ins->name() == "@return")
            return "func.return";
+        if(ins->name() == "@literal")
+        {
+            return "tosa.const";
+        }
        return "migraphx." + ins->name();
    }

    static value get_operator_value(const operation& op)
    {
        auto v = op.to_value();
-        if(op.name() == "convolution")
+        if(op.name() == "convolution" or op.name() == "quant_convolution")
        {
            // Adjust symetrical padding
            if(v.at("padding").size() == v.at("stride").size())
@@ -528,11 +537,24 @@ struct mlir_program
        {
            if(ins->name() == "@param")
                continue;
+            if(ins->name() == "contiguous")
+            {
+                ins_map[ins] = ins_map[ins->inputs().at(0)];
+                continue;
+            }
            auto name = get_name(ins);
            auto ops  = create_operation_state(name);
            ops.add_attribute_value(get_operator_value(ins->get_operator()));
            if(ins->name() != "@return")
                ops.add_results({get_shape(ins)});
+            if(ins->name() == "@literal")
+            {
+                literal r            = ins->get_literal();
+                MlirType tensor_type = make_tensor(ins->get_shape());
+                MlirAttribute mlir_value_attr =
+                    mlirDenseElementsAttrRawBufferGet(tensor_type, r.get_shape().bytes(), r.data());
+                ops.add_attributes({{"value", mlir_value_attr}});
+            }
            if(ins->name() == "convolution" or ins->name() == "dot")
            {
                pp =
@@ -735,12 +757,13 @@ code_object_op compile_mlir(const context&, module m, const std::vector<instruct
 {
    adjust_param_shapes(m, inputs);
    const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
-    if(trace)
-        std::cout << m << std::endl;
-
    // set mutex while llvm thread support is disabled.
    static std::mutex g_mlirc_mutex; // NOLINT
    const std::lock_guard<std::mutex> lock(g_mlirc_mutex);
+
+    if(trace)
+        std::cout << m << std::endl;
+
    mlir_program mp;
    mp.find_target();
    mp.parse(m);

--- a/src/targets/gpu/rocblas.cpp
+++ b/src/targets/gpu/rocblas.cpp
@@ -55,24 +55,15 @@ const std::unordered_set<std::string>& get_rocblas_fp32_archs()

 bool get_compute_fp32_flag()
 {
-    bool compute_fp32 = false;
-#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
    const auto device_name = trim(split_string(get_device_name(), ':').front());
-    if(contains(get_rocblas_fp32_archs(), device_name))
-        compute_fp32 = true;
-#endif
-    return compute_fp32;
+    return contains(get_rocblas_fp32_archs(), device_name);
 }

 bool get_int8_x4_format(context& ctx)
 {
-    bool int8_x4_format = true;
-#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
    rocblas_gemm_flags flag;
    rocblas_query_int8_layout_flag(ctx.get_stream().get_rocblas(), &flag);
-    int8_x4_format = (flag == rocblas_gemm_flags_pack_int8x4);
-#endif
-    return int8_x4_format;
+    return flag == rocblas_gemm_flags_pack_int8x4;
 }
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -74,7 +74,6 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS)
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_POINTWISE_FUSION)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_REDUCE_FUSION)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NHWC)
 struct id_pass
@@ -100,16 +99,17 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
    unsupported_types.erase(shape::type_t::bool_type);
    unsupported_types.erase(shape::type_t::int8_type);
    unsupported_types.erase(shape::type_t::uint8_type);
+    unsupported_types.erase(shape::type_t::int32_type);
    unsupported_types.erase(shape::type_t::tuple_type);
    // clang-format off
    return
    {
-        enable_pass(options.split_single_dyn_dim, split_single_dyn_dim{}),
-        enable_pass(options.split_single_dyn_dim, dead_code_elimination{}),
+        split_single_dyn_dim{},
+        dead_code_elimination{},
        normalize_ops{},
        dead_code_elimination{},
        simplify_qdq{},
-        rewrite_quantization{},
+        enable_pass(not mlir_enabled(), rewrite_quantization{}),
        dead_code_elimination{},
        eliminate_data_type{unsupported_types, shape::type_t::float_type},
        simplify_reshapes{},
@@ -133,11 +133,11 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        fuse_ck_gemm_softmax_gemm{&ctx},
        dead_code_elimination{},
        optimize_module{},
-        enable_pass(not enabled(MIGRAPHX_DISABLE_POINTWISE_FUSION{}), fuse_pointwise{}),
+        fuse_pointwise{},
        dead_code_elimination{},
        enable_pass(not enabled(MIGRAPHX_DISABLE_REDUCE_FUSION{}), fuse_reduce{}),
        dead_code_elimination{},
-        fuse_mlir{&ctx},
+        enable_pass(mlir_enabled(), fuse_mlir{&ctx}),
        dead_code_elimination{},
        fuse_ck{&ctx},
        dead_code_elimination{},

--- a/src/tf/op_parser.cpp
+++ b/src/tf/op_parser.cpp
@@ -46,6 +46,7 @@ std::vector<std::string> get_op_parsers()
                   op_parser_map().end(),
                   std::back_inserter(result),
                   [&](auto&& p) { return p.first; });
+    std::sort(result.begin(), result.end());
    return result;
 }


--- a/src/tf/tf.cpp
+++ b/src/tf/tf.cpp
@@ -22,6 +22,7 @@
 * THE SOFTWARE.
 */
 #include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/tf/op_parser.hpp>
 #include <iostream>
 #include <fstream>
 #include <unordered_map>
@@ -62,5 +63,7 @@ program parse_tf(const std::string& name, const tf_options& options)
    return std::move(parser.prog);
 }

+std::vector<std::string> get_tf_operators() { return tf::get_op_parsers(); }
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -134,6 +134,9 @@ if(MIGRAPHX_ENABLE_GPU)
            COST 10
            RESOURCE_LOCK gpu
        )
+        if(MIGRAPHX_USE_HIPRTC)
+        target_compile_definitions(test_gpu_${BASE_NAME} PUBLIC -DMIGRAPHX_USE_HIPRTC)
+        endif()
        target_link_libraries(test_gpu_${BASE_NAME} migraphx_gpu migraphx_kernels)
    endforeach()
 endif()

--- a/test/api/CMakeLists.txt
+++ b/test/api/CMakeLists.txt
@@ -48,6 +48,7 @@ add_api_test(assign test_assign.cpp ${TEST_ONNX_DIR})
 add_api_test(compile_options test_compile_options.cpp ${TEST_ONNX_DIR})
 add_api_test(lookup test_lookup.cpp ${TEST_ONNX_DIR})
 add_api_test(module_construct test_module_construct.cpp ${TEST_ONNX_DIR})
+add_api_test(dynamic_shape test_dynamic_shape.cpp ${TEST_ONNX_DIR})
 add_api_test(ref test_cpu.cpp ${TEST_ONNX_DIR})
 add_api_test(save_load test_save_load.cpp ${TEST_ONNX_DIR})
 add_api_test(op test_op_construct.cpp ${TEST_ONNX_DIR})

--- a/test/api/test_dynamic_shape.cpp
+++ b/test/api/test_dynamic_shape.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/migraphx.h>
+#include <migraphx/migraphx.hpp>
+#include "test.hpp"
+
+TEST_CASE(create_dynamic_dimensions)
+{
+    migraphx::dynamic_dimension dd0{1, 4};
+    EXPECT(not dd0.is_fixed());
+    migraphx::dynamic_dimension dd1{4, 4};
+    EXPECT(dd1.is_fixed());
+    migraphx::optimals opts{1, 2, 4};
+    migraphx::dynamic_dimension dd2{1, 4, opts};
+    migraphx::dynamic_dimensions dyn_dims0{dd0, dd1, dd2};
+    CHECK(bool{dyn_dims0[0] == dd0});
+    CHECK(bool{dyn_dims0[1] == dd1});
+    CHECK(bool{dyn_dims0[2] == dd2});
+    CHECK(bool{dyn_dims0[2] != dd0});
+    EXPECT(dyn_dims0.size() == 3);
+}
+
+TEST_CASE(create_dynamic_shape)
+{
+    migraphx::dynamic_dimensions dyn_dims(migraphx::dynamic_dimension{1, 4},
+                                          migraphx::dynamic_dimension{78, 92},
+                                          migraphx::dynamic_dimension{1, 4, {1, 4}});
+    migraphx::shape dyn_shape{migraphx_shape_float_type, dyn_dims};
+    CHECK(bool{dyn_shape.dynamic()});
+    CHECK(bool{dyn_shape.dyn_dims()[0] == migraphx::dynamic_dimension{1, 4}});
+
+    migraphx::shape static_shape{migraphx_shape_float_type, {3, 8}};
+    EXPECT(not static_shape.dynamic());
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/api/test_gpu.cpp
+++ b/test/api/test_gpu.cpp
@@ -25,7 +25,6 @@
 #include <hip/hip_runtime_api.h>
 #include <migraphx/migraphx.h>
 #include <migraphx/migraphx.hpp>
-
 #include <migraphx/manage_ptr.hpp>
 #include "test.hpp"

@@ -72,6 +71,105 @@ hip_ptr get_hip_buffer(size_t size)
    return hip_ptr{ptr};
 }

+// TODO: placeholder until we have a way to copy tuple arguments to/from device through c++ api
+// TEST_CASE(dynamic_batch_load_and_run)
+//{
+//    migraphx::onnx_options o_options;
+//    migraphx::dynamic_dimensions dyn_dims = {{1, 4, {2, 4}}, {3, 3}, {4, 4}, {4, 4}};
+//    o_options.set_dyn_input_parameter_shape("0", dyn_dims);
+//    dyn_dims = {{2, 2}, {3, 3}, {3, 3}, {3, 3}};
+//    o_options.set_dyn_input_parameter_shape("1", dyn_dims);
+//    auto p = migraphx::parse_onnx("conv_dynamic_batch_test.onnx", o_options);
+//    migraphx::compile_options c_options;
+//    c_options.set_split_single_dyn_dim();
+//    p.compile(migraphx::target("gpu"), c_options);
+//    auto out_shapes = p.get_output_shapes();
+//    CHECK(out_shapes.size() == 1);
+//    EXPECT(out_shapes[0].dynamic());
+//
+//    std::vector<float> a(0.12, 2*3*4*4);
+//    std::vector<float> c(0.75, 2*3*3*3);
+//
+//    auto param_shapes = p.get_parameter_shapes();
+//    int batch_size    = 2;
+//    std::unordered_map<std::string, migraphx::argument> arg_map;
+//
+//    arg_map["0"] = migraphx::argument(param_shapes["0"].to_static(batch_size), a.data());
+//    arg_map["1"] = migraphx::argument(param_shapes["1"].to_static(batch_size), c.data());
+//
+//    migraphx::program_parameters pp;
+//    std::vector<hip_ptr> buffs;
+//    std::vector<migraphx::argument> args;
+//
+//    // copy to GPU and create parameter map
+//    for(auto&& name : param_shapes.names())
+//    {
+//        if(arg_map.find(name) != arg_map.end())
+//        {
+//            args.push_back(arg_map.at(name));
+//        }
+//        else
+//        {
+//            migraphx::shape static_shape = param_shapes[name].to_static(batch_size);
+//            auto output_arg              = migraphx::argument(static_shape);
+//            args.push_back(output_arg);
+//        }
+//        buffs.push_back(get_hip_buffer(args.rbegin()->get_shape().bytes()));
+//        auto err = hipMemcpy(buffs.rbegin()->get(),
+//                             args.rbegin()->data(),
+//                             args.rbegin()->get_shape().bytes(),
+//                             hipMemcpyHostToDevice);
+//        EXPECT(err == hipSuccess);
+//        pp.add(name, migraphx::argument(args.rbegin()->get_shape(), buffs.rbegin()->get()));
+//    }
+//
+//    auto output = p.eval(pp)[0];
+//
+//    // copy output back to host
+//    auto host_arg = migraphx::argument(output.get_shape());
+//    auto err      = hipMemcpy(
+//        host_arg.data(), output.data(), output.get_shape().bytes(), hipMemcpyDeviceToHost);
+//    EXPECT(err == hipSuccess);
+//}
+
+TEST_CASE(dynamic_batch_load_and_run_offload)
+{
+    migraphx::onnx_options o_options;
+    migraphx::dynamic_dimensions dyn_dims = {migraphx::dynamic_dimension{1, 4, {2, 4}},
+                                             migraphx::dynamic_dimension{3, 3},
+                                             migraphx::dynamic_dimension{4, 4},
+                                             migraphx::dynamic_dimension{4, 4}};
+    o_options.set_dyn_input_parameter_shape("0", dyn_dims);
+    dyn_dims = {migraphx::dynamic_dimension{2, 2},
+                migraphx::dynamic_dimension{3, 3},
+                migraphx::dynamic_dimension{3, 3},
+                migraphx::dynamic_dimension{3, 3}};
+    o_options.set_dyn_input_parameter_shape("1", dyn_dims);
+    auto p             = migraphx::parse_onnx("conv_dynamic_batch_test.onnx", o_options);
+    auto shapes_before = p.get_output_shapes();
+    migraphx::compile_options c_options;
+    c_options.set_offload_copy();
+    p.compile(migraphx::target("gpu"), c_options);
+    auto out_shapes = p.get_output_shapes();
+    CHECK(out_shapes.size() == 1);
+    EXPECT(out_shapes[0].dynamic());
+
+    // batch size = 2
+    std::vector<float> a(2 * 3 * 4 * 4, 0.12);
+    std::vector<float> c(2 * 3 * 3 * 3, 0.75);
+    migraphx::program_parameters pp;
+    auto param_shapes = p.get_parameter_shapes();
+    pp.add("0",
+           migraphx::argument(migraphx::shape(migraphx_shape_float_type, {2, 3, 4, 4}), a.data()));
+    pp.add("1",
+           migraphx::argument(migraphx::shape(migraphx_shape_float_type, {2, 3, 3, 3}), c.data()));
+    auto outputs = p.eval(pp);
+
+    CHECK(shapes_before.size() == outputs.size());
+    CHECK(bool{outputs.front().get_shape() ==
+               migraphx::shape(migraphx_shape_float_type, {2, 1, 3, 3})});
+}
+
 TEST_CASE(load_and_run_async)
 {
    auto p             = migraphx::parse_onnx("conv_relu_maxpool_test.onnx");

--- a/test/fuse_pointwise.cpp
+++ b/test/fuse_pointwise.cpp
@@ -329,4 +329,36 @@ TEST_CASE(all_scalar_input)
    EXPECT(p1 == p2);
 }

+TEST_CASE(no_input)
+{
+    migraphx::program p;
+    {
+        auto* mm = p.get_main_module();
+        migraphx::shape g_shape{migraphx::shape::int64_type, {1}, {0}};
+        migraphx::shape s_indices{migraphx::shape::int32_type, {3}};
+        std::vector<int> indices{3, 800, 800};
+        auto a0  = mm->add_literal(migraphx::literal{s_indices, indices});
+        auto a1  = mm->add_literal(migraphx::literal{g_shape, {1}});
+        int axis = 0;
+        auto out = mm->add_instruction(migraphx::make_op("gather", {{"axis", axis}}), a0, a1);
+        mm->add_return({out});
+    }
+    run_pass(p);
+
+    // This should NOT create a pointwise module if there are no inputs here.
+    migraphx::program p2;
+    {
+        auto* mm = p2.get_main_module();
+        migraphx::shape g_shape{migraphx::shape::int64_type, {1}, {0}};
+        migraphx::shape s_indices{migraphx::shape::int32_type, {3}};
+        std::vector<int> indices{3, 800, 800};
+        auto a0  = mm->add_literal(migraphx::literal{s_indices, indices});
+        auto a1  = mm->add_literal(migraphx::literal{g_shape, {1}});
+        int axis = 0;
+        auto out = mm->add_instruction(migraphx::make_op("gather", {{"axis", axis}}), a0, a1);
+        mm->add_return({out});
+    }
+    EXPECT(p == p2);
+}
+
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/gpu/jit.cpp
+++ b/test/gpu/jit.cpp
@@ -206,8 +206,16 @@ TEST_CASE(compile_warnings)
    EXPECT(not compile("").empty());
    EXPECT(not compile("-Wunused-parameter -Wno-error").empty());
    EXPECT(not compile("-Wno-unused-parameter -Werror").empty());
+#ifdef MIGRAPHX_USE_HIPRTC
+    if(not migraphx::enabled(migraphx::gpu::MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS{}))
+    {
+        EXPECT(test::throws([&] { compile("-Werror=unused-parameter"); }));
+        EXPECT(test::throws([&] { compile("-Wunused-parameter -Werror"); }));
+    }
+#else
    EXPECT(test::throws([&] { compile("-Werror=unused-parameter"); }));
    EXPECT(test::throws([&] { compile("-Wunused-parameter -Werror"); }));
+#endif
 }

 TEST_CASE(code_object_hip)
@@ -356,4 +364,69 @@ TEST_CASE(compile_math)
    });
 }

+// NOLINTNEXTLINE
+const std::string assert_template = R"__migraphx__(
+#include <migraphx/kernels/math.hpp>
+#include <migraphx/kernels/types.hpp>
+using namespace migraphx;
+extern "C" {
+__global__ void kernel(void*) 
+{
+    static_assert(numeric_max<${type}>() == ${max}, "");
+    static_assert(numeric_lowest<${type}>() == ${min}, "");
+}
+}
+
+int main() {}
+
+)__migraphx__";
+
+TEST_CASE(assert_type_min_max)
+{
+    std::vector<std::string> data_types;
+    migraphx::gpu::hip_compile_options options;
+    for(auto&& t : migraphx::shape::types())
+    {
+        if(contains({migraphx::shape::bool_type, migraphx::shape::tuple_type}, t))
+            continue;
+        auto name = migraphx::shape::cpp_type(t);
+        if(t == migraphx::shape::half_type)
+            name.insert(0, "migraphx::");
+
+        migraphx::shape::visit(t, [&](auto as) {
+            std::string min = "";
+            std::string max = "";
+            // Note 9223372036854775808 is a constant literal that is outside the range of long
+            // long type For the same reason, 18446744073709551616 needs postfix ULL to be parsed
+            // correctly
+            if(t == migraphx::shape::int64_type)
+            {
+                min = "(" + std::to_string(as.min() + 1) + "LL - 1)";
+                max = std::to_string(as.max());
+            }
+            else if(t == migraphx::shape::uint64_type)
+            {
+                min = std::to_string(as.min());
+                max = std::to_string(as.max()) + "ULL";
+            }
+            else
+            {
+                min = std::to_string(as.min());
+                max = std::to_string(as.max());
+            }
+
+            auto src = migraphx::interpolate_string(assert_template,
+                                                    {{"type", name}, {"max", max}, {"min", min}});
+            migraphx::shape input{migraphx::shape::float_type, {5, 2}};
+            options.global = 1024;
+            options.local  = 1024;
+            options.inputs = {input};
+            options.output = input;
+            options.params = "-Wno-float-equal";
+
+            auto co = migraphx::gpu::compile_hip_code_object(src, options);
+        });
+    }
+}
+
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/gpu/mlir.cpp
+++ b/test/gpu/mlir.cpp
@@ -213,4 +213,37 @@ module {
    EXPECT(verify_mlir(m));
 }

+TEST_CASE(conv_int8_dequantize_quantize)
+{
+    const std::string mlir_output = R"__migraphx__(
+module {
+  func.func @main(%arg0: tensor<2x8x3x3xi8>, %arg1: tensor<1x8x4x4xi8>, %arg2: tensor<1x2x2x2xf32>, %arg3: tensor<1x2x2x2xi32>) -> tensor<1x2x2x2xi32> attributes {arch = "", kernel = "mixr"} {
+      %0 = migraphx.quant_convolution(%arg1, %arg0) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : (tensor<1x8x4x4xi8>, tensor<2x8x3x3xi8>) -> tensor<1x2x2x2xi32>
+      %1 = migraphx.dequantizelinear(%0, %arg2, %arg3) : (tensor<1x2x2x2xi32>, tensor<1x2x2x2xf32>, tensor<1x2x2x2xi32>) -> tensor<1x2x2x2xf32>
+      %2 = migraphx.quantizelinear(%1, %arg2, %arg3) : (tensor<1x2x2x2xf32>, tensor<1x2x2x2xf32>, tensor<1x2x2x2xi32>) -> tensor<1x2x2x2xi32>
+      return %2 : tensor<1x2x2x2xi32>
+    }
+}
+)__migraphx__";
+
+    migraphx::module m;
+    auto x    = m.add_parameter("x", {migraphx::shape::int8_type, {1, 8, 4, 4}});
+    auto w    = m.add_parameter("w", {migraphx::shape::int8_type, {2, 8, 3, 3}});
+    auto conv = m.add_instruction(migraphx::make_op("quant_convolution"), x, w);
+    migraphx::shape ss{migraphx::shape::float_type, {1, 2, 2, 2}};
+    migraphx::shape sz{migraphx::shape::int32_type, {1, 2, 2, 2}};
+    auto input2  = m.add_parameter("x_scale", ss);
+    auto input3  = m.add_parameter("x_zero_point", sz);
+    auto dequant = m.add_instruction(migraphx::make_op("dequantizelinear"), conv, input2, input3);
+    auto r       = m.add_instruction(migraphx::make_op("quantizelinear"), dequant, input2, input3);
+
+    m.add_return({r});
+    auto s = migraphx::gpu::dump_mlir(m);
+    // Skip test if MLIR is not enabled
+    if(s.empty())
+        return;
+    CHECK(encode(s) == encode(mlir_output));
+    EXPECT(verify_mlir(m));
+}
+
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/gpu/quantization.cpp
+++ b/test/gpu/quantization.cpp
@@ -23,6 +23,7 @@
 */
 #include <iostream>
 #include <vector>
+#include <migraphx/gpu/fuse_mlir.hpp>
 #include <migraphx/operators.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/quantization.hpp>
@@ -110,6 +111,15 @@ TEST_CASE(int8_quantization)
        migraphx::target gpu_t = migraphx::make_target("gpu");
        run_prog(p, gpu_t, m, gpu_result);

+        // Note: the tolerance for mlir_enabled result is temporarily bumped
+        // higher because the lowering pipeline between mlir fallback and
+        // regular non-mlir pipeline diverged. MLIR fallback uses the
+        // rewrite_quantization at the very end of the pipeline, whereas
+        // the regular pipeline uses the rewrite_quantization in the much
+        // earlier stage.
+        if(migraphx::gpu::mlir_enabled())
+            EXPECT(migraphx::verify_range(ref_result, gpu_result, 1e5));
+        else
            EXPECT(migraphx::verify_range(ref_result, gpu_result));
    }
 }

--- a/test/onnx/.onnxrt-commit
+++ b/test/onnx/.onnxrt-commit
-ad4db1269972f92fdba932bb5770943291be3ca5
+5a43828b3d73028bfd33b3856f82698d9ab02cb1
--- a/test/onnx/onnx_test.cpp
+++ b/test/onnx/onnx_test.cpp
@@ -4959,13 +4959,13 @@ TEST_CASE(reducemax_dyn_test)
    migraphx::program p;
    auto* mm = p.get_main_module();
    auto l0  = mm->add_parameter(
-        "x", migraphx::shape{migraphx::shape::float_type, {{3, 3}, {4, 4}, {5, 5}, {6, 6}}});
+        "x", migraphx::shape{migraphx::shape::float_type, {{3, 5}, {4, 4}, {5, 5}, {6, 6}}});
    auto r0 = mm->add_instruction(migraphx::make_op("reduce_max", {{"axes", {2}}}), l0);
    auto r1 = mm->add_instruction(migraphx::make_op("squeeze", {{"axes", {2}}}), r0);
    mm->add_return({r1});

    migraphx::onnx_options options;
-    options.map_dyn_input_dims["x"] = {{3, 3}, {4, 4}, {5, 5}, {6, 6}};
+    options.map_dyn_input_dims["x"] = {{3, 5}, {4, 4}, {5, 5}, {6, 6}};
    auto prog                       = migraphx::parse_onnx("reducemax_dyn_test.onnx", options);

    EXPECT(p == prog);
@@ -6953,6 +6953,23 @@ TEST_CASE(variable_batch_user_input_test6)
    EXPECT(test::throws([&] { migraphx::parse_onnx("variable_batch_test.onnx", options); }));
 }

+TEST_CASE(variable_batch_user_input_test7)
+{
+    // if entry in map_dyn_input_dims is all fixed dynamic_dimensions, convert it to a static shape
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    auto l0  = mm->add_parameter("0", migraphx::shape{migraphx::shape::float_type, {2, 3, 16, 16}});
+    auto r   = mm->add_instruction(migraphx::make_op("identity"), l0);
+    mm->add_return({r});
+
+    migraphx::onnx_options options;
+    options.map_dyn_input_dims["0"] = {{2, 2, {2}}, {3, 3}, {16, 16}, {16, 16}};
+
+    auto prog = migraphx::parse_onnx("variable_batch_test.onnx", options);
+
+    EXPECT(p == prog);
+}
+
 TEST_CASE(variable_batch_leq_zero_test)
 {
    migraphx::program p;

--- a/test/op_shape_test.cpp
+++ b/test/op_shape_test.cpp
@@ -1822,6 +1822,33 @@ TEST_CASE(pad_dyn_shape1)
    expect_shape(output, migraphx::make_op("pad", {{"pads", {0, 0, 1, 1, 0, 0, 1, 1}}}), input);
 }

+TEST_CASE(pointwise_no_module)
+{
+    migraphx::shape input{migraphx::shape::float_type, {0}, {0}};
+    throws_shape(migraphx::make_op("pointwise"), input);
+}
+
+TEST_CASE(pointwise_no_input)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::module m;
+    std::vector<migraphx::instruction_ref> args{};
+    auto output = migraphx::shape(migraphx::shape::float_type, {1}, {0});
+    auto l      = m.add_literal(migraphx::literal(output, {1}));
+    m.add_return({l});
+    EXPECT(test::throws([&] { mm->add_instruction(migraphx::make_op("pointwise"), args, {&m}); }));
+}
+
+TEST_CASE(pointwise_no_output)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::module m;
+    std::vector<migraphx::instruction_ref> args{};
+    EXPECT(test::throws([&] { mm->add_instruction(migraphx::make_op("pointwise"), args, {&m}); }));
+}
+
 TEST_CASE(pooling_shape0)
 {
    migraphx::shape input{migraphx::shape::float_type, {4, 3, 3, 3}};
@@ -3114,14 +3141,22 @@ TEST_CASE(test_unsqueeze_scalar)

 TEST_CASE(test_unsqueeze_scalar_tensor1)
 {
-    migraphx::shape s{migraphx::shape::float_type, {4, 3, 3}, {0, 0, 0}};
-    throws_shape(migraphx::make_op("unsqueeze", {{"axes", {-2}}}), s);
+    migraphx::shape s1{migraphx::shape::float_type, {4, 3, 3}, {0, 0, 0}};
+    migraphx::shape s2{migraphx::shape::float_type, {4, 3, 1, 3}, {0, 0, 1, 0}};
+    expect_shape(s2, migraphx::make_op("unsqueeze", {{"axes", {-2}}}), s1);
 }

 TEST_CASE(test_unsqueeze_scalar_tensor2)
 {
-    migraphx::shape s{migraphx::shape::float_type, {1, 1, 1}, {0, 0, 0}};
-    throws_shape(migraphx::make_op("unsqueeze", {{"axes", {-2}}}), s);
+    migraphx::shape s1{migraphx::shape::float_type, {1, 1, 1}, {0, 0, 0}};
+    migraphx::shape s2{migraphx::shape::float_type, {1, 1, 1, 1}, {0, 0, 0, 1}};
+    expect_shape(s2, migraphx::make_op("unsqueeze", {{"axes", {-1}}}), s1);
+}
+
+TEST_CASE(test_unsqueeze_scalar_step)
+{
+    migraphx::shape s{migraphx::shape::float_type, {6, 1, 2}, {0, 0, 0}};
+    throws_shape(migraphx::make_op("unsqueeze", {{"axes", {0}}, {"steps", {3}}}), s);
 }

 TEST_CASE(test_unsqueeze_transpose)

--- a/test/py/test_gpu.py
+++ b/test/py/test_gpu.py
@@ -86,8 +86,8 @@ def test_nonzero():
    params = {}

    shapes = p.get_parameter_shapes()
-    params["data"] = np.array([1, 1, 0, 1]).reshape(
-        shapes["data"].lens()).astype(np.bool)
+    params["data"] = np.array([1, 1, 0,
+                               1]).reshape(shapes["data"].lens()).astype(bool)

    r = p.run(params)
    print(r)
@@ -127,15 +127,54 @@ def test_if_pl():
    params["x"] = np.ones(6).reshape(shapes["x"].lens()).astype(np.float32)
    params["y"] = np.array([2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0
                            ]).reshape(shapes["y"].lens()).astype(np.float32)
-    params["cond"] = np.array([1]).reshape(()).astype(np.bool)
+    params["cond"] = np.array([1]).reshape(()).astype(bool)

    r = p.run(params)[-1]
    print(r)


+def test_dyn_batch():
+    a = migraphx.shape.dynamic_dimension(1, 4, {2, 4})
+    b = migraphx.shape.dynamic_dimension(3, 3)
+    c = migraphx.shape.dynamic_dimension(32, 32)
+    dd_map = {"0": [a, b, c, c]}
+    p = migraphx.parse_onnx("conv_relu_maxpool_test.onnx",
+                            map_dyn_input_dims=dd_map)
+    print(p)
+    print("Compiling ...")
+    p.compile(migraphx.get_target("gpu"))
+    print(p)
+
+    def run_prog(batch_size):
+        params = {}
+        for key, value in p.get_parameter_shapes().items():
+            # convert to a static shape
+            if value.dynamic():
+                dds = value.dyn_dims()
+                new_lens = []
+                for dd in dds:
+                    if dd.is_fixed():
+                        new_lens.append(dd.min)
+                    else:
+                        new_lens.append(batch_size)
+                s = migraphx.shape(type=value.type_string(), lens=new_lens)
+            else:
+                s = value
+            print("Parameter {} -> {}".format(key, s))
+            params[key] = migraphx.generate_argument(s)
+        r = p.run(params)
+        print(r)
+
+    run_prog(1)
+    run_prog(2)
+    run_prog(3)
+    run_prog(4)
+
+
 test_conv_relu()
 test_sub_uint64()
 test_neg_int64()
 test_fp16_imagescaler()
 test_if_pl()
 test_nonzero()
+test_dyn_batch()
--- a/test/py/test_gpu_offload.py
+++ b/test/py/test_gpu_offload.py
@@ -23,16 +23,53 @@
 #####################################################################################
 import migraphx

-p = migraphx.parse_onnx("conv_relu_maxpool_test.onnx")
-print(p)
-print("Compiling ...")
-p.compile(migraphx.get_target("gpu"), offload_copy=False)
-print(p)
-params = {}

-for key, value in p.get_parameter_shapes().items():
+def test_conv_relu():
+    p = migraphx.parse_onnx("conv_relu_maxpool_test.onnx")
+    print(p)
+    print("Compiling ...")
+    p.compile(migraphx.get_target("gpu"), offload_copy=False)
+    print(p)
+    params = {}
+
+    for key, value in p.get_parameter_shapes().items():
        print("Parameter {} -> {}".format(key, value))
        params[key] = migraphx.to_gpu(migraphx.generate_argument(value))

-r = migraphx.from_gpu(p.run(params)[-1])
-print(r)
+    r = migraphx.from_gpu(p.run(params)[-1])
+    print(r)
+
+
+# TODO: placeholder until tuple shapes and arguments exposed
+#def test_dyn_batch():
+#    a = migraphx.shape.dynamic_dimension(1, 4, {2, 4})
+#    b = migraphx.shape.dynamic_dimension(3, 3)
+#    c = migraphx.shape.dynamic_dimension(32, 32)
+#    dd_map = {"0": [a, b, c, c]}
+#    p = migraphx.parse_onnx("conv_relu_maxpool_test.onnx",
+#                            map_dyn_input_dims=dd_map)
+#    print(p)
+#    print("Compiling ...")
+#    p.compile(migraphx.get_target("gpu"), offload_copy=False)
+#
+#    print(p)
+#
+#    def run_prog(batch_size):
+#        params = {}
+#        for key, value in p.get_parameter_shapes().items():
+#            print("Parameter {} -> {}".format(key, value))
+#            params[key] = migraphx.to_gpu(
+#                migraphx.generate_argument(value.to_static(batch_size)))
+#
+#        print("before_output")
+#        outputs = p.run(params)
+#        print(outputs)
+#        r = migraphx.from_gpu(p.run(params)[-1])
+#        print(r)
+#
+#    run_prog(1)
+#    run_prog(2)
+#    run_prog(3)
+#    run_prog(4)
+
+test_conv_relu()