Merge branch 'develop' into resnet50_partition

655e9646 · Umang Yadav · d9528ead · c3990622 · 655e9646 · 655e9646
Commit 655e9646 authored Oct 18, 2023 by Umang Yadav
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,20 +27,18 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}")
    message(FATAL_ERROR "The binary and source directroy cannot be the same")
 endif()
-get_property(_GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
+# Setup valid strings for build type
+if (NOT CMAKE_CONFIGURATION_TYPES)
+    set(CMAKE_CONFIGURATION_TYPES "Debug;Release;RelWithDebInfo;MinSizeRel" CACHE STRING "Configs")
+endif()
+get_property(MIGRAPHX_GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
 # This has to be initialized before the project() command appears
 # Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D.  MSVC_IDE does not use CMAKE_BUILD_TYPE
-if(_GENERATOR_IS_MULTI_CONFIG)
+if(NOT MIGRAPHX_GENERATOR_IS_MULTI_CONFIG)
-    if (NOT CMAKE_CONFIGURATION_TYPES)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING
-        set(CMAKE_CONFIGURATION_TYPES "Debug;Release;RelWithDebInfo;MinSizeRel" CACHE STRING
+        "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel.")
-            "Available build types (configurations) on multi-config generators")
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CMAKE_CONFIGURATION_TYPES})
-    endif()
-else()
-    if(NOT CMAKE_BUILD_TYPE)
-        set(CMAKE_BUILD_TYPE Release CACHE STRING
-            "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel.")
-    endif()
 endif()
 set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -109,10 +109,13 @@ def rocmnode(name, body) {
 rocmtest clang_debug: rocmnode('mi100+') { cmake_build ->
    stage('hipRTC Debug') {
-        def sanitizers = "undefined"
+        // Disable MLIR since it doesnt work with all ub sanitizers
-        def debug_flags = "-g -O2 -fsanitize=${sanitizers} -fno-sanitize-recover=${sanitizers}"
+        withEnv(['MIGRAPHX_DISABLE_MLIR=1']) {
-        def gpu_targets = getgputargets()
+            def sanitizers = "undefined"
-        cmake_build(flags: "-DCMAKE_BUILD_TYPE=debug -DMIGRAPHX_ENABLE_PYTHON=Off -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}' -DCMAKE_C_FLAGS_DEBUG='${debug_flags}' -DMIGRAPHX_USE_HIPRTC=On -DGPU_TARGETS='${gpu_targets}'", gpu_debug: true)
+            def debug_flags = "-g -O2 -fsanitize=${sanitizers} -fno-sanitize-recover=${sanitizers}"
+            def gpu_targets = getgputargets()
+            cmake_build(flags: "-DCMAKE_BUILD_TYPE=debug -DMIGRAPHX_ENABLE_PYTHON=Off -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}' -DCMAKE_C_FLAGS_DEBUG='${debug_flags}' -DMIGRAPHX_USE_HIPRTC=On -DGPU_TARGETS='${gpu_targets}'", gpu_debug: true)
+        }
    }
 }, clang_release: rocmnode('mi100+') { cmake_build ->
    stage('Hip Clang Release') {
@@ -131,7 +134,7 @@ rocmtest clang_debug: rocmnode('mi100+') { cmake_build ->
    }
 }, mlir_debug: rocmnode('mi100+') { cmake_build ->
    stage('MLIR Debug') {
-        withEnv(['MIGRAPHX_ENABLE_MLIR=1']) {
+        withEnv(['MIGRAPHX_ENABLE_EXTRA_MLIR=1']) {
            def sanitizers = "undefined"
            // Note: the -fno-sanitize= is copied from upstream LLVM_UBSAN_FLAGS.
            def debug_flags_cxx = "-g -O2 -fsanitize=${sanitizers} -fno-sanitize=vptr,function -fno-sanitize-recover=${sanitizers}"
@@ -142,7 +145,7 @@ rocmtest clang_debug: rocmnode('mi100+') { cmake_build ->
    }
 }, ck_hiprtc: rocmnode('mi100+') { cmake_build ->
    stage('CK hipRTC') {
-        withEnv(['MIGRAPHX_ENABLE_CK=1', 'MIGRAPHX_TUNE_CK=1']) {
+        withEnv(['MIGRAPHX_ENABLE_CK=1', 'MIGRAPHX_TUNE_CK=1', 'MIGRAPHX_DISABLE_MLIR=1']) {
            def gpu_targets = getgputargets()
            cmake_build(flags: "-DCMAKE_BUILD_TYPE=release -DMIGRAPHX_USE_HIPRTC=On -DGPU_TARGETS='${gpu_targets}'")
        }

--- a/requirements.txt
+++ b/requirements.txt
@@ -28,5 +28,5 @@ ROCmSoftwarePlatform/half@rocm-5.6.0
 pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build
 msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off
 sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On
-ROCmSoftwarePlatform/composable_kernel@a22e479b8e1557961039db2d5c5ff89cff35e86b -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On
+ROCmSoftwarePlatform/composable_kernel@70eefcf4f263aa5c25f3c9ff0db8f6f199ef0fb9 -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On
-ROCmSoftwarePlatform/rocMLIR@12748a3402c069f733ea7f2ba1f8d8a070b3622a -DBUILD_FAT_LIBROCKCOMPILER=On
+ROCmSoftwarePlatform/rocMLIR@507bb94ce7873786486d296ec81d2eadaab49003 -DBUILD_FAT_LIBROCKCOMPILER=On
\ No newline at end of file
--- a/src/driver/argument_parser.hpp
+++ b/src/driver/argument_parser.hpp
@@ -187,6 +187,13 @@ struct value_parser
    }
 };
+// version for std::optional object
+template <class T>
+struct value_parser<std::optional<T>>
+{
+    static T apply(const std::string& x) { return value_parser<T>::apply(x); }
+};
 struct argument_parser
 {
    struct argument

--- a/src/driver/main.cpp
+++ b/src/driver/main.cpp
@@ -540,22 +540,17 @@ struct params : command<params>
 struct verify : command<verify>
 {
    compiler c;
-    // Set to -1. as nonsense initial value
+    std::optional<double> rms_tol;
-    double rms_tol       = -1.0;
+    std::optional<double> atol;
-    double atol          = -1.0;
+    std::optional<double> rtol;
-    double rtol          = -1.0;
    bool per_instruction = false;
    bool reduce          = false;
    void parse(argument_parser& ap)
    {
        c.parse(ap);
-        ap(rms_tol, {"--rms-tol"}, ap.help("Tolerance for the RMS error (Default: 0.001)"));
+        ap(rms_tol, {"--rms-tol"}, ap.help("Tolerance for the RMS error"));
-        ap(atol,
+        ap(atol, {"--atol"}, ap.help("Tolerance for the elementwise absolute difference"));
-           {"--atol"},
+        ap(rtol, {"--rtol"}, ap.help("Tolerance for the elementwise relative difference"));
-           ap.help("Tolerance for the elementwise absolute difference (Default: 0.001)"));
-        ap(rtol,
-           {"--rtol"},
-           ap.help("Tolerance for the elementwise relative difference (Default: 0.001)"));
        ap(per_instruction,
           {"-i", "--per-instruction"},
           ap.help("Verify each instruction"),
@@ -572,33 +567,6 @@ struct verify : command<verify>
        auto t = c.ct.get_target();
        auto m = c.parameters.generate(p, t, true, c.l.batch);
-        // TODO remove this and make the driver able to figure out datatype most used in the model
-        //  then set the tolerances appropriately. Need to check here because c.to_fp16 only set
-        //  after argument_parser.parse() is run. This code is complicated because there's not a
-        //  good way to change the default tolerances after reading `--fp16` but before reading
-        //  `--rms-tol`, `--atol`, and `--rtol`.
-        migraphx::verify::tolerance tols{};
-        if(c.to_fp16)
-        {
-            tols = migraphx::verify::tolerance{8e-2, 4e-2, 4e-2};
-        }
-        if(not float_equal(this->rms_tol, -1.0))
-        {
-            tols.rms_tol = this->rms_tol;
-        }
-        if(not float_equal(this->atol, -1.0))
-        {
-            tols.atol = this->atol;
-        }
-        if(not float_equal(this->rtol, -1.0))
-        {
-            tols.rtol = this->rtol;
-        }
-        std::cout << "rms_tol: " << tols.rms_tol << std::endl;
-        std::cout << "atol: " << tols.atol << std::endl;
-        std::cout << "rtol: " << tols.rtol << std::endl;
        auto quantize = precision::fp32;
        if(c.to_fp16)
        {
@@ -609,6 +577,11 @@ struct verify : command<verify>
            quantize = precision::int8;
        }
+        auto tols = get_tolerances(p, quantize, rms_tol, atol, rtol);
+        std::cout << "rms_tol: " << tols.rms_tol << std::endl;
+        std::cout << "atol: " << tols.atol << std::endl;
+        std::cout << "rtol: " << tols.rtol << std::endl;
        if(per_instruction)
        {
            verify_instructions(p, t, c.co, quantize, tols);

--- a/src/driver/verify.cpp
+++ b/src/driver/verify.cpp
@@ -36,6 +36,42 @@ namespace migraphx {
 namespace driver {
 inline namespace MIGRAPHX_INLINE_NS {
+/**
+ * Gives tolerances based on user input (`rms_tol`, `atol`, `rtol` parameters) and defaults.
+ * Sets to fp16 tolerances if `quantize` input is fp16 or any fp16 instruction in found in the
+ * model.
+ */
+verify::tolerance get_tolerances(const program& p,
+                                 precision quantize,
+                                 std::optional<double> rms_tol,
+                                 std::optional<double> atol,
+                                 std::optional<double> rtol)
+{
+    bool has_fp16 = any_of(p.get_modules(), [](auto&& m) {
+        return any_of(*m, [](auto&& ins) { return (ins.get_shape().type() == shape::half_type); });
+    });
+    migraphx::verify::tolerance result{};
+    if(has_fp16 or quantize == precision::fp16)
+    {
+        result.rms_tol = 8e-2;
+        result.atol    = 4e-2;
+        result.rtol    = 4e-2;
+    }
+    if(rms_tol)
+    {
+        result.rms_tol = *rms_tol;
+    }
+    if(atol)
+    {
+        result.atol = *atol;
+    }
+    if(rtol)
+    {
+        result.rtol = *rtol;
+    }
+    return result;
+}
 std::vector<argument> run_ref(program p, const parameter_map& inputs)
 {
    p.compile(migraphx::make_target("ref"));

--- a/src/driver/verify.hpp
+++ b/src/driver/verify.hpp
@@ -32,6 +32,12 @@ namespace migraphx {
 namespace driver {
 inline namespace MIGRAPHX_INLINE_NS {
+verify::tolerance get_tolerances(const program& p,
+                                 precision quantize,
+                                 std::optional<double> rms_tol,
+                                 std::optional<double> atol,
+                                 std::optional<double> rtol);
 void verify_program(const std::string& name,
                    const program& p,
                    const target& t,

--- a/src/include/migraphx/op/reshape.hpp
+++ b/src/include/migraphx/op/reshape.hpp
@@ -36,6 +36,22 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {
+/**
+ * 1 input version:
+ * reshape(input_data)
+ * this.dims = output_dims
+ * Makes a copy of input_data to the output shape.
+ *
+ * 2 input version:
+ * reshape(input_data, output_buffer)
+ * this.dims = unset
+ * Copies input_data to output_buffer; output_buffer already has the output shape.
+ * This version will not fail gracefully if the input shape and output_buffer shape are
+ * incompatible. There's a throw that will catch when the number of elements do not match at
+ * runtime. This version should only be used for dynamic reshapes (output dimensions only known at
+ * runtime). If output_buffer has a static shape during compile/parse, you can use the 1 input
+ * version.
+ */
 struct reshape
 {
    std::vector<int64_t> dims;
@@ -215,32 +231,56 @@ struct reshape
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this, true}.has(1);
+        check_shapes{inputs, *this, true}.has(1, 2);
        auto n_neg_dims = std::count(dims.begin(), dims.end(), -1);
        if(n_neg_dims > 1)
            MIGRAPHX_THROW("reshape: Dimensions for reshape can only have one -1 dim");
        auto s0 = inputs.front();
-        if(s0.dynamic())
+        if(inputs.size() == 1)
        {
-            return dyn_compute_shape(s0);
+            if(s0.dynamic())
+            {
+                return dyn_compute_shape(s0);
+            }
+            else
+            {
+                return static_compute_shape(inputs, n_neg_dims);
+            }
        }
        else
        {
-            return static_compute_shape(inputs, n_neg_dims);
+            return inputs.back();
        }
    }
    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
        assert(dyn_out.computed_shape.standard());
-        argument result{dyn_out.computed_shape};
+        if(args.size() == 1)
+        {
+            argument result{dyn_out.computed_shape};
-        visit_all(result, args[0])([&](auto output, auto input) {
+            visit_all(result, args[0])([&](auto output, auto input) {
-            std::copy(input.begin(), input.end(), output.begin());
+                std::copy(input.begin(), input.end(), output.begin());
-        });
+            });
-        return result;
+            return result;
+        }
+        else
+        {
+            // 2 arg
+            if(args[0].get_shape().elements() != args[1].get_shape().elements())
+            {
+                MIGRAPHX_THROW("Reshape: Number of elements must match at runtime. Input: " +
+                               std::to_string(args[0].get_shape().elements()) +
+                               " Output buffer: " + std::to_string(args[1].get_shape().elements()));
+            }
+            visit_all(args[1], args[0])([&](auto output, auto input) {
+                std::copy(input.begin(), input.end(), output.begin());
+            });
+            return args[1];
+        }
    }
 };

--- a/src/onnx/padding.cpp
+++ b/src/onnx/padding.cpp
@@ -47,7 +47,7 @@ void cal_auto_padding_size(onnx_parser::node_info info,
        return;
    }
-    auto auto_pad = info.attributes["auto_pad"].s();
+    auto auto_pad = to_upper(info.attributes["auto_pad"].s());
    if(auto_pad.find("SAME") != std::string::npos)
    {
        bool is_same_upper = (auto_pad.find("SAME_UPPER") != std::string::npos);

--- a/src/onnx/parse_groupnorm.cpp
+++ b/src/onnx/parse_groupnorm.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/instruction.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+struct parse_groupnorm : op_parser<parse_groupnorm>
+{
+    std::vector<op_desc> operators() const { return {{"GroupNormalization"}}; }
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const onnx_parser& parser,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        float epsilon = 1e-5f;
+        if(contains(info.attributes, "epsilon"))
+        {
+            epsilon = parser.parse_value(info.attributes.at("epsilon")).at<float>();
+        }
+        size_t num_groups;
+        if(contains(info.attributes, "num_groups"))
+        {
+            num_groups = parser.parse_value(info.attributes.at("num_groups")).at<size_t>();
+        }
+        else
+        {
+            MIGRAPHX_THROW("PARSE_GROUPNORM: num_groups must be available");
+        }
+        if(args.size() != 3)
+        {
+            MIGRAPHX_THROW("PARSE_GROUPNORM: invalid input count");
+        }
+        auto x     = args.at(0);
+        auto scale = args.at(1);
+        auto bias  = args.at(2);
+        auto x_shape = x->get_shape();
+        auto x_dtype = x_shape.type();
+        auto x_dims  = x_shape.lens();
+        if(x_shape.ndim() <= 2)
+        {
+            MIGRAPHX_THROW("PARSE_GROUPNORM: invalid input shape");
+        }
+        auto c = x_shape.lens().at(1);
+        if(c % num_groups != 0)
+        {
+            MIGRAPHX_THROW(
+                "PARSE_GROUPNORM: num_groups should be a divisor of the number of channels");
+        }
+        auto group_size = c / num_groups;
+        if(scale->get_shape().ndim() != 1 or scale->get_shape().lens().at(0) != num_groups)
+        {
+            MIGRAPHX_THROW("PARSE_GROUPNORM: scale tensor shape should be num_groups");
+        }
+        if(bias->get_shape().ndim() != 1 or bias->get_shape().lens().at(0) != num_groups)
+        {
+            MIGRAPHX_THROW("PARSE_GROUPNORM: bias tensor shape should be num_groups");
+        }
+        // Original shape: N x C x D1 x ... x Dn
+        // New shape: N x num_groups x C // num_groups x D1 x ... x Dn
+        std::vector<size_t> dims = {x_dims.at(0), num_groups, group_size};
+        std::copy(x_dims.begin() + 2, x_dims.end(), std::back_inserter(dims));
+        auto x_reshaped = info.add_instruction(make_op("reshape", {{"dims", dims}}), x);
+        // Axes for D1 x ... x Dn
+        std::vector<size_t> axes(dims.size() - 2);
+        std::iota(axes.begin(), axes.end(), 2);
+        // y = (x - mean) * rsqrt(variance + epsilon) * scale + bias
+        // mean = reduce_mean({D1, D2, ... Dk}, x)
+        // variance = reduce_mean({D1, D2, ... Dk}, (x - mean)^2)
+        auto mean = info.add_instruction(make_op("reduce_mean", {{"axes", axes}}), x_reshaped);
+        auto x_sub_mean    = info.add_common_op("sub", x_reshaped, mean);
+        auto x_sqdiff_mean = info.add_common_op("sqdiff", x_reshaped, mean);
+        auto variance =
+            info.add_instruction(make_op("reduce_mean", {{"axes", axes}}), x_sqdiff_mean);
+        epsilon =
+            (x_dtype == migraphx::shape::half_type and std::abs(epsilon) < 1e-7) ? 1e-7 : epsilon;
+        auto eps     = info.add_literal(migraphx::literal{migraphx::shape{x_dtype}, {epsilon}});
+        auto var_eps = info.add_common_op("add", variance, eps);
+        auto rsqrt   = info.add_instruction(make_op("rsqrt"), var_eps);
+        auto result  = info.add_common_op("mul", x_sub_mean, rsqrt);
+        auto scale_bcast =
+            info.add_instruction(make_op("broadcast", {{"axis", 1}, {"out_lens", dims}}), scale);
+        auto bias_bcast =
+            info.add_instruction(make_op("broadcast", {{"axis", 1}, {"out_lens", dims}}), bias);
+        auto scaled     = info.add_instruction(make_op("mul"), result, scale_bcast);
+        auto y          = info.add_instruction(make_op("add"), scaled, bias_bcast);
+        auto y_reshaped = info.add_instruction(make_op("reshape", {{"dims", x_dims}}), y);
+        return y_reshaped;
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_layernorm.cpp
+++ b/src/onnx/parse_layernorm.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/instruction.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+struct parse_layernorm : op_parser<parse_layernorm>
+{
+    std::vector<op_desc> operators() const { return {{"LayerNormalization"}}; }
+    std::vector<instruction_ref> parse(const op_desc& /*opd*/,
+                                       const onnx_parser& parser,
+                                       const onnx_parser::node_info& info,
+                                       std::vector<instruction_ref> args) const
+    {
+        int64_t axis = -1;
+        if(contains(info.attributes, "axis"))
+        {
+            axis = parser.parse_value(info.attributes.at("axis")).at<int64_t>();
+        }
+        float epsilon = 1e-5f;
+        if(contains(info.attributes, "epsilon"))
+        {
+            epsilon = parser.parse_value(info.attributes.at("epsilon")).at<float>();
+        }
+        if(contains(info.attributes, "stash_type"))
+        {
+            std::cerr << "WARNING: LAYERNORM does not support stash_type, it will be ignored.\n";
+        }
+        if(args.size() < 2 or args.size() > 3)
+        {
+            MIGRAPHX_THROW("PARSE_LAYERNORM: invalid input count");
+        }
+        auto x         = args.at(0);
+        auto scale     = args.at(1);
+        bool skip_bias = args.size() == 2;
+        instruction_ref bias;
+        if(not skip_bias)
+        {
+            bias = args.at(2);
+        }
+        auto x_shape   = x->get_shape();
+        auto x_dtype   = x_shape.type();
+        int64_t x_rank = x_shape.ndim();
+        if(x_rank < 2)
+        {
+            MIGRAPHX_THROW("PARSE_LAYERNORM: invalid input shape");
+        }
+        // If rank(X) is r, axis' allowed range is [-r, r)
+        if(axis < -x_rank or axis >= x_rank)
+        {
+            MIGRAPHX_THROW("PARSE_LAYERNORM: invalid axis");
+        }
+        // y = (x - mean) * rsqrt(variance + epsilon) * scale + bias
+        // mean = reduce_mean({D1, D2, ... Dk}, x)
+        // variance = reduce_mean({D1, D2, ... Dk}, (x - mean)^2)
+        // axis can be negative
+        axis = axis < 0 ? axis + x_rank : axis;
+        auto kdims = x_rank - axis;
+        std::vector<int64_t> axes(kdims);
+        std::iota(axes.begin(), axes.end(), axis);
+        auto skipped_axes = x_rank - kdims;
+        auto mean          = info.add_instruction(make_op("reduce_mean", {{"axes", axes}}), x);
+        auto x_sub_mean    = info.add_common_op("sub", x, mean);
+        auto x_sqdiff_mean = info.add_common_op("sqdiff", x, mean);
+        auto variance =
+            info.add_instruction(make_op("reduce_mean", {{"axes", axes}}), x_sqdiff_mean);
+        epsilon =
+            (x_dtype == migraphx::shape::half_type and std::abs(epsilon) < 1e-7) ? 1e-7 : epsilon;
+        auto eps     = info.add_literal(migraphx::literal{migraphx::shape{x_dtype}, {epsilon}});
+        auto var_eps = info.add_common_op("add", variance, eps);
+        auto rsqrt   = info.add_instruction(make_op("rsqrt"), var_eps);
+        auto result  = info.add_common_op("mul", x_sub_mean, rsqrt);
+        instruction_ref scale_bcast = scale;
+        instruction_ref bias_bcast  = bias;
+        if(skipped_axes > 0)
+        {
+            auto x_dims = x_shape.lens();
+            scale_bcast = info.add_instruction(
+                make_op("broadcast", {{"axis", skipped_axes}, {"out_lens", x_dims}}), scale);
+            if(not skip_bias)
+            {
+                bias_bcast = info.add_instruction(
+                    make_op("broadcast", {{"axis", skipped_axes}, {"out_lens", x_dims}}), bias);
+            }
+        }
+        auto scaled = info.add_instruction(make_op("mul"), result, scale_bcast);
+        auto y      = skip_bias ? scaled : info.add_instruction(make_op("add"), scaled, bias_bcast);
+        return {y, mean, rsqrt};
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_pad.cpp
+++ b/src/onnx/parse_pad.cpp
@@ -115,34 +115,9 @@ struct parse_pad : op_parser<parse_pad>
 {
    std::vector<op_desc> operators() const { return {{"Pad"}}; }
-    instruction_ref parse(const op_desc& /*opd*/,
+    std::string parse_mode(const onnx_parser::node_info& info,
-                          const onnx_parser& parser,
+                           const std::vector<instruction_ref>& args) const
-                          onnx_parser::node_info info,
-                          std::vector<instruction_ref> args) const
    {
-        std::vector<int64_t> pads{};
-        if(args.size() >= 2)
-        {
-            auto pad_arg = args.at(1)->eval();
-            check_arg_empty(pad_arg, "PARSE_PAD: pad input must be constant");
-            pad_arg.visit([&](auto v) { pads.assign(v.begin(), v.end()); });
-        }
-        else if(contains(info.attributes, "pads"))
-        {
-            auto&& pad_vals = info.attributes["pads"].ints();
-            pads            = std::vector<int64_t>(pad_vals.begin(), pad_vals.end());
-        }
-        else
-        {
-            MIGRAPHX_THROW("PARSE_PAD: pad must be available");
-        }
-        // check if padding is actually being done (at least one value is nonzero)
-        if(std::all_of(pads.begin(), pads.end(), [](const int& i) { return i == 0; }))
-        {
-            return info.add_instruction(make_op("identity"), args.front());
-        }
        if(contains(info.attributes, "mode"))
        {
            auto mode = info.attributes.at("mode").s();
@@ -152,28 +127,59 @@ struct parse_pad : op_parser<parse_pad>
                {
                    MIGRAPHX_THROW("PARSE_PAD: reflect padding with dynamic shape not supported");
                }
-                return reflect_pad(info, pads, args.front());
            }
-            if(mode != "constant")
+            else if(mode != "constant")
            {
                MIGRAPHX_THROW(
                    "PARSE_PAD: migraphx currently only supports constant and reflect padding");
            }
+            return mode;
+        }
+        else
+        {
+            // default mode
+            return "constant";
        }
+    }
+    std::vector<int64_t> parse_pads(const onnx_parser::node_info& info,
+                                    const std::vector<instruction_ref>& args) const
+    {
+        std::vector<int64_t> pads{};
+        if(args.size() >= 2)
+        {
+            auto pad_arg = args.at(1)->eval();
+            check_arg_empty(pad_arg, "PARSE_PAD: `pads` input must be constant");
+            pad_arg.visit([&](auto v) { pads.assign(v.begin(), v.end()); });
+        }
+        else if(contains(info.attributes, "pads"))
+        {
+            auto&& pad_vals = info.attributes.at("pads").ints();
+            pads            = std::vector<int64_t>(pad_vals.begin(), pad_vals.end());
+        }
+        else
+        {
+            MIGRAPHX_THROW("PARSE_PAD: `pads` must be available");
+        }
+        return pads;
+    }
+    float parse_constant_value(const onnx_parser& parser,
+                               const onnx_parser::node_info& info,
+                               const std::vector<instruction_ref>& args) const
+    {
        float value = 0.0f;
-        // third input is the value
+        if(args.size() >= 3 and args.at(2)->get_shape().scalar())
-        if(args.size() == 3)
        {
            auto val_ins = args.at(2);
            if(not val_ins->can_eval())
            {
-                MIGRAPHX_THROW("PARSE_PAD: input value must be constant");
+                MIGRAPHX_THROW("PARSE_PAD: input `value` must be constant");
            }
            auto val_arg = val_ins->eval();
            if(val_arg.get_shape().elements() != 1)
            {
-                MIGRAPHX_THROW("PARSE_PAD: value should contain only one element");
+                MIGRAPHX_THROW("PARSE_PAD: `value` should contain only one element");
            }
            value = val_arg.at<float>();
        }
@@ -181,6 +187,81 @@ struct parse_pad : op_parser<parse_pad>
        {
            value = parser.parse_value(info.attributes.at("value")).at<float>();
        }
+        return value;
+    }
+    std::vector<int64_t> parse_axes(const std::vector<instruction_ref>& args,
+                                    bool is_constant_mode) const
+    {
+        std::vector<int64_t> axes{};
+        // axes is 3rd or 4th, depending on constant mode
+        auto pos = is_constant_mode ? 4 : 3;
+        if(args.size() >= pos)
+        {
+            auto axes_arg = args.at(pos - 1)->eval();
+            check_arg_empty(axes_arg, "PARSE_PAD: variable `axes` input not supported");
+            axes_arg.visit([&](auto v) { axes.assign(v.begin(), v.end()); });
+        }
+        return axes;
+    }
+    std::vector<int64_t> calculate_pads_with_axes(const std::vector<int64_t>& pads,
+                                                  const std::vector<int64_t>& axes,
+                                                  size_t input_rank) const
+    {
+        size_t num_axes = axes.size();
+        if(num_axes * 2 != pads.size())
+        {
+            MIGRAPHX_THROW("PARSE_PAD: number of elements of pads should be equal to 2 * "
+                           "number of elements of axes");
+        }
+        std::vector<int64_t> new_pads(input_rank * 2);
+        for(size_t idx{0}; idx < num_axes; ++idx)
+        {
+            // axis can be negative
+            int64_t axis = axes[idx] < 0 ? input_rank + axes[idx] : axes[idx];
+            // pad format is x1_begin, x2_begin, ... , x3_end, x4_end
+            new_pads[axis]              = pads[idx];
+            new_pads[axis + input_rank] = pads[idx + num_axes];
+        }
+        return new_pads;
+    }
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const onnx_parser& parser,
+                          const onnx_parser::node_info& info,
+                          const std::vector<instruction_ref>& args) const
+    {
+        std::vector<int64_t> pads = parse_pads(info, args);
+        // check if padding is actually being done (at least one value is nonzero)
+        if(std::all_of(pads.begin(), pads.end(), [](const int& i) { return i == 0; }))
+        {
+            return info.add_instruction(make_op("identity"), args.front());
+        }
+        std::string mode      = parse_mode(info, args);
+        bool is_constant_mode = mode == "constant";
+        float value           = is_constant_mode ? parse_constant_value(parser, info, args) : 0.0f;
+        std::vector<int64_t> axes = parse_axes(args, is_constant_mode);
+        size_t input_rank         = args.front()->get_shape().ndim();
+        if(not axes.empty())
+        {
+            pads = calculate_pads_with_axes(pads, axes, input_rank);
+        }
+        if(pads.size() != input_rank * 2)
+        {
+            MIGRAPHX_THROW("PARSE_PAD: number of elements of pads should be equal to 2 * "
+                           "input rank");
+        }
+        if(mode == "reflect")
+        {
+            return reflect_pad(info, pads, args.front());
+        }
        return info.add_instruction(migraphx::make_op("pad", {{"pads", pads}, {"value", value}}),
                                    args.front());

--- a/src/onnx/parse_pooling.cpp
+++ b/src/onnx/parse_pooling.cpp
@@ -97,7 +97,7 @@ struct parse_pooling : op_parser<parse_pooling>
            values["lp_order"] = info.attributes.at("p").i();
        }
-        // ensure pads availabe only when auto_pad is "NOT_SET"
+        // ensure pads available only when auto_pad is "NOT_SET"
        check_padding_mode(info, "POOLING");
        return values;

--- a/src/onnx/parse_qlinearconv.cpp
+++ b/src/onnx/parse_qlinearconv.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/onnx/padding.hpp>
+#include <migraphx/onnx/conv.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/onnx/broadcast_qdq.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/stringutils.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+/*
+ *********************************************************************************
+ *  Reference: see QLinearConv in                                                *
+ *  https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md  *
+ *********************************************************************************
+com.microsoft.QLinearConv
+Version
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+ATTRIBUTES:
+auto_pad : string
+channels_last : int
+dilations : list of ints
+group : int
+kernel_shape : list of ints
+pads : list of ints
+strides : list of ints
+INPUTS (8 - 9):
+x : T1
+x_scale : tensor(float)
+x_zero_point : T1
+w : T2
+w_scale : tensor(float)
+w_zero_point : T2
+y_scale : tensor(float)
+y_zero_point : T3
+B (optional) : T4
+OUTPUTS:
+y : T3
+Type Constraints:
+T1 : tensor(int8), tensor(uint8)
+T2 : tensor(int8), tensor(uint8)
+T3 : tensor(int8), tensor(uint8)
+T4 : tensor(int32)
+More details also at:
+https://xadupre.github.io/draft/onnx/onnx_doc_folder/onnx__QLinearConv.html
+*/
+struct parse_qlinearconv : op_parser<parse_qlinearconv>
+{
+    std::vector<op_desc> operators() const { return {{"QLinearConv"}}; }
+    // basic type checking for QLinearConv Operator
+    void check_inputs(const std::vector<instruction_ref>& inp_arg) const
+    {
+        if(inp_arg.size() < 8)
+            MIGRAPHX_THROW("QLINEARCONV: missing inputs");
+        const instruction_ref& in_x       = inp_arg[0];
+        const instruction_ref& in_scale_x = inp_arg[1];
+        const instruction_ref& in_w       = inp_arg[3];
+        const instruction_ref& in_scale_w = inp_arg[4];
+        const instruction_ref& in_scale_y = inp_arg[6];
+        auto sh_x   = in_x->get_shape();
+        auto sh_w   = in_w->get_shape();
+        auto type_x = sh_x.type();
+        auto type_w = sh_w.type();
+        assert(in_x->get_shape().ndim() > 2);
+        if(type_x != shape::int8_type and type_x != shape::uint8_type)
+            MIGRAPHX_THROW("QLINEARCONV: unsupported input type");
+        if(type_w != shape::int8_type and type_w != shape::uint8_type)
+            MIGRAPHX_THROW("QLINEARCONV: unsupported weight type");
+        if(in_scale_x->get_shape().type() != shape::float_type)
+            MIGRAPHX_THROW("QLINEARCONV x scale type should be float");
+        if(in_scale_w->get_shape().type() != shape::float_type)
+            MIGRAPHX_THROW("QLINEARCONV: wt scale type should be float");
+        if(in_scale_y->get_shape().type() != shape::float_type)
+            MIGRAPHX_THROW("QLINEARCONV: y scale type should be float");
+        if(inp_arg.size() > 8 and inp_arg[8]->get_shape().type() != shape::int32_type)
+            MIGRAPHX_THROW("QLINEARCONV y bias should be int32");
+    }
+    // process all attributes of QLinearConv Operator..
+    value process_attributes(const onnx_parser& parser,
+                             const onnx_parser::node_info& info,
+                             const std::vector<instruction_ref>& args) const
+    {
+        value values;
+        const auto& in_x = args[0];
+        const auto& wt   = args[3];
+        size_t kdims = in_x->get_shape().ndim() - 2;
+        check_padding_mode(info, "QLINEARCONV");
+        values["stride"]   = std::vector<int>(kdims, 1);
+        values["dilation"] = std::vector<int>(kdims, 1);
+        values["padding"]  = std::vector<int>(kdims, 0);
+        values["group"]    = 1;
+        if(contains(info.attributes, "group"))
+            values["group"] = parser.parse_value(info.attributes.at("group")).template at<int>();
+        if(contains(info.attributes, "strides"))
+        {
+            std::vector<int> st;
+            copy(info.attributes.at("strides").ints(), std::back_inserter(st));
+            check_attr_sizes(kdims, st.size(), "QLINEARCONV: inconsistent strides");
+            values["stride"] = st;
+        }
+        if(contains(info.attributes, "dilations"))
+        {
+            std::vector<int> dil;
+            copy(info.attributes.at("dilations").ints(), std::back_inserter(dil));
+            check_attr_sizes(kdims, dil.size(), "QLINEARCONV: inconsistent dilations");
+            values["dilation"] = dil;
+        }
+        if(contains(info.attributes, "pads"))
+        {
+            std::vector<int> pads;
+            copy(info.attributes.at("pads").ints(), std::back_inserter(pads));
+            check_attr_sizes(kdims, pads.size() / 2, "QLINEARCONV: inconsistent padding");
+            values["padding"] = pads;
+        }
+        else if(contains(info.attributes, "auto_pad"))
+        {
+            auto in_lens = in_x->get_shape().lens();
+            auto wt_lens = wt->get_shape().lens();
+            std::vector<std::size_t> k_lens(wt_lens.begin() + 2, wt_lens.end());
+            std::vector<int64_t> pads = values["padding"].to_vector<std::int64_t>();
+            cal_auto_padding_size(
+                info, values, k_lens, values["dilation"].to_vector<std::size_t>(), in_lens, pads);
+            values["padding"] = pads;
+        }
+        recalc_conv_attributes(values, kdims);
+        return values;
+    }
+    instruction_ref add_bias_to_conv(const instruction_ref bias_arg,
+                                     const instruction_ref conv_instr,
+                                     const onnx_parser::node_info& info) const
+    {
+        auto conv_sh   = conv_instr->get_shape();
+        auto conv_lens = conv_sh.lens();
+        auto conv_type = conv_sh.type();
+        auto broadcast_bias = info.add_instruction(
+            migraphx::make_op("broadcast", {{"axis", 1}, {"out_lens", conv_lens}}), bias_arg);
+        auto f_bias =
+            info.add_instruction(make_op("convert", {{"target_type", conv_type}}), broadcast_bias);
+        return info.add_instruction(migraphx::make_op("add"), conv_instr, f_bias);
+    };
+    instruction_ref parse(const op_desc& /* opd */,
+                          const onnx_parser& parser,
+                          const onnx_parser::node_info& info,
+                          const std::vector<instruction_ref>& args) const
+    {
+        check_inputs(args);
+        auto values = process_attributes(parser, info, args);
+        // input: quantized x, scale, zero_pt
+        const instruction_ref& in_x         = args[0];
+        const instruction_ref& in_scale_x   = args[1];
+        const instruction_ref& in_zero_pt_x = args[2];
+        // input: quantized weights, scale, zero_pt
+        const instruction_ref& in_w         = args[3];
+        const instruction_ref& in_scale_w   = args[4];
+        const instruction_ref& in_zero_pt_w = args[5];
+        // for the dequantized output  y: scale & zero_pt
+        const instruction_ref& in_scale_y   = args[6];
+        const instruction_ref& in_zero_pt_y = args[7];
+        auto dquant_x = bcast_qdq_instr("dequantizelinear", in_x, in_scale_x, in_zero_pt_x, info);
+        auto dquant_w = bcast_qdq_instr("dequantizelinear", in_w, in_scale_w, in_zero_pt_w, info);
+        auto conv_op = migraphx::make_op("convolution", values);
+        auto conv_x_w = info.add_instruction(conv_op, dquant_x, dquant_w);
+        // Biases, if any.. : is an optional argument.
+        if(args.size() > 8)
+            conv_x_w = add_bias_to_conv(args[8], conv_x_w, info);
+        auto quant_conv =
+            bcast_qdq_instr("quantizelinear", conv_x_w, in_scale_y, in_zero_pt_y, info);
+        return quant_conv;
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_qlinearglavgpool.cpp
+++ b/src/onnx/parse_qlinearglavgpool.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/op/pooling.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/onnx/broadcast_qdq.hpp>
+#include <migraphx/instruction.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+/*
+ *********************************************************************************
+ *  Reference: see QLinearGlobalAveragePool in                                   *
+ *  github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md          *
+ *********************************************************************************
+QLinearGlobalAveragePool consumes an input tensor X and applies
+Average pooling across the values in the same channel. This is
+equivalent to AveragePool with kernel size equal to the spatial
+dimension of input tensor. Input is of type uint8_t or int8_t.
+Version
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+Attributes
+channels_last : int
+Inputs
+X : T
+Input data tensor from the previous operator; According to channels_last, dimensions for image case
+are (N x C x H x W), or (N x H x W x C) where N is the batch size, C is the number of channels, and
+H and W are the height and the width of the data. For non image case, the dimensions are in the form
+of (N x C x D1 x D2 ... Dn), or (N x D1 X D2 ... Dn x C) where N is the batch size.
+x_scale : tensor(float)
+Scale of quantized input 'X'. It must be a scalar.
+x_zero_point : T
+Zero point tensor for input 'X'. It must be a scalar.
+y_scale : tensor(float)
+Scale of quantized output 'Y'. It must be a scalar.
+y_zero_point : T
+Zero point tensor for output 'Y'. It must be a scalar.
+Outputs
+Y : T
+Output data tensor from pooling across the input tensor. The output tensor has the same rank as the
+input. with the N and C value keep it value, while the other dimensions are all 1. Type Constraints
+T : tensor(uint8), tensor(int8)
+Constrain input and output types to signed/unsigned int8 tensors.
+*/
+struct parse_qlinearglobalaveragepool : op_parser<parse_qlinearglobalaveragepool>
+{
+    std::vector<op_desc> operators() const { return {{"QLinearGlobalAveragePool"}}; }
+    // basic type checking for QLinearGlobalAveragePool Operator
+    void check_inputs(const std::vector<instruction_ref>& args) const
+    {
+        if(args.size() < 5)
+            MIGRAPHX_THROW("QLINEARGLOBALAVERAGEPOOL: missing inputs");
+        const auto& in_x      = args[0];
+        const auto& zero_pt_x = args[2];
+        const auto& zero_pt_y = args[4];
+        if(in_x->get_shape().ndim() <= 2)
+            MIGRAPHX_THROW("QLINEARGLOBALAVERAGEPOOL: input dimensions too small");
+        auto type_x = in_x->get_shape().type();
+        if(type_x != migraphx::shape::int8_type and type_x != migraphx::shape::uint8_type)
+            MIGRAPHX_THROW("QLINEARGLOBALAVERAGEPOOL: unsupported input type");
+        if(type_x != zero_pt_x->get_shape().type())
+            MIGRAPHX_THROW("QLINEARGLOBALAVERAGEPOOL: mismatched type: input zero point");
+        if(type_x != zero_pt_y->get_shape().type())
+            MIGRAPHX_THROW("QLINEARGLOBALAVERAGEPOOL: mismatched type: output zero point");
+    }
+    instruction_ref parse(const op_desc& /* opd */,
+                          const onnx_parser& parser,
+                          const onnx_parser::node_info& info,
+                          const std::vector<instruction_ref>& args) const
+    {
+        int channels_last =
+            parser.parse_value(info.attributes.at("channels_last")).template at<int>();
+        if(channels_last != 0)
+            MIGRAPHX_THROW(
+                "QLINEARGLOBALAVERAGEPOOL: channels_last (N x D1..Dn x C) is not supported");
+        check_inputs(args);
+        // Input: X
+        const auto& in_x      = args[0];
+        const auto& scale_x   = args[1];
+        const auto& zero_pt_x = args[2];
+        auto dquant_x         = bcast_qdq_instr("dequantizelinear", in_x, scale_x, zero_pt_x, info);
+        // Output Y = globalaveragepool(X)
+        auto op   = migraphx::op::pooling{migraphx::op::pooling_mode::average};
+        auto lens = in_x->get_shape().lens();
+        std::vector<size_t> lengths(lens.begin() + 2, lens.end());
+        op.lengths = lengths;
+        op.padding = std::vector<size_t>(lens.size());
+        auto out_y = info.add_instruction(op, dquant_x);
+        const auto& scale_y   = args[3];
+        const auto& zero_pt_y = args[4];
+        auto out_quant_y = bcast_qdq_instr("quantizelinear", out_y, scale_y, zero_pt_y, info);
+        return out_quant_y;
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_qlinearmatmul.cpp
+++ b/src/onnx/parse_qlinearmatmul.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/op/pooling.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/onnx/broadcast_qdq.hpp>
+#include <migraphx/instruction.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+/*
+ *********************************************************************************
+ *  Reference: see QLinearMatMul in                                              *
+ *  https://onnx.ai/onnx/operators/onnx__QLinearMatMul.html                      *
+ *********************************************************************************
+Matrix product that behaves like numpy.matmul:
+https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html. It consumes two
+quantized input tensors, their scales and zero points, scale and zero point of output, and computes
+the quantized output. The quantization formula is y = saturate((x / y_scale) + y_zero_point). For (x
+/ y_scale), it is rounding to nearest ties to even. Refer to https://en.wikipedia.org/wiki/Rounding
+for details. Scale and zero point must have same shape. They must be either scalar (per tensor) or
+N-D tensor (per row for ‘a’ and per column for ‘b’). Scalar refers to per tensor quantization
+whereas N-D refers to per row or per column quantization. If the input is 2D of shape [M, K] then
+zero point and scale tensor may be an M element vector [v_1, v_2, …, v_M] for per row quantization
+and K element vector of shape [v_1, v_2, …, v_K] for per column quantization. If the input is N-D
+tensor with shape [D1, D2, M, K] then zero point and scale tensor may have shape [D1, D2, M, 1] for
+per row quantization and shape [D1, D2, 1, K] for per column quantization. Production must never
+overflow, and accumulation may overflow if and only if in 32 bits.
+Inputs
+a (heterogeneous) - T1: N-dimensional quantized matrix a
+a_scale (heterogeneous) - tensor(float): scale of quantized input a
+a_zero_point (heterogeneous) - T1: zero point of quantized input a
+b (heterogeneous) - T2: N-dimensional quantized matrix b
+b_scale (heterogeneous) - tensor(float): scale of quantized input b
+b_zero_point (heterogeneous) - T2: zero point of quantized input b
+y_scale (heterogeneous) - tensor(float): scale of quantized output y
+y_zero_point (heterogeneous) - T3: zero point of quantized output y
+Outputs
+y (heterogeneous) - T3: Quantized matrix multiply results from a * b
+Type Constraints
+T1 in ( tensor(int8), tensor(uint8) ): Constrain input a and its zero point data type to 8-bit
+integer tensor.
+T2 in ( tensor(int8), tensor(uint8) ): Constrain input b and its zero point data type to 8-bit
+integer tensor.
+T3 in ( tensor(int8), tensor(uint8) ): Constrain output y and its zero point data type to 8-bit
+integer tensor.
+*/
+struct parse_qlinearmatmul : op_parser<parse_qlinearmatmul>
+{
+    std::vector<op_desc> operators() const { return {{"QLinearMatMul"}}; }
+    // basic type checking for QLinearMatMul Operator
+    void check_inputs(const std::vector<instruction_ref>& args) const
+    {
+        if(args.size() < 8)
+            MIGRAPHX_THROW("QLINEARMATMUL: missing inputs");
+        const auto& in_a = args[0];
+        const auto& in_b = args[3];
+        auto sh_a = in_a->get_shape();
+        auto sh_b = in_b->get_shape();
+        auto type_a = sh_a.type();
+        auto type_b = sh_b.type();
+        if(type_a != migraphx::shape::int8_type and type_a != migraphx::shape::uint8_type)
+            MIGRAPHX_THROW("QLINEARMATMUL: unsupported input type");
+        if(type_b != migraphx::shape::int8_type and type_b != migraphx::shape::uint8_type)
+            MIGRAPHX_THROW("QLINEARMATMUL: unsupported input type");
+        auto lens_a = sh_a.lens();
+        auto lens_b = sh_b.lens();
+        size_t dim_a = lens_a.size();
+        size_t dim_b = lens_b.size();
+        if(dim_a == 0 or dim_b == 0)
+            MIGRAPHX_THROW("QLINEARMATMUL: empty input");
+        // broadcast supported if either is 1-D -- the other can be a 2-D tensor.
+        // if it is 1-D, just prepend/append that lens and check further constraints..
+        if(dim_a == 1)
+        {
+            lens_a.insert(lens_a.begin(), 1);
+            dim_a++;
+        }
+        if(dim_b == 1)
+        {
+            lens_b.push_back(1);
+            dim_b++;
+        }
+        // 2-D or higher-order mat mul
+        if(dim_a != dim_b or *lens_a.rbegin() != *(lens_b.rbegin() + 1) or
+           not std::equal(lens_a.rbegin() + 2, lens_a.rend(), lens_b.rbegin() + 2, lens_b.rend()))
+            MIGRAPHX_THROW("QLINEARMATMUL: mismatched input dimensions");
+        if(migraphx::any_of({args[1], args[2], args[4], args[5]},
+                            [](auto arg) { return not arg->get_shape().scalar(); }))
+            MIGRAPHX_THROW("QLINEARMATMUL: unsupported row/column quantization");
+    }
+    instruction_ref parse(const op_desc& /* opd */,
+                          const onnx_parser& /*parser*/,
+                          const onnx_parser::node_info& info,
+                          const std::vector<instruction_ref>& args) const
+    {
+        check_inputs(args);
+        // A
+        const auto& in_a         = args[0];
+        const auto& in_scale_a   = args[1];
+        const auto& in_zero_pt_a = args[2];
+        auto dquant_a = bcast_qdq_instr("dequantizelinear", in_a, in_scale_a, in_zero_pt_a, info);
+        // B
+        const auto& in_b         = args[3];
+        const auto& in_scale_b   = args[4];
+        const auto& in_zero_pt_b = args[5];
+        auto dquant_b = bcast_qdq_instr("dequantizelinear", in_b, in_scale_b, in_zero_pt_b, info);
+        bool is_a_prepended = false;
+        bool is_b_appended  = false;
+        // un-squeeze either tensor if 1-D.
+        if(in_a->get_shape().ndim() == 1)
+        {
+            is_a_prepended = true;
+            dquant_a       = info.add_instruction(make_op("unsqueeze", {{"axes", {0}}}), dquant_a);
+        }
+        if(in_b->get_shape().ndim() == 1)
+        {
+            is_b_appended = true;
+            dquant_b      = info.add_instruction(make_op("unsqueeze", {{"axes", {1}}}), dquant_b);
+        }
+        // Y = A * B
+        auto out_y = info.add_instruction(migraphx::make_op("dot"), dquant_a, dquant_b);
+        // squeeze just once if necessary.. not twice.
+        if(is_a_prepended)
+            out_y = info.add_instruction(make_op("squeeze", {{"axes", {0}}}), out_y);
+        else if(is_b_appended)
+            out_y = info.add_instruction(make_op("squeeze", {{"axes", {1}}}), out_y);
+        const auto& scale_y   = args[6];
+        const auto& zero_pt_y = args[7];
+        return bcast_qdq_instr("quantizelinear", out_y, scale_y, zero_pt_y, info);
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_reshape.cpp
+++ b/src/onnx/parse_reshape.cpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -45,15 +45,25 @@ struct parse_reshape : op_parser<parse_reshape>
        {
            literal s = parser.parse_value(info.attributes.at("shape"));
            s.visit([&](auto v) { copy(v, std::back_inserter(dims)); });
+            return info.add_instruction(make_op("reshape", {{"dims", dims}}), args[0]);
        }
-        if(args.size() == 2)
+        else
        {
+            // 2 inputs
            auto s = args[1]->eval();
-            check_arg_empty(s, "Reshape: non-constant shape input is not supported");
+            if(s.empty())
-            s.visit([&](auto v) { copy(v, std::back_inserter(dims)); });
+            {
+                // arg[1] not eval-able
+                auto alloc_ins = info.add_instruction(
+                    make_op("allocate", {{"buf_type", args[0]->get_shape().type()}}), args[1]);
+                return info.add_instruction(make_op("reshape"), args[0], alloc_ins);
+            }
+            else
+            {
+                s.visit([&](auto v) { copy(v, std::back_inserter(dims)); });
+                return info.add_instruction(make_op("reshape", {{"dims", dims}}), args[0]);
+            }
        }
-        return info.add_instruction(make_op("reshape", {{"dims", dims}}), args[0]);
    }
 };

--- a/src/onnx/parse_shrink.cpp
+++ b/src/onnx/parse_shrink.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+struct parse_shrink : op_parser<parse_shrink>
+{
+    std::vector<op_desc> operators() const { return {{"Shrink"}}; }
+    instruction_ref parse(const op_desc&,
+                          const onnx_parser& parser,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        float bias = 0.0;
+        if(contains(info.attributes, "bias"))
+        {
+            bias = parser.parse_value(info.attributes.at("bias")).at<float>();
+        }
+        float lambd = 0.5;
+        if(contains(info.attributes, "lambd"))
+        {
+            lambd = parser.parse_value(info.attributes.at("lambd")).at<float>();
+        }
+        auto x             = args[0];
+        auto x_shape       = x->get_shape();
+        auto x_type        = x_shape.type();
+        auto lit_bias      = info.add_literal(bias);
+        auto lit_neg_lambd = info.add_literal(-lambd);
+        auto lit_lambd     = info.add_literal(lambd);
+        auto x_plus_bias = info.add_common_op("add", x, lit_bias);
+        auto x_min_bias  = info.add_common_op("sub", x, lit_bias);
+        auto cond1   = info.add_common_op("less", x, lit_neg_lambd);
+        auto cond2_a = info.add_common_op("not", cond1);
+        auto cond2_b = info.add_common_op("greater", x, lit_lambd);
+        auto cond2   = info.add_common_op("logical_and", cond2_a, cond2_b);
+        auto mul1 = info.add_instruction(make_op("convert", {{"target_type", x_type}}), cond1);
+        auto mul2 = info.add_instruction(make_op("convert", {{"target_type", x_type}}), cond2);
+        auto first  = info.add_common_op("mul", mul1, x_plus_bias);
+        auto second = info.add_common_op("mul", mul2, x_min_bias);
+        auto ret    = info.add_common_op("add", first, second);
+        if(ret->get_shape().type() != x_type)
+        {
+            ret = info.add_instruction(make_op("convert", {{"target_type", x_type}}), ret);
+        }
+        return ret;
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -521,6 +521,27 @@ struct find_inner_broadcast
                      }) < (lens.size() - 1);
           }))
            return;
+        if(broadcasts.size() > 1)
+        {
+            auto bcast_strides = broadcasts.front()->get_shape().strides().size();
+            std::vector<size_t> common_axis(bcast_strides, 0);
+            // go through the strides of each broadcast,
+            // keep track of values that are equal to 0 in a dimension
+            for(auto i = 0; i < bcast_strides; i++)
+            {
+                for(const auto& broadcast : broadcasts)
+                {
+                    if(broadcast->get_shape().strides()[i] == 0)
+                        common_axis[i]++;
+                }
+            }
+            // if no common broadcast axis, transformation is not useful
+            if(std::find_if(common_axis.begin(), common_axis.end(), [](auto num_common) {
+                   return num_common > 1;
+               }) == common_axis.end())
+                return;
+        }
        std::vector<instruction_ref> inputs;
        std::transform(broadcasts.begin(),
                       broadcasts.end(),

--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -632,6 +632,9 @@ struct find_transpose_contiguous_reshaper_unary
    }
 };
+// simplifies broadcast->transpose to transpose->broadcast
+// in the case of a scalar, simply rewrite to broadcast
+// this can allow for further optimizations with find_inner_broadcast() in simplify_algebra.cpp
 struct find_broadcast_transpose
 {
    auto matcher() const
@@ -642,17 +645,30 @@ struct find_broadcast_transpose
    void apply(module& m, const match::matcher_result& r) const
    {
-        auto ins       = r.result;
+        auto transpose      = r.result;
-        auto ins_lens  = ins->get_shape().lens();
+        auto transpose_lens = transpose->get_shape().lens();
        auto bcast_ins = r.instructions["bcast_ins"];
        auto input     = bcast_ins->inputs().front();
-        // for now, focusing on scalar transformation
+        // scalar transformation does not need extra transpose
        if(not input->get_shape().scalar())
-            return;
+        {
+            // find common shape
+            auto in_lens  = input->get_shape().lens();
+            int lens_diff = transpose_lens.size() - in_lens.size();
+            // insert unsqueeze if input lens < transpose lens
+            if(lens_diff > 0)
+            {
+                std::vector<size_t> unsqueeze_axes(lens_diff);
+                std::iota(unsqueeze_axes.begin(), unsqueeze_axes.end(), 0);
+                input = m.insert_instruction(
+                    bcast_ins, make_op("unsqueeze", {{"axes", unsqueeze_axes}}), input);
+            }
+            // apply transpose before the multibroadcast
+            input = m.insert_instruction(bcast_ins, transpose->get_operator(), input);
+        }
        auto new_mbcast = m.insert_instruction(
-            bcast_ins, make_op("multibroadcast", {{"out_lens", ins_lens}}), input);
+            bcast_ins, make_op("multibroadcast", {{"out_lens", transpose_lens}}), input);
-        m.replace_instruction(ins, new_mbcast);
+        m.replace_instruction(transpose, new_mbcast);
    }
 };