Merge branch 'develop' into blas_tuning

ca300bd6 · Chris Austen · GitHub · 5adb597c · e7486577 · ca300bd6
Unverified Commit ca300bd6 authored Oct 19, 2023 by Chris Austen Committed by GitHub Oct 19, 2023
20 changed files
--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -35,7 +35,7 @@ fastjsonschema==2.16.3
    # via rocm-docs-core
 gitdb==4.0.10
    # via gitpython
-gitpython==3.1.32
+gitpython==3.1.37
    # via rocm-docs-core
 idna==3.4
    # via requests
@@ -87,7 +87,7 @@ requests==2.28.2
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==0.24.2
+rocm-docs-core==0.26.0
    # via -r requirements.in
 smmap==5.0.0
    # via gitdb
@@ -130,7 +130,7 @@ sphinxcontrib-serializinghtml==1.1.5
    # via sphinx
 typing-extensions==4.5.0
    # via pydata-sphinx-theme
-urllib3==1.26.15
+urllib3==1.26.18
    # via requests
 wrapt==1.15.0
    # via deprecated
--- a/src/driver/argument_parser.hpp
+++ b/src/driver/argument_parser.hpp
@@ -187,6 +187,13 @@ struct value_parser
    }
 };
+// version for std::optional object
+template <class T>
+struct value_parser<std::optional<T>>
+{
+    static T apply(const std::string& x) { return value_parser<T>::apply(x); }
+};
 struct argument_parser
 {
    struct argument

--- a/src/driver/main.cpp
+++ b/src/driver/main.cpp
@@ -540,22 +540,17 @@ struct params : command<params>
 struct verify : command<verify>
 {
    compiler c;
-    // Set to -1. as nonsense initial value
+    std::optional<double> rms_tol;
-    double rms_tol       = -1.0;
+    std::optional<double> atol;
-    double atol          = -1.0;
+    std::optional<double> rtol;
-    double rtol          = -1.0;
    bool per_instruction = false;
    bool reduce          = false;
    void parse(argument_parser& ap)
    {
        c.parse(ap);
-        ap(rms_tol, {"--rms-tol"}, ap.help("Tolerance for the RMS error (Default: 0.001)"));
+        ap(rms_tol, {"--rms-tol"}, ap.help("Tolerance for the RMS error"));
-        ap(atol,
+        ap(atol, {"--atol"}, ap.help("Tolerance for the elementwise absolute difference"));
-           {"--atol"},
+        ap(rtol, {"--rtol"}, ap.help("Tolerance for the elementwise relative difference"));
-           ap.help("Tolerance for the elementwise absolute difference (Default: 0.001)"));
-        ap(rtol,
-           {"--rtol"},
-           ap.help("Tolerance for the elementwise relative difference (Default: 0.001)"));
        ap(per_instruction,
           {"-i", "--per-instruction"},
           ap.help("Verify each instruction"),
@@ -572,33 +567,6 @@ struct verify : command<verify>
        auto t = c.ct.get_target();
        auto m = c.parameters.generate(p, t, true, c.l.batch);
-        // TODO remove this and make the driver able to figure out datatype most used in the model
-        //  then set the tolerances appropriately. Need to check here because c.to_fp16 only set
-        //  after argument_parser.parse() is run. This code is complicated because there's not a
-        //  good way to change the default tolerances after reading `--fp16` but before reading
-        //  `--rms-tol`, `--atol`, and `--rtol`.
-        migraphx::verify::tolerance tols{};
-        if(c.to_fp16)
-        {
-            tols = migraphx::verify::tolerance{8e-2, 4e-2, 4e-2};
-        }
-        if(not float_equal(this->rms_tol, -1.0))
-        {
-            tols.rms_tol = this->rms_tol;
-        }
-        if(not float_equal(this->atol, -1.0))
-        {
-            tols.atol = this->atol;
-        }
-        if(not float_equal(this->rtol, -1.0))
-        {
-            tols.rtol = this->rtol;
-        }
-        std::cout << "rms_tol: " << tols.rms_tol << std::endl;
-        std::cout << "atol: " << tols.atol << std::endl;
-        std::cout << "rtol: " << tols.rtol << std::endl;
        auto quantize = precision::fp32;
        if(c.to_fp16)
        {
@@ -609,6 +577,11 @@ struct verify : command<verify>
            quantize = precision::int8;
        }
+        auto tols = get_tolerances(p, quantize, rms_tol, atol, rtol);
+        std::cout << "rms_tol: " << tols.rms_tol << std::endl;
+        std::cout << "atol: " << tols.atol << std::endl;
+        std::cout << "rtol: " << tols.rtol << std::endl;
        if(per_instruction)
        {
            verify_instructions(p, t, c.co, quantize, tols);

--- a/src/driver/verify.cpp
+++ b/src/driver/verify.cpp
@@ -36,6 +36,42 @@ namespace migraphx {
 namespace driver {
 inline namespace MIGRAPHX_INLINE_NS {
+/**
+ * Gives tolerances based on user input (`rms_tol`, `atol`, `rtol` parameters) and defaults.
+ * Sets to fp16 tolerances if `quantize` input is fp16 or any fp16 instruction in found in the
+ * model.
+ */
+verify::tolerance get_tolerances(const program& p,
+                                 precision quantize,
+                                 std::optional<double> rms_tol,
+                                 std::optional<double> atol,
+                                 std::optional<double> rtol)
+{
+    bool has_fp16 = any_of(p.get_modules(), [](auto&& m) {
+        return any_of(*m, [](auto&& ins) { return (ins.get_shape().type() == shape::half_type); });
+    });
+    migraphx::verify::tolerance result{};
+    if(has_fp16 or quantize == precision::fp16)
+    {
+        result.rms_tol = 8e-2;
+        result.atol    = 4e-2;
+        result.rtol    = 4e-2;
+    }
+    if(rms_tol)
+    {
+        result.rms_tol = *rms_tol;
+    }
+    if(atol)
+    {
+        result.atol = *atol;
+    }
+    if(rtol)
+    {
+        result.rtol = *rtol;
+    }
+    return result;
+}
 std::vector<argument> run_ref(program p, const parameter_map& inputs)
 {
    p.compile(migraphx::make_target("ref"));

--- a/src/driver/verify.hpp
+++ b/src/driver/verify.hpp
@@ -32,6 +32,12 @@ namespace migraphx {
 namespace driver {
 inline namespace MIGRAPHX_INLINE_NS {
+verify::tolerance get_tolerances(const program& p,
+                                 precision quantize,
+                                 std::optional<double> rms_tol,
+                                 std::optional<double> atol,
+                                 std::optional<double> rtol);
 void verify_program(const std::string& name,
                    const program& p,
                    const target& t,

--- a/src/include/migraphx/argument.hpp
+++ b/src/include/migraphx/argument.hpp
@@ -46,7 +46,7 @@ struct MIGRAPHX_EXPORT argument : raw_data<argument>
 {
    argument() = default;
-    argument(const shape& s);
+    explicit argument(const shape& s);
    template <class F, MIGRAPHX_REQUIRES(std::is_pointer<decltype(std::declval<F>()())>{})>
    argument(shape s, F d)

--- a/src/include/migraphx/op/allocate.hpp
+++ b/src/include/migraphx/op/allocate.hpp
@@ -88,13 +88,13 @@ struct allocate
    {
        if(args.empty())
        {
-            return {output_shape};
+            return argument{output_shape};
        }
        else
        {
            std::vector<std::size_t> output_dims(output_shape.ndim());
            args.at(0).visit([&](auto a) { output_dims.assign(a.begin(), a.end()); });
-            return {shape{buf_type, output_dims}};
+            return argument{shape{buf_type, output_dims}};
        }
    }
 };

--- a/src/include/migraphx/op/pooling.hpp
+++ b/src/include/migraphx/op/pooling.hpp
@@ -411,7 +411,7 @@ struct pooling
            // for dynamic GlobalPooling, there's no padding
            kernel_dims.insert(kernel_dims.end(), input_lens.begin() + 2, input_lens.end());
            output_shape = dyn_out.computed_shape;
-            result       = dyn_out.computed_shape;
+            result       = argument{dyn_out.computed_shape};
        }
        else if((padding_mode != op::padding_mode_t::default_))
        {
@@ -439,7 +439,7 @@ struct pooling
        {
            kernel_dims  = this->lengths;
            output_shape = dyn_out.computed_shape;
-            result       = dyn_out.computed_shape;
+            result       = argument{dyn_out.computed_shape};
        }
        // Perform the computation and populate result

--- a/src/onnx/parse_groupnorm.cpp
+++ b/src/onnx/parse_groupnorm.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/instruction.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+struct parse_groupnorm : op_parser<parse_groupnorm>
+{
+    std::vector<op_desc> operators() const { return {{"GroupNormalization"}}; }
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const onnx_parser& parser,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        float epsilon = 1e-5f;
+        if(contains(info.attributes, "epsilon"))
+        {
+            epsilon = parser.parse_value(info.attributes.at("epsilon")).at<float>();
+        }
+        size_t num_groups;
+        if(contains(info.attributes, "num_groups"))
+        {
+            num_groups = parser.parse_value(info.attributes.at("num_groups")).at<size_t>();
+        }
+        else
+        {
+            MIGRAPHX_THROW("PARSE_GROUPNORM: num_groups must be available");
+        }
+        if(args.size() != 3)
+        {
+            MIGRAPHX_THROW("PARSE_GROUPNORM: invalid input count");
+        }
+        auto x     = args.at(0);
+        auto scale = args.at(1);
+        auto bias  = args.at(2);
+        auto x_shape = x->get_shape();
+        auto x_dtype = x_shape.type();
+        auto x_dims  = x_shape.lens();
+        if(x_shape.ndim() <= 2)
+        {
+            MIGRAPHX_THROW("PARSE_GROUPNORM: invalid input shape");
+        }
+        auto c = x_shape.lens().at(1);
+        if(c % num_groups != 0)
+        {
+            MIGRAPHX_THROW(
+                "PARSE_GROUPNORM: num_groups should be a divisor of the number of channels");
+        }
+        auto group_size = c / num_groups;
+        if(scale->get_shape().ndim() != 1 or scale->get_shape().lens().at(0) != num_groups)
+        {
+            MIGRAPHX_THROW("PARSE_GROUPNORM: scale tensor shape should be num_groups");
+        }
+        if(bias->get_shape().ndim() != 1 or bias->get_shape().lens().at(0) != num_groups)
+        {
+            MIGRAPHX_THROW("PARSE_GROUPNORM: bias tensor shape should be num_groups");
+        }
+        // Original shape: N x C x D1 x ... x Dn
+        // New shape: N x num_groups x C // num_groups x D1 x ... x Dn
+        std::vector<size_t> dims = {x_dims.at(0), num_groups, group_size};
+        std::copy(x_dims.begin() + 2, x_dims.end(), std::back_inserter(dims));
+        auto x_reshaped = info.add_instruction(make_op("reshape", {{"dims", dims}}), x);
+        // Axes for D1 x ... x Dn
+        std::vector<size_t> axes(dims.size() - 2);
+        std::iota(axes.begin(), axes.end(), 2);
+        // y = (x - mean) * rsqrt(variance + epsilon) * scale + bias
+        // mean = reduce_mean({D1, D2, ... Dk}, x)
+        // variance = reduce_mean({D1, D2, ... Dk}, (x - mean)^2)
+        auto mean = info.add_instruction(make_op("reduce_mean", {{"axes", axes}}), x_reshaped);
+        auto x_sub_mean    = info.add_common_op("sub", x_reshaped, mean);
+        auto x_sqdiff_mean = info.add_common_op("sqdiff", x_reshaped, mean);
+        auto variance =
+            info.add_instruction(make_op("reduce_mean", {{"axes", axes}}), x_sqdiff_mean);
+        epsilon =
+            (x_dtype == migraphx::shape::half_type and std::abs(epsilon) < 1e-7) ? 1e-7 : epsilon;
+        auto eps     = info.add_literal(migraphx::literal{migraphx::shape{x_dtype}, {epsilon}});
+        auto var_eps = info.add_common_op("add", variance, eps);
+        auto rsqrt   = info.add_instruction(make_op("rsqrt"), var_eps);
+        auto result  = info.add_common_op("mul", x_sub_mean, rsqrt);
+        auto scale_bcast =
+            info.add_instruction(make_op("broadcast", {{"axis", 1}, {"out_lens", dims}}), scale);
+        auto bias_bcast =
+            info.add_instruction(make_op("broadcast", {{"axis", 1}, {"out_lens", dims}}), bias);
+        auto scaled     = info.add_instruction(make_op("mul"), result, scale_bcast);
+        auto y          = info.add_instruction(make_op("add"), scaled, bias_bcast);
+        auto y_reshaped = info.add_instruction(make_op("reshape", {{"dims", x_dims}}), y);
+        return y_reshaped;
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_layernorm.cpp
+++ b/src/onnx/parse_layernorm.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/instruction.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+struct parse_layernorm : op_parser<parse_layernorm>
+{
+    std::vector<op_desc> operators() const { return {{"LayerNormalization"}}; }
+    std::vector<instruction_ref> parse(const op_desc& /*opd*/,
+                                       const onnx_parser& parser,
+                                       const onnx_parser::node_info& info,
+                                       std::vector<instruction_ref> args) const
+    {
+        int64_t axis = -1;
+        if(contains(info.attributes, "axis"))
+        {
+            axis = parser.parse_value(info.attributes.at("axis")).at<int64_t>();
+        }
+        float epsilon = 1e-5f;
+        if(contains(info.attributes, "epsilon"))
+        {
+            epsilon = parser.parse_value(info.attributes.at("epsilon")).at<float>();
+        }
+        if(contains(info.attributes, "stash_type"))
+        {
+            std::cerr << "WARNING: LAYERNORM does not support stash_type, it will be ignored.\n";
+        }
+        if(args.size() < 2 or args.size() > 3)
+        {
+            MIGRAPHX_THROW("PARSE_LAYERNORM: invalid input count");
+        }
+        auto x         = args.at(0);
+        auto scale     = args.at(1);
+        bool skip_bias = args.size() == 2;
+        instruction_ref bias;
+        if(not skip_bias)
+        {
+            bias = args.at(2);
+        }
+        auto x_shape   = x->get_shape();
+        auto x_dtype   = x_shape.type();
+        int64_t x_rank = x_shape.ndim();
+        if(x_rank < 2)
+        {
+            MIGRAPHX_THROW("PARSE_LAYERNORM: invalid input shape");
+        }
+        // If rank(X) is r, axis' allowed range is [-r, r)
+        if(axis < -x_rank or axis >= x_rank)
+        {
+            MIGRAPHX_THROW("PARSE_LAYERNORM: invalid axis");
+        }
+        // y = (x - mean) * rsqrt(variance + epsilon) * scale + bias
+        // mean = reduce_mean({D1, D2, ... Dk}, x)
+        // variance = reduce_mean({D1, D2, ... Dk}, (x - mean)^2)
+        // axis can be negative
+        axis = axis < 0 ? axis + x_rank : axis;
+        auto kdims = x_rank - axis;
+        std::vector<int64_t> axes(kdims);
+        std::iota(axes.begin(), axes.end(), axis);
+        auto skipped_axes = x_rank - kdims;
+        auto mean          = info.add_instruction(make_op("reduce_mean", {{"axes", axes}}), x);
+        auto x_sub_mean    = info.add_common_op("sub", x, mean);
+        auto x_sqdiff_mean = info.add_common_op("sqdiff", x, mean);
+        auto variance =
+            info.add_instruction(make_op("reduce_mean", {{"axes", axes}}), x_sqdiff_mean);
+        epsilon =
+            (x_dtype == migraphx::shape::half_type and std::abs(epsilon) < 1e-7) ? 1e-7 : epsilon;
+        auto eps     = info.add_literal(migraphx::literal{migraphx::shape{x_dtype}, {epsilon}});
+        auto var_eps = info.add_common_op("add", variance, eps);
+        auto rsqrt   = info.add_instruction(make_op("rsqrt"), var_eps);
+        auto result  = info.add_common_op("mul", x_sub_mean, rsqrt);
+        instruction_ref scale_bcast = scale;
+        instruction_ref bias_bcast  = bias;
+        if(skipped_axes > 0)
+        {
+            auto x_dims = x_shape.lens();
+            scale_bcast = info.add_instruction(
+                make_op("broadcast", {{"axis", skipped_axes}, {"out_lens", x_dims}}), scale);
+            if(not skip_bias)
+            {
+                bias_bcast = info.add_instruction(
+                    make_op("broadcast", {{"axis", skipped_axes}, {"out_lens", x_dims}}), bias);
+            }
+        }
+        auto scaled = info.add_instruction(make_op("mul"), result, scale_bcast);
+        auto y      = skip_bias ? scaled : info.add_instruction(make_op("add"), scaled, bias_bcast);
+        return {y, mean, rsqrt};
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_pad.cpp
+++ b/src/onnx/parse_pad.cpp
@@ -115,65 +115,71 @@ struct parse_pad : op_parser<parse_pad>
 {
    std::vector<op_desc> operators() const { return {{"Pad"}}; }
-    instruction_ref parse(const op_desc& /*opd*/,
+    std::string parse_mode(const onnx_parser::node_info& info,
-                          const onnx_parser& parser,
+                           const std::vector<instruction_ref>& args) const
-                          onnx_parser::node_info info,
-                          std::vector<instruction_ref> args) const
    {
-        std::vector<int64_t> pads{};
+        if(contains(info.attributes, "mode"))
-        if(args.size() >= 2)
        {
-            auto pad_arg = args.at(1)->eval();
+            auto mode = info.attributes.at("mode").s();
-            check_arg_empty(pad_arg, "PARSE_PAD: pad input must be constant");
+            if(mode == "reflect")
-            pad_arg.visit([&](auto v) { pads.assign(v.begin(), v.end()); });
+            {
+                if(args.front()->get_shape().dynamic())
+                {
+                    MIGRAPHX_THROW("PARSE_PAD: reflect padding with dynamic shape not supported");
                }
-        else if(contains(info.attributes, "pads"))
+            }
+            else if(mode != "constant")
            {
-            auto&& pad_vals = info.attributes["pads"].ints();
+                MIGRAPHX_THROW(
-            pads            = std::vector<int64_t>(pad_vals.begin(), pad_vals.end());
+                    "PARSE_PAD: migraphx currently only supports constant and reflect padding");
+            }
+            return mode;
        }
        else
        {
-            MIGRAPHX_THROW("PARSE_PAD: pad must be available");
+            // default mode
+            return "constant";
        }
-        // check if padding is actually being done (at least one value is nonzero)
-        if(std::all_of(pads.begin(), pads.end(), [](const int& i) { return i == 0; }))
-        {
-            return info.add_instruction(make_op("identity"), args.front());
    }
-        if(contains(info.attributes, "mode"))
+    std::vector<int64_t> parse_pads(const onnx_parser::node_info& info,
+                                    const std::vector<instruction_ref>& args) const
    {
-            auto mode = info.attributes.at("mode").s();
+        std::vector<int64_t> pads{};
-            if(mode == "reflect")
+        if(args.size() >= 2)
-            {
-                if(args.front()->get_shape().dynamic())
        {
-                    MIGRAPHX_THROW("PARSE_PAD: reflect padding with dynamic shape not supported");
+            auto pad_arg = args.at(1)->eval();
+            check_arg_empty(pad_arg, "PARSE_PAD: `pads` input must be constant");
+            pad_arg.visit([&](auto v) { pads.assign(v.begin(), v.end()); });
        }
-                return reflect_pad(info, pads, args.front());
+        else if(contains(info.attributes, "pads"))
+        {
+            auto&& pad_vals = info.attributes.at("pads").ints();
+            pads            = std::vector<int64_t>(pad_vals.begin(), pad_vals.end());
        }
-            if(mode != "constant")
+        else
        {
-                MIGRAPHX_THROW(
+            MIGRAPHX_THROW("PARSE_PAD: `pads` must be available");
-                    "PARSE_PAD: migraphx currently only supports constant and reflect padding");
        }
+        return pads;
    }
+    float parse_constant_value(const onnx_parser& parser,
+                               const onnx_parser::node_info& info,
+                               const std::vector<instruction_ref>& args) const
+    {
        float value = 0.0f;
-        // third input is the value
+        if(args.size() >= 3 and args.at(2)->get_shape().scalar())
-        if(args.size() == 3)
        {
            auto val_ins = args.at(2);
            if(not val_ins->can_eval())
            {
-                MIGRAPHX_THROW("PARSE_PAD: input value must be constant");
+                MIGRAPHX_THROW("PARSE_PAD: input `value` must be constant");
            }
            auto val_arg = val_ins->eval();
            if(val_arg.get_shape().elements() != 1)
            {
-                MIGRAPHX_THROW("PARSE_PAD: value should contain only one element");
+                MIGRAPHX_THROW("PARSE_PAD: `value` should contain only one element");
            }
            value = val_arg.at<float>();
        }
@@ -181,6 +187,81 @@ struct parse_pad : op_parser<parse_pad>
        {
            value = parser.parse_value(info.attributes.at("value")).at<float>();
        }
+        return value;
+    }
+    std::vector<int64_t> parse_axes(const std::vector<instruction_ref>& args,
+                                    bool is_constant_mode) const
+    {
+        std::vector<int64_t> axes{};
+        // axes is 3rd or 4th, depending on constant mode
+        auto pos = is_constant_mode ? 4 : 3;
+        if(args.size() >= pos)
+        {
+            auto axes_arg = args.at(pos - 1)->eval();
+            check_arg_empty(axes_arg, "PARSE_PAD: variable `axes` input not supported");
+            axes_arg.visit([&](auto v) { axes.assign(v.begin(), v.end()); });
+        }
+        return axes;
+    }
+    std::vector<int64_t> calculate_pads_with_axes(const std::vector<int64_t>& pads,
+                                                  const std::vector<int64_t>& axes,
+                                                  size_t input_rank) const
+    {
+        size_t num_axes = axes.size();
+        if(num_axes * 2 != pads.size())
+        {
+            MIGRAPHX_THROW("PARSE_PAD: number of elements of pads should be equal to 2 * "
+                           "number of elements of axes");
+        }
+        std::vector<int64_t> new_pads(input_rank * 2);
+        for(size_t idx{0}; idx < num_axes; ++idx)
+        {
+            // axis can be negative
+            int64_t axis = axes[idx] < 0 ? input_rank + axes[idx] : axes[idx];
+            // pad format is x1_begin, x2_begin, ... , x3_end, x4_end
+            new_pads[axis]              = pads[idx];
+            new_pads[axis + input_rank] = pads[idx + num_axes];
+        }
+        return new_pads;
+    }
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const onnx_parser& parser,
+                          const onnx_parser::node_info& info,
+                          const std::vector<instruction_ref>& args) const
+    {
+        std::vector<int64_t> pads = parse_pads(info, args);
+        // check if padding is actually being done (at least one value is nonzero)
+        if(std::all_of(pads.begin(), pads.end(), [](const int& i) { return i == 0; }))
+        {
+            return info.add_instruction(make_op("identity"), args.front());
+        }
+        std::string mode      = parse_mode(info, args);
+        bool is_constant_mode = mode == "constant";
+        float value           = is_constant_mode ? parse_constant_value(parser, info, args) : 0.0f;
+        std::vector<int64_t> axes = parse_axes(args, is_constant_mode);
+        size_t input_rank         = args.front()->get_shape().ndim();
+        if(not axes.empty())
+        {
+            pads = calculate_pads_with_axes(pads, axes, input_rank);
+        }
+        if(pads.size() != input_rank * 2)
+        {
+            MIGRAPHX_THROW("PARSE_PAD: number of elements of pads should be equal to 2 * "
+                           "input rank");
+        }
+        if(mode == "reflect")
+        {
+            return reflect_pad(info, pads, args.front());
+        }
        return info.add_instruction(migraphx::make_op("pad", {{"pads", pads}, {"value", value}}),
                                    args.front());

--- a/src/onnx/parse_shrink.cpp
+++ b/src/onnx/parse_shrink.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+struct parse_shrink : op_parser<parse_shrink>
+{
+    std::vector<op_desc> operators() const { return {{"Shrink"}}; }
+    instruction_ref parse(const op_desc&,
+                          const onnx_parser& parser,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        float bias = 0.0;
+        if(contains(info.attributes, "bias"))
+        {
+            bias = parser.parse_value(info.attributes.at("bias")).at<float>();
+        }
+        float lambd = 0.5;
+        if(contains(info.attributes, "lambd"))
+        {
+            lambd = parser.parse_value(info.attributes.at("lambd")).at<float>();
+        }
+        auto x             = args[0];
+        auto x_shape       = x->get_shape();
+        auto x_type        = x_shape.type();
+        auto lit_bias      = info.add_literal(bias);
+        auto lit_neg_lambd = info.add_literal(-lambd);
+        auto lit_lambd     = info.add_literal(lambd);
+        auto x_plus_bias = info.add_common_op("add", x, lit_bias);
+        auto x_min_bias  = info.add_common_op("sub", x, lit_bias);
+        auto cond1   = info.add_common_op("less", x, lit_neg_lambd);
+        auto cond2_a = info.add_common_op("not", cond1);
+        auto cond2_b = info.add_common_op("greater", x, lit_lambd);
+        auto cond2   = info.add_common_op("logical_and", cond2_a, cond2_b);
+        auto mul1 = info.add_instruction(make_op("convert", {{"target_type", x_type}}), cond1);
+        auto mul2 = info.add_instruction(make_op("convert", {{"target_type", x_type}}), cond2);
+        auto first  = info.add_common_op("mul", mul1, x_plus_bias);
+        auto second = info.add_common_op("mul", mul2, x_min_bias);
+        auto ret    = info.add_common_op("add", first, second);
+        if(ret->get_shape().type() != x_type)
+        {
+            ret = info.add_instruction(make_op("convert", {{"target_type", x_type}}), ret);
+        }
+        return ret;
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -139,6 +139,12 @@ void hip_compile_options::set_launch_params(
        global = compute_global(local);
 }
+static bool hip_accept_non_uniform_wg()
+{
+    static bool non_uniform_wg = hip_has_flags({"-fno-offload-uniform-block"});
+    return non_uniform_wg;
+}
 std::function<std::size_t(std::size_t local)>
 compute_global_for(context& ctx, std::size_t n, std::size_t over)
 {
@@ -146,11 +152,12 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)
    std::size_t max_global = ctx.get_current_device().get_cu_count() *
                             ctx.get_current_device().get_max_workitems_per_cu();
    return [n, over, max_global](std::size_t local) {
-        // hip require global workitems multiple of local workitems. It may degrade performance.
+        std::size_t num_elements = n;
-        // [TODO]: consider adding "fno-hip-uniform-block" flag when it becomes available.
+        if(not hip_accept_non_uniform_wg())
-        // https://reviews.llvm.org/D155213
+        {
-        std::size_t num_elements = ((n + local - 1) / local) * local;
+            num_elements = (1 + (n - 1) / local) * local;
-        std::size_t groups       = (num_elements + local - 1) / local;
+        }
+        std::size_t groups     = 1 + (num_elements - 1) / local;
        std::size_t max_blocks = max_global / local;
        std::size_t nglobal    = std::min(max_blocks * over, groups) * local;
        return std::min(nglobal, num_elements);
@@ -183,6 +190,11 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option
        generate_args_hpp(options.virtual_inputs.empty() ? options.inputs : options.virtual_inputs);
    srcs.emplace_back("args.hpp", args_hpp);
+    if(options.global % options.local != 0 and hip_accept_non_uniform_wg())
+        options.params += " -fno-offload-uniform-block";
+    else
+        assert(options.global % options.local == 0);
    options.params += " -DMIGRAPHX_NGLOBAL=" + std::to_string(options.global);
    options.params += " -DMIGRAPHX_NLOCAL=" + std::to_string(options.local);
    options.params += " " + join_strings(compiler_warnings(), " ");

--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -199,9 +199,9 @@ struct miopen_convolution
            // MIOpen has APIs to pass pre-allocated buffers starting from rocm-5.6
            preallocate = true;
 #endif
-            auto x = preallocate ? to_gpu(generate_argument(x_shape)) : inputs[0];
+            auto x = preallocate ? to_gpu(generate_argument(x_shape)) : argument{inputs[0]};
-            auto w = preallocate ? to_gpu(generate_argument(w_shape)) : inputs[1];
+            auto w = preallocate ? to_gpu(generate_argument(w_shape)) : argument{inputs[1]};
-            auto y = preallocate ? allocate_gpu(output_shape) : inputs[2];
+            auto y = preallocate ? allocate_gpu(output_shape) : argument{inputs[2]};
            auto workspace =
                preallocate ? allocate_gpu(workspace_shape) : migraphx::argument(workspace_shape);

--- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
@@ -31,6 +31,14 @@
 #include <migraphx/kernels/debug.hpp>
 #include <migraphx/kernels/functional.hpp>
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+extern "C" __device__ size_t __ockl_get_enqueued_local_size(uint); // NOLINT
+extern "C" __device__ size_t __ockl_get_local_size(uint);          // NOLINT
+#pragma clang diagnostic pop
+#endif
 namespace migraphx {
 #if defined(MIGRAPHX_NGLOBAL) && defined(MIGRAPHX_NLOCAL)
@@ -49,39 +57,33 @@ inline __device__ __attribute__((const)) index_int compute_global_size()
 #endif
 }
-// We cant just use blockDim.x to get the local size since its broken on hip
+#ifdef MIGRAPHX_NGROUP
-// when global is not divisible by local size. In this case, we calulate the
+// If global is divisible by local then local can be a const
-// size for the last group.
+#if(MIGRAPHX_NGLOBAL % MIGRAPHX_NLOCAL == 0) || (MIGRAPHX_NGROUP == 1)
+#define MIGRAPHX_HAS_CONST_LOCAL 1
+#endif
+#endif
 inline __device__ __attribute__((const)) index_int compute_local_size()
 {
-#ifdef MIGRAPHX_NLOCAL
+#ifdef MIGRAPHX_HAS_CONST_LOCAL
-    const auto nlocal = MIGRAPHX_NLOCAL;
+    return MIGRAPHX_NLOCAL;
-#else
-    const auto nlocal = blockDim.x; // NOLINT
-#endif
-#ifdef MIGRAPHX_NGROUP
-    const auto ngroup = MIGRAPHX_NGROUP;
 #else
-    const auto ngroup = gridDim.x;  // NOLINT
+    // Returns block size. For the non-uniform block it returns the size of the non-uniform block.
+    return __ockl_get_local_size(0); // NOLINT
 #endif
-    const auto group_id = blockIdx.x; // NOLINT
-    const auto nglobal  = compute_global_size();
-    if(group_id == ngroup - 1)
-    {
-        return 1 + (nglobal - 1) % nlocal;
-    }
-    else
-    {
-        return nlocal; // NOLINT
-    }
 }
-#ifdef MIGRAPHX_NGROUP
+inline __device__ __attribute__((const)) index_int compute_max_local_size()
-// If global is divisible by local then local can be a const
+{
-#if(MIGRAPHX_NGLOBAL % MIGRAPHX_NLOCAL == 0) || (MIGRAPHX_NGROUP == 1)
+#ifdef MIGRAPHX_LOCAL
-#define MIGRAPHX_HAS_CONST_LOCAL 1
+    return MIGRAPHX_NLOCAL;
-#endif
+#else
+    // Returns the block size. When workgrop has non-uniform block, this returns size of the uniform
+    // block.
+    return __ockl_get_enqueued_local_size(0); // NOLINT
 #endif
+}
 struct index
 {
@@ -126,8 +128,8 @@ struct index
 #else
    __device__ index_int max_nlocal() const
    {
-        MIGRAPHX_ASSERT(blockDim.x > 0);
+        MIGRAPHX_ASSERT(compute_max_local_size() > 0);
-        return blockDim.x;
+        return compute_max_local_size();
    }
 #endif
@@ -249,7 +251,8 @@ struct index
 #endif
 inline __device__ __attribute__((const)) index make_index()
 {
-    return index{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT
+    return index{
+        blockIdx.x * compute_max_local_size() + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT
 }
 } // namespace migraphx

--- a/test/eliminate_allocation_test.cpp
+++ b/test/eliminate_allocation_test.cpp
@@ -55,7 +55,7 @@ struct allocate
                               const migraphx::shape& output_shape,
                               const std::vector<migraphx::argument>&) const
    {
-        return {output_shape};
+        return migraphx::argument{output_shape};
    }
 };

--- a/test/eliminate_concat_test.cpp
+++ b/test/eliminate_concat_test.cpp
@@ -60,7 +60,7 @@ struct concat
                               const migraphx::shape& output_shape,
                               const std::vector<migraphx::argument>&) const
    {
-        return {output_shape};
+        return migraphx::argument{output_shape};
    }
 };
@@ -104,7 +104,7 @@ struct allocate
                               const migraphx::shape& output_shape,
                               const std::vector<migraphx::argument>&) const
    {
-        return {output_shape};
+        return migraphx::argument{output_shape};
    }
 };

--- a/test/memory_coloring_test.cpp
+++ b/test/memory_coloring_test.cpp
@@ -55,7 +55,7 @@ struct allocate
                               const migraphx::shape& output_shape,
                               const std::vector<migraphx::argument>&) const
    {
-        return {output_shape};
+        return migraphx::argument{output_shape};
    }
 };

--- a/test/normalize_ops_test.cpp
+++ b/test/normalize_ops_test.cpp
@@ -57,7 +57,7 @@ struct normalize_test_op
                               const migraphx::shape& output_shape,
                               const std::vector<migraphx::argument>&) const
    {
-        return {output_shape};
+        return migraphx::argument{output_shape};
    }
 };

--- a/test/onnx/.onnxrt-commit
+++ b/test/onnx/.onnxrt-commit
-6d7bc2a097a1a08541cd0d4628831c79ab8092d5
+635d3faa3b3908d2806d009dc6872152cfcfcdda