Merge branch 'develop' into rocblas_fp8

c4cee345 · Umang Yadav · GitHub · c40a39c3 · eafd55de · c4cee345
Unverified Commit c4cee345 authored Dec 01, 2023 by Umang Yadav Committed by GitHub Dec 01, 2023
20 changed files
--- a/src/include/migraphx/op/binary.hpp
+++ b/src/include/migraphx/op/binary.hpp
@@ -29,6 +29,7 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/value.hpp>
 #include <migraphx/dyn_output.hpp>
+#include <migraphx/par.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -95,7 +96,7 @@ struct binary : op_name<Derived>
    {
        argument result{dyn_out.computed_shape};
        visit_all(result, args[0], args[1])([&](auto output, auto input1, auto input2) {
-            std::transform(input1.begin(),
+            par_transform(input1.begin(),
                          input1.end(),
                          input2.begin(),
                          output.begin(),

--- a/src/include/migraphx/op/pooling.hpp
+++ b/src/include/migraphx/op/pooling.hpp
@@ -70,7 +70,8 @@ struct pooling
    // 2 smaller than the input tensor rank (NCHW layout)
    std::vector<std::size_t> lengths = {1, 1};
-    // Dilations are not supported at this time.
+    // Spacing between the elements of the pooling kernel. Must be the same ndim as lengths.
+    std::vector<std::size_t> dilations = {1, 1};
    // ceiling mode is a flag affecting output size
    // or equivalently, placements of the pooling kernel.
@@ -99,6 +100,7 @@ struct pooling
                    f(self.padding_mode, "padding_mode"),
                    f(self.stride, "stride"),
                    f(self.lengths, "lengths"),
+                    f(self.dilations, "dilations"),
                    f(self.ceil_mode, "ceil_mode"),
                    f(self.lp_order, "lp_order"),
                    f(self.dyn_global, "dyn_global"));
@@ -112,14 +114,17 @@ struct pooling
            return;
        if((padding_mode != default_ and padding.size() != stride.size() and
            (padding.size()) != stride.size() * 2) or
-           stride.size() != lengths.size())
+           stride.size() != lengths.size() or dilations.size() != lengths.size())
        {
            MIGRAPHX_THROW("POOLING: inconsistent attribute sizes");
        }
-        if(std::any_of(lengths.begin(), lengths.end(), [&](auto i) { return (i == 0); }) or
-           std::any_of(stride.begin(), stride.end(), [&](auto i) { return (i == 0); }))
+        const auto is_zero = [](auto el) { return el == 0; };
+        if(std::any_of(lengths.begin(), lengths.end(), is_zero) or
+           std::any_of(stride.begin(), stride.end(), is_zero) or
+           std::any_of(dilations.begin(), dilations.end(), is_zero))
        {
-            MIGRAPHX_THROW("POOLING: size 0 pooling kernel or stride");
+            MIGRAPHX_THROW("POOLING: size 0 pooling kernel or stride or dilations");
        }
        // TODO:  update lowering to run the reference
@@ -142,6 +147,11 @@ struct pooling
    value attributes() const { return {{"normalize_padding", "padding"}}; }
+    inline std::size_t dilate_dim(std::size_t dim, std::size_t dilation) const
+    {
+        return 1 + dilation * (dim - 1);
+    }
    std::vector<std::size_t> calc_spatial_dim_out(const std::vector<std::size_t>& input_lens,
                                                  std::size_t kdims) const
    {
@@ -151,8 +161,9 @@ struct pooling
            std::size_t padding_factor = 2 * padding[i];
            if(padding.size() == 2 * kdims)
                padding_factor = padding[i] + padding[i + kdims];
+            std::size_t dilated_length = dilate_dim(lengths[i], dilations[i]);
            std::size_t dim_size;
-            if(input_lens[i + 2] + padding_factor < lengths[i])
+            if(input_lens[i + 2] + padding_factor < dilated_length)
            {
                if(padding_mode == default_)
                    MIGRAPHX_THROW("POOLING: not enough padding for the given kernel size");
@@ -162,7 +173,7 @@ struct pooling
            }
            else
            {
-                dim_size = input_lens[i + 2] + padding_factor - lengths[i];
+                dim_size = input_lens[i + 2] + padding_factor - dilated_length;
            }
            std::size_t len =
                (ceil_mode)
@@ -331,6 +342,7 @@ struct pooling
                int start = static_cast<int>(idx_o[dim] * stride[d_2]) -
                            static_cast<int>(padding_vals[d_2]);
                int end;
+                std::size_t dilated_kernel_dim = dilate_dim(kernel_dims[d_2], dilations[d_2]);
                // NOLINT
                if(count_include_pad and ceil_mode and (mode != pooling_mode::max))
                {
@@ -340,15 +352,14 @@ struct pooling
                    // padding.  Clip out-of-bounds indexes but not padding.
                    // Check if this kernel extends beyond the padding at end of dimension
-                    end = std::min(start + kernel_dims[d_2],
+                    end = std::min(start + dilated_kernel_dim,
                                   in_lens[dim] + static_cast<int>(padding_vals[d_2]));
                }
                else
                {
                    // In non-ceiling mode, when
                    // count_include_pad is false, or for max pooling, clip off padding.
-                    end   = std::min(start + kernel_dims[d_2], in_lens[dim]);
+                    end = std::min(start + dilated_kernel_dim, in_lens[dim]);
-                    start = std::max(start, 0);
                }
                win_start.push_back(start);
                if(end < start)
@@ -366,6 +377,16 @@ struct pooling
            // for each element in the window...
            shape_for_each(win_shape, [&](const auto& idx_w) {
+                // Skip elements that belong to the dilated area
+                for(size_t axis = 0; axis < idx_w.size(); ++axis)
+                {
+                    if(idx_w[axis] % dilations[axis])
+                    {
+                        pool_size -= 1;
+                        return;
+                    }
+                }
                // the coordinates of this element
                auto idx = idx_o;
@@ -390,7 +411,15 @@ struct pooling
                    // this is a padding element.  Padding locations
                    // don't contribute to average or max pooling total but can play in
                    // lpnorm pooling.
-                    output_val = op(output_val, 0);
+                    if(mode == pooling_mode::lpnorm)
+                    {
+                        output_val = op(output_val, op.template init<Type>());
+                    }
+                    if(mode == pooling_mode::average)
+                    {
+                        // Ignore padding
+                        pool_size -= 1;
+                    }
                }
            });
            output[i] = Type(op.final(output_val, pool_size));

--- a/src/include/migraphx/op/scatternd_max.hpp
+++ b/src/include/migraphx/op/scatternd_max.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_OPERATORS_SCATTERND_MAX_HPP
+#define MIGRAPHX_GUARD_OPERATORS_SCATTERND_MAX_HPP
+#include <migraphx/op/scatternd_op.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct scatternd_max : scatternd_op<scatternd_max>
+{
+    scatternd_max() {}
+    auto reduction() const
+    {
+        return [](auto& x, const auto& y) { x = std::max(x, y); };
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/scatternd_min.hpp
+++ b/src/include/migraphx/op/scatternd_min.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_OPERATORS_SCATTERND_MIN_HPP
+#define MIGRAPHX_GUARD_OPERATORS_SCATTERND_MIN_HPP
+#include <migraphx/op/scatternd_op.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct scatternd_min : scatternd_op<scatternd_min>
+{
+    scatternd_min() {}
+    auto reduction() const
+    {
+        return [](auto& x, const auto& y) { x = std::min(x, y); };
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/scatternd_op.hpp
+++ b/src/include/migraphx/op/scatternd_op.hpp
@@ -121,7 +121,8 @@ struct scatternd_op : op_name<Derived>
                auto k             = indices_shape.lens().back();
                auto q             = indices_shape.ndim();
                auto r             = dyn_out.computed_shape.ndim();
-                par_for(updates_shape.elements(), [&](const auto i) {
+                for(auto i = 0u; i < updates_shape.elements(); ++i)
+                {
                    auto updates_idx = updates_std.multi(i);
                    std::vector<std::size_t> indices_idx(q, 0);
                    std::copy(
@@ -135,7 +136,7 @@ struct scatternd_op : op_name<Derived>
                    std::copy(updates_idx.begin() + q - 1, updates_idx.end(), out_idx.begin() + k);
                    self.reduction()(output[dyn_out.computed_shape.index(out_idx)], updates[i]);
-                });
+                }
            });
        });

--- a/src/include/migraphx/op/unary.hpp
+++ b/src/include/migraphx/op/unary.hpp
@@ -31,6 +31,7 @@
 #include <migraphx/stringutils.hpp>
 #include <migraphx/value.hpp>
 #include <migraphx/dyn_output.hpp>
+#include <migraphx/par.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -84,7 +85,7 @@ struct unary : op_name<Derived>
        argument result{dyn_out.computed_shape};
        result.visit([&](auto output) {
            args[0].visit([&](auto input) {
-                std::transform(input.begin(),
+                par_transform(input.begin(),
                              input.end(),
                              output.begin(),
                              static_cast<const Derived&>(*this).apply());

--- a/src/include/migraphx/op/unique.hpp
+++ b/src/include/migraphx/op/unique.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_OPERATORS_UNIQUE_HPP
+#define MIGRAPHX_GUARD_OPERATORS_UNIQUE_HPP
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/tune_axis.hpp>
+#include <utility>
+#include <map>
+#include <limits>
+#include <optional>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+// https://onnx.ai/onnx/operators/onnx__Unique.html
+// The Onnx spec refers to numpy specification, used as a reference:
+// https://numpy.org/doc/stable/reference/generated/numpy.unique.html
+// Input : Given an array of elements : X.
+// Output(s) :
+// 1. Find the unique elements (Y) of input (X).
+//
+// There are three outputs in addition to the unique elements in Y:
+// 2. the indices of the input array that give the unique values
+// 3. the indices of the unique array that reconstruct the input array
+// 4. the number of times each unique value comes up in the input array
+// Optional Attribute: 'Sorted' = 1 for sorted; = 0 for unsorted.
+// Onnx specification makes 'sorted' a default, while Numpy always sorts.
+//
+// Optional Attribute: 'Axis' is 'None' (default) or a valid int < rank(X).
+// Negative values are allowed.
+//
+// Numpy has the following important note on Axis:
+// ------------------------------------------------------------------
+// When an axis is specified the subarrays indexed by the axis are
+// sorted. This is done by making the specified axis the first
+// dimension of the array (move the axis to the first dimension to
+// keep the order of the other axes) and then flattening the subarrays
+// in C order. The flattened subarrays are then viewed as a structured
+// type with each element given a label, with the effect that we end
+// up with a 1-D array of structured types that can be treated in the
+// same way as any other 1-D array. The result is that the flattened
+// subarrays are sorted in lexicographic order starting with the first
+// element.
+// ------------------------------------------------------------------
+struct unique
+{
+    template <class T>
+    auto make_idx_less_fn(const T& data, size_t chunk_sz) const
+    {
+        return [&data, chunk_sz](auto idx1, auto idx2) {
+            return std::lexicographical_compare(data.begin() + idx1,
+                                                data.begin() + idx1 + chunk_sz,
+                                                data.begin() + idx2,
+                                                data.begin() + idx2 + chunk_sz);
+        };
+    }
+    // CASE SORTED:
+    //
+    // To process into a sorted unique series of elements/chunks:
+    // Chunk size == 1 means a simple element; >1 means a flat representation.
+    // Steps: first go through the input elements/chunks for uniqueness.
+    // At the end of this processing, per the sorted sequence of unique elements:
+    // update/create data structures: y, y_indices, x_rev_indices, y_count
+    //
+    // INPUT x: [2, 1, 1, 3, 4, 3], attr_sorted = 1;
+    // OUTPUT(s): indices..
+    // y_indices: [1, 0, 3, 4]  --- first incidence, in terms of index in sequence x
+    // x_rev_indices: [1, 0, 0, 2, 3, 2] --- x seen in terms of indices of unique sequence y
+    // y_count: [2, 1, 2, 1] -- count at each y_index. sum = len(x)
+    // NOTE: y [1, 2, 3, 4]   --- the unique output is constructed from x[y_indices[...]]
+    template <class T>
+    auto sorted_uniq_indices(const T& input_data, size_t chunk_sz) const
+    {
+        struct y_info
+        {
+            size_t y_idx;
+            size_t x_idx;
+            size_t ct = 0;
+        };
+        auto idx_less_fn = make_idx_less_fn(input_data, chunk_sz);
+        std::map<size_t, y_info, decltype(idx_less_fn)> uniq_val_map(idx_less_fn);
+        std::tuple<std::vector<std::size_t>, std::vector<std::size_t>, std::vector<std::size_t>> rv;
+        auto& [y_indices, x_rev_indices, y_count] = rv;
+        // go through all the elements and find the unique elements..
+        size_t count_x = input_data.size();
+        for(size_t f_idx = 0, x_idx = 0; f_idx < count_x; f_idx += chunk_sz, x_idx++)
+        {
+            y_info entry          = {.y_idx = uniq_val_map.size(), .x_idx = x_idx};
+            auto [itr, added_new] = uniq_val_map.insert({f_idx, entry});
+            itr->second.ct++;
+            x_rev_indices.push_back(itr->second.y_idx);
+        }
+        std::vector<std::size_t> y2x_indices(uniq_val_map.size());
+        y_indices.resize(uniq_val_map.size());
+        y_count.resize(uniq_val_map.size());
+        size_t idx = 0;
+        // the unique elements are now sorted:
+        // post-processing for all the return indices.
+        for(const auto& v : uniq_val_map)
+        {
+            y2x_indices[v.second.y_idx] = idx;
+            y_indices[idx]              = v.second.x_idx;
+            y_count[idx]                = v.second.ct;
+            idx++;
+        }
+        // update x_rev_indices as per the sorted order of y_indices
+        for(auto& i : x_rev_indices)
+            i = y2x_indices[i];
+        return rv;
+    }
+    // CASE UNSORTED:
+    //
+    // To process into an un-sorted unique series of elements/chunks:
+    // For chunk size = 1 is a simple element, else use a flat representation of a tensor obj
+    // Go through the input elements/chunks one by one with inline processing of indices..
+    // INPUT x: [2, 1, 1, 3, 4, 3], attr_sorted = 0;
+    // OUTPUT(s): indices..
+    // y_indices: [0, 1, 3, 4]  --- first incidence, in terms of index in sequence x
+    // x_rev_indices: [0, 1, 1, 2, 3, 2] --- x seen in terms of indices of unique sequence y
+    // y_count: [1, 2, 2, 1] -- count at each y_index. sum = len(x)
+    // NOTE: y [2, 1, 3, 4]   --- the unique output is constructed from x[y_indices[...]]
+    // Output data structures: y_indices, x_rev_indices, y_count are processed inline.
+    template <class T>
+    auto unsorted_uniq_indices(const T& input_data, size_t chunk_sz) const
+    {
+        auto idx_less_fn = make_idx_less_fn(input_data, chunk_sz);
+        std::map<size_t, size_t, decltype(idx_less_fn)> uniq_val_map(idx_less_fn);
+        // rv is used for NVRO below..
+        std::tuple<std::vector<std::size_t>, std::vector<std::size_t>, std::vector<std::size_t>> rv;
+        auto& [y_indices, x_rev_indices, y_count] = rv;
+        // go through all the elements and add the unique elements into the map..
+        // inline processing for outputs: y_indices, x_rev_indices, y_count
+        size_t count_x = input_data.size();
+        for(size_t f_idx = 0; f_idx < count_x; f_idx += chunk_sz)
+        {
+            auto [itr, added_new] = uniq_val_map.insert({f_idx, y_indices.size()});
+            if(added_new)
+            {
+                y_count.push_back(0);
+                y_indices.push_back(x_rev_indices.size());
+            }
+            y_count[itr->second]++;
+            x_rev_indices.push_back(itr->second);
+        }
+        return rv;
+    }
+    // Axis. Default: none. Range: [-rank, rank-1]
+    std::optional<int64_t> axis;
+    // Sorted, Default: 1= sorted. 0 = unsorted.
+    bool sorted = true;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.axis, "axis"), f(self.sorted, "sorted"));
+    }
+    std::string name() const { return "unique"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(1);
+        auto& sh_x         = inputs[0];
+        auto lens_x        = sh_x.lens();
+        size_t dim_x       = sh_x.ndim();
+        size_t max_uniq_ct = sh_x.elements();
+        std::vector<shape::dynamic_dimension> d_out;
+        if(axis)
+        {
+            int64_t t_axis = migraphx::tune_axis(dim_x, *axis, name());
+            if(t_axis != 0)
+                MIGRAPHX_THROW("Unique: Only supports axis = 0 or None");
+            d_out = sh_x.to_dynamic().dyn_dims();
+            // only axis = 0 is supported:
+            max_uniq_ct = lens_x[0];
+            // min = 1 unique element; max = full dimension along axis 0
+            d_out[0] = {1, max_uniq_ct};
+        }
+        else
+        {
+            d_out.push_back({1, max_uniq_ct});
+        }
+        shape sh_y = {sh_x.type(), d_out};
+        // The three outputted Indices are just 1-D:
+        shape sh_idx{shape::int64_type, {d_out[0]}};
+        return {{sh_y, sh_idx, sh_idx, sh_idx}};
+    }
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
+    {
+        auto sh_x          = args.front().get_shape();
+        auto lens_x        = sh_x.lens();
+        shape output_shape = dyn_out.computed_shape;
+        auto vec_ss        = output_shape.sub_shapes();
+        auto ct_x          = sh_x.elements();
+        shape sh_y         = {vec_ss[0].type(), {ct_x}};
+        shape sh_idx       = {vec_ss[1].type(), {ct_x}};
+        shape sh_x_idx     = {vec_ss[1].type(), {ct_x}};
+        argument res_y{sh_y};
+        argument res_y_idx{sh_idx};
+        argument res_x_rev_idx{sh_idx};
+        argument res_y_ct_idx{sh_idx};
+        std::vector<size_t> out_y_idx;
+        std::vector<size_t> out_x_rev_idx;
+        std::vector<size_t> out_y_ct;
+        // If axis is not none, for >1D tensors, we have to consider
+        // then, the uniqueness of chunks of sub-tensors: a subsequence of built-ins..
+        // For a built-in type, chunk_sz is of course = 1
+        size_t chunk_sz = 1;
+        if(axis)
+            chunk_sz = ct_x / lens_x[0]; // axis = 0 is supported.
+        visit_all(args.front(), res_y)([&](auto x, auto y_flat) {
+            using o_type = typename decltype(x)::value_type;
+            std::vector<o_type> x_in(x.begin(), x.end());
+            std::tie(out_y_idx, out_x_rev_idx, out_y_ct) =
+                sorted ? sorted_uniq_indices(x_in, chunk_sz)
+                       : unsorted_uniq_indices(x_in, chunk_sz);
+            const auto uniq_ct = out_y_idx.size();
+            // construct y from x[indices] in flattened form
+            // later we reshape y to the final shape..
+            auto y_dst = y_flat.begin();
+            for(size_t idx = 0; idx < uniq_ct; idx++)
+                y_dst = copy_n(x_in.begin() + out_y_idx[idx] * chunk_sz, chunk_sz, y_dst);
+            std::vector<size_t> lens_y;
+            // if axis is specified:
+            // the output shape keeps the n-1 dimensions of x
+            if(axis)
+            {
+                lens_y    = lens_x;
+                lens_y[0] = uniq_ct;
+            }
+            else
+            {
+                lens_y = {uniq_ct};
+            }
+            sh_y   = {sh_y.type(), lens_y};
+            sh_idx = {sh_idx.type(), {uniq_ct}};
+        });
+        visit_all(res_y_idx, res_x_rev_idx, res_y_ct_idx)(
+            [&](auto y_indices, auto x_rev_indices, auto y_count) {
+                std::copy(out_y_idx.begin(), out_y_idx.end(), y_indices.begin());
+                std::copy(out_x_rev_idx.begin(), out_x_rev_idx.end(), x_rev_indices.begin());
+                std::copy(out_y_ct.begin(), out_y_ct.end(), y_count.begin());
+                sh_x_idx = {sh_idx.type(), {out_x_rev_idx.size()}};
+            });
+        return {{res_y.reshape(sh_y),
+                 res_y_idx.reshape(sh_idx),
+                 res_x_rev_idx.reshape(sh_x_idx),
+                 res_y_ct_idx.reshape(sh_idx)}};
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -119,6 +119,8 @@
 #include <migraphx/op/scatternd_add.hpp>
 #include <migraphx/op/scatternd_none.hpp>
 #include <migraphx/op/scatternd_mul.hpp>
+#include <migraphx/op/scatternd_max.hpp>
+#include <migraphx/op/scatternd_min.hpp>
 #include <migraphx/op/sigmoid.hpp>
 #include <migraphx/op/sign.hpp>
 #include <migraphx/op/sinh.hpp>
@@ -137,6 +139,7 @@
 #include <migraphx/op/unary.hpp>
 #include <migraphx/op/unary_not.hpp>
 #include <migraphx/op/undefined.hpp>
+#include <migraphx/op/unique.hpp>
 #include <migraphx/op/unknown.hpp>
 #include <migraphx/op/unsqueeze.hpp>
 #include <migraphx/op/where.hpp>

--- a/src/include/migraphx/par.hpp
+++ b/src/include/migraphx/par.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_MIGRAPHX_PAR_HPP
+#define MIGRAPHX_GUARD_MIGRAPHX_PAR_HPP
+#include <migraphx/config.hpp>
+#if MIGRAPHX_HAS_EXECUTORS
+#include <execution>
+#else
+#include <migraphx/simple_par_for.hpp>
+#endif
+#include <algorithm>
+#include <mutex>
+#include <vector>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace detail {
+struct exception_list
+{
+    std::vector<std::exception_ptr> exceptions;
+    std::mutex m;
+    void add_exception()
+    {
+        std::lock_guard<std::mutex> guard(m);
+        exceptions.push_back(std::current_exception());
+    }
+    template <class F>
+    auto collect(F f)
+    {
+        return [f, this](auto&&... xs) {
+            try
+            {
+                f(std::forward<decltype(xs)>(xs)...);
+            }
+            catch(...)
+            {
+                this->add_exception();
+            }
+        };
+    }
+    void throw_if_exception() const
+    {
+        if(not exceptions.empty())
+            std::rethrow_exception(exceptions.front());
+    }
+};
+} // namespace detail
+template <class InputIt, class OutputIt, class UnaryOperation>
+OutputIt par_transform(InputIt first1, InputIt last1, OutputIt d_first, UnaryOperation unary_op)
+{
+#if MIGRAPHX_HAS_EXECUTORS
+    return std::transform(std::execution::par, first1, last1, d_first, std::move(unary_op));
+#else
+    simple_par_for(last1 - first1, [&](auto i) { d_first[i] = unary_op(first1[i]); });
+    return d_first + (last1 - first1);
+#endif
+}
+template <class InputIt1, class InputIt2, class OutputIt, class BinaryOperation>
+OutputIt par_transform(
+    InputIt1 first1, InputIt1 last1, InputIt2 first2, OutputIt d_first, BinaryOperation binary_op)
+{
+#if MIGRAPHX_HAS_EXECUTORS
+    return std::transform(
+        std::execution::par, first1, last1, first2, d_first, std::move(binary_op));
+#else
+    simple_par_for(last1 - first1, [&](auto i) { d_first[i] = binary_op(first1[i], first2[i]); });
+    return d_first + (last1 - first1);
+#endif
+}
+template <class InputIt, class UnaryFunction>
+void par_for_each(InputIt first, InputIt last, UnaryFunction f)
+{
+#if MIGRAPHX_HAS_EXECUTORS
+    // Propagate the exception
+    detail::exception_list ex;
+    std::for_each(std::execution::par, first, last, ex.collect(std::move(f)));
+    ex.throw_if_exception();
+#else
+    simple_par_for(last - first, [&](auto i) { f(first[i]); });
+#endif
+}
+template <class... Ts>
+auto par_copy_if(Ts&&... xs)
+{
+#if MIGRAPHX_HAS_EXECUTORS
+    return std::copy_if(std::execution::par, std::forward<Ts>(xs)...);
+#else
+    return std::copy_if(std::forward<Ts>(xs)...);
+#endif
+}
+template <class... Ts>
+auto par_sort(Ts&&... xs)
+{
+#if MIGRAPHX_HAS_EXECUTORS
+    return std::sort(std::execution::par, std::forward<Ts>(xs)...);
+#else
+    return std::sort(std::forward<Ts>(xs)...);
+#endif
+}
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_MIGRAPHX_PAR_HPP
--- a/src/include/migraphx/par_for.hpp
+++ b/src/include/migraphx/par_for.hpp
@@ -24,93 +24,23 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_PAR_FOR_HPP
 #define MIGRAPHX_GUARD_RTGLIB_PAR_FOR_HPP
-#include <thread>
+#include <migraphx/par.hpp>
-#include <cmath>
+#include <migraphx/ranges.hpp>
-#include <algorithm>
-#include <vector>
-#include <cassert>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-struct joinable_thread : std::thread
-{
-    template <class... Xs>
-    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...) // NOLINT
-    {
-    }
-    joinable_thread& operator=(joinable_thread&& other) = default;
-    joinable_thread(joinable_thread&& other)            = default;
-    ~joinable_thread()
-    {
-        if(this->joinable())
-            this->join();
-    }
-};
-template <class F>
-auto thread_invoke(std::size_t i, std::size_t tid, F f) -> decltype(f(i, tid))
-{
-    f(i, tid);
-}
-template <class F>
-auto thread_invoke(std::size_t i, std::size_t, F f) -> decltype(f(i))
-{
-    f(i);
-}
-template <class F>
-void par_for_impl(std::size_t n, std::size_t threadsize, F f)
-{
-    if(threadsize <= 1)
-    {
-        for(std::size_t i = 0; i < n; i++)
-            thread_invoke(i, 0, f);
-    }
-    else
-    {
-        std::vector<joinable_thread> threads(threadsize);
-// Using const here causes gcc 5 to ICE
-#if(!defined(__GNUC__) || __GNUC__ != 5)
-        const
-#endif
-            std::size_t grainsize = std::ceil(static_cast<double>(n) / threads.size());
-        std::size_t work = 0;
-        std::size_t tid  = 0;
-        std::generate(threads.begin(), threads.end(), [=, &work, &tid] {
-            auto result = joinable_thread([=] {
-                std::size_t start = work;
-                std::size_t last  = std::min(n, work + grainsize);
-                for(std::size_t i = start; i < last; i++)
-                {
-                    thread_invoke(i, tid, f);
-                }
-            });
-            work += grainsize;
-            ++tid;
-            return result;
-        });
-        assert(work >= n);
-    }
-}
 template <class F>
-void par_for(std::size_t n, std::size_t min_grain, F f)
+void par_for(std::size_t n, F f)
 {
-    const auto threadsize = std::min<std::size_t>(std::thread::hardware_concurrency(),
+    using iterator = basic_iota_iterator<id, std::size_t>;
-                                                  n / std::max<std::size_t>(1, min_grain));
+    par_for_each(iterator{0, {}}, iterator{n, {}}, f);
-    par_for_impl(n, threadsize, f);
 }
 template <class F>
-void par_for(std::size_t n, F f)
+void par_for(std::size_t n, std::size_t, F f)
 {
-    const int min_grain = 8;
+    par_for(n, f);
-    par_for(n, min_grain, f);
 }
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/include/migraphx/rewrite_pooling.hpp
+++ b/src/include/migraphx/rewrite_pooling.hpp
@@ -26,6 +26,7 @@
 #include <string>
 #include <migraphx/config.hpp>
+#include <migraphx/instruction_ref.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/include/migraphx/simple_par_for.hpp
+++ b/src/include/migraphx/simple_par_for.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_SIMPLE_PAR_FOR_HPP
+#define MIGRAPHX_GUARD_RTGLIB_SIMPLE_PAR_FOR_HPP
+#include <thread>
+#include <cmath>
+#include <algorithm>
+#include <vector>
+#include <cassert>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct joinable_thread : std::thread
+{
+    template <class... Xs>
+    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...) // NOLINT
+    {
+    }
+    joinable_thread& operator=(joinable_thread&& other) = default;
+    joinable_thread(joinable_thread&& other)            = default;
+    ~joinable_thread()
+    {
+        if(this->joinable())
+            this->join();
+    }
+};
+template <class F>
+auto thread_invoke(std::size_t i, std::size_t tid, F f) -> decltype(f(i, tid))
+{
+    f(i, tid);
+}
+template <class F>
+auto thread_invoke(std::size_t i, std::size_t, F f) -> decltype(f(i))
+{
+    f(i);
+}
+template <class F>
+void simple_par_for_impl(std::size_t n, std::size_t threadsize, F f)
+{
+    if(threadsize <= 1)
+    {
+        for(std::size_t i = 0; i < n; i++)
+            thread_invoke(i, 0, f);
+    }
+    else
+    {
+        std::vector<joinable_thread> threads(threadsize);
+// Using const here causes gcc 5 to ICE
+#if(!defined(__GNUC__) || __GNUC__ != 5)
+        const
+#endif
+            std::size_t grainsize = std::ceil(static_cast<double>(n) / threads.size());
+        std::size_t work = 0;
+        std::size_t tid  = 0;
+        std::generate(threads.begin(), threads.end(), [=, &work, &tid] {
+            auto result = joinable_thread([=] {
+                std::size_t start = work;
+                std::size_t last  = std::min(n, work + grainsize);
+                for(std::size_t i = start; i < last; i++)
+                {
+                    thread_invoke(i, tid, f);
+                }
+            });
+            work += grainsize;
+            ++tid;
+            return result;
+        });
+        assert(work >= n);
+    }
+}
+template <class F>
+void simple_par_for(std::size_t n, std::size_t min_grain, F f)
+{
+    const auto threadsize = std::min<std::size_t>(std::thread::hardware_concurrency(),
+                                                  n / std::max<std::size_t>(1, min_grain));
+    simple_par_for_impl(n, threadsize, f);
+}
+template <class F>
+void simple_par_for(std::size_t n, F f)
+{
+    const int min_grain = 8;
+    simple_par_for(n, min_grain, f);
+}
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/tune_axis.hpp
+++ b/src/include/migraphx/tune_axis.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -24,21 +24,21 @@
 #ifndef MIGRAPHX_GUARD_OPERATORS_TUNE_AXIS_HPP
 #define MIGRAPHX_GUARD_OPERATORS_TUNE_AXIS_HPP
-#include <utility>
-#include <cstdint>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/errors.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-inline int tune_axis(const int n_dim, const int axis, const std::string& op_name = "OPERATOR")
+inline int tune_axis(int n_dim, int axis, const std::string& op_name = "OPERATOR")
 {
-    if(axis >= n_dim or std::abs(axis) > n_dim)
+    if(axis < 0)
-    {
+        axis += n_dim;
+    if(axis < 0 or axis >= n_dim)
        MIGRAPHX_THROW(to_upper(op_name) + ": axis is out of range.");
-    }
-    return (axis < 0) ? axis + n_dim : axis;
+    return axis;
 }
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/onnx/CMakeLists.txt
+++ b/src/onnx/CMakeLists.txt
@@ -26,7 +26,11 @@ find_package(Protobuf REQUIRED)
 protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS onnx.proto)
 add_library(onnx-proto STATIC ${PROTO_SRCS})
 target_include_directories(onnx-proto SYSTEM PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${PROTOBUF_INCLUDE_DIR})
-target_compile_options(onnx-proto PRIVATE -w)
+if(MSVC)
+    target_compile_options(onnx-proto PRIVATE /w)
+else()
+    target_compile_options(onnx-proto PRIVATE -w)
+endif()
 target_link_libraries(onnx-proto PRIVATE ${PROTOBUF_LIBRARY})
 set_target_properties(onnx-proto PROPERTIES POSITION_INDEPENDENT_CODE On)
@@ -37,7 +41,10 @@ set_target_properties(migraphx_onnx PROPERTIES EXPORT_NAME onnx)
 migraphx_generate_export_header(migraphx_onnx)
 rocm_set_soversion(migraphx_onnx ${MIGRAPHX_SO_VERSION})
 rocm_clang_tidy_check(migraphx_onnx)
-target_link_libraries(migraphx_onnx PRIVATE onnx-proto "-Wl,--exclude-libs,ALL")
+target_link_libraries(migraphx_onnx PRIVATE onnx-proto)
+if(NOT WIN32)
+    target_link_libraries(migraphx_onnx PRIVATE "-Wl,--exclude-libs,ALL")
+endif()
 target_link_libraries(migraphx_onnx PUBLIC migraphx)
 rocm_install_targets(

--- a/src/onnx/include/migraphx/onnx/pooling.hpp
+++ b/src/onnx/include/migraphx/onnx/pooling.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_ONNX_POOLING_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_ONNX_POOLING_HPP
+#include <migraphx/config.hpp>
+#include <migraphx/onnx/onnx_parser.hpp>
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/instruction.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+value handle_pooling_values(const op_desc& opd,
+                            onnx_parser::node_info info,
+                            const shape& in_shape,
+                            value values);
+instruction_ref add_pooling_op(const op_desc& opd, onnx_parser::node_info info, instruction_ref l0);
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/onnx/parse_multinomial.cpp
+++ b/src/onnx/parse_multinomial.cpp
@@ -127,9 +127,9 @@ struct parse_multinomial : op_parser<parse_multinomial>
            // use literal.  The array populated by random_uniform may have any shape, as long its
            // number of elements is batch_size * sample_size .
            size_t batch_size = s0.lens().front();
-            auto rand_dummy   = info.add_literal(
+            auto rand_dummy   = info.add_literal(migraphx::literal{
-                migraphx::literal{migraphx::shape::float_type, {batch_size * sample_size}});
+                migraphx::shape{migraphx::shape::float_type, {batch_size, sample_size}},
+                std::vector<float>(batch_size * sample_size)});
            randoms =
                info.add_instruction(migraphx::make_op("random_uniform"), seed_input, rand_dummy);
        }

--- a/src/onnx/parse_pooling.cpp
+++ b/src/onnx/parse_pooling.cpp
@@ -22,14 +22,8 @@
 * THE SOFTWARE.
 */
 #include <migraphx/onnx/op_parser.hpp>
-#include <migraphx/onnx/checks.hpp>
+#include <migraphx/onnx/pooling.hpp>
-#include <migraphx/onnx/padding.hpp>
-#include <migraphx/op/pad.hpp>
-#include <migraphx/op/pooling.hpp>
 #include <migraphx/instruction.hpp>
-#include <migraphx/ranges.hpp>
-#include <migraphx/stringutils.hpp>
-#include <migraphx/make_op.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -39,68 +33,14 @@ struct parse_pooling : op_parser<parse_pooling>
 {
    std::vector<op_desc> operators() const
    {
-        return {{"AveragePool", "average"},
+        return {
+            {"AveragePool", "average"},
            {"GlobalAveragePool", "average"},
            {"GlobalMaxPool", "max"},
            {"MaxPool", "max"},
            {"LpPool", "lpnorm"},
-                {"GlobalLpPool", "lpnorm"}};
+            {"GlobalLpPool", "lpnorm"},
-    }
+        };
-    value handle_values(const op_desc& opd,
-                        onnx_parser::node_info info,
-                        const shape& in_shape,
-                        value values) const
-    {
-        auto kdims = in_shape.ndim() - 2;
-        if(starts_with(opd.onnx_name, "Global"))
-        {
-            // if spatial dimensions are dynamic use dyn_global flag
-            if(in_shape.dynamic() and std::any_of(in_shape.dyn_dims().cbegin() + 2,
-                                                  in_shape.dyn_dims().cend(),
-                                                  [](auto dd) { return not dd.is_fixed(); }))
-            {
-                values["dyn_global"] = true;
-                values["lengths"]    = std::vector<size_t>();
-            }
-            else
-            {
-                // works with static and fixed dynamic shape
-                auto m_lens       = in_shape.max_lens();
-                values["lengths"] = std::vector<size_t>(m_lens.begin() + 2, m_lens.end());
-            }
-        }
-        if(contains(info.attributes, "ceil_mode"))
-        {
-            values["ceil_mode"] = static_cast<bool>(info.attributes.at("ceil_mode").i());
-        }
-        if(contains(info.attributes, "strides"))
-        {
-            values["stride"].clear();
-            copy(info.attributes["strides"].ints(), std::back_inserter(values["stride"]));
-            check_attr_sizes(kdims, values["stride"].size(), "PARSE_POOLING: inconsistent strides");
-        }
-        if(contains(info.attributes, "kernel_shape"))
-        {
-            values["lengths"].clear();
-            copy(info.attributes["kernel_shape"].ints(), std::back_inserter(values["lengths"]));
-            check_attr_sizes(
-                kdims, values["lengths"].size(), "PARSE_POOLING: inconsistent lengths");
-        }
-        // lp_order attribute
-        if(contains(info.attributes, "p"))
-        {
-            values["lp_order"] = info.attributes.at("p").i();
-        }
-        // ensure pads available only when auto_pad is "NOT_SET"
-        check_padding_mode(info, "POOLING");
-        return values;
    }
    instruction_ref parse(const op_desc& opd,
@@ -108,144 +48,8 @@ struct parse_pooling : op_parser<parse_pooling>
                          onnx_parser::node_info info,
                          std::vector<instruction_ref> args) const
    {
-        std::string mode                                                 = opd.op_name;
+        return add_pooling_op(opd, std::move(info), args[0]);
-        const std::unordered_map<std::string, op::pooling_mode> mode_map = {
+    };
-            {"max", op::pooling_mode::max},
-            {"average", op::pooling_mode::average},
-            {"lpnorm", op::pooling_mode::lpnorm}};
-        if(not contains(mode_map, mode))
-        {
-            MIGRAPHX_THROW(
-                "PARSE_POOLING: onnx pooling mode must be [\"max\", \"average\", \"lpnorm\"]");
-        }
-        operation op  = make_op("pooling", {{"mode", mode_map.at(mode)}});
-        value values  = op.to_value();
-        auto l0       = args[0];
-        auto in_shape = l0->get_shape();
-        assert(in_shape.ndim() > 2);
-        auto kdims = in_shape.ndim() - 2;
-        values = handle_values(opd, info, in_shape, values);
-        // count include padding, if count include pad is 1, we always use
-        // explicit pad
-        int count_include_pad = 0;
-        if(contains(info.attributes, "count_include_pad"))
-        {
-            if(in_shape.dynamic())
-            {
-                MIGRAPHX_THROW("PARSE_POOLING: count_include_pad attribute is not supported for "
-                               "dynamic input shape");
-            }
-            count_include_pad = info.attributes.at("count_include_pad").i();
-        }
-        std::vector<int64_t> paddings;
-        float pad_val = ((mode == "max") ? std::numeric_limits<float>::lowest() : 0.0f);
-        if(contains(info.attributes, "pads"))
-        {
-            values["padding"].clear();
-            copy(info.attributes["pads"].ints(), std::back_inserter(paddings));
-            check_attr_sizes(
-                kdims, paddings.size() / 2, "PARSE_POOLING: inconsistent explicit paddings");
-        }
-        if(paddings.size() != 2 * kdims)
-        {
-            paddings.resize(kdims * 2);
-            std::fill_n(paddings.begin(), 2 * kdims, 0);
-        }
-        if(values["padding"].size() != kdims)
-        {
-            values["padding"].resize(kdims);
-            std::fill_n(values["padding"].begin(), kdims, 0);
-        }
-        if(values["stride"].size() != kdims)
-        {
-            values["stride"].resize(kdims);
-            std::fill_n(values["stride"].begin(), kdims, 1);
-        }
-        // used to calculate the supposed output shape
-        std::vector<int64_t> orig_padding = paddings;
-        // TODO:  add parsing for dilations
-        if(contains(info.attributes, "auto_pad") and
-           to_upper(info.attributes["auto_pad"].s()) != "NOTSET")
-        {
-            auto auto_pad = to_upper(info.attributes["auto_pad"].s());
-            // don't use the given padding sizes, if any
-            // values["padding"].clear();
-            if(in_shape.dynamic())
-            {
-                // set padding_mode to trigger auto padding at runtime
-                bool is_same_upper     = (auto_pad.find("SAME_UPPER") != std::string::npos);
-                values["padding_mode"] = is_same_upper ? to_value(op::padding_mode_t::same_upper)
-                                                       : to_value(op::padding_mode_t::same_lower);
-            }
-            else
-            {
-                // Calculate auto padding
-                // dilations (argument 4) not supported; default to all 1's
-                cal_auto_padding_size(info,
-                                      values,
-                                      values["lengths"].to_vector<std::size_t>(),
-                                      std::vector<size_t>(in_shape.ndim() - 2, 1),
-                                      in_shape.lens(),
-                                      paddings);
-                values["padding"] = paddings;
-                // default padding_mode indicates that padding sizes are not calculated dynamically
-                values["padding_mode"] = migraphx::op::padding_mode_t::default_;
-            }
-        }
-        std::vector<int64_t> slice_start;
-        std::vector<int64_t> slice_end;
-        tune_padding_size(values, paddings, count_include_pad, slice_start);
-        if(not slice_start.empty())
-        {
-            if(in_shape.dynamic())
-            {
-                MIGRAPHX_THROW(
-                    "PARSE_POOLING: asymmetric padding not supported for dynamic input shape");
-            }
-            // calculate expected output shape
-            orig_padding.insert(orig_padding.begin() + kdims, 2, 0);
-            orig_padding.insert(orig_padding.begin(), 2, 0);
-            op::pad pad{orig_padding, 0.0f};
-            shape padded_shape = pad.compute_shape({l0->get_shape()});
-            // make an op just to get its output shape
-            auto out_lens = make_op("pooling", values).compute_shape({padded_shape}).lens();
-            // compute slice_end information
-            slice_end.resize(slice_start.size());
-            std::transform(out_lens.begin() + 2,
-                           out_lens.end(),
-                           slice_start.begin(),
-                           slice_end.begin(),
-                           [](auto i, auto j) { return i + j; });
-        }
-        values["padding"] = std::vector<size_t>(paddings.begin(), paddings.end());
-        check_asym_padding(info, l0, paddings, values, count_include_pad, pad_val);
-        op.from_value(values);
-        auto l1 = info.add_instruction(op, l0);
-        if(not slice_start.empty())
-        {
-            std::vector<int64_t> axes(kdims);
-            std::iota(axes.begin(), axes.end(), 2);
-            l1 = info.add_instruction(
-                make_op("slice", {{"axes", axes}, {"starts", slice_start}, {"ends", slice_end}}),
-                l1);
-        }
-        return l1;
-    }
 };
 } // namespace onnx

--- a/src/onnx/parse_qlinearglavgpool.cpp
+++ b/src/onnx/parse_qlinearglavgpool.cpp
@@ -23,6 +23,7 @@
 */
 #include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/onnx/pooling.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/op/pooling.hpp>
 #include <migraphx/make_op.hpp>
@@ -36,90 +37,56 @@ namespace onnx {
 /*
 *********************************************************************************
- *  Reference: see QLinearGlobalAveragePool in                                   *
+ *  Reference: see QLinearAveragePool and QLinearGlobalAveragePool in            *
 *  github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md          *
 *********************************************************************************
+ */
-QLinearGlobalAveragePool consumes an input tensor X and applies
+struct parse_qlinearpooling : op_parser<parse_qlinearpooling>
-Average pooling across the values in the same channel. This is
-equivalent to AveragePool with kernel size equal to the spatial
-dimension of input tensor. Input is of type uint8_t or int8_t.
-Version
-This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
-Attributes
-channels_last : int
-Inputs
-X : T
-Input data tensor from the previous operator; According to channels_last, dimensions for image case
-are (N x C x H x W), or (N x H x W x C) where N is the batch size, C is the number of channels, and
-H and W are the height and the width of the data. For non image case, the dimensions are in the form
-of (N x C x D1 x D2 ... Dn), or (N x D1 X D2 ... Dn x C) where N is the batch size.
-x_scale : tensor(float)
-Scale of quantized input 'X'. It must be a scalar.
-x_zero_point : T
-Zero point tensor for input 'X'. It must be a scalar.
-y_scale : tensor(float)
-Scale of quantized output 'Y'. It must be a scalar.
-y_zero_point : T
-Zero point tensor for output 'Y'. It must be a scalar.
-Outputs
-Y : T
-Output data tensor from pooling across the input tensor. The output tensor has the same rank as the
-input. with the N and C value keep it value, while the other dimensions are all 1. Type Constraints
-T : tensor(uint8), tensor(int8)
-Constrain input and output types to signed/unsigned int8 tensors.
-*/
-struct parse_qlinearglobalaveragepool : op_parser<parse_qlinearglobalaveragepool>
 {
-    std::vector<op_desc> operators() const { return {{"QLinearGlobalAveragePool"}}; }
+    std::vector<op_desc> operators() const
-    // basic type checking for QLinearGlobalAveragePool Operator
-    void check_inputs(const std::vector<instruction_ref>& args) const
    {
-        if(args.size() < 5)
+        return {{"QLinearGlobalAveragePool", "average"}, {"QLinearAveragePool", "average"}};
-            MIGRAPHX_THROW("QLINEARGLOBALAVERAGEPOOL: missing inputs");
+    }
+    void check_inputs(const op_desc& opd, const std::vector<instruction_ref>& args) const
+    {
        const auto& in_x     = args[0];
-        const auto& zero_pt_x = args[2];
+        const auto onnx_name = opd.onnx_name;
-        const auto& zero_pt_y = args[4];
        if(in_x->get_shape().ndim() <= 2)
-            MIGRAPHX_THROW("QLINEARGLOBALAVERAGEPOOL: input dimensions too small");
+            MIGRAPHX_THROW(onnx_name + ": input dimensions too small");
        auto type_x = in_x->get_shape().type();
        if(type_x != migraphx::shape::int8_type and type_x != migraphx::shape::uint8_type)
-            MIGRAPHX_THROW("QLINEARGLOBALAVERAGEPOOL: unsupported input type");
+            MIGRAPHX_THROW(onnx_name + ": unsupported input type");
+        const auto& zero_pt_x = args[2];
        if(type_x != zero_pt_x->get_shape().type())
-            MIGRAPHX_THROW("QLINEARGLOBALAVERAGEPOOL: mismatched type: input zero point");
+            MIGRAPHX_THROW(onnx_name + ": mismatched type: input zero point");
+        if(args.size() == 5)
+        {
+            const auto& zero_pt_y = args[4];
            if(type_x != zero_pt_y->get_shape().type())
-            MIGRAPHX_THROW("QLINEARGLOBALAVERAGEPOOL: mismatched type: output zero point");
+                MIGRAPHX_THROW(onnx_name + ": mismatched type: output zero point");
+        }
    }
-    instruction_ref parse(const op_desc& /* opd */,
+    instruction_ref parse(const op_desc& opd,
                          const onnx_parser& parser,
                          const onnx_parser::node_info& info,
                          const std::vector<instruction_ref>& args) const
+    {
+        if(contains(info.attributes, "channel_last"))
        {
            int channels_last =
                parser.parse_value(info.attributes.at("channels_last")).template at<int>();
            if(channels_last != 0)
-            MIGRAPHX_THROW(
+                MIGRAPHX_THROW(opd.onnx_name + ": channels_last (N x D1..Dn x C) is not supported");
-                "QLINEARGLOBALAVERAGEPOOL: channels_last (N x D1..Dn x C) is not supported");
+        }
-        check_inputs(args);
+        check_inputs(opd, args);
        // Input: X
@@ -128,21 +95,18 @@ struct parse_qlinearglobalaveragepool : op_parser<parse_qlinearglobalaveragepool
        const auto& zero_pt_x = args[2];
        auto dquant_x         = bcast_qdq_instr("dequantizelinear", in_x, scale_x, zero_pt_x, info);
-        // Output Y = globalaveragepool(X)
+        // Output Y = pooling_op(X)
-        auto op   = migraphx::op::pooling{migraphx::op::pooling_mode::average};
-        auto lens = in_x->get_shape().lens();
-        std::vector<size_t> lengths(lens.begin() + 2, lens.end());
-        op.lengths = lengths;
-        op.padding = std::vector<size_t>(lens.size());
-        auto out_y = info.add_instruction(op, dquant_x);
-        const auto& scale_y   = args[3];
+        auto out_y = add_pooling_op(opd, info, dquant_x);
-        const auto& zero_pt_y = args[4];
-        auto out_quant_y = bcast_qdq_instr("quantizelinear", out_y, scale_y, zero_pt_y, info);
+        const auto& in_scale_y = args[3];
+        // zero_pt for Y is supplied as the last optional argument..
+        if(args.size() == 5)
+            return (bcast_qdq_instr("quantizelinear", out_y, in_scale_y, args[4], info));
-        return out_quant_y;
+        // if no zero_pt: just broadcast the scale..
+        auto bcast_scale_y = bcast_scalar_instr(out_y->get_shape(), in_scale_y, info);
+        return (info.add_instruction(migraphx::make_op("quantizelinear"), out_y, bcast_scale_y));
    }
 };

--- a/src/onnx/parse_qlinearunary.cpp
+++ b/src/onnx/parse_qlinearunary.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/common.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/onnx/broadcast_qdq.hpp>
+#include <migraphx/op/pooling.hpp>
+#include <migraphx/instruction.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+/*
+ *********************************************************************************
+ *  Reference: see QLinearSigmoid, QLinearLeakyRelu in                           *
+ *  https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md  *
+ *********************************************************************************
+com.microsoft.QLinearSigmoid
+QLinearSigmoid takes quantized input data (Tensor), and quantize parameter for output, and produces
+one output data (Tensor) where the function f(x) = quantize(Sigmoid(dequantize(x))), is applied to
+the data tensor elementwise. Where the function Sigmoid(x) = 1 / (1 + exp(-x))
+Version
+This version of the operator has been available since version 1 of the 'com.microsoft' operator
+set.
+*****************************************************************************************************
+com.microsoft.QLinearLeakyRelu
+QLinearLeakyRelu takes quantized input data (Tensor), an argument alpha, and quantize parameter for
+output, and produces one output data (Tensor) where the function f(x) = quantize(alpha *
+dequantize(x)) for dequantize(x) < 0, f(x) = quantize(dequantize(x)) for dequantize(x) >= 0, is
+applied to the data tensor elementwise.
+Version
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+Attributes
+alpha : float
+Coefficient of leakage.
+******************************************************************************************************
+Generic input layout of QLinear unary operators:
+Inputs (4 - 5)
+X : T
+Input tensor
+X_scale : tensor(float)
+Input X's scale. It's a scalar, which means a per-tensor/layer quantization.
+X_zero_point (optional) : T
+Input X's zero point. Default value is 0 if it's not specified. It's a scalar, which means a
+per-tensor/layer quantization.
+Y_scale : tensor(float) Output Y's scale. It's a scalar, which means
+a per-tensor/layer quantization.
+Y_zero_point (optional) : T Output Y's zero point. Default value is
+0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.
+Outputs
+Y : T
+Output tensor
+Type Constraints
+T : tensor(uint8), tensor(int8)
+Constrain input and output types to 8 bit tensors.
+*/
+struct parse_qlinearunary : op_parser<parse_qlinearunary>
+{
+    std::vector<op_desc> operators() const
+    {
+        return {{"QLinearSigmoid", "sigmoid"}, {"QLinearLeakyRelu", "leaky_relu"}};
+    }
+    void check_inputs(const op_desc& opd, const std::vector<instruction_ref>& args) const
+    {
+        if(args.size() < 4)
+            MIGRAPHX_THROW(opd.op_name + ": missing inputs");
+        const auto& in_x = args[0];
+        auto sh_x   = in_x->get_shape();
+        auto type_x = sh_x.type();
+        if(type_x != migraphx::shape::int8_type and type_x != migraphx::shape::uint8_type)
+            MIGRAPHX_THROW(opd.op_name + ": unsupported input type");
+    }
+    instruction_ref parse(const op_desc& opd,
+                          const onnx_parser& parser,
+                          const onnx_parser::node_info& info,
+                          const std::vector<instruction_ref>& args) const
+    {
+        check_inputs(opd, args);
+        // X
+        const auto& in_x         = args[0];
+        const auto& in_scale_x   = args[1];
+        const auto& in_zero_pt_x = args[2];
+        auto dquant_x = bcast_qdq_instr("dequantizelinear", in_x, in_scale_x, in_zero_pt_x, info);
+        // Y = (op(dequantizelinear(x))
+        auto op = parser.load(opd.op_name, info);
+        auto y  = info.add_instruction(op, dquant_x);
+        const auto& in_scale_y = args[3];
+        // zero_pt for Y is supplied as the last optional argument..
+        if(args.size() == 5)
+            return (bcast_qdq_instr("quantizelinear", y, in_scale_y, args[4], info));
+        // if no zero_pt: just broadcast the scale..
+        auto bcast_scale_sigm = bcast_scalar_instr(y->get_shape(), in_scale_y, info);
+        return (info.add_instruction(migraphx::make_op("quantizelinear"), y, bcast_scale_sigm));
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_scatternd.cpp
+++ b/src/onnx/parse_scatternd.cpp
@@ -39,15 +39,17 @@ struct parse_scatternd : op_parser<parse_scatternd>
                          const onnx_parser::node_info& info,
                          std::vector<instruction_ref>& args) const
    {
+        std::string reduction = "none";
        if(contains(info.attributes, "reduction"))
        {
-            if(info.attributes.at("reduction").s() == "add")
+            reduction = info.attributes.at("reduction").s();
-                return info.add_instruction(migraphx::make_op("scatternd_add"), args);
+            if(not contains({"none", "add", "mul", "min", "max"}, reduction))
-            if(info.attributes.at("reduction").s() == "mul")
+            {
-                return info.add_instruction(migraphx::make_op("scatternd_mul"), args);
+                MIGRAPHX_THROW("PARSE_SCATTERND: unsupported reduction mode " + reduction);
+            }
        }
-        return info.add_instruction(migraphx::make_op("scatternd_none"), args);
+        return info.add_instruction(migraphx::make_op("scatternd_" + reduction), args);
    }
 };