Merge branch 'develop' into optimize_jenkinsfile

a24ed87e · Chris Austen · GitHub · 6481cd69 · a09dc502 · a24ed87e
Unverified Commit a24ed87e authored Dec 05, 2023 by Chris Austen Committed by GitHub Dec 05, 2023
20 changed files
--- a/src/targets/gpu/include/migraphx/gpu/int8_conv_pack.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/int8_conv_pack.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,31 +21,32 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_INT8_CONV_PACK_HPP
+#ifndef MIGRAPHX_GUARD_OPERATORS_ISINF_HPP
-#define MIGRAPHX_GUARD_RTGLIB_INT8_CONV_PACK_HPP
+#define MIGRAPHX_GUARD_OPERATORS_ISINF_HPP
-#include <migraphx/argument.hpp>
+#include <migraphx/op/unary.hpp>
 #include <migraphx/config.hpp>
-#include <utility>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
+namespace op {
-struct context;
+struct isinf : unary<isinf>
-struct miopen_int8_conv_pack
 {
-    std::string name() const { return "gpu::int8_conv_pack"; }
+    auto apply() const
-    shape compute_shape(const std::vector<shape>& inputs) const;
+    {
-    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
+        return [&](auto x) { return std::isinf(static_cast<double>(x)); };
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    }
+    std::string name() const { return "isinf"; }
+    shape compute_shape(std::vector<shape> inputs) const
    {
-        return shapes.size() - 1;
+        return unary<isinf>::compute_shape(std::move(inputs)).with_type(shape::bool_type);
    }
 };
-} // namespace gpu
+} // namespace op
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/op/multinomial.hpp
+++ b/src/include/migraphx/op/multinomial.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,11 +21,52 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+/**
+ *  * Multinomial or categorical distribution.  Performs a sampling of random input
+ *         and returns a count of
+ *         each category, or bucket.  This does not require the standard multinomial
+ *         distribution but instead takes a probability distribution, i.e. cumulative
+ *         distribution function (CDF) as its first input.
+ *
+ *      Inputs:   args[0] - a tensor of probabilities for each category.  Values are
+ *                          cumulative density function
+ *                          totals as provided by operation prefix_scan_sum.  Values are
+ *                          cumulative probabilities (i.e. start with any set of numbers > 0
+ *                          and then apply prefix_scan_sum).  Values do not need to be
+ *                          normalized to sum to 1; this is done in runtime computation.
+ *
+ *                          This input has Rank 2.  Dimension 0 is batch #, so that there can be
+ *                          a different CDF for each iteration in the batch.  The size of dimension
+ *                          1 is the number of categories.
+ *
+ *                args[1] - a tensor of random numbers.  The last dimension is the sample
+ *                          size, i.e. the number of
+ *                          random samples in each iteration of the batch.  Nominally
+ *                          has two dimensions where the first dimension is batch size, but
+ *                          any reshaping such that the total
+ *                          number of elements is (batch_size * sample_size) is legal.
+ *
+ *                          Values as created by a std::mt19937 like this:
+ *
+ *                           size_t sample_size = 100000;
+ *                           float seed         = 0.0f;
+ *                           std::mt19937 gen(seed);
+ *                           std::uniform_real_distribution<> dis(0.0, 1.0);
+ *                           std::vector<float> rand_samples(sample_size);
+ *                           std::generate(rand_samples.begin(), rand_samples.end(), [&]() { return
+ *                                dis(gen); });
+ *
+ *        Output:   A 2D vector of category each input.  Dimensions are (Input 1[first], Input
+ 2[last]).
+ *
+*/
 #ifndef MIGRAPHX_GUARD_OPERATORS_MULTINOMIAL_HPP
 #define MIGRAPHX_GUARD_OPERATORS_MULTINOMIAL_HPP
-#include <migraphx/check_shapes.hpp>
 #include <migraphx/argument.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/dyn_output.hpp>
 #include <migraphx/par_for.hpp>
 #include <migraphx/reflect.hpp>
 #include <random>
@@ -47,22 +88,35 @@ struct multinomial
    std::string name() const { return "multinomial"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2).only_dims(2);
+        check_shapes{inputs, *this, true}.has(2).only_dims(2);
-        size_t sample_size = inputs.back().lens().back();
-        if(not contains({shape::int32_type, shape::int64_type}, dtype))
+        if(inputs.back().ndim() < 1)
-            MIGRAPHX_THROW(
+            MIGRAPHX_THROW("Multinomial: Second input shape (sample) has no dimensions");
-                "Multinomial: Invalid output type. Valid types are int32_type and int64_type.");
+        if(dtype == shape::bool_type)
+            MIGRAPHX_THROW("Multinomial: boolean output type invalid.");
-        return {dtype, {inputs.front().lens().front(), sample_size}};
+        // Output takes one dimension from each of the two input shapes.  If they are both fixed,
+        // return a static shape
+        if((not inputs.front().dynamic()) or (inputs.front().dyn_dims().front().is_fixed()))
+        {
+            if((not inputs.back().dynamic()) or (inputs.back().dyn_dims().back().is_fixed()))
+            {
+                size_t batch = {inputs.front().max_lens().front()};
+                size_t sample_size{inputs.back().max_lens().back()};
+                return {dtype, {batch, sample_size}};
+            }
+        }
+        return {dtype,
+                {inputs.front().to_dynamic().dyn_dims().front(),
+                 inputs.back().to_dynamic().dyn_dims().back()}};
    }
-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
-        size_t batch_size  = output_shape.lens().front();
+        size_t batch_size  = dyn_out.computed_shape.lens().front();
        size_t class_size  = args[0].get_shape().lens().back();
-        size_t sample_size = output_shape.lens().back();
+        size_t sample_size = dyn_out.computed_shape.lens().back();
        visit_all(args[0], args[1])([&](auto cdf, auto dist) {
            result.visit([&](auto output) {
@@ -70,13 +124,16 @@ struct multinomial
                    auto idx       = args[1].get_shape().multi(i);
                    auto cdf_begin = cdf.begin() + (idx[0] * class_size);
                    auto cdf_end   = cdf_begin + class_size;
+                    // std::upper_bound returns an iterator to the bucket the value belongs in,
+                    // when normalized by the probability distribution dist
                    auto sample_iter =
                        std::upper_bound(cdf_begin, cdf_end, dist[i] * *(std::prev(cdf_end)));
+                    // convert iterator to an integer index
                    output[i] = std::distance(cdf_begin, sample_iter);
                });
            });
        });
        return result;
    }
 };

--- a/src/targets/gpu/gather.cpp
+++ b/src/targets/gpu/gather.cpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,25 +21,30 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <migraphx/gpu/gather.hpp>
+#ifndef MIGRAPHX_GUARD_OPERATORS_NEARBYINT_HPP
-#include <migraphx/gpu/context.hpp>
+#define MIGRAPHX_GUARD_OPERATORS_NEARBYINT_HPP
-#include <migraphx/gpu/device/gather.hpp>
+#include <migraphx/op/unary.hpp>
+#include <migraphx/config.hpp>
+#include <fenv.h>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
+namespace op {
+struct nearbyint : unary<nearbyint>
-shape hip_gather::compute_shape(std::vector<shape> inputs) const
-{
-    inputs.pop_back();
-    return op.normalize_compute_shape(inputs);
-}
-argument hip_gather::compute(context& ctx, const shape&, const std::vector<argument>& args) const
 {
-    return device::gather(ctx.get_stream().get(), args.back(), args[0], args[1], op.axis);
+    auto apply() const
-}
+    {
+        return [](auto x) {
-} // namespace gpu
+            auto rounding_mode = fegetround();
+            fesetround(FE_TONEAREST);
+            return std::nearbyint(x);
+            fesetround(rounding_mode);
+        };
+    }
+};
+} // namespace op
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
+#endif
--- a/src/include/migraphx/op/normalize_attribute.hpp
+++ b/src/include/migraphx/op/normalize_attribute.hpp
@@ -40,6 +40,8 @@ namespace op {
 * 2. use_rank (default) vs use_len:
 *  `use_rank` sets the max value/index of the attribute as the rank of lens.
 *  `use_lens` sets the max value/index as the corresponding value in lens at the axes index.
+ *      Uses the dynamic_dimension.max value for dynamic shapes. Returns the original vector
+ *      (no normalization) if any of dynamic_dimension[axes] are not fixed.
 * 3. `clip_min` vs. `not_clip_min` (default):
 *  Clip values less than the minimum to the minimum or not.
 * 4. `include_min` vs. `exclude_min` (default):

--- a/src/include/migraphx/op/pooling.hpp
+++ b/src/include/migraphx/op/pooling.hpp
@@ -70,7 +70,8 @@ struct pooling
    // 2 smaller than the input tensor rank (NCHW layout)
    std::vector<std::size_t> lengths = {1, 1};
-    // Dilations are not supported at this time.
+    // Spacing between the elements of the pooling kernel. Must be the same ndim as lengths.
+    std::vector<std::size_t> dilations = {1, 1};
    // ceiling mode is a flag affecting output size
    // or equivalently, placements of the pooling kernel.
@@ -99,6 +100,7 @@ struct pooling
                    f(self.padding_mode, "padding_mode"),
                    f(self.stride, "stride"),
                    f(self.lengths, "lengths"),
+                    f(self.dilations, "dilations"),
                    f(self.ceil_mode, "ceil_mode"),
                    f(self.lp_order, "lp_order"),
                    f(self.dyn_global, "dyn_global"));
@@ -112,14 +114,17 @@ struct pooling
            return;
        if((padding_mode != default_ and padding.size() != stride.size() and
            (padding.size()) != stride.size() * 2) or
-           stride.size() != lengths.size())
+           stride.size() != lengths.size() or dilations.size() != lengths.size())
        {
            MIGRAPHX_THROW("POOLING: inconsistent attribute sizes");
        }
-        if(std::any_of(lengths.begin(), lengths.end(), [&](auto i) { return (i == 0); }) or
-           std::any_of(stride.begin(), stride.end(), [&](auto i) { return (i == 0); }))
+        const auto is_zero = [](auto el) { return el == 0; };
+        if(std::any_of(lengths.begin(), lengths.end(), is_zero) or
+           std::any_of(stride.begin(), stride.end(), is_zero) or
+           std::any_of(dilations.begin(), dilations.end(), is_zero))
        {
-            MIGRAPHX_THROW("POOLING: size 0 pooling kernel or stride");
+            MIGRAPHX_THROW("POOLING: size 0 pooling kernel or stride or dilations");
        }
        // TODO:  update lowering to run the reference
@@ -142,6 +147,11 @@ struct pooling
    value attributes() const { return {{"normalize_padding", "padding"}}; }
+    inline std::size_t dilate_dim(std::size_t dim, std::size_t dilation) const
+    {
+        return 1 + dilation * (dim - 1);
+    }
    std::vector<std::size_t> calc_spatial_dim_out(const std::vector<std::size_t>& input_lens,
                                                  std::size_t kdims) const
    {
@@ -151,8 +161,9 @@ struct pooling
            std::size_t padding_factor = 2 * padding[i];
            if(padding.size() == 2 * kdims)
                padding_factor = padding[i] + padding[i + kdims];
+            std::size_t dilated_length = dilate_dim(lengths[i], dilations[i]);
            std::size_t dim_size;
-            if(input_lens[i + 2] + padding_factor < lengths[i])
+            if(input_lens[i + 2] + padding_factor < dilated_length)
            {
                if(padding_mode == default_)
                    MIGRAPHX_THROW("POOLING: not enough padding for the given kernel size");
@@ -162,7 +173,7 @@ struct pooling
            }
            else
            {
-                dim_size = input_lens[i + 2] + padding_factor - lengths[i];
+                dim_size = input_lens[i + 2] + padding_factor - dilated_length;
            }
            std::size_t len =
                (ceil_mode)
@@ -331,6 +342,7 @@ struct pooling
                int start = static_cast<int>(idx_o[dim] * stride[d_2]) -
                            static_cast<int>(padding_vals[d_2]);
                int end;
+                std::size_t dilated_kernel_dim = dilate_dim(kernel_dims[d_2], dilations[d_2]);
                // NOLINT
                if(count_include_pad and ceil_mode and (mode != pooling_mode::max))
                {
@@ -340,15 +352,14 @@ struct pooling
                    // padding.  Clip out-of-bounds indexes but not padding.
                    // Check if this kernel extends beyond the padding at end of dimension
-                    end = std::min(start + kernel_dims[d_2],
+                    end = std::min(start + dilated_kernel_dim,
                                   in_lens[dim] + static_cast<int>(padding_vals[d_2]));
                }
                else
                {
                    // In non-ceiling mode, when
                    // count_include_pad is false, or for max pooling, clip off padding.
-                    end   = std::min(start + kernel_dims[d_2], in_lens[dim]);
+                    end = std::min(start + dilated_kernel_dim, in_lens[dim]);
-                    start = std::max(start, 0);
                }
                win_start.push_back(start);
                if(end < start)
@@ -366,6 +377,16 @@ struct pooling
            // for each element in the window...
            shape_for_each(win_shape, [&](const auto& idx_w) {
+                // Skip elements that belong to the dilated area
+                for(size_t axis = 0; axis < idx_w.size(); ++axis)
+                {
+                    if(idx_w[axis] % dilations[axis])
+                    {
+                        pool_size -= 1;
+                        return;
+                    }
+                }
                // the coordinates of this element
                auto idx = idx_o;
@@ -390,7 +411,15 @@ struct pooling
                    // this is a padding element.  Padding locations
                    // don't contribute to average or max pooling total but can play in
                    // lpnorm pooling.
-                    output_val = op(output_val, 0);
+                    if(mode == pooling_mode::lpnorm)
+                    {
+                        output_val = op(output_val, op.template init<Type>());
+                    }
+                    if(mode == pooling_mode::average)
+                    {
+                        // Ignore padding
+                        pool_size -= 1;
+                    }
                }
            });
            output[i] = Type(op.final(output_val, pool_size));

--- a/src/include/migraphx/op/prefix_scan_op.hpp
+++ b/src/include/migraphx/op/prefix_scan_op.hpp
@@ -22,6 +22,12 @@
 * THE SOFTWARE.
 */
+/**
+ * Parent struct for prefix scan ops.  A prefix scan is a mathematical entity useful
+ * in parallelizing various computations.  Given a list of numbers, a prefix scan
+ * op returns an equal size list of running totals of the values.  Other operations
+ * besides addition can be supported by child ops.
+ */
 #ifndef MIGRAPHX_GUARD_OPERATORS_SCAN_OP_HPP
 #define MIGRAPHX_GUARD_OPERATORS_SCAN_OP_HPP

--- a/src/include/migraphx/op/quantizelinear.hpp
+++ b/src/include/migraphx/op/quantizelinear.hpp
@@ -30,11 +30,11 @@
 #include <migraphx/par_for.hpp>
 #include <migraphx/value.hpp>
 #include <cmath>
+#include <fenv.h>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {
 struct quantizelinear
 {
    std::string name() const { return "quantizelinear"; }
@@ -71,26 +71,26 @@ struct quantizelinear
        {
            y_zero_point = args.at(2);
        }
        argument result{output_shape};
+        auto rounding_mode = fegetround();
+        fesetround(FE_TONEAREST);
        visit_all(result, y_zero_point)([&](auto output, auto zero_pts) {
            visit_all(x, y_scale)([&](auto input, auto scales) {
                using quant_type = typename decltype(output)::value_type;
                auto min_value   = std::numeric_limits<quant_type>::min();
                auto max_value   = std::numeric_limits<quant_type>::max();
                par_for(output_shape.elements(), [&](auto i) {
-                    int64_t quantized = static_cast<int64_t>(std::round(input[i] / scales[i])) +
+                    int64_t quantized = static_cast<int64_t>(std::nearbyint(input[i] / scales[i])) +
                                        static_cast<int64_t>(zero_pts[i]);
                    output[i] = std::max(static_cast<int64_t>(min_value),
                                         std::min(static_cast<int64_t>(max_value), quantized));
                });
            });
        });
+        fesetround(rounding_mode);
        return result;
    }
 };
 } // namespace op
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/op/random_uniform.hpp
+++ b/src/include/migraphx/op/random_uniform.hpp
@@ -65,11 +65,10 @@ struct random_uniform
        return inputs.at(1);
    }
-    argument compute(const shape&, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
        // Output goes into the passed buffer, not the shape output.
-        auto result = args[1];
+        argument result{dyn_out.computed_shape};
        uint64_t local_seed = args[0].at<uint64_t>(0);
        std::mt19937 gen(local_seed);

--- a/src/targets/gpu/include/migraphx/gpu/pack_int8_args.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/pack_int8_args.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,25 +21,26 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_PACK_INT8_ARGS_HPP
+#ifndef MIGRAPHX_GUARD_OPERATORS_SCATTERND_MAX_HPP
-#define MIGRAPHX_GUARD_RTGLIB_PACK_INT8_ARGS_HPP
+#define MIGRAPHX_GUARD_OPERATORS_SCATTERND_MAX_HPP
-#include <migraphx/program.hpp>
+#include <migraphx/op/scatternd_op.hpp>
-#include <migraphx/gpu/context.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
-namespace gpu {
+struct scatternd_max : scatternd_op<scatternd_max>
-struct MIGRAPHX_GPU_EXPORT pack_int8_args
 {
-    std::string name() const { return "gpu::pack_int8_args"; }
+    scatternd_max() {}
-    void apply(module& m) const;
-    shape pack_int8_shape(const shape& s) const;
+    auto reduction() const
+    {
+        return [](auto& x, const auto& y) { x = std::max(x, y); };
+    }
 };
-} // namespace gpu
+} // namespace op
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/device/gather.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/gather.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,23 +21,26 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_GATHER_HPP
+#ifndef MIGRAPHX_GUARD_OPERATORS_SCATTERND_MIN_HPP
-#define MIGRAPHX_GUARD_RTGLIB_DEVICE_GATHER_HPP
+#define MIGRAPHX_GUARD_OPERATORS_SCATTERND_MIN_HPP
-#include <migraphx/argument.hpp>
+#include <migraphx/op/scatternd_op.hpp>
-#include <migraphx/gpu/device/config.hpp>
-#include <hip/hip_runtime_api.h>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
+namespace op {
-namespace device {
-argument MIGRAPHX_DEVICE_EXPORT
+struct scatternd_min : scatternd_op<scatternd_min>
-gather(hipStream_t stream, argument result, argument arg1, argument arg2, int64_t axis);
+{
+    scatternd_min() {}
-} // namespace device
+    auto reduction() const
-} // namespace gpu
+    {
+        return [](auto& x, const auto& y) { x = std::min(x, y); };
+    }
+};
+} // namespace op
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/op/scatternd_op.hpp
+++ b/src/include/migraphx/op/scatternd_op.hpp
@@ -121,7 +121,8 @@ struct scatternd_op : op_name<Derived>
                auto k             = indices_shape.lens().back();
                auto q             = indices_shape.ndim();
                auto r             = dyn_out.computed_shape.ndim();
-                par_for(updates_shape.elements(), [&](const auto i) {
+                for(auto i = 0u; i < updates_shape.elements(); ++i)
+                {
                    auto updates_idx = updates_std.multi(i);
                    std::vector<std::size_t> indices_idx(q, 0);
                    std::copy(
@@ -135,7 +136,7 @@ struct scatternd_op : op_name<Derived>
                    std::copy(updates_idx.begin() + q - 1, updates_idx.end(), out_idx.begin() + k);
                    self.reduction()(output[dyn_out.computed_shape.index(out_idx)], updates[i]);
-                });
+                }
            });
        });

--- a/src/include/migraphx/op/slice.hpp
+++ b/src/include/migraphx/op/slice.hpp
@@ -31,6 +31,7 @@
 #include <migraphx/dyn_output.hpp>
 #include <migraphx/op/normalize_attribute.hpp>
 #include <migraphx/normalize_attributes.hpp>
+#include <array>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -38,6 +39,18 @@ namespace op {
 /**
 * Slice operator that accepts variable axes, starts and ends.
+ * All of `starts`, `ends`, and `axes` must be supplied by either
+ * their attribute or an input (but not both).
+ *
+ * Valid calls:
+ * slice(input); axes, starts, ends set
+ * slice(input, starts); axes, ends set
+ * slice(input, ends); starts, axes set
+ * slice(input, axes); starts, ends set
+ * slice(input, starts, ends); axes set
+ * slice(input, starts, axes); ends set
+ * slice(input, ends, axes); starts set
+ * slice(input, start, ends, axes); none set
 *
 * Attributes:
 * axes: constant axes to slice over (optional)
@@ -46,8 +59,8 @@ namespace op {
 *
 * Parameters:
 * data: the input tensor to slice (dynamic or static shape)
- * input_starts: starting indicies of slice (optional, static shape)
+ * input_starts: starting indices of slice (optional, static shape)
- * input_ends: ending indicies of slice (optional, static shape)
+ * input_ends: ending indices of slice (optional, static shape)
 * input_axes: axes to slice over (optional, static shape)
 */
 struct slice
@@ -56,6 +69,18 @@ struct slice
    std::vector<int64_t> starts{};
    std::vector<int64_t> ends{};
+    /**
+     * Named arrays for the set attribute possibilities.
+     */
+    static constexpr std::array<bool, 3> all_set     = {true, true, true};
+    static constexpr std::array<bool, 3> ends_axes   = {false, true, true};
+    static constexpr std::array<bool, 3> starts_axes = {true, false, true};
+    static constexpr std::array<bool, 3> starts_ends = {true, true, false};
+    static constexpr std::array<bool, 3> axes_only   = {false, false, true};
+    static constexpr std::array<bool, 3> ends_only   = {false, true, false};
+    static constexpr std::array<bool, 3> starts_only = {true, false, false};
+    static constexpr std::array<bool, 3> none_set    = {false, false, false};
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
@@ -63,24 +88,26 @@ struct slice
    }
    /**
-     * Ensure that attribute vectors axes, starts, and ends are all the same size and values are
+     * Ensure that attribute axes is within limits.
-     * within limits.
+     * Will attempt to normalize starts and ends; but will use the dynamic_dimension.max
+     * values for dynamic shapes. This makes it so you have to renormalize for
+     * non-fixed dynamic_dimensions.
     */
    value attributes() const
    {
-        value normalize     = value::object{};
+        value normalize_axes     = value::object{};
-        normalize["axes"]   = value::array{normalize_attribute::include_min};
+        normalize_axes["axes"]   = value::array{normalize_attribute::include_min};
-        normalize["starts"] = value::array{normalize_attribute::clip_max,
+        normalize_axes["starts"] = value::array{normalize_attribute::clip_max,
-                                           normalize_attribute::clip_min,
+                                                normalize_attribute::clip_min,
-                                           normalize_attribute::include_max,
+                                                normalize_attribute::include_max,
-                                           normalize_attribute::use_len,
+                                                normalize_attribute::use_len,
-                                           normalize_attribute::include_min};
+                                                normalize_attribute::include_min};
-        normalize["ends"]   = value::array{normalize_attribute::clip_max,
+        normalize_axes["ends"]   = value::array{normalize_attribute::clip_max,
-                                         normalize_attribute::clip_min,
+                                              normalize_attribute::clip_min,
-                                         normalize_attribute::include_max,
+                                              normalize_attribute::include_max,
-                                         normalize_attribute::use_len,
+                                              normalize_attribute::use_len,
-                                         normalize_attribute::include_min};
+                                              normalize_attribute::include_min};
-        return {{"normalize_axes", normalize}};
+        return {{"normalize_axes", normalize_axes}};
    }
    std::string name() const { return "slice"; }
@@ -88,7 +115,7 @@ struct slice
    /**
     * Computes the slice output shape dimensions for given starts, ends,and axes.
     * Templated to also handle tensor views.
-     * Possibily different type between [in_starts, in_ends] and [in_axes] if in_axes is this
+     * Possibly different type between [in_starts, in_ends] and [in_axes] if in_axes is this
     * object's axes attribute. Assumes in_starts and in_ends are normalized; in_axes are valid.
     */
    template <class A, class B>
@@ -104,62 +131,160 @@ struct slice
        return new_lens;
    }
-    shape normalize_compute_shape(std::vector<shape> inputs) const
+    /// Get the attributes that are non-empty
+    std::array<bool, 3> get_set_attributes() const
    {
-        check_shapes{inputs, *this, true}.has(1, 3, 4);
+        std::array<std::vector<int64_t>, 3> attrs = {this->starts, this->ends, this->axes};
-        auto input_shape = inputs[0];
+        std::array<bool, 3> bool_vec;
-        if(inputs.size() == 1)
+        std::transform(
+            attrs.cbegin(), attrs.cend(), bool_vec.begin(), [](auto a) { return not a.empty(); });
+        return bool_vec;
+    }
+    /// Helper function for normalize_compute_shape()
+    shape compute_two_or_more(std::vector<shape> inputs) const
+    {
+        auto input_shape    = inputs[0];
+        auto set_attributes = get_set_attributes();
+        // check that inputs [1, end) are all 1D, have the same
+        // dimension, and are static
+        check_shapes{inputs.begin() + 1,
+                     inputs.end(),
+                     std::string("SLICE: inputs (starts, ends, and input_axes)"),
+                     false}
+            .only_dims(1)
+            .same_dims();
+        auto dds = input_shape.to_dynamic().dyn_dims();
+        if(inputs.size() == 2)
        {
-            auto t = input_shape.type();
+            if(set_attributes == ends_axes)
-            if(input_shape.dynamic() and std::any_of(axes.begin(), axes.end(), [&](auto axis) {
-                   return not input_shape.dyn_dims()[axis].is_fixed();
-               }))
            {
-                MIGRAPHX_THROW("SLICE: slicing is not allowed on non-fixed dynamic input axis ");
+                // attr ends and axes set; inputs are (data, input_starts)
+                if(inputs[1].lens().at(0) != axes.size())
+                {
+                    MIGRAPHX_THROW("SLICE: 2 input and attributes mismatch");
+                }
+                std::for_each(axes.cbegin(), axes.cend(), [&](const auto& axis) {
+                    dds.at(axis) = {0, dds.at(axis).max};
+                });
            }
-            if(input_shape.dynamic())
+            else if(set_attributes == starts_axes)
            {
-                return shape{t,
+                // attr starts and axes set; inputs are (data, input_ends)
-                             lens_calc(input_shape.min_lens(), starts, ends, axes),
+                if(inputs[1].lens().at(0) != axes.size())
-                             lens_calc(input_shape.max_lens(), starts, ends, axes),
+                {
-                             {}};
+                    MIGRAPHX_THROW("SLICE: 2 input and attributes mismatch");
+                }
+                std::for_each(axes.cbegin(), axes.cend(), [&](const auto& axis) {
+                    dds.at(axis) = {0, dds.at(axis).max};
+                });
+            }
+            else if(set_attributes == starts_ends)
+            {
+                // attr starts and ends set; inputs are (data, input_axes)
+                if(inputs[1].lens().at(0) != starts.size())
+                {
+                    MIGRAPHX_THROW("SLICE: 2 input and attributes mismatch");
+                }
+                std::transform(dds.begin(), dds.end(), dds.begin(), [](auto dd) {
+                    return shape::dynamic_dimension{0, dd.max};
+                });
            }
            else
            {
-                return shape{
+                MIGRAPHX_THROW("SLICE: Invalid 2 input and attributes configuration");
-                    t, lens_calc(input_shape.lens(), starts, ends, axes), input_shape.strides()};
            }
        }
-        else
+        else if(inputs.size() == 3)
        {
-            // check that starts, ends, and optionally input_axes are all 1D, have the same
+            if(set_attributes == axes_only)
-            // dimension, and are static
-            check_shapes{inputs.begin() + 1,
-                         inputs.end(),
-                         std::string("SLICE: inputs (starts, ends, and input_axes)"),
-                         false}
-                .only_dims(1)
-                .same_dims();
-            auto dds = input_shape.to_dynamic().dyn_dims();
-            if(inputs.size() == 3)
            {
+                // attr axes set; inputs are (data, input_starts, input_ends)
                if(inputs[1].lens().at(0) != axes.size())
                {
-                    MIGRAPHX_THROW("SLICE: inputs starts and ends do not have the same dimension "
+                    MIGRAPHX_THROW("SLICE: 3 input and attributes mismatch");
-                                   "as the axes attribute");
                }
                std::for_each(axes.cbegin(), axes.cend(), [&](const auto& axis) {
                    dds.at(axis) = {0, dds.at(axis).max};
                });
            }
-            else
+            else if(set_attributes == ends_only)
+            {
+                // attr ends set; inputs are (data, input_starts, input_axes)
+                if(inputs[1].lens().at(0) != ends.size())
+                {
+                    MIGRAPHX_THROW("SLICE: 3 input and attributes mismatch");
+                }
+                std::transform(dds.begin(), dds.end(), dds.begin(), [](auto dd) {
+                    return shape::dynamic_dimension{0, dd.max};
+                });
+            }
+            else if(set_attributes == starts_only)
            {
-                // if axes is an input, then all the output dimensions could be 0 to the max value
+                // attr starts set; inputs are (data, input_ends, input_axes)
+                if(inputs[1].lens().at(0) != starts.size())
+                {
+                    MIGRAPHX_THROW("SLICE: 3 input and attributes mismatch");
+                }
                std::transform(dds.begin(), dds.end(), dds.begin(), [](auto dd) {
                    return shape::dynamic_dimension{0, dd.max};
                });
            }
-            return shape{input_shape.type(), dds};
+            else
+            {
+                MIGRAPHX_THROW("Invalid 3 input and attributes configuration");
+            }
+        }
+        else
+        {
+            // all 4 inputs (data, inputs_starts, input_ends, input_axes)
+            std::transform(dds.begin(), dds.end(), dds.begin(), [](auto dd) {
+                return shape::dynamic_dimension{0, dd.max};
+            });
+        }
+        return shape{input_shape.type(), dds};
+    }
+    // uses the normalize_axes flag to normalize axes, starts, and ends
+    shape normalize_compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this, true}.has(1, 2, 3, 4);
+        if(inputs.size() == 1)
+        {
+            auto input_shape    = inputs[0];
+            auto set_attributes = get_set_attributes();
+            if(set_attributes != all_set)
+            {
+                MIGRAPHX_THROW("SLICE 1_arg: Invalid 1 input and attributes configuration");
+            }
+            // NOTE: make sure to update how normalization works here if this type of slicing is
+            // changed to be allowed
+            if(input_shape.dynamic() and std::any_of(axes.begin(), axes.end(), [&](auto axis) {
+                   return not input_shape.dyn_dims()[axis].is_fixed();
+               }))
+            {
+                MIGRAPHX_THROW(
+                    "SLICE 1_arg: slicing is not allowed on non-fixed dynamic input axis ");
+            }
+            if(input_shape.dynamic())
+            {
+                return shape{
+                    input_shape.type(),
+                    lens_calc(input_shape.min_lens(), this->starts, this->ends, this->axes),
+                    lens_calc(input_shape.max_lens(), this->starts, this->ends, this->axes),
+                    {}};
+            }
+            else
+            {
+                return shape{input_shape.type(),
+                             lens_calc(input_shape.lens(), this->starts, this->ends, this->axes),
+                             input_shape.strides()};
+            }
+        }
+        else
+        {
+            return compute_two_or_more(inputs);
        }
    }
@@ -194,14 +319,14 @@ struct slice
    /**
     * Calculates the starting offset for the sliced tensor (for aliasing).
-     * Used when the starts and/or the axes are inputs.
+     * Used for 2-4 inputs to `slice.
     *
     * \param s static input shape
     * \param input_starts starting indices of slice
     * \param ax_vec axes to slice on
     */
-    template <class IndView, class Axes>
+    template <class T>
-    auto compute_offset(const shape& s, const IndView& input_starts, const Axes& ax_vec) const
+    auto compute_offset(const shape& s, const T& input_starts, const T& ax_vec) const
    {
        auto ret = 0;
        for(std::size_t i = 0; i < ax_vec.size(); ++i)
@@ -212,106 +337,168 @@ struct slice
        return ret * s.type_size();
    }
-    std::unordered_map<std::string, std::vector<int64_t>>
-    normalize_inputs(const shape& input_shape,
-                     const std::vector<int64_t>& input_starts,
-                     const std::vector<int64_t>& input_ends) const
-    {
-        auto attrs = this->attributes().at("normalize_axes");
-        return {{"input_starts",
-                 normalize_indices(input_starts,
-                                   this->axes,
-                                   input_shape,
-                                   attrs.at("starts"),
-                                   "Slice variable input_starts")},
-                {"input_ends",
-                 normalize_indices(input_ends,
-                                   this->axes,
-                                   input_shape,
-                                   attrs.at("ends"),
-                                   "Slice variable input_ends")}};
-    }
    /**
-     * Three input version of the normalize_inputs.
+     * If given, normalize the inputs. Otherwise get from operator attributes.
-     * This one also checks that the input_axes are valid.
+     * Return the values in a map.
+     *
+     * Parameters
+     * input_shape: static shape of the input
+     * input_starts: optional
+     * input_ends: optional
+     * input_ends: optional
     */
    std::unordered_map<std::string, std::vector<int64_t>>
-    normalize_inputs(shape input_shape,
+    normalize_starts_ends_axes(shape input_shape,
-                     const std::vector<int64_t>& input_starts,
+                               const optional<std::vector<int64_t>>& input_starts,
-                     const std::vector<int64_t>& input_ends,
+                               const optional<std::vector<int64_t>>& input_ends,
-                     const std::vector<int64_t>& input_axes) const
+                               const optional<std::vector<int64_t>>& input_axes) const
    {
-        auto attrs = this->attributes().at("normalize_axes");
+        auto axes_attrs = this->attributes().at("normalize_axes");
-        auto norm_axes =
+        std::vector<int64_t> norm_starts;
-            normalize_axes(input_axes, input_shape, attrs.at("axes"), "Slice variable input_axes");
+        std::vector<int64_t> norm_ends;
-        return {{"input_starts",
+        std::vector<int64_t> norm_axes;
-                 normalize_indices(input_starts,
+        if(input_axes)
-                                   norm_axes,
+        {
-                                   input_shape,
+            norm_axes = normalize_axes(input_axes.value(),
-                                   attrs.at("starts"),
+                                       input_shape,
-                                   "Slice variable input_starts")},
+                                       axes_attrs.at("axes"),
-                {"input_ends",
+                                       "Slice variable input_axes");
-                 normalize_indices(input_ends,
+        }
-                                   norm_axes,
+        else
-                                   input_shape,
+        {
-                                   attrs.at("ends"),
+            norm_axes = this->axes;
-                                   "Slice variable input ends")},
+        }
-                {"input_axes", norm_axes}};
+        if(input_starts)
+        {
+            norm_starts = normalize_indices(input_starts.value(),
+                                            norm_axes,
+                                            input_shape,
+                                            axes_attrs.at("starts"),
+                                            "Slice variable input_starts");
+        }
+        else
+        {
+            norm_starts = this->starts;
+        }
+        if(input_ends)
+        {
+            norm_ends = normalize_indices(input_ends.value(),
+                                          norm_axes,
+                                          input_shape,
+                                          axes_attrs.at("ends"),
+                                          "Slice variable input ends");
+        }
+        else
+        {
+            norm_ends = this->ends;
+        }
+        return {{"norm_starts", norm_starts}, {"norm_ends", norm_ends}, {"norm_axes", norm_axes}};
    }
    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
        auto input       = args[0];
        auto input_shape = input.get_shape();
-        switch(args.size())
+        if(args.size() == 1)
        {
-        case 1: {
            std::size_t offset = compute_offset(input_shape);
            return {dyn_out.computed_shape, [=] { return input.data() + offset; }};
        }
-        case 3: {
+        else
-            shape calc_shape;
+        {
-            std::size_t offset = 0;
+            // Note that we re-normalize both the attributes and inputs because of the non-fixed
-            visit_all(args[1], args[2])([&](auto input_starts, auto input_ends) {
+            // dynamic input shape case. It's possible to only re-normalize if slicing over
-                auto norm_inputs = normalize_inputs(input_shape,
+            // non-fixed dynamic_dimensions.
-                                                    input_starts.template to_vector<int64_t>(),
+            auto set_attributes = get_set_attributes();
-                                                    input_ends.template to_vector<int64_t>());
+            std::unordered_map<std::string, std::vector<int64_t>> norm_inputs;
-                offset = compute_offset(input_shape, norm_inputs.at("input_starts"), this->axes);
+            if(set_attributes == ends_axes)
-                calc_shape = {input_shape.type(),
+            {
-                              lens_calc(input_shape.lens(),
+                // attr ends and axes set; inputs are (data, input_starts)
-                                        norm_inputs.at("input_starts"),
+                args[1].visit([&](auto input_starts) {
-                                        norm_inputs.at("input_ends"),
+                    norm_inputs =
-                                        this->axes),
+                        normalize_starts_ends_axes(input_shape,
-                              input_shape.strides()};
+                                                   input_starts.template to_vector<int64_t>(),
-            });
+                                                   this->ends,
-            return {calc_shape, [=] { return input.data() + offset; }};
+                                                   this->axes);
-        }
+                });
-        case 4: {
+            }
-            shape calc_shape;
+            else if(set_attributes == starts_axes)
-            std::size_t offset = 0;
+            {
-            visit_all(args[1], args[2], args[3])(
+                // attr starts and axes set; inputs are (data, input_ends)
-                [&](auto input_starts, auto input_ends, auto input_axes) {
+                args[1].visit([&](auto input_ends) {
-                    auto norm_inputs = normalize_inputs(input_shape,
+                    norm_inputs =
-                                                        input_starts.template to_vector<int64_t>(),
+                        normalize_starts_ends_axes(input_shape,
-                                                        input_ends.template to_vector<int64_t>(),
+                                                   this->starts,
-                                                        input_axes.template to_vector<int64_t>());
+                                                   input_ends.template to_vector<int64_t>(),
-                    offset           = compute_offset(
+                                                   this->axes);
-                        input_shape, norm_inputs.at("input_starts"), norm_inputs.at("input_axes"));
+                });
-                    calc_shape = shape{input_shape.type(),
+            }
-                                       lens_calc(input_shape.lens(),
+            else if(set_attributes == starts_ends)
-                                                 norm_inputs.at("input_starts"),
+            {
-                                                 norm_inputs.at("input_ends"),
+                // attr starts and ends set; inputs are (data, input_axes)
-                                                 norm_inputs.at("input_axes")),
+                args[1].visit([&](auto input_axes) {
-                                       input_shape.strides()};
+                    norm_inputs =
+                        normalize_starts_ends_axes(input_shape,
+                                                   this->starts,
+                                                   this->ends,
+                                                   input_axes.template to_vector<int64_t>());
                });
+            }
+            else if(set_attributes == axes_only)
+            {
+                // attr axes set; inputs are (data, input_starts, input_ends)
+                visit_all(args[1], args[2])([&](auto input_starts, auto input_ends) {
+                    norm_inputs =
+                        normalize_starts_ends_axes(input_shape,
+                                                   input_starts.template to_vector<int64_t>(),
+                                                   input_ends.template to_vector<int64_t>(),
+                                                   this->axes);
+                });
+            }
+            else if(set_attributes == ends_only)
+            {
+                // attr ends set; inputs are (data, input_starts, input_axes)
+                visit_all(args[1], args[2])([&](auto input_starts, auto input_axes) {
+                    norm_inputs =
+                        normalize_starts_ends_axes(input_shape,
+                                                   input_starts.template to_vector<int64_t>(),
+                                                   this->ends,
+                                                   input_axes.template to_vector<int64_t>());
+                });
+            }
+            else if(set_attributes == starts_only)
+            {
+                // attr starts set; inputs are (data, input_ends, input_axes)
+                visit_all(args[1], args[2])([&](auto input_ends, auto input_axes) {
+                    norm_inputs =
+                        normalize_starts_ends_axes(input_shape,
+                                                   this->starts,
+                                                   input_ends.template to_vector<int64_t>(),
+                                                   input_axes.template to_vector<int64_t>());
+                });
+            }
+            else
+            {
+                // no attr set, all inputs
+                visit_all(args[1], args[2], args[3])(
+                    [&](auto input_starts, auto input_ends, auto input_axes) {
+                        norm_inputs =
+                            normalize_starts_ends_axes(input_shape,
+                                                       input_starts.template to_vector<int64_t>(),
+                                                       input_ends.template to_vector<int64_t>(),
+                                                       input_axes.template to_vector<int64_t>());
+                    });
+            }
+            auto offset = compute_offset(
+                input_shape, norm_inputs.at("norm_starts"), norm_inputs.at("norm_axes"));
+            shape calc_shape = shape{input_shape.type(),
+                                     lens_calc(input_shape.lens(),
+                                               norm_inputs.at("norm_starts"),
+                                               norm_inputs.at("norm_ends"),
+                                               norm_inputs.at("norm_axes")),
+                                     input_shape.strides()};
            return {calc_shape, [=] { return input.data() + offset; }};
        }
-        default: {
-            // Should never get here; covering in case some code change occurs
-            MIGRAPHX_THROW("SLICE: invalid number of inputs");
-        }
-        }
    }
    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }

--- a/src/include/migraphx/op/unary.hpp
+++ b/src/include/migraphx/op/unary.hpp
@@ -31,6 +31,7 @@
 #include <migraphx/stringutils.hpp>
 #include <migraphx/value.hpp>
 #include <migraphx/dyn_output.hpp>
+#include <migraphx/par.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -84,10 +85,10 @@ struct unary : op_name<Derived>
        argument result{dyn_out.computed_shape};
        result.visit([&](auto output) {
            args[0].visit([&](auto input) {
-                std::transform(input.begin(),
+                par_transform(input.begin(),
-                               input.end(),
+                              input.end(),
-                               output.begin(),
+                              output.begin(),
-                               static_cast<const Derived&>(*this).apply());
+                              static_cast<const Derived&>(*this).apply());
            });
        });
        return result;

--- a/src/include/migraphx/op/unique.hpp
+++ b/src/include/migraphx/op/unique.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_OPERATORS_UNIQUE_HPP
+#define MIGRAPHX_GUARD_OPERATORS_UNIQUE_HPP
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/tune_axis.hpp>
+#include <utility>
+#include <map>
+#include <limits>
+#include <optional>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+// https://onnx.ai/onnx/operators/onnx__Unique.html
+// The Onnx spec refers to numpy specification, used as a reference:
+// https://numpy.org/doc/stable/reference/generated/numpy.unique.html
+// Input : Given an array of elements : X.
+// Output(s) :
+// 1. Find the unique elements (Y) of input (X).
+//
+// There are three outputs in addition to the unique elements in Y:
+// 2. the indices of the input array that give the unique values
+// 3. the indices of the unique array that reconstruct the input array
+// 4. the number of times each unique value comes up in the input array
+// Optional Attribute: 'Sorted' = 1 for sorted; = 0 for unsorted.
+// Onnx specification makes 'sorted' a default, while Numpy always sorts.
+//
+// Optional Attribute: 'Axis' is 'None' (default) or a valid int < rank(X).
+// Negative values are allowed.
+//
+// Numpy has the following important note on Axis:
+// ------------------------------------------------------------------
+// When an axis is specified the subarrays indexed by the axis are
+// sorted. This is done by making the specified axis the first
+// dimension of the array (move the axis to the first dimension to
+// keep the order of the other axes) and then flattening the subarrays
+// in C order. The flattened subarrays are then viewed as a structured
+// type with each element given a label, with the effect that we end
+// up with a 1-D array of structured types that can be treated in the
+// same way as any other 1-D array. The result is that the flattened
+// subarrays are sorted in lexicographic order starting with the first
+// element.
+// ------------------------------------------------------------------
+struct unique
+{
+    template <class T>
+    auto make_idx_less_fn(const T& data, size_t chunk_sz) const
+    {
+        return [&data, chunk_sz](auto idx1, auto idx2) {
+            return std::lexicographical_compare(data.begin() + idx1,
+                                                data.begin() + idx1 + chunk_sz,
+                                                data.begin() + idx2,
+                                                data.begin() + idx2 + chunk_sz);
+        };
+    }
+    // CASE SORTED:
+    //
+    // To process into a sorted unique series of elements/chunks:
+    // Chunk size == 1 means a simple element; >1 means a flat representation.
+    // Steps: first go through the input elements/chunks for uniqueness.
+    // At the end of this processing, per the sorted sequence of unique elements:
+    // update/create data structures: y, y_indices, x_rev_indices, y_count
+    //
+    // INPUT x: [2, 1, 1, 3, 4, 3], attr_sorted = 1;
+    // OUTPUT(s): indices..
+    // y_indices: [1, 0, 3, 4]  --- first incidence, in terms of index in sequence x
+    // x_rev_indices: [1, 0, 0, 2, 3, 2] --- x seen in terms of indices of unique sequence y
+    // y_count: [2, 1, 2, 1] -- count at each y_index. sum = len(x)
+    // NOTE: y [1, 2, 3, 4]   --- the unique output is constructed from x[y_indices[...]]
+    template <class T>
+    auto sorted_uniq_indices(const T& input_data, size_t chunk_sz) const
+    {
+        struct y_info
+        {
+            size_t y_idx;
+            size_t x_idx;
+            size_t ct = 0;
+        };
+        auto idx_less_fn = make_idx_less_fn(input_data, chunk_sz);
+        std::map<size_t, y_info, decltype(idx_less_fn)> uniq_val_map(idx_less_fn);
+        std::tuple<std::vector<std::size_t>, std::vector<std::size_t>, std::vector<std::size_t>> rv;
+        auto& [y_indices, x_rev_indices, y_count] = rv;
+        // go through all the elements and find the unique elements..
+        size_t count_x = input_data.size();
+        for(size_t f_idx = 0, x_idx = 0; f_idx < count_x; f_idx += chunk_sz, x_idx++)
+        {
+            y_info entry          = {.y_idx = uniq_val_map.size(), .x_idx = x_idx};
+            auto [itr, added_new] = uniq_val_map.insert({f_idx, entry});
+            itr->second.ct++;
+            x_rev_indices.push_back(itr->second.y_idx);
+        }
+        std::vector<std::size_t> y2x_indices(uniq_val_map.size());
+        y_indices.resize(uniq_val_map.size());
+        y_count.resize(uniq_val_map.size());
+        size_t idx = 0;
+        // the unique elements are now sorted:
+        // post-processing for all the return indices.
+        for(const auto& v : uniq_val_map)
+        {
+            y2x_indices[v.second.y_idx] = idx;
+            y_indices[idx]              = v.second.x_idx;
+            y_count[idx]                = v.second.ct;
+            idx++;
+        }
+        // update x_rev_indices as per the sorted order of y_indices
+        for(auto& i : x_rev_indices)
+            i = y2x_indices[i];
+        return rv;
+    }
+    // CASE UNSORTED:
+    //
+    // To process into an un-sorted unique series of elements/chunks:
+    // For chunk size = 1 is a simple element, else use a flat representation of a tensor obj
+    // Go through the input elements/chunks one by one with inline processing of indices..
+    // INPUT x: [2, 1, 1, 3, 4, 3], attr_sorted = 0;
+    // OUTPUT(s): indices..
+    // y_indices: [0, 1, 3, 4]  --- first incidence, in terms of index in sequence x
+    // x_rev_indices: [0, 1, 1, 2, 3, 2] --- x seen in terms of indices of unique sequence y
+    // y_count: [1, 2, 2, 1] -- count at each y_index. sum = len(x)
+    // NOTE: y [2, 1, 3, 4]   --- the unique output is constructed from x[y_indices[...]]
+    // Output data structures: y_indices, x_rev_indices, y_count are processed inline.
+    template <class T>
+    auto unsorted_uniq_indices(const T& input_data, size_t chunk_sz) const
+    {
+        auto idx_less_fn = make_idx_less_fn(input_data, chunk_sz);
+        std::map<size_t, size_t, decltype(idx_less_fn)> uniq_val_map(idx_less_fn);
+        // rv is used for NVRO below..
+        std::tuple<std::vector<std::size_t>, std::vector<std::size_t>, std::vector<std::size_t>> rv;
+        auto& [y_indices, x_rev_indices, y_count] = rv;
+        // go through all the elements and add the unique elements into the map..
+        // inline processing for outputs: y_indices, x_rev_indices, y_count
+        size_t count_x = input_data.size();
+        for(size_t f_idx = 0; f_idx < count_x; f_idx += chunk_sz)
+        {
+            auto [itr, added_new] = uniq_val_map.insert({f_idx, y_indices.size()});
+            if(added_new)
+            {
+                y_count.push_back(0);
+                y_indices.push_back(x_rev_indices.size());
+            }
+            y_count[itr->second]++;
+            x_rev_indices.push_back(itr->second);
+        }
+        return rv;
+    }
+    // Axis. Default: none. Range: [-rank, rank-1]
+    std::optional<int64_t> axis;
+    // Sorted, Default: 1= sorted. 0 = unsorted.
+    bool sorted = true;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.axis, "axis"), f(self.sorted, "sorted"));
+    }
+    std::string name() const { return "unique"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(1);
+        auto& sh_x         = inputs[0];
+        auto lens_x        = sh_x.lens();
+        size_t dim_x       = sh_x.ndim();
+        size_t max_uniq_ct = sh_x.elements();
+        std::vector<shape::dynamic_dimension> d_out;
+        if(axis)
+        {
+            int64_t t_axis = migraphx::tune_axis(dim_x, *axis, name());
+            if(t_axis != 0)
+                MIGRAPHX_THROW("Unique: Only supports axis = 0 or None");
+            d_out = sh_x.to_dynamic().dyn_dims();
+            // only axis = 0 is supported:
+            max_uniq_ct = lens_x[0];
+            // min = 1 unique element; max = full dimension along axis 0
+            d_out[0] = {1, max_uniq_ct};
+        }
+        else
+        {
+            d_out.push_back({1, max_uniq_ct});
+        }
+        shape sh_y = {sh_x.type(), d_out};
+        // The three outputted Indices are just 1-D:
+        shape sh_idx{shape::int64_type, {d_out[0]}};
+        return {{sh_y, sh_idx, sh_idx, sh_idx}};
+    }
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
+    {
+        auto sh_x          = args.front().get_shape();
+        auto lens_x        = sh_x.lens();
+        shape output_shape = dyn_out.computed_shape;
+        auto vec_ss        = output_shape.sub_shapes();
+        auto ct_x          = sh_x.elements();
+        shape sh_y         = {vec_ss[0].type(), {ct_x}};
+        shape sh_idx       = {vec_ss[1].type(), {ct_x}};
+        shape sh_x_idx     = {vec_ss[1].type(), {ct_x}};
+        argument res_y{sh_y};
+        argument res_y_idx{sh_idx};
+        argument res_x_rev_idx{sh_idx};
+        argument res_y_ct_idx{sh_idx};
+        std::vector<size_t> out_y_idx;
+        std::vector<size_t> out_x_rev_idx;
+        std::vector<size_t> out_y_ct;
+        // If axis is not none, for >1D tensors, we have to consider
+        // then, the uniqueness of chunks of sub-tensors: a subsequence of built-ins..
+        // For a built-in type, chunk_sz is of course = 1
+        size_t chunk_sz = 1;
+        if(axis)
+            chunk_sz = ct_x / lens_x[0]; // axis = 0 is supported.
+        visit_all(args.front(), res_y)([&](auto x, auto y_flat) {
+            using o_type = typename decltype(x)::value_type;
+            std::vector<o_type> x_in(x.begin(), x.end());
+            std::tie(out_y_idx, out_x_rev_idx, out_y_ct) =
+                sorted ? sorted_uniq_indices(x_in, chunk_sz)
+                       : unsorted_uniq_indices(x_in, chunk_sz);
+            const auto uniq_ct = out_y_idx.size();
+            // construct y from x[indices] in flattened form
+            // later we reshape y to the final shape..
+            auto y_dst = y_flat.begin();
+            for(size_t idx = 0; idx < uniq_ct; idx++)
+                y_dst = copy_n(x_in.begin() + out_y_idx[idx] * chunk_sz, chunk_sz, y_dst);
+            std::vector<size_t> lens_y;
+            // if axis is specified:
+            // the output shape keeps the n-1 dimensions of x
+            if(axis)
+            {
+                lens_y    = lens_x;
+                lens_y[0] = uniq_ct;
+            }
+            else
+            {
+                lens_y = {uniq_ct};
+            }
+            sh_y   = {sh_y.type(), lens_y};
+            sh_idx = {sh_idx.type(), {uniq_ct}};
+        });
+        visit_all(res_y_idx, res_x_rev_idx, res_y_ct_idx)(
+            [&](auto y_indices, auto x_rev_indices, auto y_count) {
+                std::copy(out_y_idx.begin(), out_y_idx.end(), y_indices.begin());
+                std::copy(out_x_rev_idx.begin(), out_x_rev_idx.end(), x_rev_indices.begin());
+                std::copy(out_y_ct.begin(), out_y_ct.end(), y_count.begin());
+                sh_x_idx = {sh_idx.type(), {out_x_rev_idx.size()}};
+            });
+        return {{res_y.reshape(sh_y),
+                 res_y_idx.reshape(sh_idx),
+                 res_x_rev_idx.reshape(sh_x_idx),
+                 res_y_ct_idx.reshape(sh_idx)}};
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -84,6 +84,7 @@
 #include <migraphx/op/mod.hpp>
 #include <migraphx/op/mul.hpp>
 #include <migraphx/op/multibroadcast.hpp>
+#include <migraphx/op/nearbyint.hpp>
 #include <migraphx/op/neg.hpp>
 #include <migraphx/op/nonmaxsuppression.hpp>
 #include <migraphx/op/nonzero.hpp>
@@ -110,7 +111,6 @@
 #include <migraphx/op/rnn_variable_seq_lens.hpp>
 #include <migraphx/op/rnn_var_sl_last_output.hpp>
 #include <migraphx/op/roialign.hpp>
-#include <migraphx/op/round.hpp>
 #include <migraphx/op/rsqrt.hpp>
 #include <migraphx/op/scalar.hpp>
 #include <migraphx/op/scatter_add.hpp>
@@ -119,6 +119,8 @@
 #include <migraphx/op/scatternd_add.hpp>
 #include <migraphx/op/scatternd_none.hpp>
 #include <migraphx/op/scatternd_mul.hpp>
+#include <migraphx/op/scatternd_max.hpp>
+#include <migraphx/op/scatternd_min.hpp>
 #include <migraphx/op/sigmoid.hpp>
 #include <migraphx/op/sign.hpp>
 #include <migraphx/op/sinh.hpp>
@@ -137,6 +139,7 @@
 #include <migraphx/op/unary.hpp>
 #include <migraphx/op/unary_not.hpp>
 #include <migraphx/op/undefined.hpp>
+#include <migraphx/op/unique.hpp>
 #include <migraphx/op/unknown.hpp>
 #include <migraphx/op/unsqueeze.hpp>
 #include <migraphx/op/where.hpp>

--- a/src/include/migraphx/par.hpp
+++ b/src/include/migraphx/par.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_MIGRAPHX_PAR_HPP
+#define MIGRAPHX_GUARD_MIGRAPHX_PAR_HPP
+#include <migraphx/config.hpp>
+#if MIGRAPHX_HAS_EXECUTORS
+#include <execution>
+#else
+#include <migraphx/simple_par_for.hpp>
+#endif
+#include <algorithm>
+#include <mutex>
+#include <vector>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace detail {
+struct exception_list
+{
+    std::vector<std::exception_ptr> exceptions;
+    std::mutex m;
+    void add_exception()
+    {
+        std::lock_guard<std::mutex> guard(m);
+        exceptions.push_back(std::current_exception());
+    }
+    template <class F>
+    auto collect(F f)
+    {
+        return [f, this](auto&&... xs) {
+            try
+            {
+                f(std::forward<decltype(xs)>(xs)...);
+            }
+            catch(...)
+            {
+                this->add_exception();
+            }
+        };
+    }
+    void throw_if_exception() const
+    {
+        if(not exceptions.empty())
+            std::rethrow_exception(exceptions.front());
+    }
+};
+} // namespace detail
+template <class InputIt, class OutputIt, class UnaryOperation>
+OutputIt par_transform(InputIt first1, InputIt last1, OutputIt d_first, UnaryOperation unary_op)
+{
+#if MIGRAPHX_HAS_EXECUTORS
+    return std::transform(std::execution::par, first1, last1, d_first, std::move(unary_op));
+#else
+    simple_par_for(last1 - first1, [&](auto i) { d_first[i] = unary_op(first1[i]); });
+    return d_first + (last1 - first1);
+#endif
+}
+template <class InputIt1, class InputIt2, class OutputIt, class BinaryOperation>
+OutputIt par_transform(
+    InputIt1 first1, InputIt1 last1, InputIt2 first2, OutputIt d_first, BinaryOperation binary_op)
+{
+#if MIGRAPHX_HAS_EXECUTORS
+    return std::transform(
+        std::execution::par, first1, last1, first2, d_first, std::move(binary_op));
+#else
+    simple_par_for(last1 - first1, [&](auto i) { d_first[i] = binary_op(first1[i], first2[i]); });
+    return d_first + (last1 - first1);
+#endif
+}
+template <class InputIt, class UnaryFunction>
+void par_for_each(InputIt first, InputIt last, UnaryFunction f)
+{
+#if MIGRAPHX_HAS_EXECUTORS
+    // Propagate the exception
+    detail::exception_list ex;
+    std::for_each(std::execution::par, first, last, ex.collect(std::move(f)));
+    ex.throw_if_exception();
+#else
+    simple_par_for(last - first, [&](auto i) { f(first[i]); });
+#endif
+}
+template <class... Ts>
+auto par_copy_if(Ts&&... xs)
+{
+#if MIGRAPHX_HAS_EXECUTORS
+    return std::copy_if(std::execution::par, std::forward<Ts>(xs)...);
+#else
+    return std::copy_if(std::forward<Ts>(xs)...);
+#endif
+}
+template <class... Ts>
+auto par_sort(Ts&&... xs)
+{
+#if MIGRAPHX_HAS_EXECUTORS
+    return std::sort(std::execution::par, std::forward<Ts>(xs)...);
+#else
+    return std::sort(std::forward<Ts>(xs)...);
+#endif
+}
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_MIGRAPHX_PAR_HPP
--- a/src/include/migraphx/par_for.hpp
+++ b/src/include/migraphx/par_for.hpp
@@ -24,93 +24,23 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_PAR_FOR_HPP
 #define MIGRAPHX_GUARD_RTGLIB_PAR_FOR_HPP
-#include <thread>
+#include <migraphx/par.hpp>
-#include <cmath>
+#include <migraphx/ranges.hpp>
-#include <algorithm>
-#include <vector>
-#include <cassert>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-struct joinable_thread : std::thread
-{
-    template <class... Xs>
-    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...) // NOLINT
-    {
-    }
-    joinable_thread& operator=(joinable_thread&& other) = default;
-    joinable_thread(joinable_thread&& other)            = default;
-    ~joinable_thread()
-    {
-        if(this->joinable())
-            this->join();
-    }
-};
-template <class F>
-auto thread_invoke(std::size_t i, std::size_t tid, F f) -> decltype(f(i, tid))
-{
-    f(i, tid);
-}
-template <class F>
-auto thread_invoke(std::size_t i, std::size_t, F f) -> decltype(f(i))
-{
-    f(i);
-}
-template <class F>
-void par_for_impl(std::size_t n, std::size_t threadsize, F f)
-{
-    if(threadsize <= 1)
-    {
-        for(std::size_t i = 0; i < n; i++)
-            thread_invoke(i, 0, f);
-    }
-    else
-    {
-        std::vector<joinable_thread> threads(threadsize);
-// Using const here causes gcc 5 to ICE
-#if(!defined(__GNUC__) || __GNUC__ != 5)
-        const
-#endif
-            std::size_t grainsize = std::ceil(static_cast<double>(n) / threads.size());
-        std::size_t work = 0;
-        std::size_t tid  = 0;
-        std::generate(threads.begin(), threads.end(), [=, &work, &tid] {
-            auto result = joinable_thread([=] {
-                std::size_t start = work;
-                std::size_t last  = std::min(n, work + grainsize);
-                for(std::size_t i = start; i < last; i++)
-                {
-                    thread_invoke(i, tid, f);
-                }
-            });
-            work += grainsize;
-            ++tid;
-            return result;
-        });
-        assert(work >= n);
-    }
-}
 template <class F>
-void par_for(std::size_t n, std::size_t min_grain, F f)
+void par_for(std::size_t n, F f)
 {
-    const auto threadsize = std::min<std::size_t>(std::thread::hardware_concurrency(),
+    using iterator = basic_iota_iterator<id, std::size_t>;
-                                                  n / std::max<std::size_t>(1, min_grain));
+    par_for_each(iterator{0, {}}, iterator{n, {}}, f);
-    par_for_impl(n, threadsize, f);
 }
 template <class F>
-void par_for(std::size_t n, F f)
+void par_for(std::size_t n, std::size_t, F f)
 {
-    const int min_grain = 8;
+    par_for(n, f);
-    par_for(n, min_grain, f);
 }
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/include/migraphx/rewrite_pooling.hpp
+++ b/src/include/migraphx/rewrite_pooling.hpp
@@ -26,6 +26,7 @@
 #include <string>
 #include <migraphx/config.hpp>
+#include <migraphx/instruction_ref.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/include/migraphx/shape.hpp
+++ b/src/include/migraphx/shape.hpp
@@ -34,6 +34,7 @@
 #include <migraphx/functional.hpp>
 #include <migraphx/errors.hpp>
 #include <migraphx/half.hpp>
+#include <migraphx/float8.hpp>
 #include <migraphx/serialize.hpp>
 #include <migraphx/config.hpp>
@@ -60,7 +61,8 @@ struct MIGRAPHX_EXPORT shape
    m(int32_type, int32_t) \
    m(int64_type, int64_t) \
    m(uint32_type, uint32_t) \
-    m(uint64_type, uint64_t)
+    m(uint64_type, uint64_t) \
+    m(fp8e4m3fnuz_type, migraphx::fp8::fp8e4m3fnuz)
    // clang-format on
 #define MIGRAPHX_SHAPE_GENERATE_ENUM_TYPES(x, t) x,

--- a/src/targets/gpu/include/migraphx/gpu/pad.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/pad.hpp
@@ -21,40 +21,98 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_PAD_HPP
+#ifndef MIGRAPHX_GUARD_RTGLIB_SIMPLE_PAR_FOR_HPP
-#define MIGRAPHX_GUARD_RTGLIB_PAD_HPP
+#define MIGRAPHX_GUARD_RTGLIB_SIMPLE_PAR_FOR_HPP
-#include <migraphx/argument.hpp>
+#include <thread>
-#include <migraphx/reflect.hpp>
+#include <cmath>
-#include <migraphx/op/pad.hpp>
+#include <algorithm>
+#include <vector>
+#include <cassert>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-struct context;
+struct joinable_thread : std::thread
-struct hip_pad
 {
-    op::pad op;
+    template <class... Xs>
+    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...) // NOLINT
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
    {
-        return migraphx::reflect(self.op, f);
    }
-    std::string name() const { return "gpu::pad"; }
+    joinable_thread& operator=(joinable_thread&& other) = default;
-    shape compute_shape(std::vector<shape> inputs) const;
+    joinable_thread(joinable_thread&& other)            = default;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    ~joinable_thread()
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
-        return shapes.size() - 1;
+        if(this->joinable())
+            this->join();
    }
 };
-} // namespace gpu
+template <class F>
+auto thread_invoke(std::size_t i, std::size_t tid, F f) -> decltype(f(i, tid))
+{
+    f(i, tid);
+}
+template <class F>
+auto thread_invoke(std::size_t i, std::size_t, F f) -> decltype(f(i))
+{
+    f(i);
+}
+template <class F>
+void simple_par_for_impl(std::size_t n, std::size_t threadsize, F f)
+{
+    if(threadsize <= 1)
+    {
+        for(std::size_t i = 0; i < n; i++)
+            thread_invoke(i, 0, f);
+    }
+    else
+    {
+        std::vector<joinable_thread> threads(threadsize);
+// Using const here causes gcc 5 to ICE
+#if(!defined(__GNUC__) || __GNUC__ != 5)
+        const
+#endif
+            std::size_t grainsize = std::ceil(static_cast<double>(n) / threads.size());
+        std::size_t work = 0;
+        std::size_t tid  = 0;
+        std::generate(threads.begin(), threads.end(), [=, &work, &tid] {
+            auto result = joinable_thread([=] {
+                std::size_t start = work;
+                std::size_t last  = std::min(n, work + grainsize);
+                for(std::size_t i = start; i < last; i++)
+                {
+                    thread_invoke(i, tid, f);
+                }
+            });
+            work += grainsize;
+            ++tid;
+            return result;
+        });
+        assert(work >= n);
+    }
+}
+template <class F>
+void simple_par_for(std::size_t n, std::size_t min_grain, F f)
+{
+    const auto threadsize = std::min<std::size_t>(std::thread::hardware_concurrency(),
+                                                  n / std::max<std::size_t>(1, min_grain));
+    simple_par_for_impl(n, threadsize, f);
+}
+template <class F>
+void simple_par_for(std::size_t n, F f)
+{
+    const int min_grain = 8;
+    simple_par_for(n, min_grain, f);
+}
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx