Merge

9bff4331 · Paul · 214b313f · 94a7f6ee · 9bff4331 · 9bff4331
Commit 9bff4331 authored Mar 21, 2023 by Paul
20 changed files
--- a/src/include/migraphx/context.hpp
+++ b/src/include/migraphx/context.hpp
@@ -66,6 +66,7 @@ any_ptr get_queue_context(T&)
 {
    return {};
 }
+
 template <class T>
 void wait_for_context(T&, any_ptr)
 {
@@ -302,7 +303,7 @@ struct context
            PrivateDetailTypeErasedT value,
            typename std::enable_if<not std::is_reference<PrivateDetailTypeErasedU>::value,
                                    int>::type* = nullptr) noexcept
-            : private_detail_te_value(value)
+            : private_detail_te_value(std::move(value))
        {
        }

@@ -412,6 +413,7 @@ inline const ValueType& any_cast(const context& x)
 #endif

 inline void migraphx_to_value(value& v, const context& ctx) { v = ctx.to_value(); }
+
 inline void migraphx_from_value(const value& v, context& ctx) { ctx.from_value(v); }

 #endif

--- a/src/include/migraphx/file_buffer.hpp
+++ b/src/include/migraphx/file_buffer.hpp
@@ -31,7 +31,7 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-std::vector<char> read_buffer(const std::string& filename);
+std::vector<char> read_buffer(const std::string& filename, size_t offset = 0, size_t nbytes = 0);
 std::string read_string(const std::string& filename);

 void write_buffer(const std::string& filename, const char* buffer, std::size_t size);

--- a/src/include/migraphx/half.hpp
+++ b/src/include/migraphx/half.hpp
@@ -25,7 +25,7 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_HALF_HPP
 #define MIGRAPHX_GUARD_RTGLIB_HALF_HPP

-#include <half.hpp>
+#include <half/half.hpp>
 #include <migraphx/config.hpp>

 namespace migraphx {
@@ -58,12 +58,12 @@ using deduce = typename detail::deduce<T>::type;
 namespace std {

 template <class T>
-struct common_type<migraphx::half, T> : std::common_type<float, T>
+struct common_type<migraphx::half, T> : std::common_type<float, T> // NOLINT
 {
 };

 template <class T>
-struct common_type<T, migraphx::half> : std::common_type<float, T>
+struct common_type<T, migraphx::half> : std::common_type<float, T> // NOLINT
 {
 };


--- a/src/include/migraphx/instruction.hpp
+++ b/src/include/migraphx/instruction.hpp
@@ -121,6 +121,8 @@ struct instruction

    bool can_eval() const;

+    bool is_undefined() const;
+
    argument eval(bool check_eval = true) const;

    void finalize(context& ctx);

--- a/src/include/migraphx/instruction_ref.hpp
+++ b/src/include/migraphx/instruction_ref.hpp
@@ -41,7 +41,7 @@ migraphx::instruction* as_address(const instruction_ref& ins) noexcept;

 namespace std {
 template <>
-struct hash<migraphx::instruction_ref>
+struct hash<migraphx::instruction_ref> // NOLINT
 {
    using argument_type = migraphx::instruction_ref;
    using result_type   = std::size_t;
@@ -52,7 +52,7 @@ struct hash<migraphx::instruction_ref>
 };

 template <>
-struct equal_to<migraphx::instruction_ref>
+struct equal_to<migraphx::instruction_ref> // NOLINT
 {
    using argument_type = migraphx::instruction_ref;
    using result_type   = bool;

--- a/src/include/migraphx/match/layernorm.hpp
+++ b/src/include/migraphx/match/layernorm.hpp
@@ -36,22 +36,46 @@ template <class F>
 struct layernorm_matcher
 {
    F f;
+
+    auto last_axis() const
+    {
+        return make_basic_pred_matcher([](instruction_ref ins) {
+            auto v = ins->get_operator().to_value();
+            if(not v.contains("axes"))
+                return false;
+            auto axes = v["axes"].to_vector<std::size_t>();
+            if(axes.size() != 1)
+                return false;
+            return axes.front() == ins->inputs().front()->get_shape().lens().size() - 1;
+        });
+    }
+
+    auto reduce_mean() const { return f("reduce_mean")(last_axis()); }
+
    auto x_minus_mean() const
    {
-        return f("sub")(arg(0)(any().bind("x")), arg(1)(skip_broadcasts(f("reduce_mean"))));
+        return f("sub")(arg(0)(any().bind("x")), arg(1)(skip_broadcasts(reduce_mean())));
    }

    auto variance() const
    {
-        return f("reduce_mean")(arg(0)(f("pow")(arg(0)(x_minus_mean()), arg(1)(has_value(2.0f)))));
+        return reduce_mean()(arg(0)(any_of(
+            f("pow")(arg(0)(x_minus_mean()), arg(1)(has_value(2.0f))),
+            f("mul")(arg(0)(x_minus_mean()), arg(1)(x_minus_mean())),
+            f("sqdiff")(either_arg(0, 1)(any().bind("x"), skip_broadcasts(reduce_mean()))))));
    }

-    auto layernorm_onnx() const
+    auto sqrt_add_eps(const std::string& name) const
    {
-        return f("div")(arg(0)(x_minus_mean()),
+        auto add_eps = f("add")(either_arg(0, 1)(variance(), is_constant().bind("eps")));
+        return skip_broadcasts(f(name)(arg(0)(any_of(add_eps, variance()))));
+    }

-                        arg(1)(skip_broadcasts(f("sqrt")(arg(0)(
-                            f("add")(either_arg(0, 1)(variance(), is_constant().bind("eps"))))))));
+    auto layernorm_onnx() const
+    {
+        auto div_sqrt  = f("div")(arg(0)(x_minus_mean()), arg(1)(sqrt_add_eps("sqrt")));
+        auto mul_rsqrt = f("mul")(either_arg(0, 1)(x_minus_mean(), sqrt_add_eps("rsqrt")));
+        return any(any_of(div_sqrt, mul_rsqrt));
    }

    auto matcher() const { return layernorm_onnx(); }

--- a/src/include/migraphx/memory_coloring.hpp
+++ b/src/include/migraphx/memory_coloring.hpp
@@ -33,13 +33,14 @@ inline namespace MIGRAPHX_INLINE_NS {
 struct module;

 /**
- * Remove memory allocations. It uses graph coloring to find memory allocations that can be reused.
+ * Remove multiple memory allocations using graph coloring to find memory allocations that can be
+ * reused.
 */
 struct memory_coloring
 {
    std::string allocation_op{};
    bool verify = false;
-    std::string name() const { return "memory coloring"; }
+    std::string name() const { return "memory_coloring"; }
    void apply(module& m) const;
 };


--- a/src/include/migraphx/module.hpp
+++ b/src/include/migraphx/module.hpp
@@ -205,6 +205,12 @@ struct module

    void print_graph(std::ostream& os, bool brief = false) const;

+    void print_py(std::ostream& os) const;
+    std::unordered_map<instruction_ref, std::string>
+    print_py(std::ostream& os,
+             const std::string& mname,
+             std::unordered_map<instruction_ref, std::string> names) const;
+
    void print_cpp(std::ostream& os) const;
    std::unordered_map<instruction_ref, std::string>
    print_cpp(std::ostream& os,

--- a/src/include/migraphx/op/allocate.hpp
+++ b/src/include/migraphx/op/allocate.hpp
@@ -44,7 +44,7 @@ struct allocate
    std::string name() const { return "allocate"; }
    shape compute_shape(const std::vector<shape>& inputs) const
    {
-        migraphx::check_shapes{inputs, *this}.has(0);
+        migraphx::check_shapes{inputs, *this, true}.has(0);
        return s;
    }
    argument compute(const shape& output_shape, const std::vector<argument>&) const

--- a/src/include/migraphx/op/argmax.hpp
+++ b/src/include/migraphx/op/argmax.hpp
@@ -30,6 +30,7 @@
 #include <migraphx/config.hpp>
 #include <migraphx/value.hpp>
 #include <migraphx/op/normalize_attribute.hpp>
+#include <migraphx/dyn_output.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -56,12 +57,20 @@ struct argmax

    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1);
-        auto lens = inputs[0].lens();
-
-        lens[axis] = 1;
-
-        return {shape::int64_type, lens};
+        check_shapes{inputs, *this, true}.has(1);
+        const auto& s0 = inputs[0];
+        if(s0.dynamic())
+        {
+            auto dyn_dims  = s0.dyn_dims();
+            dyn_dims[axis] = {1, 1, 0};
+            return {shape::int64_type, dyn_dims};
+        }
+        else
+        {
+            auto lens  = s0.lens();
+            lens[axis] = 1;
+            return {shape::int64_type, lens};
+        }
    }

    template <class T>
@@ -79,19 +88,18 @@ struct argmax
                max_index = i;
            }
        }
-
        return max_index;
    }

-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
        auto batch_item_num = args.front().get_shape().lens()[axis];

        result.visit([&](auto output) {
            args[0].visit([&](auto input) {
-                par_for(output_shape.elements(), [&](auto i) {
-                    auto data_idx = output_shape.multi(i);
+                par_for(dyn_out.computed_shape.elements(), [&](auto i) {
+                    auto data_idx = dyn_out.computed_shape.multi(i);
                    output[i]     = this->calc_argmax(input, data_idx, batch_item_num);
                });
            });

--- a/src/include/migraphx/op/concat.hpp
+++ b/src/include/migraphx/op/concat.hpp
@@ -26,6 +26,7 @@

 #include <array>
 #include <migraphx/check_shapes.hpp>
+#include <migraphx/dyn_output.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
 #include <migraphx/literal.hpp>
@@ -73,49 +74,87 @@ struct concat
        }
        return offsets;
    }
+
    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        if(inputs.empty())
+        // inputs can contain 1 or more shapes (variadic).  compute_shape_op ensures there must
+        // be at least 1.
+        check_shapes{inputs, *this, true}.same_ndims().same_type();
+
+        if(std::none_of(inputs.begin(), inputs.end(), [&](const shape& s) { return s.dynamic(); }))
        {
-            MIGRAPHX_THROW("CONCAT: Number of input tensors should exceed 0");
+            // Static input shapes
+            const auto& first_shape_lens = inputs.front().lens();
+            const auto& type             = inputs.front().type();
+            for(std::size_t ll = 0; ll < first_shape_lens.size(); ll++)
+            {
+                if(ll != axis)
+                {
+                    if(not std::all_of(inputs.begin(), inputs.end(), [&](auto s) {
+                           return s.lens()[ll] == first_shape_lens[ll];
+                       }))
+                    {
+                        MIGRAPHX_THROW("CONCAT: all input dimensions should match along axis " +
+                                       std::to_string(ll));
+                    }
+                }
+            }
+            std::size_t new_dim_axis = 0;
+            for(const auto& input : inputs)
+            {
+                const auto& lens = input.lens();
+                new_dim_axis += lens[axis];
+            }
+            std::vector<std::size_t> new_lens = first_shape_lens;
+            new_lens[axis]                    = new_dim_axis;
+            return shape::from_permutation(type, new_lens, find_permutation(inputs));
        }
-
-        const auto& first_shape_lens = inputs.front().lens();
-        const auto& type             = inputs.front().type();
-        for(std::size_t l = 0; l < first_shape_lens.size(); l++)
+        else if(std::all_of(
+                    inputs.begin(), inputs.end(), [&](const shape& s) { return s.dynamic(); }))
        {
-            if(l != axis)
+            // Dynamic input shapes
+            for(std::size_t index = 0; index < inputs[0].ndim(); index++)
            {
-                if(not std::all_of(inputs.begin(), inputs.end(), [&](auto s) {
-                       return s.lens()[l] == first_shape_lens[l];
-                   }))
+                if(index != axis)
                {
-                    MIGRAPHX_THROW("CONCAT: Non-axis dimensions should match");
+                    if(not std::all_of(inputs.begin(), inputs.end(), [&](const shape& s) {
+                           return s.dyn_dims()[index] == inputs[0].dyn_dims()[index];
+                       }))
+                        MIGRAPHX_THROW("CONCAT: all input dimensions should match in axis " +
+                                       std::to_string(index));
                }
            }
+            std::size_t new_min = 0;
+            std::size_t new_max = 0;
+            for(const auto& input : inputs)
+            {
+                auto ddim = input.dyn_dims()[axis];
+                new_min += ddim.min;
+                new_max += ddim.max;
+            }
+
+            auto new_dims  = inputs[0].dyn_dims();
+            new_dims[axis] = migraphx::shape::dynamic_dimension{new_min, new_max, 0};
+            return {inputs[0].type(), new_dims};
        }
-        std::size_t new_dim_axis = 0;
-        for(const auto& input : inputs)
+        else
        {
-            const auto& lens = input.lens();
-            new_dim_axis += lens[axis];
+            MIGRAPHX_THROW("CONCAT: Cannot mix static and dynamic input shapes.");
        }
-        std::vector<std::size_t> new_lens;
-        std::copy(first_shape_lens.begin(), first_shape_lens.end(), std::back_inserter(new_lens));
-        new_lens[axis] = new_dim_axis;
-        return shape::from_permutation(type, new_lens, find_permutation(inputs));
    }
-    argument compute(const shape& output_shape, std::vector<argument> args) const
+
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
-        std::vector<std::size_t> coffsets = compute_offsets(output_shape, args);
+        argument result{dyn_out.computed_shape};
+        std::vector<std::size_t> coffsets = compute_offsets(dyn_out.computed_shape, args);
        for(std::size_t l = 0; l < args.size(); l++)
        {
            auto argl = args[l];
            visit_all(result, argl)([&](auto output, auto input) {
-                auto slice_shape =
-                    shape{output_shape.type(), input.get_shape().lens(), output_shape.strides()};
-                auto slice = make_view(slice_shape, output.data() + coffsets[l]);
+                auto slice_shape = shape{dyn_out.computed_shape.type(),
+                                         input.get_shape().lens(),
+                                         dyn_out.computed_shape.strides()};
+                auto slice       = make_view(slice_shape, output.data() + coffsets[l]);
                std::copy(input.begin(), input.end(), slice.begin());
            });
        }

--- a/src/include/migraphx/op/dot.hpp
+++ b/src/include/migraphx/op/dot.hpp
@@ -28,6 +28,7 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/config.hpp>
 #include <migraphx/gemm.hpp>
+#include <migraphx/dyn_output.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -38,41 +39,69 @@ struct dot
    std::string name() const { return "dot"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.same_type().has(2);
+        check_shapes{inputs, *this, true}.same_type().same_ndims().has(2);
        const shape& a = inputs.at(0);
        const shape& b = inputs.at(1);
        auto t         = a.type();

-        if(not std::all_of(
-               inputs.begin(), inputs.end(), [](auto s) { return s.lens().size() >= 2; }))
+        if(not std::all_of(inputs.begin(), inputs.end(), [](auto s) { return s.ndim() >= 2; }))
        {
-            MIGRAPHX_THROW("DOT: dot only accept 2 or more dims operands");
+            MIGRAPHX_THROW("DOT: dot only accepts operands with 2 or more dimensions ");
        }
-
-        // only handle the case that the batch size of a and b are the same
-        if(not std::equal(
-               a.lens().rbegin() + 2, a.lens().rend(), b.lens().rbegin() + 2, b.lens().rend()))
+        if(a.dynamic() or b.dynamic())
        {
-            MIGRAPHX_THROW("DOT: batch size of A and B mismatch: {" + to_string_range(a.lens()) +
-                           "} x {" + to_string_range(b.lens()) + "}");
+            auto s0 = a.to_dynamic();
+            auto s1 = b.to_dynamic();
+            if(not std::equal(s0.dyn_dims().rbegin() + 2,
+                              s0.dyn_dims().rend(),
+                              s1.dyn_dims().rbegin() + 2,
+                              s1.dyn_dims().rend()))
+            {
+                MIGRAPHX_THROW("DOT: dynamic outer dimensions of A and B mismatch: {" +
+                               to_string_range(s0.dyn_dims()) + "} x {" +
+                               to_string_range(s1.dyn_dims()) + "}");
+            }
+            std::size_t dim_0 = s0.ndim() - 2;
+            std::size_t dim_1 = s0.ndim() - 1;
+            if(s0.dyn_dims()[dim_1] != s1.dyn_dims()[dim_0])
+            {
+                MIGRAPHX_THROW("DOT: dynamic inner dimensions do not match: {" +
+                               to_string_range(s0.dyn_dims()) + "} x {" +
+                               to_string_range(s1.dyn_dims()) + "}");
+            }
+            auto out_dyn_dims   = s0.dyn_dims();
+            out_dyn_dims[dim_1] = s1.dyn_dims()[dim_1];
+            return {t, out_dyn_dims};
        }
-
-        std::size_t dim_0 = a.lens().size() - 2;
-        std::size_t dim_1 = a.lens().size() - 1;
-        if(a.lens()[dim_1] != b.lens()[dim_0])
+        else
        {
-            MIGRAPHX_THROW("DOT: inner dimensions do not match: {" + to_string_range(a.lens()) +
-                           "} x {" + to_string_range(b.lens()) + "}");
-        }
+            // only handle the case that all the dimensions except the last two are the same
+            if(not std::equal(
+                   a.lens().rbegin() + 2, a.lens().rend(), b.lens().rbegin() + 2, b.lens().rend()))
+            {
+                MIGRAPHX_THROW("DOT: static outer dimensions of A and B mismatch: {" +
+                               to_string_range(a.lens()) + "} x {" + to_string_range(b.lens()) +
+                               "}");
+            }

-        auto out_lens   = a.lens();
-        out_lens[dim_1] = b.lens()[dim_1];
-        return {t, out_lens};
+            std::size_t dim_0 = a.ndim() - 2;
+            std::size_t dim_1 = a.ndim() - 1;
+            if(a.lens()[dim_1] != b.lens()[dim_0])
+            {
+                MIGRAPHX_THROW("DOT: static inner dimensions do not match: {" +
+                               to_string_range(a.lens()) + "} x {" + to_string_range(b.lens()) +
+                               "}");
+            }
+
+            auto out_lens   = a.lens();
+            out_lens[dim_1] = b.lens()[dim_1];
+            return {t, out_lens};
+        }
    }

-    argument compute(shape output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result = argument{output_shape};
+        argument result = argument{dyn_out.computed_shape};
        visit_all(result, args[0], args[1])(
            [&](auto cmat, auto amat, auto bmat) { gemm(cmat, amat, bmat, 1.0f, 0.0f); });
        return result;

--- a/src/include/migraphx/op/flatten.hpp
+++ b/src/include/migraphx/op/flatten.hpp
@@ -55,17 +55,47 @@ struct flatten
    std::string name() const { return "flatten"; }
    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1).standard();
-        auto&& lens = inputs.front().lens();
-        auto x =
-            std::accumulate(lens.begin(), lens.begin() + axis, std::size_t{1}, std::multiplies<>{});
-        auto y =
-            std::accumulate(lens.begin() + axis, lens.end(), std::size_t{1}, std::multiplies<>{});
-        return {inputs.at(0).type(), {x, y}};
+        check_shapes{inputs, *this, true}.has(1);
+        auto s = inputs[0];
+        if(s.dynamic())
+        {
+            auto min_lens = s.min_lens();
+            auto max_lens = s.max_lens();
+            auto opt_lens = s.opt_lens();
+            // If any of the opt values is 0, output opt will be 0
+            shape::dynamic_dimension x = {
+                std::accumulate(
+                    min_lens.begin(), min_lens.begin() + axis, std::size_t{1}, std::multiplies<>{}),
+                std::accumulate(
+                    max_lens.begin(), max_lens.begin() + axis, std::size_t{1}, std::multiplies<>{}),
+                std::accumulate(opt_lens.begin(),
+                                opt_lens.begin() + axis,
+                                std::size_t{1},
+                                std::multiplies<>{})};
+            shape::dynamic_dimension y = {
+                std::accumulate(
+                    min_lens.begin() + axis, min_lens.end(), std::size_t{1}, std::multiplies<>{}),
+                std::accumulate(
+                    max_lens.begin() + axis, max_lens.end(), std::size_t{1}, std::multiplies<>{}),
+                std::accumulate(
+                    opt_lens.begin() + axis, opt_lens.end(), std::size_t{1}, std::multiplies<>{}),
+            };
+            return {s.type(), {x, y}};
+        }
+        else
+        {
+            check_shapes{inputs, *this}.standard();
+            auto&& lens = s.lens();
+            auto x      = std::accumulate(
+                lens.begin(), lens.begin() + axis, std::size_t{1}, std::multiplies<>{});
+            auto y = std::accumulate(
+                lens.begin() + axis, lens.end(), std::size_t{1}, std::multiplies<>{});
+            return {s.type(), {x, y}};
+        }
    }
-    argument compute(shape output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        return args[0].reshape(output_shape);
+        return args[0].reshape(dyn_out.computed_shape);
    }
    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }
 };

--- a/src/include/migraphx/op/gather.hpp
+++ b/src/include/migraphx/op/gather.hpp
@@ -26,6 +26,7 @@

 #include <array>
 #include <migraphx/check_shapes.hpp>
+#include <migraphx/dyn_output.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
 #include <migraphx/literal.hpp>
@@ -61,35 +62,59 @@ struct gather

    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2);
-        auto lens = inputs[0].lens();
-        auto type = inputs[0].type();
-        lens.erase(lens.begin() + axis);
-        if(not inputs[1].scalar())
+        check_shapes{inputs, *this, true}.has(2);
+        shape data    = inputs[0];
+        shape indices = inputs[1];
+        auto type     = data.type();
+        // If index_dims is dynamic, convert the data to dynamic too.
+        if(indices.dynamic())
        {
-            auto ind_lens = inputs[1].lens();
-            lens.insert(lens.begin() + axis, ind_lens.begin(), ind_lens.end());
+            data = data.to_dynamic();
        }
-
-        // for scalar output
-        if(lens.empty())
+        if(data.dynamic())
        {
-            return {type};
+            auto dims = data.dyn_dims();
+            dims.erase(dims.begin() + axis);
+
+            if(not indices.scalar())
+            {
+                auto index_dims = indices.to_dynamic().dyn_dims();
+                dims.insert(dims.begin() + axis, index_dims.begin(), index_dims.end());
+            }
+            return {type, dims};
        }
+        else
+        {
+            // Both data and indices are static.  indices may be scalar
+            auto lens = data.lens();
+            lens.erase(lens.begin() + axis);

-        return {type, lens};
+            if(not indices.scalar())
+            {
+                auto ind_lens = indices.lens();
+                lens.insert(lens.begin() + axis, ind_lens.begin(), ind_lens.end());
+            }
+
+            // for scalar output
+            if(lens.empty())
+            {
+                return {type};
+            }
+
+            return {type, lens};
+        }
    }

-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
        // negative axis means counting dimensions from back
        auto lens                 = args[0].get_shape().lens();
        std::size_t axis_dim_size = lens[axis];
        // max dimension in axis
        visit_all(result, args[0])([&](auto output, auto data) {
            args[1].visit([&](auto indices) {
-                if(output_shape.scalar())
+                if(dyn_out.computed_shape.scalar())
                {
                    auto in_index = indices.front();
                    in_index      = (in_index < 0) ? in_index + axis_dim_size : in_index;

--- a/src/include/migraphx/op/gathernd.hpp
+++ b/src/include/migraphx/op/gathernd.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -25,6 +25,7 @@
 #define MIGRAPHX_GUARD_OPERATORS_GATHERND_HPP

 #include <migraphx/check_shapes.hpp>
+#include <migraphx/dyn_output.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/par_for.hpp>
 #include <migraphx/argument.hpp>
@@ -47,33 +48,103 @@ struct gathernd

    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2);
-        auto r = inputs.front().lens().size();
-        auto q = inputs.back().lens().size();
-        auto k = inputs.back().lens().back();
+        check_shapes{inputs, *this, true}.has(2);
+        auto i_shape    = inputs.back();
+        auto data_shape = inputs.front();
+        auto r          = data_shape.ndim();
+        auto q          = i_shape.ndim();
+
+        size_t k;
+        if(i_shape.dynamic())
+        {
+            // the rank of the output is a function of k, so it must be fixed.
+            if(not i_shape.dyn_dims().back().is_fixed())
+            {
+                MIGRAPHX_THROW(
+                    "GATHERND: last dimension of indices tensor must be fixed (min=max)");
+            }
+            k = i_shape.dyn_dims().back().min;
+        }
+        else
+            k = i_shape.lens().back();
+
+        // Begin input validation checks.
+        int output_ndim = int(q) + r - k - batch_dims - 1;
+
        if(k > r - batch_dims)
        {
            MIGRAPHX_THROW("GATHERND: Indices of length " + std::to_string(k) +
                           " cannot be used to access data of rank " +
                           std::to_string(r - batch_dims));
        }
-        auto indices_lens_iter = inputs.back().lens().begin();
-        auto output_lens_size  = q + r - k - batch_dims - 1;
-        std::vector<std::size_t> output_lens(output_lens_size);
-        std::copy(indices_lens_iter, indices_lens_iter + (q - 1), output_lens.begin());
-        if(k < r - batch_dims)
+
+        if(batch_dims >= q or batch_dims >= r)
+        {
+            MIGRAPHX_THROW("GATHERND: rank of an input cannot be less than batch_dims=" +
+                           std::to_string(batch_dims));
+        }
+
+        if(output_ndim < 0)
+        {
+            MIGRAPHX_THROW("GATHERND: Indices too large for static data input: k=" +
+                           std::to_string(k));
+        }
+
+        if(migraphx::none_of(inputs, [](auto v) { return v.dynamic(); }))
+        {
+            auto indices_lens_iter = i_shape.lens().begin();
+
+            // A rank 0 output is a scalar
+            if(output_ndim == 0)
+                return shape{data_shape.type(), {1}};
+
+            // Part of the output shape comes from indices tensor, part from data tensor
+            std::vector<std::size_t> output_lens(output_ndim);
+            std::copy(indices_lens_iter, indices_lens_iter + (q - 1), output_lens.begin());
+            // fill the rest of output shape from data tensor
+            if(k + batch_dims < r)
+            {
+                auto data_lens = data_shape.lens();
+                std::copy(data_lens.begin() + batch_dims + k,
+                          data_lens.end(),
+                          output_lens.begin() + q - 1);
+            }
+            shape output_shape{data_shape.type(), output_lens};
+            return output_shape;
+        }
+        else
        {
-            auto data_lens = inputs.front().lens();
-            std::copy(
-                data_lens.begin() + batch_dims + k, data_lens.end(), output_lens.begin() + q - 1);
+            // If one or both inputs are dynamic shapes, the output is dynamic.
+            // Make both inputs dynamic to simplify computations.
+            data_shape = data_shape.to_dynamic();
+            i_shape    = i_shape.to_dynamic();
+
+            // A rank 0 output is a scalar
+            if(output_ndim == 0)
+                return shape(data_shape.type(), {shape::dynamic_dimension({1, 1, 0})});
+
+            // Part of the output shape comes from indices tensor, part from data tensor
+            std::vector<shape::dynamic_dimension> output_dims(output_ndim);
+            std::copy(i_shape.dyn_dims().begin(),
+                      i_shape.dyn_dims().begin() + q - 1,
+                      output_dims.begin());
+
+            // fill the rest of output shape from data tensor
+            if(k + batch_dims < r)
+            {
+                auto data_dims = data_shape.dyn_dims();
+                std::copy(data_dims.begin() + batch_dims + k,
+                          data_dims.begin() + r,
+                          output_dims.begin() + q - 1);
+            }
+            shape output_shape(data_shape.type(), output_dims);
+            return output_shape;
        }
-        shape output_shape{inputs.front().type(), output_lens};
-        return output_shape;
    }

-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
        visit_all(result, args[0])([&](auto output, auto data) {
            args[1].visit([&](auto indices) {
                auto indices_shape        = indices.get_shape();

--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
@@ -143,16 +143,22 @@ struct nonmaxsuppression

        void sort()
        {
-            std::sort(x.begin(), x.end());
-            std::sort(y.begin(), y.end());
+            if(x[0] > x[1])
+            {
+                std::swap(x[0], x[1]);
+            }
+            if(y[0] > y[1])
+            {
+                std::swap(y[0], y[1]);
+            }
        }

        std::array<double, 2>& operator[](std::size_t i) { return i == 0 ? x : y; }

        double area() const
        {
-            assert(std::is_sorted(x.begin(), x.end()));
-            assert(std::is_sorted(y.begin(), y.end()));
+            assert(x[0] <= x[1]);
+            assert(y[0] <= y[1]);
            return (x[1] - x[0]) * (y[1] - y[0]);
        }
    };
@@ -190,14 +196,10 @@ struct nonmaxsuppression
        {
            intersection[i][0] = std::max(b1[i][0], b2[i][0]);
            intersection[i][1] = std::min(b1[i][1], b2[i][1]);
-        }
-
-        std::vector<std::array<double, 2>> bbox = {intersection.x, intersection.y};
-        if(std::any_of(bbox.begin(), bbox.end(), [](auto bx) {
-               return not std::is_sorted(bx.begin(), bx.end());
-           }))
-        {
-            return false;
+            if(intersection[i][0] > intersection[i][1])
+            {
+                return false;
+            }
        }

        const double area1             = b1.area();
@@ -265,31 +267,31 @@ struct nonmaxsuppression
            auto batch_boxes_start = boxes.begin() + batch_idx * num_boxes * 4;
            auto boxes_heap = filter_boxes_by_score(scores_start, num_boxes, score_threshold);
            selected_boxes_inside_class.clear();
-            // Get the next box with top score, filter by iou_threshold
            while(not boxes_heap.empty() &&
                  selected_boxes_inside_class.size() < max_output_boxes_per_class)
            {
-                // Check with existing selected boxes for this class, remove box if it
-                // exceeds the IOU (Intersection Over Union) threshold
+                // select next top scorer box and remove any boxes from boxes_heap that exceeds IOU
+                // threshold with the selected box
                const auto next_top_score = boxes_heap.top();
-                bool not_selected =
-                    std::any_of(selected_boxes_inside_class.begin(),
-                                selected_boxes_inside_class.end(),
-                                [&](auto selected_index) {
-                                    return this->suppress_by_iou(
-                                        batch_box(batch_boxes_start, next_top_score.second),
-                                        batch_box(batch_boxes_start, selected_index.second),
-                                        iou_threshold);
-                                });
-
-                if(not not_selected)
+                boxes_heap.pop();
+                selected_boxes_inside_class.push_back(next_top_score);
+                selected_indices.push_back(batch_idx);
+                selected_indices.push_back(class_idx);
+                selected_indices.push_back(next_top_score.second);
+                std::priority_queue<std::pair<double, int64_t>> remainder_boxes;
+                while(not boxes_heap.empty())
                {
-                    selected_boxes_inside_class.push_back(next_top_score);
-                    selected_indices.push_back(batch_idx);
-                    selected_indices.push_back(class_idx);
-                    selected_indices.push_back(next_top_score.second);
+                    auto iou_candidate_box = boxes_heap.top();
+                    if(not this->suppress_by_iou(
+                           batch_box(batch_boxes_start, iou_candidate_box.second),
+                           batch_box(batch_boxes_start, next_top_score.second),
+                           iou_threshold))
+                    {
+                        remainder_boxes.push(iou_candidate_box);
+                    }
+                    boxes_heap.pop();
                }
-                boxes_heap.pop();
+                boxes_heap = remainder_boxes;
            }
        });
        std::copy(selected_indices.begin(), selected_indices.end(), output.begin());

--- a/src/include/migraphx/op/normalize_attribute.hpp
+++ b/src/include/migraphx/op/normalize_attribute.hpp
@@ -31,18 +31,30 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {

-// different attributes
-// 1) use_input(default)/use_output
-// 2) use_rank(default)/use_len
-// 3) clip_min(default)/not_clip_min
-//   3.1) include_min(default)/exclude_min
-// 4) clip_max(default)/not_clip_max
-//   4.1) exclude_max(default)/include_max
-// 5) normalize padding
+/**
+ * `normalize_attribute` settings:
+ * Note that default options are not included as enums.
+ * 1. `use_input` (default) vs. `use_output`:
+ *  Affects the rank of the attribute.
+ *  `use_input -> lens.size()`, `use_output -> lens.size() + vec.size()`.
+ * 2. use_rank (default) vs use_len:
+ *  `use_rank` sets the max value/index of the attribute as the rank of lens.
+ *  `use_lens` sets the max value/index as the corresponding value in lens at the axes index.
+ * 3. `clip_min` vs. `not_clip_min` (default):
+ *  Clip values less than the minimum to the minimum or not.
+ * 4. `include_min` vs. `exclude_min` (default):
+ *  Include or exclude the minimum value/index for range checking and clipping.
+ * 5. `clip_max` vs. `not_clip_max` (default):
+ *  Clip values greater than the maximum or not.
+ * 6. `include_max` vs. `exclude_max` (default):
+ *  Include or exclude the maximum value/index for range checking and clipping.
+ * 7. `normalize_padding`:
+ *  To normalize the padding to `2*(pad ndim)` dimensions.
+ */
 enum class normalize_attribute
 {
-    use_len,
    use_output,
+    use_len,
    clip_max,
    clip_min,
    include_max,

--- a/src/include/migraphx/op/pad.hpp
+++ b/src/include/migraphx/op/pad.hpp
@@ -59,18 +59,29 @@ struct pad
    std::string name() const { return "pad"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1);
-        auto&& idims = inputs.front().lens();
-        std::vector<std::size_t> rdims(idims.begin(), idims.end());
-        std::size_t num_dims = rdims.size();
-
-        for(std::size_t i = 0; i < num_dims; i++)
+        check_shapes{inputs, *this, true}.has(1);
+        const auto& s0 = inputs.front();
+        if(s0.dynamic())
        {
-            rdims[i] += pads[i] + pads[i + num_dims];
+            auto out_dyn_dims = s0.dyn_dims();
+            for(std::size_t i = 0; i < s0.ndim(); ++i)
+            {
+                out_dyn_dims[i] += pads[i] + pads[i + s0.ndim()];
+            }
+            return {s0.type(), out_dyn_dims};
+        }
+        else
+        {
+            auto&& idims = s0.lens();
+            std::vector<std::size_t> rdims(idims.begin(), idims.end());
+            std::size_t num_dims = rdims.size();
+            for(std::size_t i = 0; i < num_dims; i++)
+            {
+                rdims[i] += pads[i] + pads[i + num_dims];
+            }
+            shape s{s0.type(), rdims};
+            return s;
        }
-
-        shape s{inputs.front().type(), rdims};
-        return s;
    }

    std::size_t pad_ndims() const

--- a/src/include/migraphx/op/reduce_op.hpp
+++ b/src/include/migraphx/op/reduce_op.hpp
@@ -26,6 +26,7 @@

 #include <migraphx/op/name.hpp>
 #include <migraphx/check_shapes.hpp>
+#include <migraphx/dyn_output.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/tensor_view.hpp>
 #include <migraphx/shape_for_each.hpp>
@@ -105,18 +106,41 @@ struct reduce_op : op_name<Derived>
        return tuned_axes;
    }

+    /**
+     * @brief returns a shape in which the axis or axes named
+     * for reduction by this op are set, to size 1.
+     *
+     * @param inputs list of input shapes
+     * @return shape
+     */
    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1);
-        auto s          = inputs.at(0);
-        auto lens       = s.lens();
-        auto tuned_axes = tune_axes(lens.size());
-        for(auto axis : tuned_axes)
+        check_shapes{inputs, *this, true}.has(1);
+        auto s = inputs.at(0);
+        if(s.dynamic())
        {
-            lens[axis] = 1;
+            auto output_dyn_dims = s.dyn_dims();
+            auto tuned_axes      = tune_axes(output_dyn_dims.size());
+            for(const auto& axis : tuned_axes)
+            {
+                // At the time of writing, there's no functional difference between
+                // optimum of 0 (no opt) or 1.
+                output_dyn_dims[axis] = {1, 1, 0};
+            }
+
+            return shape{s.type(), output_dyn_dims};
+        }
+        else
+        {
+            auto lens       = s.lens();
+            auto tuned_axes = tune_axes(lens.size());
+            for(const auto& axis : tuned_axes)
+            {
+                lens[axis] = 1;
+            }
+
+            return inputs[0].with_lens(lens);
        }
-
-        return inputs[0].with_lens(lens);
    }

    template <class T>
@@ -124,7 +148,7 @@ struct reduce_op : op_name<Derived>
                   const std::vector<T>& in_lens,
                   std::vector<T>& out_lens) const
    {
-        for(auto axis : tuned_axes)
+        for(const auto& axis : tuned_axes)
        {
            out_lens[axis] = in_lens[axis];
        }
@@ -151,17 +175,17 @@ struct reduce_op : op_name<Derived>
            static_cast<const Derived&>(*this).output(batch_shape)(val);
    }

-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
        auto arg_lens   = args.front().get_shape().lens();
        auto tuned_axes = tune_axes(arg_lens.size());
-        std::vector<std::size_t> batch_lens(output_shape.lens().size(), 1);
+        std::vector<std::size_t> batch_lens(dyn_out.computed_shape.lens().size(), 1);
        tune_dims(tuned_axes, arg_lens, batch_lens);
-        shape batch_shape{output_shape.type(), batch_lens};
+        shape batch_shape{dyn_out.computed_shape.type(), batch_lens};
        visit_all(result, args[0])([&](auto output, auto input) {
-            par_for(output_shape.elements(), [&](auto i) {
-                auto out_idx = output_shape.multi(i);
+            par_for(dyn_out.computed_shape.elements(), [&](auto i) {
+                auto out_idx = dyn_out.computed_shape.multi(i);
                this->reduce(input, batch_shape, tuned_axes, out_idx, output);
            });
        });

--- a/src/include/migraphx/op/reshape.hpp
+++ b/src/include/migraphx/op/reshape.hpp
@@ -28,6 +28,7 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/config.hpp>
 #include <migraphx/value.hpp>
+#include <migraphx/dyn_output.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -46,14 +47,60 @@ struct reshape
    value attributes() const { return {{"require_std_shape", true}}; }

    std::string name() const { return "reshape"; }
-    shape compute_shape(std::vector<shape> inputs) const
+
+    shape dyn_compute_shape(shape s0) const
+    {
+        auto dyn_dims      = s0.dyn_dims();
+        auto num_not_fixed = std::count_if(
+            dyn_dims.cbegin(), dyn_dims.cend(), [](auto dd) { return not dd.is_fixed(); });
+        if(num_not_fixed != 1)
+        {
+            MIGRAPHX_THROW("Reshape: Only supports one non-fixed dynamic_dimension");
+        }
+        // track number of fixed elements in input and output
+        std::size_t num_dims_ele = 1;
+        std::size_t num_dd_ele   = 1;
+        for(std::size_t i = 0; i < dyn_dims.size(); ++i)
+        {
+            if(dyn_dims[i].is_fixed())
+            {
+                num_dims_ele *= dims[i];
+                num_dd_ele *= dyn_dims[i].min;
+            }
+            else
+            {
+                if(dims[i] != 0 and dims[i] != -1)
+                {
+                    MIGRAPHX_THROW(
+                        "Reshape: Non-fixed dynamic_dimension doesn't match with 0 or -1 "
+                        "output dimension");
+                }
+            }
+        }
+        if(num_dims_ele != num_dd_ele)
+        {
+            MIGRAPHX_THROW("Reshape: Number of fixed elements must match. Input: " +
+                           std::to_string(num_dd_ele) + " Output: " + std::to_string(num_dims_ele));
+        }
+        // construct output dynamic shape from dims attribute
+        std::vector<shape::dynamic_dimension> output_dyn_dims(dims.size());
+        std::transform(dims.cbegin(),
+                       dims.cend(),
+                       dyn_dims.cbegin(),
+                       output_dyn_dims.begin(),
+                       [](std::size_t dim, auto dyn_dim) {
+                           if(not dyn_dim.is_fixed())
+                               return dyn_dim;
+                           return shape::dynamic_dimension{dim, dim};
+                       });
+        return {s0.type(), output_dyn_dims};
+    }
+
+    shape static_compute_shape(std::vector<shape> inputs, std::size_t n_neg_dims) const
    {
-        check_shapes{inputs, *this}.has(1).standard();
+        check_shapes{inputs, *this}.standard();
        auto&& idims = inputs.front().lens();
        std::vector<std::size_t> rdims(dims.begin(), dims.end());
-        auto n_neg_dims = std::count(dims.begin(), dims.end(), -1);
-        if(n_neg_dims > 1)
-            MIGRAPHX_THROW("Reshape: Dimensions for reshape can only have one -1 dim");

        for(std::size_t i = 0; i < dims.size(); i++)
        {
@@ -86,9 +133,26 @@ struct reshape
        return s;
    }

-    argument compute(shape output_shape, std::vector<argument> args) const
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this, true}.has(1);
+        auto n_neg_dims = std::count(dims.begin(), dims.end(), -1);
+        if(n_neg_dims > 1)
+            MIGRAPHX_THROW("Reshape: Dimensions for reshape can only have one -1 dim");
+        auto s0 = inputs[0];
+        if(s0.dynamic())
+        {
+            return dyn_compute_shape(s0);
+        }
+        else
+        {
+            return static_compute_shape(inputs, n_neg_dims);
+        }
+    }
+
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        return args[0].reshape(output_shape);
+        return args[0].reshape(dyn_out.computed_shape);
    }

    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }