Merge branch 'develop' into enable_navi_32_ci

9d3fb0b5 · Ted Themistokleous · GitHub · 9c91c08d · aeb9f78c · 9d3fb0b5
Unverified Commit 9d3fb0b5 authored Aug 05, 2023 by Ted Themistokleous Committed by GitHub Aug 05, 2023
20 changed files
--- a/src/include/migraphx/json.hpp
+++ b/src/include/migraphx/json.hpp
@@ -31,10 +31,10 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-std::string to_pretty_json_string(const value& val, std::size_t indent = 4);
-std::string to_json_string(const value& val);
-value from_json_string(const std::string& str);
-value from_json_string(const char* str, std::size_t size);
+MIGRAPHX_EXPORT std::string to_pretty_json_string(const value& val, std::size_t indent = 4);
+MIGRAPHX_EXPORT std::string to_json_string(const value& val);
+MIGRAPHX_EXPORT value from_json_string(const std::string& str);
+MIGRAPHX_EXPORT value from_json_string(const char* str, std::size_t size);

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/layout_nhwc.hpp
+++ b/src/include/migraphx/layout_nhwc.hpp
@@ -36,7 +36,7 @@ struct module_pass_manager;
 /**
 * Transform convolutions to nhwc
 */
-struct layout_nhwc
+struct MIGRAPHX_EXPORT layout_nhwc
 {
    std::string name() const { return "layout_nhwc"; }
    void apply(module_pass_manager& mpm) const;

--- a/src/include/migraphx/literal.hpp
+++ b/src/include/migraphx/literal.hpp
@@ -147,8 +147,8 @@ literal transform(literal l1, literal l2, F f)
    return result;
 }

-void migraphx_to_value(value& v, const literal& l);
-void migraphx_from_value(const value& v, literal& l);
+MIGRAPHX_EXPORT void migraphx_to_value(value& v, const literal& l);
+MIGRAPHX_EXPORT void migraphx_from_value(const value& v, literal& l);

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/load_save.hpp
+++ b/src/include/migraphx/load_save.hpp
@@ -36,15 +36,18 @@ struct file_options
    std::string format = "msgpack";
 };

-program load(const std::string& filename, const file_options& options = file_options{});
-program load_buffer(const std::vector<char>& buffer, const file_options& options = file_options{});
-program
-load_buffer(const char* buffer, std::size_t size, const file_options& options = file_options{});
+MIGRAPHX_EXPORT program load(const std::string& filename,
+                             const file_options& options = file_options{});
+MIGRAPHX_EXPORT program load_buffer(const std::vector<char>& buffer,
+                                    const file_options& options = file_options{});
+MIGRAPHX_EXPORT program load_buffer(const char* buffer,
+                                    std::size_t size,
+                                    const file_options& options = file_options{});

-void save(const program& p,
-          const std::string& filename,
-          const file_options& options = file_options{});
-std::vector<char> save_buffer(const program& p, const file_options& options = file_options{});
+MIGRAPHX_EXPORT void
+save(const program& p, const std::string& filename, const file_options& options = file_options{});
+MIGRAPHX_EXPORT std::vector<char> save_buffer(const program& p,
+                                              const file_options& options = file_options{});

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/make_op.hpp
+++ b/src/include/migraphx/make_op.hpp
@@ -33,10 +33,10 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-operation make_op(const std::string& name);
-operation make_op(const std::string& name,
-                  const std::initializer_list<std::pair<std::string, value>>& v);
-operation make_op_from_value(const std::string& name, const value& v);
+MIGRAPHX_EXPORT operation make_op(const std::string& name);
+MIGRAPHX_EXPORT operation make_op(const std::string& name,
+                                  const std::initializer_list<std::pair<std::string, value>>& v);
+MIGRAPHX_EXPORT operation make_op_from_value(const std::string& name, const value& v);

 // A template overload is added for migraphx::value so the initializer_list
 // cannot be passed in directly. This is to enforce at compile-time that all
@@ -48,7 +48,7 @@ operation make_op(const std::string& name, const Value& v)
    return make_op_from_value(name, v);
 }

-operation make_json_op(const std::string& name, const std::string& s);
+MIGRAPHX_EXPORT operation make_json_op(const std::string& name, const std::string& s);

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/marker.hpp
+++ b/src/include/migraphx/marker.hpp
@@ -46,7 +46,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 #ifdef TYPE_ERASED_DECLARATION

 // Type-erased interface for:
-struct marker
+struct MIGRAPHX_EXPORT marker
 {
    //
    void mark_start(instruction_ref ins_ref);
@@ -80,7 +80,7 @@ struct marker
    {
        using std::swap;
        auto* derived = this->any_cast<PrivateDetailTypeErasedT>();
-        if(derived and private_detail_te_handle_mem_var.unique())
+        if(derived and private_detail_te_handle_mem_var.use_count() == 1)
        {
            *derived = std::forward<PrivateDetailTypeErasedT>(value);
        }
@@ -233,7 +233,7 @@ struct marker
    private_detail_te_handle_base_type& private_detail_te_get_handle()
    {
        assert(private_detail_te_handle_mem_var != nullptr);
-        if(not private_detail_te_handle_mem_var.unique())
+        if(private_detail_te_handle_mem_var.use_count() > 1)
            private_detail_te_handle_mem_var = private_detail_te_handle_mem_var->clone();
        return *private_detail_te_handle_mem_var;
    }

--- a/src/include/migraphx/memory_coloring.hpp
+++ b/src/include/migraphx/memory_coloring.hpp
@@ -36,7 +36,7 @@ struct module;
 * Remove multiple memory allocations using graph coloring to find memory allocations that can be
 * reused.
 */
-struct memory_coloring
+struct MIGRAPHX_EXPORT memory_coloring
 {
    std::string allocation_op{};
    bool verify = false;

--- a/src/include/migraphx/module.hpp
+++ b/src/include/migraphx/module.hpp
@@ -52,7 +52,7 @@ using ins_dep_map   = std::unordered_map<instruction_ref, std::unordered_set<ins
 /**
 * @brief Stores the instruction stream
 */
-struct module
+struct MIGRAPHX_EXPORT module
 {
    module(const std::string& name = "");

@@ -222,11 +222,21 @@ struct module
    void annotate(std::ostream& os, std::function<void(instruction_ref)> a) const;

    std::vector<module_ref> get_sub_modules(bool shallow = false) const;
+    /* sorts the module in topological order aka reverse-post order (RPO) DFS order
+       it takes last instruction or @return as the root and walks back the graph and moves inputs
+       of the each instruction such that it appears before the instruction itself.
+    */
    module& sort();
+    /* Any instruction "X" can have module arguments and those modules inside them can use any other
+     * instruction "Y" from predecessor modules of the instruction "X". Such instruction "Y" inside
+     * module args are not listed as input instructions to "X". But those instructions "Y" must be
+     * evaluted before the instruction "X" can. Therefore such "Y" instructions are considered
+     * implicit dependency to "X".
+     */
    ins_dep_map calc_implicit_deps() const;

-    friend std::ostream& operator<<(std::ostream& os, const module& m);
-    friend bool operator==(const module& x, const module& y);
+    MIGRAPHX_EXPORT friend std::ostream& operator<<(std::ostream& os, const module& m);
+    MIGRAPHX_EXPORT friend bool operator==(const module& x, const module& y);
    friend bool operator!=(const module& x, const module& y) { return not(x == y); }

    private:

--- a/src/include/migraphx/msgpack.hpp
+++ b/src/include/migraphx/msgpack.hpp
@@ -31,10 +31,11 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-void to_msgpack(const value& v, std::function<void(const char*, std::size_t)> writer);
-std::vector<char> to_msgpack(const value& v);
-value from_msgpack(const std::vector<char>& buffer);
-value from_msgpack(const char* buffer, std::size_t size);
+MIGRAPHX_EXPORT void to_msgpack(const value& v,
+                                std::function<void(const char*, std::size_t)> writer);
+MIGRAPHX_EXPORT std::vector<char> to_msgpack(const value& v);
+MIGRAPHX_EXPORT value from_msgpack(const std::vector<char>& buffer);
+MIGRAPHX_EXPORT value from_msgpack(const char* buffer, std::size_t size);

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/normalize_attributes.hpp
+++ b/src/include/migraphx/normalize_attributes.hpp
@@ -42,7 +42,8 @@ struct select_dependent_type
 template <class T, class... Ts>
 using dependent_type = typename select_dependent_type<T, Ts...>::type;

-bool normalize_attributes(operation& op, const std::vector<std::size_t>& lens);
+MIGRAPHX_EXPORT
+bool normalize_attributes(operation& op, const shape& input_shape);

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/normalize_ops.hpp
+++ b/src/include/migraphx/normalize_ops.hpp
@@ -39,7 +39,7 @@ struct module;
 * Process negative axis attributes of ops
 */

-struct normalize_ops
+struct MIGRAPHX_EXPORT normalize_ops
 {
    std::string name() const { return "normalize_ops"; }
    void apply(module& m) const;

--- a/src/include/migraphx/onnx.hpp
+++ b/src/include/migraphx/onnx.hpp
@@ -26,6 +26,7 @@

 #include <migraphx/program.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/onnx/export.h>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -54,15 +55,19 @@ struct onnx_options
 };

 /// Create a program from an onnx file
-program parse_onnx(const std::string& name, const onnx_options& = onnx_options{});
+MIGRAPHX_ONNX_EXPORT program parse_onnx(const std::string& name,
+                                        const onnx_options& = onnx_options{});

 /// Create a program from an onnx buffer
-program parse_onnx_buffer(const std::string& buffer, const onnx_options& options);
+MIGRAPHX_ONNX_EXPORT program parse_onnx_buffer(const std::string& buffer,
+                                               const onnx_options& options);

 /// Create a program from an onnx buffer
-program parse_onnx_buffer(const void* data, std::size_t size, const onnx_options& options);
+MIGRAPHX_ONNX_EXPORT program parse_onnx_buffer(const void* data,
+                                               std::size_t size,
+                                               const onnx_options& options);

-std::vector<std::string> get_onnx_operators();
+MIGRAPHX_ONNX_EXPORT std::vector<std::string> get_onnx_operators();

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/op/common.hpp
+++ b/src/include/migraphx/op/common.hpp
@@ -59,8 +59,8 @@ enum class rnn_direction
    bidirectional,
 };

-std::ostream& operator<<(std::ostream& os, pooling_mode v);
-std::ostream& operator<<(std::ostream& os, rnn_direction v);
+MIGRAPHX_EXPORT std::ostream& operator<<(std::ostream& os, pooling_mode v);
+MIGRAPHX_EXPORT std::ostream& operator<<(std::ostream& os, rnn_direction v);

 } // namespace op
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/include/migraphx/op/convert.hpp
+++ b/src/include/migraphx/op/convert.hpp
@@ -66,7 +66,19 @@ struct convert : unary<convert>
        auto type = target_type;
        return [type](auto x) {
            auto y = x;
-            shape::visit(type, [&](auto as) { y = as(x); });
+            shape::visit(type, [&](auto as) {
+                // clamping value between target_type's max and min doesn't work for NaNs,
+                if(std::isnan(x))
+                {
+                    y = as.nan();
+                }
+                else
+                {
+                    // clamp overflowing/underflowing values to min()/max() instead of +/-infinity
+                    // during downcasting
+                    y = std::min(std::max(as(x), as.min()), as.max());
+                }
+            });
            return y;
        };
    }

--- a/src/include/migraphx/op/convolution.hpp
+++ b/src/include/migraphx/op/convolution.hpp
@@ -79,17 +79,17 @@ struct convolution
        check_shapes{inputs, *this, true}.has(2).same_type().same_ndims().min_ndims(3);
        check_attribute_size();
        // num of dims of input and attribute should match
-        const auto input_size   = inputs[0].max_lens().size();
+        const auto input_ndim   = inputs[0].ndim();
        const auto padding_size = padding.size();

-        if(input_size != padding_size / 2 + 2 && input_size != padding_size + 2)
+        if(input_ndim != padding_size / 2 + 2 && input_ndim != padding_size + 2)
        {
            MIGRAPHX_THROW("CONVOLUTION: input and attribute size mismatch!");
        }

        const shape& x_shape          = inputs.at(0);
        const shape& w_shape          = inputs.at(1);
-        const size_t num_spatial_dims = input_size - 2;
+        const size_t num_spatial_dims = input_ndim - 2;
        if(num_spatial_dims != this->kdims())
        {
            MIGRAPHX_THROW("CONVOLUTION: input k-dims does not match attribute size");
@@ -105,7 +105,7 @@ struct convolution
        }
        else
        {
-            return fixed_compute_shape(x_shape, w_shape);
+            return static_compute_shape(x_shape, w_shape);
        }
    }

@@ -143,23 +143,10 @@ struct convolution
    shape dynamic_compute_shape(shape x_shape, shape w_shape) const
    {
        std::vector<shape::dynamic_dimension> output_dyn_dims = {};
+        output_dyn_dims.push_back(x_shape.to_dynamic().dyn_dims().at(0));
+        output_dyn_dims.push_back(w_shape.to_dynamic().dyn_dims().at(0));

-        auto dynamic_shape_push_back = [&](const shape& input_shape) {
-            if(input_shape.dynamic())
-            {
-                output_dyn_dims.push_back(input_shape.dyn_dims().at(0));
-            }
-            else
-            {
-                auto l = input_shape.lens().at(0);
-                output_dyn_dims.push_back({l, l});
-            }
-        };
-
-        dynamic_shape_push_back(x_shape);
-        dynamic_shape_push_back(w_shape);
-
-        const size_t num_spatial_dims = x_shape.max_lens().size() - 2;
+        const size_t num_spatial_dims = x_shape.ndim() - 2;
        if(padding_mode != default_)
        {
            for(std::size_t i = 0; i < num_spatial_dims; ++i)
@@ -198,7 +185,7 @@ struct convolution
        return shape{x_shape.type(), output_dyn_dims};
    }

-    shape fixed_compute_shape(shape x_shape, shape w_shape) const
+    shape static_compute_shape(shape x_shape, shape w_shape) const
    {
        std::vector<size_t> output_lens{x_shape.lens()[0], w_shape.lens()[0]};
        auto spatial_lens = calc_conv_lens(x_shape.lens(), w_shape.lens());

--- a/src/include/migraphx/op/deconvolution.hpp
+++ b/src/include/migraphx/op/deconvolution.hpp
@@ -21,9 +21,11 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_OPERATORS_DECONVOLUTION_HPP
-#define MIGRAPHX_GUARD_OPERATORS_DECONVOLUTION_HPP
+#ifndef MIGRAPHX_GUARD_OPERATORS_CONVOLUTION_BACKWARDS_HPP
+#define MIGRAPHX_GUARD_OPERATORS_CONVOLUTION_BACKWARDS_HPP

+#include <cmath>
+#include <utility>
 #include <migraphx/op/common.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/config.hpp>
@@ -31,14 +33,13 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/par_dfor.hpp>
 #include <migraphx/shape_for_each.hpp>
-#include <cmath>
-#include <utility>
+#include <migraphx/dyn_output.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {

-struct deconvolution
+struct convolution_backwards
 {
    std::vector<std::size_t> padding  = {0, 0};
    std::vector<std::size_t> stride   = {1, 1};
@@ -57,45 +58,91 @@ struct deconvolution
                    f(self.group, "group"));
    }

-    std::string name() const { return "deconvolution"; }
+    std::string name() const { return "convolution_backwards"; }

    void check_attribute_size() const
    {
-        if((padding.size() != stride.size() and (padding.size() / 2) != stride.size()) or
-           stride.size() != dilation.size())
+        if(padding.size() != stride.size() or stride.size() != dilation.size())
        {
-            MIGRAPHX_THROW("deconvolution: inconsistent attribute sizes");
+            MIGRAPHX_THROW("CONVOLUTION_BACKWARDS: inconsistent attribute sizes");
        }
    }

    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2).same_type().same_ndims().min_ndims(3);
+        check_shapes{inputs, *this, true}.has(2).same_type().same_ndims().min_ndims(3);

-        const shape& input   = inputs.at(0);
-        const shape& weights = inputs.at(1);
-        size_t kdims         = input.lens().size() - 2;
-        if(kdims != this->kdims())
+        const shape& x_shape = inputs.at(0);
+        const shape& w_shape = inputs.at(1);
+        if(x_shape.ndim() - 2 != this->kdims())
        {
-            MIGRAPHX_THROW("deconvolution: input k-dims does not match attribute size");
+            MIGRAPHX_THROW("CONVOLUTION_BACKWARDS: input k-dims does not match attribute size");
        }

-        std::vector<size_t> output_lens{input.lens()[0], weights.lens()[1]};
+        if(not x_shape.dynamic() and not w_shape.dynamic() and
+           x_shape.lens().at(1) != (w_shape.lens().at(0) * group))
+        {
+            MIGRAPHX_THROW("CONVOLUTION_BACKWARDS: mismatched channel numbers");
+        }

-        for(size_t i = 0; i < kdims; i++)
+        if(x_shape.dynamic() or w_shape.dynamic())
        {
-            output_lens.push_back(std::size_t(std::max<std::ptrdiff_t>(
+            return dynamic_compute_shape(x_shape, w_shape);
+        }
+        else
+        {
+            return static_compute_shape(x_shape, w_shape);
+        }
+    }
+
+    std::vector<std::size_t> calc_spatial_lens(std::vector<std::size_t> x_lens,
+                                               std::vector<std::size_t> w_lens) const
+    {
+        std::vector<size_t> spatial_lens(x_lens.size() - 2);
+
+        // stride * (input - 1) + output_padding + ((kernel - 1) * dilation + 1) - padding_L -
+        // padding_R. This assumes padding_L = padding_R and output_padding handled in parser.
+        for(size_t i = 0; i < spatial_lens.size(); i++)
+        {
+            spatial_lens.at(i) = (std::size_t(std::max<std::ptrdiff_t>(
                1,
-                stride[i] * (input.lens()[i + 2] - 1) +
-                    ((weights.lens()[i + 2] - 1) * dilation[i] + 1) - 2 * padding[i])));
+                stride[i] * (x_lens[i + 2] - 1) + ((w_lens[i + 2] - 1) * dilation[i] + 1) -
+                    2 * padding[i])));
        }
-        return inputs[0].with_lens(output_lens);
+        return spatial_lens;
+    }
+
+    shape dynamic_compute_shape(shape x_shape, shape w_shape) const
+    {
+        std::vector<shape::dynamic_dimension> output_dyn_dims = {};
+        output_dyn_dims.push_back(x_shape.to_dynamic().dyn_dims().at(0));
+        output_dyn_dims.push_back(w_shape.to_dynamic().dyn_dims().at(1));
+        const std::size_t num_spatial_dims = x_shape.ndim() - 2;
+        // Does not compute for optimals
+        auto min_spatial_dims = calc_spatial_lens(x_shape.min_lens(), w_shape.min_lens());
+        auto max_spatial_dims = calc_spatial_lens(x_shape.max_lens(), w_shape.max_lens());
+        for(size_t i = 0; i < num_spatial_dims; ++i)
+        {
+            output_dyn_dims.push_back(
+                shape::dynamic_dimension{min_spatial_dims[i], max_spatial_dims[i], {}});
+        }
+        return shape{x_shape.type(), output_dyn_dims};
+    }
+
+    shape static_compute_shape(shape x_shape, shape w_shape) const
+    {
+        std::vector<size_t> output_lens{x_shape.lens()[0], w_shape.lens()[1]};
+        auto spatial_lens = calc_spatial_lens(x_shape.lens(), w_shape.lens());
+        std::for_each(spatial_lens.begin(), spatial_lens.end(), [&output_lens](auto x) {
+            output_lens.push_back(x);
+        });
+        return x_shape.with_lens(output_lens);
    }

-    argument compute(shape output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
-        auto kdims = this->kdims();
+        argument result{dyn_out.computed_shape};
+        auto num_spatial_dims = this->kdims();
        visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
            using type = typename decltype(output)::value_type;

@@ -109,22 +156,22 @@ struct deconvolution
            auto wei_n = wei[0];
            auto wei_c = wei[1];

-            auto out_lens = output_shape.lens();
+            auto out_lens = dyn_out.computed_shape.lens();

            std::vector<std::size_t> win_size{in_c};
            std::copy(in_lens.begin() + 2, in_lens.end(), std::back_inserter(win_size));
            std::copy(wei.begin() + 2, wei.end(), std::back_inserter(win_size));
-            shape win_shape{output_shape.type(), win_size};
+            shape win_shape{dyn_out.computed_shape.type(), win_size};

            par_dfor(in_n, wei_c)([&](int o, int k) {
                shape_for_each(win_shape, [&](auto idx_win) {
                    const int w = idx_win[0];

                    auto input_dims_start = idx_win.begin() + 1;
-                    auto wei_dims_start   = idx_win.begin() + kdims + 1;
+                    auto wei_dims_start   = idx_win.begin() + num_spatial_dims + 1;

                    std::vector<std::ptrdiff_t> win_start;
-                    for(std::size_t n = 0; n < kdims; ++n)
+                    for(std::size_t n = 0; n < num_spatial_dims; ++n)
                    {
                        win_start.push_back(std::ptrdiff_t(*(input_dims_start + n) * stride[n]) -
                                            std::ptrdiff_t(padding[n]));
@@ -135,7 +182,7 @@ struct deconvolution

                    std::vector<std::ptrdiff_t> idx_out{o, in_ch};

-                    for(size_t n = 0; n < kdims; n++)
+                    for(size_t n = 0; n < num_spatial_dims; n++)
                    {
                        idx_out.push_back(win_start[n] + *(wei_dims_start + n) * dilation[n]);
                    }

--- a/src/include/migraphx/op/dimensions_of.hpp
+++ b/src/include/migraphx/op/dimensions_of.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_OPERATORS_DIMENSIONS_OF_HPP
+#define MIGRAPHX_GUARD_OPERATORS_DIMENSIONS_OF_HPP
+
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/dyn_output.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+/**
+ * Returns the dimensions of the input argument from starting axis to ending axis.
+ * Atleast `end` must be set to use this operator (set `end` to ndim for default ONNX behavior of
+ * `Shape` operator) This should only be used for dynamic shapes as this can be simplified to a
+ * literal for static shapes.
+ */
+struct dimensions_of
+{
+    std::size_t start = 0;
+    std::size_t end   = 0;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.start, "start"), f(self.end, "end"));
+    }
+
+    std::string name() const { return "dimensions_of"; }
+
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this, true}.has(1);
+        if(start >= end)
+        {
+            MIGRAPHX_THROW("DIMENSIONS_OF: start >= end. start = " + std::to_string(start) +
+                           ", end = " + std::to_string(end));
+        }
+        return shape{shape::int64_type, {end - start}};
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        auto input_lens = args[0].get_shape().lens();
+        result.visit([&](auto output) {
+            std::copy(input_lens.cbegin() + start, input_lens.cbegin() + end, output.begin());
+        });
+        return result;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/multibroadcast.hpp
+++ b/src/include/migraphx/op/multibroadcast.hpp
@@ -69,7 +69,7 @@ struct multibroadcast

        auto make_bcast_strides = [&](std::vector<std::size_t> bcast_lens, std::size_t offset) {
            std::vector<size_t> bcast_strides(bcast_lens.size(), 0);
-            for(std::ptrdiff_t i = s0.lens().size() - 1; i >= 0; i--)
+            for(std::ptrdiff_t i = s0.ndim() - 1; i >= 0; i--)
            {
                if(bcast_lens[i + offset] == s0.lens()[i])
                {
@@ -84,13 +84,13 @@ struct multibroadcast
            if(s0.dynamic())
                MIGRAPHX_THROW(
                    "MULTIBROADCAST: Single dynamic input shape not supported.  Use two inputs.");
-            if(s0.lens().size() > output_lens.size())
+            if(s0.ndim() > output_lens.size())
            {
                MIGRAPHX_THROW("MULTIBROADCAST: input dimensions should <= output size");
            }

-            auto offset = output_lens.size() - s0.lens().size();
-            for(std::ptrdiff_t i = s0.lens().size() - 1; i >= 0; i--)
+            auto offset = output_lens.size() - s0.ndim();
+            for(std::ptrdiff_t i = s0.ndim() - 1; i >= 0; i--)
            {
                if(output_lens[i + offset] != s0.lens()[i] and s0.lens()[i] != 1)
                {
@@ -119,7 +119,7 @@ struct multibroadcast
            {
                // output_lens will not be set for 2+ input version
                auto bcast_lens    = compute_common_lens(inputs);
-                auto offset        = bcast_lens.size() - s0.lens().size();
+                auto offset        = bcast_lens.size() - s0.ndim();
                auto bcast_strides = make_bcast_strides(bcast_lens, offset);
                return {t, std::move(bcast_lens), std::move(bcast_strides)};
            }

--- a/src/include/migraphx/op/pooling.hpp
+++ b/src/include/migraphx/op/pooling.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -42,16 +42,43 @@ namespace op {

 struct pooling
 {
-    pooling_mode mode                = {pooling_mode::average};
+    pooling_mode mode = {pooling_mode::average};
+
+    // Padding along each spatial input dimension
+    // Can be ndim or 2*ndim values where ndim is size of lengths
+    // ndim values means pad the same before and after each dimension
+    // 2*ndim values contains n pre and then n post padding values
    std::vector<std::size_t> padding = {0, 0};
-    std::vector<std::size_t> stride  = {1, 1};
+
+    // Size of stride to take from one placement of the pooling kernel to the next.
+    // This is distinct from the strides used by the shape class.  Must be the same
+    // ndim as lengths.
+    std::vector<std::size_t> stride = {1, 1};
+
+    // Spatial dimensions of the pooling kernel or window,
+    // 2 smaller than the input tensor rank (NCHW layout)
    std::vector<std::size_t> lengths = {1, 1};
-    bool ceil_mode                   = false;
-    int lp_order                     = 2;
+
+    // Dilations are not supported at this time.
+
+    // ceiling mode is a flag affecting output size
+    // or equivalently, placements of the pooling kernel.
+    // When true, round the size upwards, possibly
+    // including partial placements where the kernel extends beyond the edge
+    // of input and even padding.  When false, round down so that all
+    // kernel placements fit but some input values may be dropped.
+    bool ceil_mode = false;
+    int lp_order   = 2;

    // Global pooling with dynamic shape input
    bool dyn_global = false;

+    // an attribute of the Onnx pooling operator, not currently enabled here because MIOpen can't
+    // support it. We currently implement padding for average pooling by inserting a Padding
+    // operator during Onnx parsing. But to support dynamic shape inputs and count_include_pad
+    // together, it would be necessary to do this calculation at runtime in MIOpen.
+    bool count_include_pad = false;
+
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
@@ -68,11 +95,29 @@ struct pooling

    void check_attribute_size() const
    {
-        if((padding.size() != stride.size() and (padding.size() / 2) != stride.size()) or
-           (not dyn_global and stride.size() != lengths.size()))
+        if(dyn_global)
+            return;
+        if((padding.size() != stride.size() and (padding.size()) != stride.size() * 2) or
+           stride.size() != lengths.size())
        {
            MIGRAPHX_THROW("POOLING: inconsistent attribute sizes");
        }
+        if(std::any_of(lengths.begin(), lengths.end(), [&](auto i) { return (i == 0); }) or
+           std::any_of(stride.begin(), stride.end(), [&](auto i) { return (i == 0); }))
+        {
+            MIGRAPHX_THROW("POOLING: size 0 pooling kernel or stride");
+        }
+
+        // TODO:  update lowering to run the reference
+        // code when OneDNN can't execute pooling for a CPU
+
+        // OneDNN has a limitation on padding size for pooling.  see
+        // https://oneapi-src.github.io/oneDNN/dev_guide_convolution.html#doxid-dev-guide-convolution
+
+        // padding = {2}; stride = {1}; lengths = {3} succeeds in oneDNN but
+        // padding = {2}; stride = {1}; lengths = {2} fails.
+        // Also, the referenced documentation contains a max. dimension size of 14 for the kernel
+        // ("weights tensor") that MIGraphX doesn't enforce.
    }

    size_t kdims() const
@@ -112,7 +157,11 @@ struct pooling
        const shape& input = inputs.at(0);
        auto padding_size  = padding.size();
        size_t kdims       = input.ndim() - 2;
-        if(input.ndim() != padding_size / 2 + 2 and input.ndim() != padding_size + 2)
+        if(input.ndim() < 3)
+        {
+            MIGRAPHX_THROW("POOLING: input must have 3 or more dimensions and be nonempty");
+        }
+        if(input.ndim() * 2 != padding_size + 4 and input.ndim() != padding_size + 2)
        {
            MIGRAPHX_THROW("POOLING: input and attribute size mismatch!");
        }
@@ -132,7 +181,7 @@ struct pooling
            }
            else
            {
-                // does not compute for optimals
+                // does not compute optimals
                auto min_spatial_dims = calc_spatial_dim_out(input.min_lens(), kdims);
                auto max_spatial_dims = calc_spatial_dim_out(input.max_lens(), kdims);
                for(size_t i = 0; i < kdims; ++i)
@@ -149,7 +198,7 @@ struct pooling

            std::vector<std::size_t> output_lens(input_lens.begin(), input_lens.begin() + 2);
            // Used for when normalize_compute_shape() is called again at model eval time
-            // for an originally dynamic shape. Since kernel shape is not used with dyn_global.
+            // for an originally dynamic shape. Kernel shape is not used with dyn_global.
            if(dyn_global)
            {
                for(size_t i = 0; i < kdims; ++i)
@@ -184,7 +233,7 @@ struct pooling

        double operator()(double x, double y) const { return x + std::pow(std::abs(y), p); }

-        double final(double x, std::size_t) const { return std::pow(x, 1. / p); }
+        double final(double x, std::size_t) const { return (p == 0) ? 1 : std::pow(x, 1. / p); }
    };

    struct avg_pool
@@ -222,37 +271,82 @@ struct pooling
    {
        auto in_s    = input.get_shape();
        auto in_lens = in_s.lens();
+
+        // For each element of output; i.e., for each placement of pooling kernel...
        par_for(output_shape.elements(), [&](auto i) {
            auto idx_o = output_shape.multi(i);
            auto n_dim = idx_o.size();
-            std::vector<std::size_t> win_start;
+            // starting offset of the pooling window
+            std::vector<int> win_start;
            std::vector<std::size_t> win_size;
+
+            // For each spatial dimension, find starting and ending index of pooling kernel
            for(std::size_t dim = 2; dim < n_dim; ++dim)
            {
                auto d_2 = dim - 2;
                int start =
                    static_cast<int>(idx_o[dim] * stride[d_2]) - static_cast<int>(padding[d_2]);
-                int end = std::min(start + kernel_dims[d_2], in_lens[dim]);
-                start   = std::max(start, 0);
+                int end;
+                // NOLINT
+                if(count_include_pad and ceil_mode and (mode != pooling_mode::max))
+                {
+                    // TODO: this block can't execute until we enable count_include_pad
+                    // Even when using padding, if in ceil_mode a window
+                    // could extend beyond the end of both input and
+                    // padding.  Clip out-of-bounds indexes but not padding.
+
+                    // Check if this kernel extends beyond the padding at end of dimension
+                    end = std::min(start + kernel_dims[d_2],
+                                   in_lens[dim] + static_cast<int>(padding[d_2]));
+                }
+                else
+                {
+                    // In non-ceiling mode, when
+                    // count_include_pad is false, or for max pooling, clip off padding.
+                    end   = std::min(start + kernel_dims[d_2], in_lens[dim]);
+                    start = std::max(start, 0);
+                }
                win_start.push_back(start);
+                if(end < start)
+                {
+                    // This error can be caused by misc. bad input combinations
+                    MIGRAPHX_THROW("POOLING:  invalid attributes");
+                }
                win_size.push_back(end - start);
            }

            shape win_shape{output_shape.type(), win_size};
            auto pool_size    = win_shape.elements();
            double output_val = op.template init<Type>();
+
+            // for each element in the window...
            shape_for_each(win_shape, [&](auto idx_w) {
+                // the coordinates of this element
                auto idx = idx_o;
+
+                // Add the kernel location idx_w and the offset win_start, for each dimension.
+                // Negative results are cast to very large unsigned integers.
                std::transform(idx_w.begin(),
                               idx_w.end(),
                               win_start.begin(),
                               idx.begin() + 2,
                               [](auto ii, auto jj) { return ii + jj; });
-                if(std::all_of(idx.begin() + 2, idx.end(), [&](auto ii) { return ii >= 0; }) and
-                   idx < in_lens)
+                // Check if any of coordinates are out of input tensor's range
+                if(std::mismatch(idx.begin() + 2,
+                                 idx.end(),
+                                 in_lens.begin() + 2,
+                                 in_lens.end(),
+                                 std::less<>{}) == std::make_pair(idx.end(), in_lens.end()))
                {
                    output_val = op(output_val, input[in_s.index(idx)]);
                }
+                else
+                {
+                    // this is a padding element.  Padding locations
+                    // don't contribute to average or max pooling total but can play in
+                    // lpnorm pooling.
+                    output_val = op(output_val, 0);
+                }
            });
            output[i] = Type(op.final(output_val, pool_size));
        });

--- a/src/include/migraphx/op/prefix_scan_op.hpp
+++ b/src/include/migraphx/op/prefix_scan_op.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,6 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+
 #ifndef MIGRAPHX_GUARD_OPERATORS_SCAN_OP_HPP
 #define MIGRAPHX_GUARD_OPERATORS_SCAN_OP_HPP

@@ -37,6 +38,12 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {

+/**
+ * Parent struct for prefix scan operations.  A prefix scan is equivalent to the C++
+ * std::exclusive_scan or std::inclusive_scan.  Given a list of numbers, a prefix scan
+ * sum op returns an equal size list of running totals of the values.  Other operations
+ * besides addition can be supported by their own child ops.
+ */
 template <class Derived>
 struct prefix_scan_op : op_name<Derived>
 {
@@ -60,9 +67,13 @@ struct prefix_scan_op : op_name<Derived>

    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1);
+        check_shapes{inputs, *this, true}.has(1);
        auto s = inputs.front();
-        if(s.broadcasted())
+        if(s.dynamic())
+        {
+            return s;
+        }
+        else if(s.broadcasted())
        {
            return {s.type(), s.lens()};
        }
@@ -72,8 +83,9 @@ struct prefix_scan_op : op_name<Derived>
        }
    }

-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
+        shape output_shape(dyn_out.computed_shape);
        argument result{output_shape};
        auto s = args[0].get_shape();
        if(s == output_shape)