Merge branch 'develop' into blas_tuning

23cb7917 · Brian Pickrell · GitHub · b5fcc0bc · ea32ca70 · 23cb7917
Unverified Commit 23cb7917 authored Aug 16, 2023 by Brian Pickrell Committed by GitHub Aug 16, 2023
20 changed files
--- a/src/include/migraphx/matcher.hpp
+++ b/src/include/migraphx/matcher.hpp
@@ -31,10 +31,15 @@
 #include <migraphx/optional.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/type_name.hpp>
+#include <migraphx/source_location.hpp>
 #include <migraphx/config.hpp>
 #include <unordered_map>
 #include <unordered_set>

+#ifndef MIGRAPHX_USE_TYPE_ERASED_MATCHERS
+#define MIGRAPHX_USE_TYPE_ERASED_MATCHERS 0
+#endif
+
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

@@ -103,6 +108,13 @@ struct predicate_matcher
    }
 };

+/// Convert a predicate function into a matcher
+template <class P>
+predicate_matcher<P> make_predicate_matcher(P p)
+{
+    return {p};
+}
+
 /// Convert a function into a matcher
 template <class F>
 struct function_matcher
@@ -124,14 +136,14 @@ template <class M>
 auto bind_match(M m, std::string name)
 {
    return make_function_matcher(
-        [=, name = std::move(name)](matcher_context& ctx,
-                                    instruction_ref ins) -> optional<instruction_ref> {
+        [=, m_name = std::move(name)](matcher_context& ctx,
+                                      instruction_ref ins) -> optional<instruction_ref> {
            auto result = m.match(ctx, ins);
            if(result)
            {
                if(not ctx.has_instruction(ins))
                    return nullopt;
-                ctx.instructions[name] = ins;
+                ctx.instructions[m_name] = ins;
            }
            return result;
        });
@@ -183,14 +195,26 @@ struct id_matcher
 template <class M>
 struct basic_matcher;

+struct any_matcher;
+
 template <class M>
-basic_matcher<M> make_basic_matcher(M m);
+struct type_erased_matcher
+{
+#if MIGRAPHX_USE_TYPE_ERASED_MATCHERS
+    using type = any_matcher;
+#else
+    using type = basic_matcher<M>;
+#endif
+};
+
+template <class M>
+typename type_erased_matcher<M>::type make_basic_matcher(M m);

 template <class F>
-basic_matcher<function_matcher<F>> make_basic_fun_matcher(F f);
+auto make_basic_fun_matcher(F f);

 template <class P>
-basic_matcher<predicate_matcher<P>> make_basic_pred_matcher(P p);
+auto make_basic_pred_matcher(P p);

 /// The basic matcher provides the all_of composability of the matcher
 template <class M>
@@ -222,38 +246,38 @@ struct basic_matcher
    auto match(matcher_context& ctx, instruction_ref ins) const { return m.match(ctx, ins); }
 };

+/// Create a typed-erased matcher
+using any_matcher_base = basic_matcher<
+    function_matcher<std::function<optional<instruction_ref>(matcher_context&, instruction_ref)>>>;
+struct any_matcher : any_matcher_base
+{
+    template <class M>
+    any_matcher(M mm) : any_matcher_base({[=](auto& ctx, auto ins) { return mm.match(ctx, ins); }})
+    {
+    }
+};
+
 /// Create a basic matcher from a matcher
 template <class M>
-basic_matcher<M> make_basic_matcher(M m)
+typename type_erased_matcher<M>::type make_basic_matcher(M m)
 {
    return {m};
 }

 /// Create a basic matcher from a function
 template <class F>
-basic_matcher<function_matcher<F>> make_basic_fun_matcher(F f)
+auto make_basic_fun_matcher(F f)
 {
-    return {{f}};
+    return make_basic_matcher(make_function_matcher(f));
 }

 /// Create a basic matcher from a predicate function
 template <class P>
-basic_matcher<predicate_matcher<P>> make_basic_pred_matcher(P p)
+auto make_basic_pred_matcher(P p)
 {
-    return {{p}};
+    return make_basic_matcher(make_predicate_matcher(p));
 }

-/// Create a typed-erased matcher
-using any_matcher_base = basic_matcher<
-    function_matcher<std::function<optional<instruction_ref>(matcher_context&, instruction_ref)>>>;
-struct any_matcher : any_matcher_base
-{
-    template <class M>
-    any_matcher(M mm) : any_matcher_base({[=](auto& ctx, auto ins) { return mm.match(ctx, ins); }})
-    {
-    }
-};
-
 /// This macro takes care of the boilerplate for defining a matcher
 #define MIGRAPHX_BASIC_MATCHER(name, ...)                                     \
    struct name##_m                                                           \
@@ -347,31 +371,30 @@ match::matcher_result find_match(module& modl, M&& m)
 }

 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_MATCHES)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_MATCHES_FOR)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_VALIDATE_MATCHES)

-/// Find matches for an instruction in the module
+/// Find matches for an instruction in the module for per section of matchers
 template <class Mod, class... Ms>
-void find_matches(Mod& mod, instruction_ref ins, Ms&&... ms)
-{
-#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 5
-    const
-#endif
-        int trace = value_of(MIGRAPHX_TRACE_MATCHES{});
-#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 5
-    const
-#endif
-        bool validate = enabled(MIGRAPHX_VALIDATE_MATCHES{});
-    bool match        = false;
+void find_matches_for(source_location location, Mod& mod, instruction_ref ins, Ms&&... ms)
+{
+    const int trace         = value_of(MIGRAPHX_TRACE_MATCHES{});
+    const bool validate     = enabled(MIGRAPHX_VALIDATE_MATCHES{});
+    const auto trace_filter = string_value_of(MIGRAPHX_TRACE_MATCHES_FOR{});
+    const bool trace_for    = not trace_filter.empty() and
+                           (contains(std::string{location.file_name()}, trace_filter) or
+                            contains(std::string{location.function_name()}, trace_filter));
+    bool match = false;
    each_args(
        [&](auto&& m) {
            if(match)
                return;
-            if(trace > 1)
+            if(trace > 1 or trace_for)
                std::cout << "Match: " << get_type_name(m) << std::endl;
            auto r = match_instruction(get_module(mod), ins, m.matcher());
            if(r.result == get_module(mod).end())
                return;
-            if(trace > 0)
+            if(trace > 0 or trace_for)
            {
                std::cout << "Matched by " << get_type_name(m) << std::endl;
                get_module(mod).debug_print(ins);
@@ -397,13 +420,19 @@ void find_matches(Mod& mod, instruction_ref ins, Ms&&... ms)

 /// Find matches in a module
 template <class Mod, class... Ms>
-void find_matches(Mod& mod, Ms&&... ms)
+struct find_matches
 {
-    for(auto ins : iterator_for(get_module(mod)))
+    find_matches(Mod& mod, Ms&&... ms, source_location location = source_location::current())
    {
-        find_matches(mod, ins, ms...);
+        for(auto ins : iterator_for(get_module(mod)))
+        {
+            find_matches_for(location, mod, ins, ms...);
+        }
    }
-}
+};
+
+template <class Mod, class... Ms>
+find_matches(Mod& mod, Ms&&... ms) -> find_matches<Mod, Ms...>;

 template <class M, class F>
 struct find_generic_match
@@ -632,9 +661,9 @@ auto skip_output(Ms... ms)
 inline auto var(std::string s)
 {
    return make_basic_fun_matcher(
-        [=, s = std::move(s)](const matcher_context& ctx,
-                              instruction_ref) -> optional<instruction_ref> {
-            auto it = ctx.instructions.find(s);
+        [=, m_s = std::move(s)](const matcher_context& ctx,
+                                instruction_ref) -> optional<instruction_ref> {
+            auto it = ctx.instructions.find(m_s);
            if(it == ctx.instructions.end())
                return nullopt;
            return it->second;
@@ -644,7 +673,7 @@ inline auto var(std::string s)
 inline auto name(std::string s)
 {
    return make_basic_pred_matcher(
-        [=, s = std::move(s)](instruction_ref ins) { return ins->name() == s; });
+        [=, m_s = std::move(s)](instruction_ref ins) { return ins->name() == m_s; });
 }

 inline auto name_contains(const std::string& name)
@@ -655,8 +684,8 @@ inline auto name_contains(const std::string& name)

 inline auto name(std::unordered_set<std::string> names)
 {
-    return make_basic_pred_matcher([=, names = std::move(names)](instruction_ref ins) {
-        return names.count(ins->name()) > 0;
+    return make_basic_pred_matcher([=, m_names = std::move(names)](instruction_ref ins) {
+        return m_names.count(ins->name()) > 0;
    });
 }


--- a/src/include/migraphx/memory_coloring.hpp
+++ b/src/include/migraphx/memory_coloring.hpp
@@ -36,7 +36,7 @@ struct module;
 * Remove multiple memory allocations using graph coloring to find memory allocations that can be
 * reused.
 */
-struct memory_coloring
+struct MIGRAPHX_EXPORT memory_coloring
 {
    std::string allocation_op{};
    bool verify = false;

--- a/src/include/migraphx/module.hpp
+++ b/src/include/migraphx/module.hpp
@@ -52,7 +52,7 @@ using ins_dep_map   = std::unordered_map<instruction_ref, std::unordered_set<ins
 /**
 * @brief Stores the instruction stream
 */
-struct module
+struct MIGRAPHX_EXPORT module
 {
    module(const std::string& name = "");

@@ -189,7 +189,7 @@ struct module
    instruction_ref validate() const;
    instruction_ref find_dangling_reference() const;

-    void finalize(context& ctx);
+    void finalize(std::vector<context>& contexts);

    void debug_print() const;
    void debug_print(instruction_ref ins) const;
@@ -222,11 +222,21 @@ struct module
    void annotate(std::ostream& os, std::function<void(instruction_ref)> a) const;

    std::vector<module_ref> get_sub_modules(bool shallow = false) const;
+    /* sorts the module in topological order aka reverse-post order (RPO) DFS order
+       it takes last instruction or @return as the root and walks back the graph and moves inputs
+       of the each instruction such that it appears before the instruction itself.
+    */
    module& sort();
+    /* Any instruction "X" can have module arguments and those modules inside them can use any other
+     * instruction "Y" from predecessor modules of the instruction "X". Such instruction "Y" inside
+     * module args are not listed as input instructions to "X". But those instructions "Y" must be
+     * evaluted before the instruction "X" can. Therefore such "Y" instructions are considered
+     * implicit dependency to "X".
+     */
    ins_dep_map calc_implicit_deps() const;

-    friend std::ostream& operator<<(std::ostream& os, const module& m);
-    friend bool operator==(const module& x, const module& y);
+    MIGRAPHX_EXPORT friend std::ostream& operator<<(std::ostream& os, const module& m);
+    MIGRAPHX_EXPORT friend bool operator==(const module& x, const module& y);
    friend bool operator!=(const module& x, const module& y) { return not(x == y); }

    private:

--- a/src/include/migraphx/msgpack.hpp
+++ b/src/include/migraphx/msgpack.hpp
@@ -31,10 +31,11 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-void to_msgpack(const value& v, std::function<void(const char*, std::size_t)> writer);
-std::vector<char> to_msgpack(const value& v);
-value from_msgpack(const std::vector<char>& buffer);
-value from_msgpack(const char* buffer, std::size_t size);
+MIGRAPHX_EXPORT void to_msgpack(const value& v,
+                                std::function<void(const char*, std::size_t)> writer);
+MIGRAPHX_EXPORT std::vector<char> to_msgpack(const value& v);
+MIGRAPHX_EXPORT value from_msgpack(const std::vector<char>& buffer);
+MIGRAPHX_EXPORT value from_msgpack(const char* buffer, std::size_t size);

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/normalize_attributes.hpp
+++ b/src/include/migraphx/normalize_attributes.hpp
@@ -42,7 +42,8 @@ struct select_dependent_type
 template <class T, class... Ts>
 using dependent_type = typename select_dependent_type<T, Ts...>::type;

-bool normalize_attributes(operation& op, const std::vector<std::size_t>& lens);
+MIGRAPHX_EXPORT
+bool normalize_attributes(operation& op, const shape& input_shape);

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/normalize_ops.hpp
+++ b/src/include/migraphx/normalize_ops.hpp
@@ -39,7 +39,7 @@ struct module;
 * Process negative axis attributes of ops
 */

-struct normalize_ops
+struct MIGRAPHX_EXPORT normalize_ops
 {
    std::string name() const { return "normalize_ops"; }
    void apply(module& m) const;

--- a/src/include/migraphx/onnx.hpp
+++ b/src/include/migraphx/onnx.hpp
@@ -26,6 +26,7 @@

 #include <migraphx/program.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/onnx/export.h>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -54,15 +55,19 @@ struct onnx_options
 };

 /// Create a program from an onnx file
-program parse_onnx(const std::string& name, const onnx_options& = onnx_options{});
+MIGRAPHX_ONNX_EXPORT program parse_onnx(const std::string& name,
+                                        const onnx_options& = onnx_options{});

 /// Create a program from an onnx buffer
-program parse_onnx_buffer(const std::string& buffer, const onnx_options& options);
+MIGRAPHX_ONNX_EXPORT program parse_onnx_buffer(const std::string& buffer,
+                                               const onnx_options& options);

 /// Create a program from an onnx buffer
-program parse_onnx_buffer(const void* data, std::size_t size, const onnx_options& options);
+MIGRAPHX_ONNX_EXPORT program parse_onnx_buffer(const void* data,
+                                               std::size_t size,
+                                               const onnx_options& options);

-std::vector<std::string> get_onnx_operators();
+MIGRAPHX_ONNX_EXPORT std::vector<std::string> get_onnx_operators();

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/op/broadcast.hpp
+++ b/src/include/migraphx/op/broadcast.hpp
@@ -37,10 +37,13 @@ namespace op {
 * 1 input version:
 * Broadcasts a tensor from the original shape to the broadcast_lens by setting the stride of
 * broadcasted dimensions to zero. `axis` attribute for a 1D input shape is the output dimension
- * that stays the same. ex: broadcasting shape [1024] -> [4, 1024, 3] has axis = 1 For higher rank
- * input shapes, axis is an offset parameter for the broadcasting. Such that this operator would
- * work in the opposite direction of NumPy broadcasting. ex: broadcasting shape [2, 2] -> [2, 2, 3]
- * with axis = 0
+ * that stays the same.
+ * ex: broadcasting shape [1024] -> [4, 1024, 3] has axis = 1.
+ *
+ * For higher rank input shapes, axis is an offset parameter for the broadcasting.
+ * Such that this operator would work in the opposite direction of NumPy broadcasting
+ * (left-most to rightwards element-wise comparison)
+ * ex: broadcasting shape [2, 2] -> [2, 2, 3] with axis = 0
 *
 * 2 input version:
 * Broadcast the first input 1D shape into the second input shape based on the axis parameter.
@@ -68,6 +71,9 @@ struct broadcast
        {
            // the ONNX broadcast op is deprecated now, so not handling the negative
            // value of axis anymore
+            if(s0.dynamic())
+                MIGRAPHX_THROW(
+                    "BROADCAST: Single dynamic input shape not supported.  Use two inputs.");
            if(axis >= broadcast_lens.size())
            {
                MIGRAPHX_THROW("BROADCAST : axis " + migraphx::to_string(axis) +

--- a/src/include/migraphx/op/clip.hpp
+++ b/src/include/migraphx/op/clip.hpp
@@ -25,12 +25,13 @@
 #define MIGRAPHX_GUARD_OPERATORS_CLIP_HPP

 #include <array>
+#include <cmath>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/par_for.hpp>
 #include <migraphx/config.hpp>
 #include <migraphx/value.hpp>
-#include <cmath>
+#include <migraphx/dyn_output.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -48,15 +49,15 @@ struct clip

    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(3).same_type().same_dims();
+        check_shapes{inputs, *this, true}.has(3).same_type().same_dims();
        return inputs.front();
    }

-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
        visit_all(result, args[0], args[1], args[2])([&](auto output, auto x, auto min, auto max) {
-            par_for(output_shape.elements(),
+            par_for(dyn_out.computed_shape.elements(),
                    [&](auto i) { output[i] = std::min(std::max(min[i], x[i]), max[i]); });
        });


--- a/src/include/migraphx/op/common.hpp
+++ b/src/include/migraphx/op/common.hpp
@@ -59,8 +59,8 @@ enum class rnn_direction
    bidirectional,
 };

-std::ostream& operator<<(std::ostream& os, pooling_mode v);
-std::ostream& operator<<(std::ostream& os, rnn_direction v);
+MIGRAPHX_EXPORT std::ostream& operator<<(std::ostream& os, pooling_mode v);
+MIGRAPHX_EXPORT std::ostream& operator<<(std::ostream& os, rnn_direction v);

 } // namespace op
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/include/migraphx/op/convert.hpp
+++ b/src/include/migraphx/op/convert.hpp
@@ -66,7 +66,19 @@ struct convert : unary<convert>
        auto type = target_type;
        return [type](auto x) {
            auto y = x;
-            shape::visit(type, [&](auto as) { y = std::min(std::max(as(x), as.min()), as.max()); });
+            shape::visit(type, [&](auto as) {
+                // clamping value between target_type's max and min doesn't work for NaNs,
+                if(std::isnan(x))
+                {
+                    y = as.nan();
+                }
+                else
+                {
+                    // clamp overflowing/underflowing values to min()/max() instead of +/-infinity
+                    // during downcasting
+                    y = std::min(std::max(as(x), as.min()), as.max());
+                }
+            });
            return y;
        };
    }

--- a/src/include/migraphx/op/convolution.hpp
+++ b/src/include/migraphx/op/convolution.hpp
@@ -79,17 +79,17 @@ struct convolution
        check_shapes{inputs, *this, true}.has(2).same_type().same_ndims().min_ndims(3);
        check_attribute_size();
        // num of dims of input and attribute should match
-        const auto input_size   = inputs[0].max_lens().size();
+        const auto input_ndim   = inputs[0].ndim();
        const auto padding_size = padding.size();

-        if(input_size != padding_size / 2 + 2 && input_size != padding_size + 2)
+        if(input_ndim != padding_size / 2 + 2 and input_ndim != padding_size + 2)
        {
            MIGRAPHX_THROW("CONVOLUTION: input and attribute size mismatch!");
        }

        const shape& x_shape          = inputs.at(0);
        const shape& w_shape          = inputs.at(1);
-        const size_t num_spatial_dims = input_size - 2;
+        const size_t num_spatial_dims = input_ndim - 2;
        if(num_spatial_dims != this->kdims())
        {
            MIGRAPHX_THROW("CONVOLUTION: input k-dims does not match attribute size");
@@ -105,7 +105,7 @@ struct convolution
        }
        else
        {
-            return fixed_compute_shape(x_shape, w_shape);
+            return static_compute_shape(x_shape, w_shape);
        }
    }

@@ -143,23 +143,10 @@ struct convolution
    shape dynamic_compute_shape(shape x_shape, shape w_shape) const
    {
        std::vector<shape::dynamic_dimension> output_dyn_dims = {};
+        output_dyn_dims.push_back(x_shape.to_dynamic().dyn_dims().at(0));
+        output_dyn_dims.push_back(w_shape.to_dynamic().dyn_dims().at(0));

-        auto dynamic_shape_push_back = [&](const shape& input_shape) {
-            if(input_shape.dynamic())
-            {
-                output_dyn_dims.push_back(input_shape.dyn_dims().at(0));
-            }
-            else
-            {
-                auto l = input_shape.lens().at(0);
-                output_dyn_dims.push_back({l, l});
-            }
-        };
-
-        dynamic_shape_push_back(x_shape);
-        dynamic_shape_push_back(w_shape);
-
-        const size_t num_spatial_dims = x_shape.max_lens().size() - 2;
+        const size_t num_spatial_dims = x_shape.ndim() - 2;
        if(padding_mode != default_)
        {
            for(std::size_t i = 0; i < num_spatial_dims; ++i)
@@ -198,7 +185,7 @@ struct convolution
        return shape{x_shape.type(), output_dyn_dims};
    }

-    shape fixed_compute_shape(shape x_shape, shape w_shape) const
+    shape static_compute_shape(shape x_shape, shape w_shape) const
    {
        std::vector<size_t> output_lens{x_shape.lens()[0], w_shape.lens()[0]};
        auto spatial_lens = calc_conv_lens(x_shape.lens(), w_shape.lens());

--- a/src/include/migraphx/op/deconvolution.hpp
+++ b/src/include/migraphx/op/deconvolution.hpp
@@ -21,9 +21,11 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_OPERATORS_DECONVOLUTION_HPP
-#define MIGRAPHX_GUARD_OPERATORS_DECONVOLUTION_HPP
+#ifndef MIGRAPHX_GUARD_OPERATORS_CONVOLUTION_BACKWARDS_HPP
+#define MIGRAPHX_GUARD_OPERATORS_CONVOLUTION_BACKWARDS_HPP

+#include <cmath>
+#include <utility>
 #include <migraphx/op/common.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/config.hpp>
@@ -31,14 +33,13 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/par_dfor.hpp>
 #include <migraphx/shape_for_each.hpp>
-#include <cmath>
-#include <utility>
+#include <migraphx/dyn_output.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {

-struct deconvolution
+struct convolution_backwards
 {
    std::vector<std::size_t> padding  = {0, 0};
    std::vector<std::size_t> stride   = {1, 1};
@@ -57,45 +58,91 @@ struct deconvolution
                    f(self.group, "group"));
    }

-    std::string name() const { return "deconvolution"; }
+    std::string name() const { return "convolution_backwards"; }

    void check_attribute_size() const
    {
-        if((padding.size() != stride.size() and (padding.size() / 2) != stride.size()) or
-           stride.size() != dilation.size())
+        if(padding.size() != stride.size() or stride.size() != dilation.size())
        {
-            MIGRAPHX_THROW("deconvolution: inconsistent attribute sizes");
+            MIGRAPHX_THROW("CONVOLUTION_BACKWARDS: inconsistent attribute sizes");
        }
    }

    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2).same_type().same_ndims().min_ndims(3);
+        check_shapes{inputs, *this, true}.has(2).same_type().same_ndims().min_ndims(3);

-        const shape& input   = inputs.at(0);
-        const shape& weights = inputs.at(1);
-        size_t kdims         = input.lens().size() - 2;
-        if(kdims != this->kdims())
+        const shape& x_shape = inputs.at(0);
+        const shape& w_shape = inputs.at(1);
+        if(x_shape.ndim() - 2 != this->kdims())
        {
-            MIGRAPHX_THROW("deconvolution: input k-dims does not match attribute size");
+            MIGRAPHX_THROW("CONVOLUTION_BACKWARDS: input k-dims does not match attribute size");
        }

-        std::vector<size_t> output_lens{input.lens()[0], weights.lens()[1]};
+        if(not x_shape.dynamic() and not w_shape.dynamic() and
+           x_shape.lens().at(1) != (w_shape.lens().at(0) * group))
+        {
+            MIGRAPHX_THROW("CONVOLUTION_BACKWARDS: mismatched channel numbers");
+        }

-        for(size_t i = 0; i < kdims; i++)
+        if(x_shape.dynamic() or w_shape.dynamic())
        {
-            output_lens.push_back(std::size_t(std::max<std::ptrdiff_t>(
+            return dynamic_compute_shape(x_shape, w_shape);
+        }
+        else
+        {
+            return static_compute_shape(x_shape, w_shape);
+        }
+    }
+
+    std::vector<std::size_t> calc_spatial_lens(std::vector<std::size_t> x_lens,
+                                               std::vector<std::size_t> w_lens) const
+    {
+        std::vector<size_t> spatial_lens(x_lens.size() - 2);
+
+        // stride * (input - 1) + output_padding + ((kernel - 1) * dilation + 1) - padding_L -
+        // padding_R. This assumes padding_L = padding_R and output_padding handled in parser.
+        for(size_t i = 0; i < spatial_lens.size(); i++)
+        {
+            spatial_lens.at(i) = (std::size_t(std::max<std::ptrdiff_t>(
                1,
-                stride[i] * (input.lens()[i + 2] - 1) +
-                    ((weights.lens()[i + 2] - 1) * dilation[i] + 1) - 2 * padding[i])));
+                stride[i] * (x_lens[i + 2] - 1) + ((w_lens[i + 2] - 1) * dilation[i] + 1) -
+                    2 * padding[i])));
        }
-        return inputs[0].with_lens(output_lens);
+        return spatial_lens;
+    }
+
+    shape dynamic_compute_shape(shape x_shape, shape w_shape) const
+    {
+        std::vector<shape::dynamic_dimension> output_dyn_dims = {};
+        output_dyn_dims.push_back(x_shape.to_dynamic().dyn_dims().at(0));
+        output_dyn_dims.push_back(w_shape.to_dynamic().dyn_dims().at(1));
+        const std::size_t num_spatial_dims = x_shape.ndim() - 2;
+        // Does not compute for optimals
+        auto min_spatial_dims = calc_spatial_lens(x_shape.min_lens(), w_shape.min_lens());
+        auto max_spatial_dims = calc_spatial_lens(x_shape.max_lens(), w_shape.max_lens());
+        for(size_t i = 0; i < num_spatial_dims; ++i)
+        {
+            output_dyn_dims.push_back(
+                shape::dynamic_dimension{min_spatial_dims[i], max_spatial_dims[i], {}});
+        }
+        return shape{x_shape.type(), output_dyn_dims};
+    }
+
+    shape static_compute_shape(shape x_shape, shape w_shape) const
+    {
+        std::vector<size_t> output_lens{x_shape.lens()[0], w_shape.lens()[1]};
+        auto spatial_lens = calc_spatial_lens(x_shape.lens(), w_shape.lens());
+        std::for_each(spatial_lens.begin(), spatial_lens.end(), [&output_lens](auto x) {
+            output_lens.push_back(x);
+        });
+        return x_shape.with_lens(output_lens);
    }

-    argument compute(shape output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
-        auto kdims = this->kdims();
+        argument result{dyn_out.computed_shape};
+        auto num_spatial_dims = this->kdims();
        visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
            using type = typename decltype(output)::value_type;

@@ -109,22 +156,22 @@ struct deconvolution
            auto wei_n = wei[0];
            auto wei_c = wei[1];

-            auto out_lens = output_shape.lens();
+            auto out_lens = dyn_out.computed_shape.lens();

            std::vector<std::size_t> win_size{in_c};
            std::copy(in_lens.begin() + 2, in_lens.end(), std::back_inserter(win_size));
            std::copy(wei.begin() + 2, wei.end(), std::back_inserter(win_size));
-            shape win_shape{output_shape.type(), win_size};
+            shape win_shape{dyn_out.computed_shape.type(), win_size};

            par_dfor(in_n, wei_c)([&](int o, int k) {
                shape_for_each(win_shape, [&](auto idx_win) {
                    const int w = idx_win[0];

                    auto input_dims_start = idx_win.begin() + 1;
-                    auto wei_dims_start   = idx_win.begin() + kdims + 1;
+                    auto wei_dims_start   = idx_win.begin() + num_spatial_dims + 1;

                    std::vector<std::ptrdiff_t> win_start;
-                    for(std::size_t n = 0; n < kdims; ++n)
+                    for(std::size_t n = 0; n < num_spatial_dims; ++n)
                    {
                        win_start.push_back(std::ptrdiff_t(*(input_dims_start + n) * stride[n]) -
                                            std::ptrdiff_t(padding[n]));
@@ -135,7 +182,7 @@ struct deconvolution

                    std::vector<std::ptrdiff_t> idx_out{o, in_ch};

-                    for(size_t n = 0; n < kdims; n++)
+                    for(size_t n = 0; n < num_spatial_dims; n++)
                    {
                        idx_out.push_back(win_start[n] + *(wei_dims_start + n) * dilation[n]);
                    }

--- a/src/include/migraphx/op/dequantizelinear.hpp
+++ b/src/include/migraphx/op/dequantizelinear.hpp
@@ -37,6 +37,15 @@ namespace op {

 struct dequantizelinear
 {
+
+    value attributes() const
+    {
+        // Note: point_op attribute is not used in this op. Instead, in
+        // gpu compilation pipeline, rewrite_quantization will be invoked
+        // from generate_pointwise() to rewrite this op.
+        return {{"pointwise", true}};
+    }
+
    std::string name() const { return "dequantizelinear"; }
    shape compute_shape(std::vector<shape> inputs) const
    {

--- a/src/include/migraphx/op/dimensions_of.hpp
+++ b/src/include/migraphx/op/dimensions_of.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_OPERATORS_DIMENSIONS_OF_HPP
+#define MIGRAPHX_GUARD_OPERATORS_DIMENSIONS_OF_HPP
+
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/dyn_output.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+/**
+ * Returns the dimensions of the input argument from starting axis to ending axis.
+ * Atleast `end` must be set to use this operator (set `end` to ndim for default ONNX behavior of
+ * `Shape` operator) This should only be used for dynamic shapes as this can be simplified to a
+ * literal for static shapes.
+ */
+struct dimensions_of
+{
+    std::size_t start = 0;
+    std::size_t end   = 0;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.start, "start"), f(self.end, "end"));
+    }
+
+    std::string name() const { return "dimensions_of"; }
+
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this, true}.has(1);
+        if(start >= end)
+        {
+            MIGRAPHX_THROW("DIMENSIONS_OF: start >= end. start = " + std::to_string(start) +
+                           ", end = " + std::to_string(end));
+        }
+        return shape{shape::int64_type, {end - start}};
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        auto input_lens = args[0].get_shape().lens();
+        result.visit([&](auto output) {
+            std::copy(input_lens.cbegin() + start, input_lens.cbegin() + end, output.begin());
+        });
+        return result;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/if_op.hpp
+++ b/src/include/migraphx/op/if_op.hpp
@@ -71,7 +71,7 @@ struct if_op
        std::unordered_map<std::string, argument> params;

        std::set<std::string> pnames;
-        for(const auto& smod : mods)
+        for(const_module_ref smod : mods)
        {
            auto names = smod->get_parameter_names();
            pnames.insert(names.begin(), names.end());

--- a/src/include/migraphx/op/loop.hpp
+++ b/src/include/migraphx/op/loop.hpp
@@ -59,9 +59,9 @@ struct loop
            MIGRAPHX_THROW("LOOP: operator should have one submodule.");
        }

-        const auto& mod     = mods.front();
-        auto mod_out_shapes = mod->get_output_shapes();
-        auto dep_param_num  = inputs.size() - 2;
+        const_module_ref mod = mods.front();
+        auto mod_out_shapes  = mod->get_output_shapes();
+        auto dep_param_num   = inputs.size() - 2;

        // first item of the mod output shapes is condition used in loop,
        // which is not needed to compute output shape

--- a/src/include/migraphx/op/multibroadcast.hpp
+++ b/src/include/migraphx/op/multibroadcast.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -36,9 +36,9 @@ namespace op {

 /**
 * Broadcast multiple dimensions between two tensors.
- * Two versions of this operator: one input and two inputs.
+ * Two versions of this operator: 1 input and 2+ inputs.
 * One input version uses output_lens attribute and broadcasts to it.
- * Two inputs version broadcasts both inputs to the common shape at evaluation time.
+ * 2+ inputs version broadcasts first input to the common shape at evaluation time.
 */
 struct multibroadcast
 {
@@ -57,19 +57,19 @@ struct multibroadcast

    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this, true}.has(1, 2);
+        check_shapes{inputs, *this, true}.has_at_least(1);

        auto t  = inputs.at(0).type();
        auto s0 = inputs.at(0);

-        if(s0.max_lens().empty())
+        if(s0.ndim() < 1)
        {
            MIGRAPHX_THROW("MULTIBROADCAST: input dimensions should be > 0");
        }

        auto make_bcast_strides = [&](std::vector<std::size_t> bcast_lens, std::size_t offset) {
            std::vector<size_t> bcast_strides(bcast_lens.size(), 0);
-            for(std::ptrdiff_t i = s0.lens().size() - 1; i >= 0; i--)
+            for(std::ptrdiff_t i = s0.ndim() - 1; i >= 0; i--)
            {
                if(bcast_lens[i + offset] == s0.lens()[i])
                {
@@ -81,13 +81,16 @@ struct multibroadcast

        if(inputs.size() == 1)
        {
-            if(s0.lens().size() > output_lens.size())
+            if(s0.dynamic())
+                MIGRAPHX_THROW(
+                    "MULTIBROADCAST: Single dynamic input shape not supported.  Use two inputs.");
+            if(s0.ndim() > output_lens.size())
            {
                MIGRAPHX_THROW("MULTIBROADCAST: input dimensions should <= output size");
            }

-            auto offset = output_lens.size() - s0.lens().size();
-            for(std::ptrdiff_t i = s0.lens().size() - 1; i >= 0; i--)
+            auto offset = output_lens.size() - s0.ndim();
+            for(std::ptrdiff_t i = s0.ndim() - 1; i >= 0; i--)
            {
                if(output_lens[i + offset] != s0.lens()[i] and s0.lens()[i] != 1)
                {
@@ -102,20 +105,21 @@ struct multibroadcast
        }
        else
        {
-            // two inputs
-            auto s1 = inputs.at(1);
-            if(s0.dynamic() or s1.dynamic())
+            // 2+ inputs
+            if(std::any_of(
+                   inputs.cbegin(), inputs.cend(), [](auto input) { return input.dynamic(); }))
            {
                if(not output_dyn_dims.empty())
                {
                    return {t, output_dyn_dims};
                }
-                return {t, compute_broadcasted_dyn_dims(s0, s1)};
+                return {t, compute_common_dyn_dims(inputs)};
            }
            else
            {
-                auto bcast_lens    = compute_broadcasted_lens(s0.lens(), s1.lens());
-                auto offset        = bcast_lens.size() - s0.lens().size();
+                // output_lens will not be set for 2+ input version
+                auto bcast_lens    = compute_common_lens(inputs);
+                auto offset        = bcast_lens.size() - s0.ndim();
                auto bcast_strides = make_bcast_strides(bcast_lens, offset);
                return {t, std::move(bcast_lens), std::move(bcast_strides)};
            }

--- a/src/include/migraphx/op/pooling.hpp
+++ b/src/include/migraphx/op/pooling.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -42,16 +42,43 @@ namespace op {

 struct pooling
 {
-    pooling_mode mode                = {pooling_mode::average};
+    pooling_mode mode = {pooling_mode::average};
+
+    // Padding along each spatial input dimension
+    // Can be ndim or 2*ndim values where ndim is size of lengths
+    // ndim values means pad the same before and after each dimension
+    // 2*ndim values contains n pre and then n post padding values
    std::vector<std::size_t> padding = {0, 0};
-    std::vector<std::size_t> stride  = {1, 1};
+
+    // Size of stride to take from one placement of the pooling kernel to the next.
+    // This is distinct from the strides used by the shape class.  Must be the same
+    // ndim as lengths.
+    std::vector<std::size_t> stride = {1, 1};
+
+    // Spatial dimensions of the pooling kernel or window,
+    // 2 smaller than the input tensor rank (NCHW layout)
    std::vector<std::size_t> lengths = {1, 1};
-    bool ceil_mode                   = false;
-    int lp_order                     = 2;
+
+    // Dilations are not supported at this time.
+
+    // ceiling mode is a flag affecting output size
+    // or equivalently, placements of the pooling kernel.
+    // When true, round the size upwards, possibly
+    // including partial placements where the kernel extends beyond the edge
+    // of input and even padding.  When false, round down so that all
+    // kernel placements fit but some input values may be dropped.
+    bool ceil_mode = false;
+    int lp_order   = 2;

    // Global pooling with dynamic shape input
    bool dyn_global = false;

+    // an attribute of the Onnx pooling operator, not currently enabled here because MIOpen can't
+    // support it. We currently implement padding for average pooling by inserting a Padding
+    // operator during Onnx parsing. But to support dynamic shape inputs and count_include_pad
+    // together, it would be necessary to do this calculation at runtime in MIOpen.
+    bool count_include_pad = false;
+
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
@@ -68,11 +95,29 @@ struct pooling

    void check_attribute_size() const
    {
-        if((padding.size() != stride.size() and (padding.size() / 2) != stride.size()) or
-           (not dyn_global and stride.size() != lengths.size()))
+        if(dyn_global)
+            return;
+        if((padding.size() != stride.size() and (padding.size()) != stride.size() * 2) or
+           stride.size() != lengths.size())
        {
            MIGRAPHX_THROW("POOLING: inconsistent attribute sizes");
        }
+        if(std::any_of(lengths.begin(), lengths.end(), [&](auto i) { return (i == 0); }) or
+           std::any_of(stride.begin(), stride.end(), [&](auto i) { return (i == 0); }))
+        {
+            MIGRAPHX_THROW("POOLING: size 0 pooling kernel or stride");
+        }
+
+        // TODO:  update lowering to run the reference
+        // code when OneDNN can't execute pooling for a CPU
+
+        // OneDNN has a limitation on padding size for pooling.  see
+        // https://oneapi-src.github.io/oneDNN/dev_guide_convolution.html#doxid-dev-guide-convolution
+
+        // padding = {2}; stride = {1}; lengths = {3} succeeds in oneDNN but
+        // padding = {2}; stride = {1}; lengths = {2} fails.
+        // Also, the referenced documentation contains a max. dimension size of 14 for the kernel
+        // ("weights tensor") that MIGraphX doesn't enforce.
    }

    size_t kdims() const
@@ -112,7 +157,11 @@ struct pooling
        const shape& input = inputs.at(0);
        auto padding_size  = padding.size();
        size_t kdims       = input.ndim() - 2;
-        if(input.ndim() != padding_size / 2 + 2 and input.ndim() != padding_size + 2)
+        if(input.ndim() < 3)
+        {
+            MIGRAPHX_THROW("POOLING: input must have 3 or more dimensions and be nonempty");
+        }
+        if(input.ndim() * 2 != padding_size + 4 and input.ndim() != padding_size + 2)
        {
            MIGRAPHX_THROW("POOLING: input and attribute size mismatch!");
        }
@@ -132,7 +181,7 @@ struct pooling
            }
            else
            {
-                // does not compute for optimals
+                // does not compute optimals
                auto min_spatial_dims = calc_spatial_dim_out(input.min_lens(), kdims);
                auto max_spatial_dims = calc_spatial_dim_out(input.max_lens(), kdims);
                for(size_t i = 0; i < kdims; ++i)
@@ -149,7 +198,7 @@ struct pooling

            std::vector<std::size_t> output_lens(input_lens.begin(), input_lens.begin() + 2);
            // Used for when normalize_compute_shape() is called again at model eval time
-            // for an originally dynamic shape. Since kernel shape is not used with dyn_global.
+            // for an originally dynamic shape. Kernel shape is not used with dyn_global.
            if(dyn_global)
            {
                for(size_t i = 0; i < kdims; ++i)
@@ -184,7 +233,7 @@ struct pooling

        double operator()(double x, double y) const { return x + std::pow(std::abs(y), p); }

-        double final(double x, std::size_t) const { return std::pow(x, 1. / p); }
+        double final(double x, std::size_t) const { return (p == 0) ? 1 : std::pow(x, 1. / p); }
    };

    struct avg_pool
@@ -222,37 +271,82 @@ struct pooling
    {
        auto in_s    = input.get_shape();
        auto in_lens = in_s.lens();
+
+        // For each element of output; i.e., for each placement of pooling kernel...
        par_for(output_shape.elements(), [&](auto i) {
            auto idx_o = output_shape.multi(i);
            auto n_dim = idx_o.size();
-            std::vector<std::size_t> win_start;
+            // starting offset of the pooling window
+            std::vector<int> win_start;
            std::vector<std::size_t> win_size;
+
+            // For each spatial dimension, find starting and ending index of pooling kernel
            for(std::size_t dim = 2; dim < n_dim; ++dim)
            {
                auto d_2 = dim - 2;
                int start =
                    static_cast<int>(idx_o[dim] * stride[d_2]) - static_cast<int>(padding[d_2]);
-                int end = std::min(start + kernel_dims[d_2], in_lens[dim]);
-                start   = std::max(start, 0);
+                int end;
+                // NOLINT
+                if(count_include_pad and ceil_mode and (mode != pooling_mode::max))
+                {
+                    // TODO: this block can't execute until we enable count_include_pad
+                    // Even when using padding, if in ceil_mode a window
+                    // could extend beyond the end of both input and
+                    // padding.  Clip out-of-bounds indexes but not padding.
+
+                    // Check if this kernel extends beyond the padding at end of dimension
+                    end = std::min(start + kernel_dims[d_2],
+                                   in_lens[dim] + static_cast<int>(padding[d_2]));
+                }
+                else
+                {
+                    // In non-ceiling mode, when
+                    // count_include_pad is false, or for max pooling, clip off padding.
+                    end   = std::min(start + kernel_dims[d_2], in_lens[dim]);
+                    start = std::max(start, 0);
+                }
                win_start.push_back(start);
+                if(end < start)
+                {
+                    // This error can be caused by misc. bad input combinations
+                    MIGRAPHX_THROW("POOLING:  invalid attributes");
+                }
                win_size.push_back(end - start);
            }

            shape win_shape{output_shape.type(), win_size};
            auto pool_size    = win_shape.elements();
            double output_val = op.template init<Type>();
+
+            // for each element in the window...
            shape_for_each(win_shape, [&](auto idx_w) {
+                // the coordinates of this element
                auto idx = idx_o;
+
+                // Add the kernel location idx_w and the offset win_start, for each dimension.
+                // Negative results are cast to very large unsigned integers.
                std::transform(idx_w.begin(),
                               idx_w.end(),
                               win_start.begin(),
                               idx.begin() + 2,
                               [](auto ii, auto jj) { return ii + jj; });
-                if(std::all_of(idx.begin() + 2, idx.end(), [&](auto ii) { return ii >= 0; }) and
-                   idx < in_lens)
+                // Check if any of coordinates are out of input tensor's range
+                if(std::mismatch(idx.begin() + 2,
+                                 idx.end(),
+                                 in_lens.begin() + 2,
+                                 in_lens.end(),
+                                 std::less<>{}) == std::make_pair(idx.end(), in_lens.end()))
                {
                    output_val = op(output_val, input[in_s.index(idx)]);
                }
+                else
+                {
+                    // this is a padding element.  Padding locations
+                    // don't contribute to average or max pooling total but can play in
+                    // lpnorm pooling.
+                    output_val = op(output_val, 0);
+                }
            });
            output[i] = Type(op.final(output_val, pool_size));
        });

--- a/src/include/migraphx/op/prefix_scan_op.hpp
+++ b/src/include/migraphx/op/prefix_scan_op.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,6 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+
 #ifndef MIGRAPHX_GUARD_OPERATORS_SCAN_OP_HPP
 #define MIGRAPHX_GUARD_OPERATORS_SCAN_OP_HPP

@@ -37,6 +38,12 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {

+/**
+ * Parent struct for prefix scan operations.  A prefix scan is equivalent to the C++
+ * std::exclusive_scan or std::inclusive_scan.  Given a list of numbers, a prefix scan
+ * sum op returns an equal size list of running totals of the values.  Other operations
+ * besides addition can be supported by their own child ops.
+ */
 template <class Derived>
 struct prefix_scan_op : op_name<Derived>
 {
@@ -60,9 +67,13 @@ struct prefix_scan_op : op_name<Derived>

    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1);
+        check_shapes{inputs, *this, true}.has(1);
        auto s = inputs.front();
-        if(s.broadcasted())
+        if(s.dynamic())
+        {
+            return s;
+        }
+        else if(s.broadcasted())
        {
            return {s.type(), s.lens()};
        }
@@ -72,8 +83,9 @@ struct prefix_scan_op : op_name<Derived>
        }
    }

-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
+        shape output_shape(dyn_out.computed_shape);
        argument result{output_shape};
        auto s = args[0].get_shape();
        if(s == output_shape)