Merge branch 'develop' into multinomial_parse_merge

264a7647 · Brian Pickrell · d99729f8 · 8e18544f · 264a7647 · 264a7647
Commit 264a7647 authored Jul 26, 2023 by Brian Pickrell
20 changed files
--- a/src/include/migraphx/op/common.hpp
+++ b/src/include/migraphx/op/common.hpp
@@ -59,8 +59,8 @@ enum class rnn_direction
    bidirectional,
 };
-std::ostream& operator<<(std::ostream& os, pooling_mode v);
+MIGRAPHX_EXPORT std::ostream& operator<<(std::ostream& os, pooling_mode v);
-std::ostream& operator<<(std::ostream& os, rnn_direction v);
+MIGRAPHX_EXPORT std::ostream& operator<<(std::ostream& os, rnn_direction v);
 } // namespace op
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/include/migraphx/op/convert.hpp
+++ b/src/include/migraphx/op/convert.hpp
@@ -66,7 +66,19 @@ struct convert : unary<convert>
        auto type = target_type;
        return [type](auto x) {
            auto y = x;
-            shape::visit(type, [&](auto as) { y = as(x); });
+            shape::visit(type, [&](auto as) {
+                // clamping value between target_type's max and min doesn't work for NaNs,
+                if(std::isnan(x))
+                {
+                    y = as.nan();
+                }
+                else
+                {
+                    // clamp overflowing/underflowing values to min()/max() instead of +/-infinity
+                    // during downcasting
+                    y = std::min(std::max(as(x), as.min()), as.max());
+                }
+            });
            return y;
        };
    }

--- a/src/include/migraphx/op/convolution.hpp
+++ b/src/include/migraphx/op/convolution.hpp
@@ -79,17 +79,17 @@ struct convolution
        check_shapes{inputs, *this, true}.has(2).same_type().same_ndims().min_ndims(3);
        check_attribute_size();
        // num of dims of input and attribute should match
-        const auto input_size   = inputs[0].max_lens().size();
+        const auto input_ndim   = inputs[0].ndim();
        const auto padding_size = padding.size();
-        if(input_size != padding_size / 2 + 2 && input_size != padding_size + 2)
+        if(input_ndim != padding_size / 2 + 2 && input_ndim != padding_size + 2)
        {
            MIGRAPHX_THROW("CONVOLUTION: input and attribute size mismatch!");
        }
        const shape& x_shape          = inputs.at(0);
        const shape& w_shape          = inputs.at(1);
-        const size_t num_spatial_dims = input_size - 2;
+        const size_t num_spatial_dims = input_ndim - 2;
        if(num_spatial_dims != this->kdims())
        {
            MIGRAPHX_THROW("CONVOLUTION: input k-dims does not match attribute size");
@@ -105,7 +105,7 @@ struct convolution
        }
        else
        {
-            return fixed_compute_shape(x_shape, w_shape);
+            return static_compute_shape(x_shape, w_shape);
        }
    }
@@ -143,23 +143,10 @@ struct convolution
    shape dynamic_compute_shape(shape x_shape, shape w_shape) const
    {
        std::vector<shape::dynamic_dimension> output_dyn_dims = {};
+        output_dyn_dims.push_back(x_shape.to_dynamic().dyn_dims().at(0));
+        output_dyn_dims.push_back(w_shape.to_dynamic().dyn_dims().at(0));
-        auto dynamic_shape_push_back = [&](const shape& input_shape) {
+        const size_t num_spatial_dims = x_shape.ndim() - 2;
-            if(input_shape.dynamic())
-            {
-                output_dyn_dims.push_back(input_shape.dyn_dims().at(0));
-            }
-            else
-            {
-                auto l = input_shape.lens().at(0);
-                output_dyn_dims.push_back({l, l});
-            }
-        };
-        dynamic_shape_push_back(x_shape);
-        dynamic_shape_push_back(w_shape);
-        const size_t num_spatial_dims = x_shape.max_lens().size() - 2;
        if(padding_mode != default_)
        {
            for(std::size_t i = 0; i < num_spatial_dims; ++i)
@@ -198,7 +185,7 @@ struct convolution
        return shape{x_shape.type(), output_dyn_dims};
    }
-    shape fixed_compute_shape(shape x_shape, shape w_shape) const
+    shape static_compute_shape(shape x_shape, shape w_shape) const
    {
        std::vector<size_t> output_lens{x_shape.lens()[0], w_shape.lens()[0]};
        auto spatial_lens = calc_conv_lens(x_shape.lens(), w_shape.lens());

--- a/src/include/migraphx/op/deconvolution.hpp
+++ b/src/include/migraphx/op/deconvolution.hpp
@@ -21,9 +21,11 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_OPERATORS_DECONVOLUTION_HPP
+#ifndef MIGRAPHX_GUARD_OPERATORS_CONVOLUTION_BACKWARDS_HPP
-#define MIGRAPHX_GUARD_OPERATORS_DECONVOLUTION_HPP
+#define MIGRAPHX_GUARD_OPERATORS_CONVOLUTION_BACKWARDS_HPP
+#include <cmath>
+#include <utility>
 #include <migraphx/op/common.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/config.hpp>
@@ -31,14 +33,13 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/par_dfor.hpp>
 #include <migraphx/shape_for_each.hpp>
-#include <cmath>
+#include <migraphx/dyn_output.hpp>
-#include <utility>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {
-struct deconvolution
+struct convolution_backwards
 {
    std::vector<std::size_t> padding  = {0, 0};
    std::vector<std::size_t> stride   = {1, 1};
@@ -57,45 +58,91 @@ struct deconvolution
                    f(self.group, "group"));
    }
-    std::string name() const { return "deconvolution"; }
+    std::string name() const { return "convolution_backwards"; }
    void check_attribute_size() const
    {
-        if((padding.size() != stride.size() and (padding.size() / 2) != stride.size()) or
+        if(padding.size() != stride.size() or stride.size() != dilation.size())
-           stride.size() != dilation.size())
        {
-            MIGRAPHX_THROW("deconvolution: inconsistent attribute sizes");
+            MIGRAPHX_THROW("CONVOLUTION_BACKWARDS: inconsistent attribute sizes");
        }
    }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2).same_type().same_ndims().min_ndims(3);
+        check_shapes{inputs, *this, true}.has(2).same_type().same_ndims().min_ndims(3);
-        const shape& input   = inputs.at(0);
+        const shape& x_shape = inputs.at(0);
-        const shape& weights = inputs.at(1);
+        const shape& w_shape = inputs.at(1);
-        size_t kdims         = input.lens().size() - 2;
+        if(x_shape.ndim() - 2 != this->kdims())
-        if(kdims != this->kdims())
        {
-            MIGRAPHX_THROW("deconvolution: input k-dims does not match attribute size");
+            MIGRAPHX_THROW("CONVOLUTION_BACKWARDS: input k-dims does not match attribute size");
        }
-        std::vector<size_t> output_lens{input.lens()[0], weights.lens()[1]};
+        if(not x_shape.dynamic() and not w_shape.dynamic() and
+           x_shape.lens().at(1) != (w_shape.lens().at(0) * group))
+        {
+            MIGRAPHX_THROW("CONVOLUTION_BACKWARDS: mismatched channel numbers");
+        }
-        for(size_t i = 0; i < kdims; i++)
+        if(x_shape.dynamic() or w_shape.dynamic())
        {
-            output_lens.push_back(std::size_t(std::max<std::ptrdiff_t>(
+            return dynamic_compute_shape(x_shape, w_shape);
+        }
+        else
+        {
+            return static_compute_shape(x_shape, w_shape);
+        }
+    }
+    std::vector<std::size_t> calc_spatial_lens(std::vector<std::size_t> x_lens,
+                                               std::vector<std::size_t> w_lens) const
+    {
+        std::vector<size_t> spatial_lens(x_lens.size() - 2);
+        // stride * (input - 1) + output_padding + ((kernel - 1) * dilation + 1) - padding_L -
+        // padding_R. This assumes padding_L = padding_R and output_padding handled in parser.
+        for(size_t i = 0; i < spatial_lens.size(); i++)
+        {
+            spatial_lens.at(i) = (std::size_t(std::max<std::ptrdiff_t>(
                1,
-                stride[i] * (input.lens()[i + 2] - 1) +
+                stride[i] * (x_lens[i + 2] - 1) + ((w_lens[i + 2] - 1) * dilation[i] + 1) -
-                    ((weights.lens()[i + 2] - 1) * dilation[i] + 1) - 2 * padding[i])));
+                    2 * padding[i])));
        }
-        return inputs[0].with_lens(output_lens);
+        return spatial_lens;
+    }
+    shape dynamic_compute_shape(shape x_shape, shape w_shape) const
+    {
+        std::vector<shape::dynamic_dimension> output_dyn_dims = {};
+        output_dyn_dims.push_back(x_shape.to_dynamic().dyn_dims().at(0));
+        output_dyn_dims.push_back(w_shape.to_dynamic().dyn_dims().at(1));
+        const std::size_t num_spatial_dims = x_shape.ndim() - 2;
+        // Does not compute for optimals
+        auto min_spatial_dims = calc_spatial_lens(x_shape.min_lens(), w_shape.min_lens());
+        auto max_spatial_dims = calc_spatial_lens(x_shape.max_lens(), w_shape.max_lens());
+        for(size_t i = 0; i < num_spatial_dims; ++i)
+        {
+            output_dyn_dims.push_back(
+                shape::dynamic_dimension{min_spatial_dims[i], max_spatial_dims[i], {}});
+        }
+        return shape{x_shape.type(), output_dyn_dims};
+    }
+    shape static_compute_shape(shape x_shape, shape w_shape) const
+    {
+        std::vector<size_t> output_lens{x_shape.lens()[0], w_shape.lens()[1]};
+        auto spatial_lens = calc_spatial_lens(x_shape.lens(), w_shape.lens());
+        std::for_each(spatial_lens.begin(), spatial_lens.end(), [&output_lens](auto x) {
+            output_lens.push_back(x);
+        });
+        return x_shape.with_lens(output_lens);
    }
-    argument compute(shape output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
-        auto kdims = this->kdims();
+        auto num_spatial_dims = this->kdims();
        visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
            using type = typename decltype(output)::value_type;
@@ -109,22 +156,22 @@ struct deconvolution
            auto wei_n = wei[0];
            auto wei_c = wei[1];
-            auto out_lens = output_shape.lens();
+            auto out_lens = dyn_out.computed_shape.lens();
            std::vector<std::size_t> win_size{in_c};
            std::copy(in_lens.begin() + 2, in_lens.end(), std::back_inserter(win_size));
            std::copy(wei.begin() + 2, wei.end(), std::back_inserter(win_size));
-            shape win_shape{output_shape.type(), win_size};
+            shape win_shape{dyn_out.computed_shape.type(), win_size};
            par_dfor(in_n, wei_c)([&](int o, int k) {
                shape_for_each(win_shape, [&](auto idx_win) {
                    const int w = idx_win[0];
                    auto input_dims_start = idx_win.begin() + 1;
-                    auto wei_dims_start   = idx_win.begin() + kdims + 1;
+                    auto wei_dims_start   = idx_win.begin() + num_spatial_dims + 1;
                    std::vector<std::ptrdiff_t> win_start;
-                    for(std::size_t n = 0; n < kdims; ++n)
+                    for(std::size_t n = 0; n < num_spatial_dims; ++n)
                    {
                        win_start.push_back(std::ptrdiff_t(*(input_dims_start + n) * stride[n]) -
                                            std::ptrdiff_t(padding[n]));
@@ -135,7 +182,7 @@ struct deconvolution
                    std::vector<std::ptrdiff_t> idx_out{o, in_ch};
-                    for(size_t n = 0; n < kdims; n++)
+                    for(size_t n = 0; n < num_spatial_dims; n++)
                    {
                        idx_out.push_back(win_start[n] + *(wei_dims_start + n) * dilation[n]);
                    }

--- a/src/include/migraphx/op/dimensions_of.hpp
+++ b/src/include/migraphx/op/dimensions_of.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_OPERATORS_DIMENSIONS_OF_HPP
+#define MIGRAPHX_GUARD_OPERATORS_DIMENSIONS_OF_HPP
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/dyn_output.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+/**
+ * Returns the dimensions of the input argument from starting axis to ending axis.
+ * Atleast `end` must be set to use this operator (set `end` to ndim for default ONNX behavior of
+ * `Shape` operator) This should only be used for dynamic shapes as this can be simplified to a
+ * literal for static shapes.
+ */
+struct dimensions_of
+{
+    std::size_t start = 0;
+    std::size_t end   = 0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.start, "start"), f(self.end, "end"));
+    }
+    std::string name() const { return "dimensions_of"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this, true}.has(1);
+        if(start >= end)
+        {
+            MIGRAPHX_THROW("DIMENSIONS_OF: start >= end. start = " + std::to_string(start) +
+                           ", end = " + std::to_string(end));
+        }
+        return shape{shape::int64_type, {end - start}};
+    }
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        auto input_lens = args[0].get_shape().lens();
+        result.visit([&](auto output) {
+            std::copy(input_lens.cbegin() + start, input_lens.cbegin() + end, output.begin());
+        });
+        return result;
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/multibroadcast.hpp
+++ b/src/include/migraphx/op/multibroadcast.hpp
@@ -69,7 +69,7 @@ struct multibroadcast
        auto make_bcast_strides = [&](std::vector<std::size_t> bcast_lens, std::size_t offset) {
            std::vector<size_t> bcast_strides(bcast_lens.size(), 0);
-            for(std::ptrdiff_t i = s0.lens().size() - 1; i >= 0; i--)
+            for(std::ptrdiff_t i = s0.ndim() - 1; i >= 0; i--)
            {
                if(bcast_lens[i + offset] == s0.lens()[i])
                {
@@ -84,13 +84,13 @@ struct multibroadcast
            if(s0.dynamic())
                MIGRAPHX_THROW(
                    "MULTIBROADCAST: Single dynamic input shape not supported.  Use two inputs.");
-            if(s0.lens().size() > output_lens.size())
+            if(s0.ndim() > output_lens.size())
            {
                MIGRAPHX_THROW("MULTIBROADCAST: input dimensions should <= output size");
            }
-            auto offset = output_lens.size() - s0.lens().size();
+            auto offset = output_lens.size() - s0.ndim();
-            for(std::ptrdiff_t i = s0.lens().size() - 1; i >= 0; i--)
+            for(std::ptrdiff_t i = s0.ndim() - 1; i >= 0; i--)
            {
                if(output_lens[i + offset] != s0.lens()[i] and s0.lens()[i] != 1)
                {
@@ -119,7 +119,7 @@ struct multibroadcast
            {
                // output_lens will not be set for 2+ input version
                auto bcast_lens    = compute_common_lens(inputs);
-                auto offset        = bcast_lens.size() - s0.lens().size();
+                auto offset        = bcast_lens.size() - s0.ndim();
                auto bcast_strides = make_bcast_strides(bcast_lens, offset);
                return {t, std::move(bcast_lens), std::move(bcast_strides)};
            }

--- a/src/include/migraphx/op/pooling.hpp
+++ b/src/include/migraphx/op/pooling.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -42,16 +42,43 @@ namespace op {
 struct pooling
 {
-    pooling_mode mode                = {pooling_mode::average};
+    pooling_mode mode = {pooling_mode::average};
+    // Padding along each spatial input dimension
+    // Can be ndim or 2*ndim values where ndim is size of lengths
+    // ndim values means pad the same before and after each dimension
+    // 2*ndim values contains n pre and then n post padding values
    std::vector<std::size_t> padding = {0, 0};
-    std::vector<std::size_t> stride  = {1, 1};
+    // Size of stride to take from one placement of the pooling kernel to the next.
+    // This is distinct from the strides used by the shape class.  Must be the same
+    // ndim as lengths.
+    std::vector<std::size_t> stride = {1, 1};
+    // Spatial dimensions of the pooling kernel or window,
+    // 2 smaller than the input tensor rank (NCHW layout)
    std::vector<std::size_t> lengths = {1, 1};
-    bool ceil_mode                   = false;
-    int lp_order                     = 2;
+    // Dilations are not supported at this time.
+    // ceiling mode is a flag affecting output size
+    // or equivalently, placements of the pooling kernel.
+    // When true, round the size upwards, possibly
+    // including partial placements where the kernel extends beyond the edge
+    // of input and even padding.  When false, round down so that all
+    // kernel placements fit but some input values may be dropped.
+    bool ceil_mode = false;
+    int lp_order   = 2;
    // Global pooling with dynamic shape input
    bool dyn_global = false;
+    // an attribute of the Onnx pooling operator, not currently enabled here because MIOpen can't
+    // support it. We currently implement padding for average pooling by inserting a Padding
+    // operator during Onnx parsing. But to support dynamic shape inputs and count_include_pad
+    // together, it would be necessary to do this calculation at runtime in MIOpen.
+    bool count_include_pad = false;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
@@ -68,11 +95,29 @@ struct pooling
    void check_attribute_size() const
    {
-        if((padding.size() != stride.size() and (padding.size() / 2) != stride.size()) or
+        if(dyn_global)
-           (not dyn_global and stride.size() != lengths.size()))
+            return;
+        if((padding.size() != stride.size() and (padding.size()) != stride.size() * 2) or
+           stride.size() != lengths.size())
        {
            MIGRAPHX_THROW("POOLING: inconsistent attribute sizes");
        }
+        if(std::any_of(lengths.begin(), lengths.end(), [&](auto i) { return (i == 0); }) or
+           std::any_of(stride.begin(), stride.end(), [&](auto i) { return (i == 0); }))
+        {
+            MIGRAPHX_THROW("POOLING: size 0 pooling kernel or stride");
+        }
+        // TODO:  update lowering to run the reference
+        // code when OneDNN can't execute pooling for a CPU
+        // OneDNN has a limitation on padding size for pooling.  see
+        // https://oneapi-src.github.io/oneDNN/dev_guide_convolution.html#doxid-dev-guide-convolution
+        // padding = {2}; stride = {1}; lengths = {3} succeeds in oneDNN but
+        // padding = {2}; stride = {1}; lengths = {2} fails.
+        // Also, the referenced documentation contains a max. dimension size of 14 for the kernel
+        // ("weights tensor") that MIGraphX doesn't enforce.
    }
    size_t kdims() const
@@ -112,7 +157,11 @@ struct pooling
        const shape& input = inputs.at(0);
        auto padding_size  = padding.size();
        size_t kdims       = input.ndim() - 2;
-        if(input.ndim() != padding_size / 2 + 2 and input.ndim() != padding_size + 2)
+        if(input.ndim() < 3)
+        {
+            MIGRAPHX_THROW("POOLING: input must have 3 or more dimensions and be nonempty");
+        }
+        if(input.ndim() * 2 != padding_size + 4 and input.ndim() != padding_size + 2)
        {
            MIGRAPHX_THROW("POOLING: input and attribute size mismatch!");
        }
@@ -132,7 +181,7 @@ struct pooling
            }
            else
            {
-                // does not compute for optimals
+                // does not compute optimals
                auto min_spatial_dims = calc_spatial_dim_out(input.min_lens(), kdims);
                auto max_spatial_dims = calc_spatial_dim_out(input.max_lens(), kdims);
                for(size_t i = 0; i < kdims; ++i)
@@ -149,7 +198,7 @@ struct pooling
            std::vector<std::size_t> output_lens(input_lens.begin(), input_lens.begin() + 2);
            // Used for when normalize_compute_shape() is called again at model eval time
-            // for an originally dynamic shape. Since kernel shape is not used with dyn_global.
+            // for an originally dynamic shape. Kernel shape is not used with dyn_global.
            if(dyn_global)
            {
                for(size_t i = 0; i < kdims; ++i)
@@ -184,7 +233,7 @@ struct pooling
        double operator()(double x, double y) const { return x + std::pow(std::abs(y), p); }
-        double final(double x, std::size_t) const { return std::pow(x, 1. / p); }
+        double final(double x, std::size_t) const { return (p == 0) ? 1 : std::pow(x, 1. / p); }
    };
    struct avg_pool
@@ -222,37 +271,82 @@ struct pooling
    {
        auto in_s    = input.get_shape();
        auto in_lens = in_s.lens();
+        // For each element of output; i.e., for each placement of pooling kernel...
        par_for(output_shape.elements(), [&](auto i) {
            auto idx_o = output_shape.multi(i);
            auto n_dim = idx_o.size();
-            std::vector<std::size_t> win_start;
+            // starting offset of the pooling window
+            std::vector<int> win_start;
            std::vector<std::size_t> win_size;
+            // For each spatial dimension, find starting and ending index of pooling kernel
            for(std::size_t dim = 2; dim < n_dim; ++dim)
            {
                auto d_2 = dim - 2;
                int start =
                    static_cast<int>(idx_o[dim] * stride[d_2]) - static_cast<int>(padding[d_2]);
-                int end = std::min(start + kernel_dims[d_2], in_lens[dim]);
+                int end;
-                start   = std::max(start, 0);
+                // NOLINT
+                if(count_include_pad and ceil_mode and (mode != pooling_mode::max))
+                {
+                    // TODO: this block can't execute until we enable count_include_pad
+                    // Even when using padding, if in ceil_mode a window
+                    // could extend beyond the end of both input and
+                    // padding.  Clip out-of-bounds indexes but not padding.
+                    // Check if this kernel extends beyond the padding at end of dimension
+                    end = std::min(start + kernel_dims[d_2],
+                                   in_lens[dim] + static_cast<int>(padding[d_2]));
+                }
+                else
+                {
+                    // In non-ceiling mode, when
+                    // count_include_pad is false, or for max pooling, clip off padding.
+                    end   = std::min(start + kernel_dims[d_2], in_lens[dim]);
+                    start = std::max(start, 0);
+                }
                win_start.push_back(start);
+                if(end < start)
+                {
+                    // This error can be caused by misc. bad input combinations
+                    MIGRAPHX_THROW("POOLING:  invalid attributes");
+                }
                win_size.push_back(end - start);
            }
            shape win_shape{output_shape.type(), win_size};
            auto pool_size    = win_shape.elements();
            double output_val = op.template init<Type>();
+            // for each element in the window...
            shape_for_each(win_shape, [&](auto idx_w) {
+                // the coordinates of this element
                auto idx = idx_o;
+                // Add the kernel location idx_w and the offset win_start, for each dimension.
+                // Negative results are cast to very large unsigned integers.
                std::transform(idx_w.begin(),
                               idx_w.end(),
                               win_start.begin(),
                               idx.begin() + 2,
                               [](auto ii, auto jj) { return ii + jj; });
-                if(std::all_of(idx.begin() + 2, idx.end(), [&](auto ii) { return ii >= 0; }) and
+                // Check if any of coordinates are out of input tensor's range
-                   idx < in_lens)
+                if(std::mismatch(idx.begin() + 2,
+                                 idx.end(),
+                                 in_lens.begin() + 2,
+                                 in_lens.end(),
+                                 std::less<>{}) == std::make_pair(idx.end(), in_lens.end()))
                {
                    output_val = op(output_val, input[in_s.index(idx)]);
                }
+                else
+                {
+                    // this is a padding element.  Padding locations
+                    // don't contribute to average or max pooling total but can play in
+                    // lpnorm pooling.
+                    output_val = op(output_val, 0);
+                }
            });
            output[i] = Type(op.final(output_val, pool_size));
        });

--- a/src/include/migraphx/op/prefix_scan_op.hpp
+++ b/src/include/migraphx/op/prefix_scan_op.hpp
@@ -44,6 +44,12 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {
+/**
+ * Parent struct for prefix scan operations.  A prefix scan is equivalent to the C++
+ * std::exclusive_scan or std::inclusive_scan.  Given a list of numbers, a prefix scan
+ * sum op returns an equal size list of running totals of the values.  Other operations
+ * besides addition can be supported by their own child ops.
+ */
 template <class Derived>
 struct prefix_scan_op : op_name<Derived>
 {

--- a/src/include/migraphx/operation.hpp
+++ b/src/include/migraphx/operation.hpp
@@ -143,7 +143,7 @@ auto compute_shape_op(rank<2>, const T& x, const std::vector<shape>& inputs)
    if(inputs.empty())
        MIGRAPHX_THROW("At least one input is required for " + x.name());
    dependent_type<operation, T> y = x;
-    normalize_attributes(y, inputs[0].max_lens());
+    normalize_attributes(y, inputs[0]);
    return any_cast<T>(y).normalize_compute_shape(inputs);
 }
@@ -251,9 +251,10 @@ auto compute_op(rank<1>,
                const shape& output,
                const std::vector<argument>& inputs,
                const std::vector<module_ref>& module_args,
-                F f)
+                F f) -> decltype(x.compute(make_compute_output_shape(pack(x, output, inputs)),
-    -> decltype(
+                                           inputs,
-        x.compute(make_compute_output_shape(pack(x, output, inputs)), inputs, module_args, f))
+                                           module_args,
+                                           f))
 {
    return x.compute(make_compute_output_shape(pack(x, output, inputs)), inputs, module_args, f);
 }
@@ -261,11 +262,13 @@ auto compute_op(rank<1>,
 template <class T, class F>
 argument compute_op(rank<0>,
                    const T& x,
-                    const shape&,
+                    const shape& output,
-                    const std::vector<argument>&,
+                    const std::vector<argument>& inputs,
-                    const std::vector<module_ref>&,
+                    const std::vector<module_ref>& module_args,
                    F)
 {
+    if(module_args.empty())
+        return compute_op(x, output, inputs);
    std::string name = x.name();
    MIGRAPHX_THROW("Not computable: " + name);
 }
@@ -307,9 +310,10 @@ auto compute_op(rank<3>,
                const shape& output,
                const std::vector<argument>& inputs,
                const std::vector<module_ref>& module_args,
-                F f)
+                F f) -> decltype(x.compute(make_compute_output_shape(pack(x, output, inputs)),
-    -> decltype(
+                                           inputs,
-        x.compute(make_compute_output_shape(pack(x, output, inputs)), inputs, module_args, f))
+                                           module_args,
+                                           f))
 {
    return x.compute(make_compute_output_shape(pack(x, output, inputs)), inputs, module_args, f);
 }
@@ -497,7 +501,7 @@ lifetime get_lifetime_op(const T&)
 #ifdef TYPE_ERASED_DECLARATION
 // Type-erased interface for:
-struct operation
+struct MIGRAPHX_EXPORT operation
 {
    //
    std::string name() const;
@@ -571,7 +575,7 @@ struct operation
    {
        using std::swap;
        auto* derived = this->any_cast<PrivateDetailTypeErasedT>();
-        if(derived and private_detail_te_handle_mem_var.unique())
+        if(derived and private_detail_te_handle_mem_var.use_count() == 1)
        {
            *derived = std::forward<PrivateDetailTypeErasedT>(value);
        }
@@ -1261,7 +1265,7 @@ struct operation
    private_detail_te_handle_base_type& private_detail_te_get_handle()
    {
        assert(private_detail_te_handle_mem_var != nullptr);
-        if(not private_detail_te_handle_mem_var.unique())
+        if(private_detail_te_handle_mem_var.use_count() > 1)
            private_detail_te_handle_mem_var = private_detail_te_handle_mem_var->clone();
        return *private_detail_te_handle_mem_var;
    }
@@ -1388,8 +1392,8 @@ bool has_finalize(const T& x)
    return detail::has_finalize_op(x);
 }
-void migraphx_to_value(value& v, const operation& op);
+MIGRAPHX_EXPORT void migraphx_to_value(value& v, const operation& op);
-void migraphx_from_value(const value& v, operation& op);
+MIGRAPHX_EXPORT void migraphx_from_value(const value& v, operation& op);
 #endif

--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -45,9 +45,10 @@
 #include <migraphx/op/contiguous.hpp>
 #include <migraphx/op/convert.hpp>
 #include <migraphx/op/convolution.hpp>
+#include <migraphx/op/convolution_backwards.hpp>
 #include <migraphx/op/cosh.hpp>
 #include <migraphx/op/cos.hpp>
-#include <migraphx/op/deconvolution.hpp>
+#include <migraphx/op/dimensions_of.hpp>
 #include <migraphx/op/div.hpp>
 #include <migraphx/op/dot.hpp>
 #include <migraphx/op/elu.hpp>

--- a/src/include/migraphx/optimize_module.hpp
+++ b/src/include/migraphx/optimize_module.hpp
@@ -36,7 +36,7 @@ struct module_pass_manager;
 /**
 * Runs several passes in a loop
 */
-struct optimize_module
+struct MIGRAPHX_EXPORT optimize_module
 {
    std::string name() const { return "optimize_module"; }
    void apply(module_pass_manager& mpm) const;

--- a/src/include/migraphx/pad_calc.hpp
+++ b/src/include/migraphx/pad_calc.hpp
@@ -32,6 +32,7 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
+MIGRAPHX_EXPORT
 void calculate_padding(int64_t idx,
                       std::vector<int64_t>& pads,
                       int64_t input_dim,
@@ -45,6 +46,7 @@ void calculate_padding(int64_t idx,
 * where the padding calculation must be done at evaluation time.
 * \return padding in the form of {x0_begin, x1_begin, ... x0_end , x1_end, ...}
 */
+MIGRAPHX_EXPORT
 std::vector<std::size_t> calc_dyn_auto_pad(const std::vector<std::size_t>& input_lens,
                                           const std::vector<std::size_t>& wei_lens,
                                           const std::vector<std::size_t>& strides,
@@ -53,6 +55,7 @@ std::vector<std::size_t> calc_dyn_auto_pad(const std::vector<std::size_t>& input
 // Used for dynamic auto padding of convolution operators since padding needs to be computed at
 // evaulation time.
+MIGRAPHX_EXPORT
 shape compute_padded_shape(const shape& input,
                           const shape& weights,
                           const std::vector<std::size_t>& padding,

--- a/src/include/migraphx/pass.hpp
+++ b/src/include/migraphx/pass.hpp
@@ -57,7 +57,7 @@ struct pass
 #else
-module& get_module(module_pass_manager& mpm);
+MIGRAPHX_EXPORT module& get_module(module_pass_manager& mpm);
 namespace detail {
@@ -84,7 +84,7 @@ void module_pass_manager_apply(const T& x, module_pass_manager& mpm)
 #ifdef TYPE_ERASED_DECLARATION
 // Type-erased interface for:
-struct pass
+struct MIGRAPHX_EXPORT pass
 {
    //
    std::string name() const;
@@ -116,7 +116,7 @@ struct pass
    {
        using std::swap;
        auto* derived = this->any_cast<PrivateDetailTypeErasedT>();
-        if(derived and private_detail_te_handle_mem_var.unique())
+        if(derived and private_detail_te_handle_mem_var.use_count() == 1)
        {
            *derived = std::forward<PrivateDetailTypeErasedT>(value);
        }
@@ -292,7 +292,7 @@ struct pass
    private_detail_te_handle_base_type& private_detail_te_get_handle()
    {
        assert(private_detail_te_handle_mem_var != nullptr);
-        if(not private_detail_te_handle_mem_var.unique())
+        if(private_detail_te_handle_mem_var.use_count() > 1)
            private_detail_te_handle_mem_var = private_detail_te_handle_mem_var->clone();
        return *private_detail_te_handle_mem_var;
    }

--- a/src/include/migraphx/pass_manager.hpp
+++ b/src/include/migraphx/pass_manager.hpp
@@ -47,12 +47,14 @@ struct module_pass_manager
    virtual ~module_pass_manager() {}
 };
-void run_passes(program& prog,
+MIGRAPHX_EXPORT void run_passes(program& prog,
-                module_ref root_mod,
+                                module_ref root_mod,
-                const std::vector<pass>& passes,
+                                const std::vector<pass>& passes,
-                tracer trace = tracer{});
+                                tracer trace = tracer{});
-void run_passes(module& mod, const std::vector<pass>& passes, tracer trace = tracer{});
+MIGRAPHX_EXPORT void
-void run_passes(program& prog, const std::vector<pass>& passes, tracer trace = tracer{});
+run_passes(module& mod, const std::vector<pass>& passes, tracer trace = tracer{});
+MIGRAPHX_EXPORT void
+run_passes(program& prog, const std::vector<pass>& passes, tracer trace = tracer{});
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/permutation.hpp
+++ b/src/include/migraphx/permutation.hpp
@@ -43,7 +43,7 @@ inline Vector reorder_dims(const Vector& dims, const std::vector<int64_t>& permu
    return result;
 }
-shape reorder_shape(const shape& s, const std::vector<int64_t>& permutation);
+MIGRAPHX_EXPORT shape reorder_shape(const shape& s, const std::vector<int64_t>& permutation);
 template <class Vector, class Op>
 inline std::vector<int64_t> sort_permutation(const Vector& data, Op op)
@@ -58,13 +58,13 @@ inline std::vector<int64_t> sort_permutation(const Vector& data, Op op)
 /*!
 * Returns the inverse permutation that could be applied to undo the inputted permutation
 */
-std::vector<int64_t> invert_permutation(const std::vector<int64_t>& permutation);
+MIGRAPHX_EXPORT std::vector<int64_t> invert_permutation(const std::vector<int64_t>& permutation);
 /*!
 * Finds the permutation that would make the shape not transposed (refering to shape.transposed())
 */
-std::vector<int64_t> find_permutation(const shape& s);
+MIGRAPHX_EXPORT std::vector<int64_t> find_permutation(const shape& s);
-std::vector<int64_t> find_permutation(const std::vector<shape>& shapes);
+MIGRAPHX_EXPORT std::vector<int64_t> find_permutation(const std::vector<shape>& shapes);
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/preallocate_param.hpp
+++ b/src/include/migraphx/preallocate_param.hpp
@@ -32,7 +32,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 struct module;
-struct preallocate_param
+struct MIGRAPHX_EXPORT preallocate_param
 {
    std::string param;
    allocation_model model;

--- a/src/include/migraphx/process.hpp
+++ b/src/include/migraphx/process.hpp
@@ -35,7 +35,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 struct process_impl;
-struct process
+struct MIGRAPHX_EXPORT process
 {
    using writer = std::function<void(const char*, std::size_t)>;
    process(const std::string& cmd);

--- a/src/include/migraphx/program.hpp
+++ b/src/include/migraphx/program.hpp
@@ -54,7 +54,7 @@ struct marker;
 /**
 * @brief Stores the instruction stream
 */
-struct program
+struct MIGRAPHX_EXPORT program
 {
    program();
@@ -79,6 +79,9 @@ struct program
    std::vector<argument> eval(parameter_map params,
                               execution_environment exec_env = execution_environment{}) const;
+    void finish() const;
    std::size_t size() const;
    std::vector<shape> get_output_shapes() const;
@@ -127,8 +130,8 @@ struct program
    program& sort();
-    friend std::ostream& operator<<(std::ostream& os, const program& p);
+    MIGRAPHX_EXPORT friend std::ostream& operator<<(std::ostream& os, const program& p);
-    friend bool operator==(const program& x, const program& y);
+    MIGRAPHX_EXPORT friend bool operator==(const program& x, const program& y);
    friend bool operator!=(const program& x, const program& y) { return not(x == y); }
    // module related api

--- a/src/include/migraphx/promote_literals.hpp
+++ b/src/include/migraphx/promote_literals.hpp
@@ -35,7 +35,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 * Replace literals in submodules with literals in the root module.
 * Intended to allow for reuse of the literals between submodules.
 */
-struct promote_literals
+struct MIGRAPHX_EXPORT promote_literals
 {
    std::string name() const { return "promote_literals"; }
    void apply(module_pass_manager&) const;

--- a/src/include/migraphx/propagate_constant.hpp
+++ b/src/include/migraphx/propagate_constant.hpp
@@ -35,7 +35,7 @@ struct module;
 /**
 * Replace instructions which take all literals with a literal of the computation.
 */
-struct propagate_constant
+struct MIGRAPHX_EXPORT propagate_constant
 {
    std::string name() const { return "propagate_constant"; }
    void apply(module& m) const;