Merge branch 'pointwise-nhwc' of...

Merge branch 'pointwise-nhwc' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into nhwc_workaround

Merge branch 'pointwise-nhwc' of...
Merge branch 'pointwise-nhwc' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into nhwc_workaround
f85ba189 · Khalique Ahmed · 122ffe97 · dfbab16e · f85ba189 · f85ba189
Commit f85ba189 authored Jul 18, 2023 by Khalique Ahmed
20 changed files
--- a/src/include/migraphx/load_save.hpp
+++ b/src/include/migraphx/load_save.hpp
@@ -36,15 +36,18 @@ struct file_options
    std::string format = "msgpack";
 };
-program load(const std::string& filename, const file_options& options = file_options{});
+MIGRAPHX_EXPORT program load(const std::string& filename,
-program load_buffer(const std::vector<char>& buffer, const file_options& options = file_options{});
+                             const file_options& options = file_options{});
-program
+MIGRAPHX_EXPORT program load_buffer(const std::vector<char>& buffer,
-load_buffer(const char* buffer, std::size_t size, const file_options& options = file_options{});
+                                    const file_options& options = file_options{});
+MIGRAPHX_EXPORT program load_buffer(const char* buffer,
+                                    std::size_t size,
+                                    const file_options& options = file_options{});
-void save(const program& p,
+MIGRAPHX_EXPORT void
-          const std::string& filename,
+save(const program& p, const std::string& filename, const file_options& options = file_options{});
-          const file_options& options = file_options{});
+MIGRAPHX_EXPORT std::vector<char> save_buffer(const program& p,
-std::vector<char> save_buffer(const program& p, const file_options& options = file_options{});
+                                              const file_options& options = file_options{});
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/make_op.hpp
+++ b/src/include/migraphx/make_op.hpp
@@ -33,10 +33,10 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-operation make_op(const std::string& name);
+MIGRAPHX_EXPORT operation make_op(const std::string& name);
-operation make_op(const std::string& name,
+MIGRAPHX_EXPORT operation make_op(const std::string& name,
-                  const std::initializer_list<std::pair<std::string, value>>& v);
+                                  const std::initializer_list<std::pair<std::string, value>>& v);
-operation make_op_from_value(const std::string& name, const value& v);
+MIGRAPHX_EXPORT operation make_op_from_value(const std::string& name, const value& v);
 // A template overload is added for migraphx::value so the initializer_list
 // cannot be passed in directly. This is to enforce at compile-time that all
@@ -48,7 +48,7 @@ operation make_op(const std::string& name, const Value& v)
    return make_op_from_value(name, v);
 }
-operation make_json_op(const std::string& name, const std::string& s);
+MIGRAPHX_EXPORT operation make_json_op(const std::string& name, const std::string& s);
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/marker.hpp
+++ b/src/include/migraphx/marker.hpp
@@ -46,7 +46,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 #ifdef TYPE_ERASED_DECLARATION
 // Type-erased interface for:
-struct marker
+struct MIGRAPHX_EXPORT marker
 {
    //
    void mark_start(instruction_ref ins_ref);

--- a/src/include/migraphx/memory_coloring.hpp
+++ b/src/include/migraphx/memory_coloring.hpp
@@ -36,7 +36,7 @@ struct module;
 * Remove multiple memory allocations using graph coloring to find memory allocations that can be
 * reused.
 */
-struct memory_coloring
+struct MIGRAPHX_EXPORT memory_coloring
 {
    std::string allocation_op{};
    bool verify = false;

--- a/src/include/migraphx/module.hpp
+++ b/src/include/migraphx/module.hpp
@@ -52,7 +52,7 @@ using ins_dep_map   = std::unordered_map<instruction_ref, std::unordered_set<ins
 /**
 * @brief Stores the instruction stream
 */
-struct module
+struct MIGRAPHX_EXPORT module
 {
    module(const std::string& name = "");
@@ -225,8 +225,8 @@ struct module
    module& sort();
    ins_dep_map calc_implicit_deps() const;
-    friend std::ostream& operator<<(std::ostream& os, const module& m);
+    MIGRAPHX_EXPORT friend std::ostream& operator<<(std::ostream& os, const module& m);
-    friend bool operator==(const module& x, const module& y);
+    MIGRAPHX_EXPORT friend bool operator==(const module& x, const module& y);
    friend bool operator!=(const module& x, const module& y) { return not(x == y); }
    private:

--- a/src/include/migraphx/msgpack.hpp
+++ b/src/include/migraphx/msgpack.hpp
@@ -31,10 +31,11 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-void to_msgpack(const value& v, std::function<void(const char*, std::size_t)> writer);
+MIGRAPHX_EXPORT void to_msgpack(const value& v,
-std::vector<char> to_msgpack(const value& v);
+                                std::function<void(const char*, std::size_t)> writer);
-value from_msgpack(const std::vector<char>& buffer);
+MIGRAPHX_EXPORT std::vector<char> to_msgpack(const value& v);
-value from_msgpack(const char* buffer, std::size_t size);
+MIGRAPHX_EXPORT value from_msgpack(const std::vector<char>& buffer);
+MIGRAPHX_EXPORT value from_msgpack(const char* buffer, std::size_t size);
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/normalize_attributes.hpp
+++ b/src/include/migraphx/normalize_attributes.hpp
@@ -42,6 +42,7 @@ struct select_dependent_type
 template <class T, class... Ts>
 using dependent_type = typename select_dependent_type<T, Ts...>::type;
+MIGRAPHX_EXPORT
 bool normalize_attributes(operation& op, const std::vector<std::size_t>& lens);
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/include/migraphx/normalize_ops.hpp
+++ b/src/include/migraphx/normalize_ops.hpp
@@ -39,7 +39,7 @@ struct module;
 * Process negative axis attributes of ops
 */
-struct normalize_ops
+struct MIGRAPHX_EXPORT normalize_ops
 {
    std::string name() const { return "normalize_ops"; }
    void apply(module& m) const;

--- a/src/include/migraphx/onnx.hpp
+++ b/src/include/migraphx/onnx.hpp
@@ -26,6 +26,7 @@
 #include <migraphx/program.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/onnx/export.h>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -54,15 +55,19 @@ struct onnx_options
 };
 /// Create a program from an onnx file
-program parse_onnx(const std::string& name, const onnx_options& = onnx_options{});
+MIGRAPHX_ONNX_EXPORT program parse_onnx(const std::string& name,
+                                        const onnx_options& = onnx_options{});
 /// Create a program from an onnx buffer
-program parse_onnx_buffer(const std::string& buffer, const onnx_options& options);
+MIGRAPHX_ONNX_EXPORT program parse_onnx_buffer(const std::string& buffer,
+                                               const onnx_options& options);
 /// Create a program from an onnx buffer
-program parse_onnx_buffer(const void* data, std::size_t size, const onnx_options& options);
+MIGRAPHX_ONNX_EXPORT program parse_onnx_buffer(const void* data,
+                                               std::size_t size,
+                                               const onnx_options& options);
-std::vector<std::string> get_onnx_operators();
+MIGRAPHX_ONNX_EXPORT std::vector<std::string> get_onnx_operators();
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/op/common.hpp
+++ b/src/include/migraphx/op/common.hpp
@@ -59,8 +59,8 @@ enum class rnn_direction
    bidirectional,
 };
-std::ostream& operator<<(std::ostream& os, pooling_mode v);
+MIGRAPHX_EXPORT std::ostream& operator<<(std::ostream& os, pooling_mode v);
-std::ostream& operator<<(std::ostream& os, rnn_direction v);
+MIGRAPHX_EXPORT std::ostream& operator<<(std::ostream& os, rnn_direction v);
 } // namespace op
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/include/migraphx/op/convolution.hpp
+++ b/src/include/migraphx/op/convolution.hpp
@@ -79,17 +79,17 @@ struct convolution
        check_shapes{inputs, *this, true}.has(2).same_type().same_ndims().min_ndims(3);
        check_attribute_size();
        // num of dims of input and attribute should match
-        const auto input_size   = inputs[0].max_lens().size();
+        const auto input_ndim   = inputs[0].ndim();
        const auto padding_size = padding.size();
-        if(input_size != padding_size / 2 + 2 && input_size != padding_size + 2)
+        if(input_ndim != padding_size / 2 + 2 && input_ndim != padding_size + 2)
        {
            MIGRAPHX_THROW("CONVOLUTION: input and attribute size mismatch!");
        }
        const shape& x_shape          = inputs.at(0);
        const shape& w_shape          = inputs.at(1);
-        const size_t num_spatial_dims = input_size - 2;
+        const size_t num_spatial_dims = input_ndim - 2;
        if(num_spatial_dims != this->kdims())
        {
            MIGRAPHX_THROW("CONVOLUTION: input k-dims does not match attribute size");
@@ -105,7 +105,7 @@ struct convolution
        }
        else
        {
-            return fixed_compute_shape(x_shape, w_shape);
+            return static_compute_shape(x_shape, w_shape);
        }
    }
@@ -143,23 +143,10 @@ struct convolution
    shape dynamic_compute_shape(shape x_shape, shape w_shape) const
    {
        std::vector<shape::dynamic_dimension> output_dyn_dims = {};
+        output_dyn_dims.push_back(x_shape.to_dynamic().dyn_dims().at(0));
+        output_dyn_dims.push_back(w_shape.to_dynamic().dyn_dims().at(0));
-        auto dynamic_shape_push_back = [&](const shape& input_shape) {
+        const size_t num_spatial_dims = x_shape.ndim() - 2;
-            if(input_shape.dynamic())
-            {
-                output_dyn_dims.push_back(input_shape.dyn_dims().at(0));
-            }
-            else
-            {
-                auto l = input_shape.lens().at(0);
-                output_dyn_dims.push_back({l, l});
-            }
-        };
-        dynamic_shape_push_back(x_shape);
-        dynamic_shape_push_back(w_shape);
-        const size_t num_spatial_dims = x_shape.max_lens().size() - 2;
        if(padding_mode != default_)
        {
            for(std::size_t i = 0; i < num_spatial_dims; ++i)
@@ -198,7 +185,7 @@ struct convolution
        return shape{x_shape.type(), output_dyn_dims};
    }
-    shape fixed_compute_shape(shape x_shape, shape w_shape) const
+    shape static_compute_shape(shape x_shape, shape w_shape) const
    {
        std::vector<size_t> output_lens{x_shape.lens()[0], w_shape.lens()[0]};
        auto spatial_lens = calc_conv_lens(x_shape.lens(), w_shape.lens());

--- a/src/include/migraphx/op/deconvolution.hpp
+++ b/src/include/migraphx/op/deconvolution.hpp
@@ -21,9 +21,11 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_OPERATORS_DECONVOLUTION_HPP
+#ifndef MIGRAPHX_GUARD_OPERATORS_CONVOLUTION_BACKWARDS_HPP
-#define MIGRAPHX_GUARD_OPERATORS_DECONVOLUTION_HPP
+#define MIGRAPHX_GUARD_OPERATORS_CONVOLUTION_BACKWARDS_HPP
+#include <cmath>
+#include <utility>
 #include <migraphx/op/common.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/config.hpp>
@@ -31,14 +33,13 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/par_dfor.hpp>
 #include <migraphx/shape_for_each.hpp>
-#include <cmath>
+#include <migraphx/dyn_output.hpp>
-#include <utility>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {
-struct deconvolution
+struct convolution_backwards
 {
    std::vector<std::size_t> padding  = {0, 0};
    std::vector<std::size_t> stride   = {1, 1};
@@ -57,45 +58,91 @@ struct deconvolution
                    f(self.group, "group"));
    }
-    std::string name() const { return "deconvolution"; }
+    std::string name() const { return "convolution_backwards"; }
    void check_attribute_size() const
    {
-        if((padding.size() != stride.size() and (padding.size() / 2) != stride.size()) or
+        if(padding.size() != stride.size() or stride.size() != dilation.size())
-           stride.size() != dilation.size())
        {
-            MIGRAPHX_THROW("deconvolution: inconsistent attribute sizes");
+            MIGRAPHX_THROW("CONVOLUTION_BACKWARDS: inconsistent attribute sizes");
        }
    }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2).same_type().same_ndims().min_ndims(3);
+        check_shapes{inputs, *this, true}.has(2).same_type().same_ndims().min_ndims(3);
-        const shape& input   = inputs.at(0);
+        const shape& x_shape = inputs.at(0);
-        const shape& weights = inputs.at(1);
+        const shape& w_shape = inputs.at(1);
-        size_t kdims         = input.lens().size() - 2;
+        if(x_shape.ndim() - 2 != this->kdims())
-        if(kdims != this->kdims())
        {
-            MIGRAPHX_THROW("deconvolution: input k-dims does not match attribute size");
+            MIGRAPHX_THROW("CONVOLUTION_BACKWARDS: input k-dims does not match attribute size");
        }
-        std::vector<size_t> output_lens{input.lens()[0], weights.lens()[1]};
+        if(not x_shape.dynamic() and not w_shape.dynamic() and
+           x_shape.lens().at(1) != (w_shape.lens().at(0) * group))
+        {
+            MIGRAPHX_THROW("CONVOLUTION_BACKWARDS: mismatched channel numbers");
+        }
-        for(size_t i = 0; i < kdims; i++)
+        if(x_shape.dynamic() or w_shape.dynamic())
        {
-            output_lens.push_back(std::size_t(std::max<std::ptrdiff_t>(
+            return dynamic_compute_shape(x_shape, w_shape);
+        }
+        else
+        {
+            return static_compute_shape(x_shape, w_shape);
+        }
+    }
+    std::vector<std::size_t> calc_spatial_lens(std::vector<std::size_t> x_lens,
+                                               std::vector<std::size_t> w_lens) const
+    {
+        std::vector<size_t> spatial_lens(x_lens.size() - 2);
+        // stride * (input - 1) + output_padding + ((kernel - 1) * dilation + 1) - padding_L -
+        // padding_R. This assumes padding_L = padding_R and output_padding handled in parser.
+        for(size_t i = 0; i < spatial_lens.size(); i++)
+        {
+            spatial_lens.at(i) = (std::size_t(std::max<std::ptrdiff_t>(
                1,
-                stride[i] * (input.lens()[i + 2] - 1) +
+                stride[i] * (x_lens[i + 2] - 1) + ((w_lens[i + 2] - 1) * dilation[i] + 1) -
-                    ((weights.lens()[i + 2] - 1) * dilation[i] + 1) - 2 * padding[i])));
+                    2 * padding[i])));
        }
-        return inputs[0].with_lens(output_lens);
+        return spatial_lens;
+    }
+    shape dynamic_compute_shape(shape x_shape, shape w_shape) const
+    {
+        std::vector<shape::dynamic_dimension> output_dyn_dims = {};
+        output_dyn_dims.push_back(x_shape.to_dynamic().dyn_dims().at(0));
+        output_dyn_dims.push_back(w_shape.to_dynamic().dyn_dims().at(1));
+        const std::size_t num_spatial_dims = x_shape.ndim() - 2;
+        // Does not compute for optimals
+        auto min_spatial_dims = calc_spatial_lens(x_shape.min_lens(), w_shape.min_lens());
+        auto max_spatial_dims = calc_spatial_lens(x_shape.max_lens(), w_shape.max_lens());
+        for(size_t i = 0; i < num_spatial_dims; ++i)
+        {
+            output_dyn_dims.push_back(
+                shape::dynamic_dimension{min_spatial_dims[i], max_spatial_dims[i], {}});
+        }
+        return shape{x_shape.type(), output_dyn_dims};
+    }
+    shape static_compute_shape(shape x_shape, shape w_shape) const
+    {
+        std::vector<size_t> output_lens{x_shape.lens()[0], w_shape.lens()[1]};
+        auto spatial_lens = calc_spatial_lens(x_shape.lens(), w_shape.lens());
+        std::for_each(spatial_lens.begin(), spatial_lens.end(), [&output_lens](auto x) {
+            output_lens.push_back(x);
+        });
+        return x_shape.with_lens(output_lens);
    }
-    argument compute(shape output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
-        auto kdims = this->kdims();
+        auto num_spatial_dims = this->kdims();
        visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
            using type = typename decltype(output)::value_type;
@@ -109,22 +156,22 @@ struct deconvolution
            auto wei_n = wei[0];
            auto wei_c = wei[1];
-            auto out_lens = output_shape.lens();
+            auto out_lens = dyn_out.computed_shape.lens();
            std::vector<std::size_t> win_size{in_c};
            std::copy(in_lens.begin() + 2, in_lens.end(), std::back_inserter(win_size));
            std::copy(wei.begin() + 2, wei.end(), std::back_inserter(win_size));
-            shape win_shape{output_shape.type(), win_size};
+            shape win_shape{dyn_out.computed_shape.type(), win_size};
            par_dfor(in_n, wei_c)([&](int o, int k) {
                shape_for_each(win_shape, [&](auto idx_win) {
                    const int w = idx_win[0];
                    auto input_dims_start = idx_win.begin() + 1;
-                    auto wei_dims_start   = idx_win.begin() + kdims + 1;
+                    auto wei_dims_start   = idx_win.begin() + num_spatial_dims + 1;
                    std::vector<std::ptrdiff_t> win_start;
-                    for(std::size_t n = 0; n < kdims; ++n)
+                    for(std::size_t n = 0; n < num_spatial_dims; ++n)
                    {
                        win_start.push_back(std::ptrdiff_t(*(input_dims_start + n) * stride[n]) -
                                            std::ptrdiff_t(padding[n]));
@@ -135,7 +182,7 @@ struct deconvolution
                    std::vector<std::ptrdiff_t> idx_out{o, in_ch};
-                    for(size_t n = 0; n < kdims; n++)
+                    for(size_t n = 0; n < num_spatial_dims; n++)
                    {
                        idx_out.push_back(win_start[n] + *(wei_dims_start + n) * dilation[n]);
                    }

--- a/src/include/migraphx/op/pooling.hpp
+++ b/src/include/migraphx/op/pooling.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -42,16 +42,43 @@ namespace op {
 struct pooling
 {
-    pooling_mode mode                = {pooling_mode::average};
+    pooling_mode mode = {pooling_mode::average};
+    // Padding along each spatial input dimension
+    // Can be ndim or 2*ndim values where ndim is size of lengths
+    // ndim values means pad the same before and after each dimension
+    // 2*ndim values contains n pre and then n post padding values
    std::vector<std::size_t> padding = {0, 0};
-    std::vector<std::size_t> stride  = {1, 1};
+    // Size of stride to take from one placement of the pooling kernel to the next.
+    // This is distinct from the strides used by the shape class.  Must be the same
+    // ndim as lengths.
+    std::vector<std::size_t> stride = {1, 1};
+    // Spatial dimensions of the pooling kernel or window,
+    // 2 smaller than the input tensor rank (NCHW layout)
    std::vector<std::size_t> lengths = {1, 1};
-    bool ceil_mode                   = false;
-    int lp_order                     = 2;
+    // Dilations are not supported at this time.
+    // ceiling mode is a flag affecting output size
+    // or equivalently, placements of the pooling kernel.
+    // When true, round the size upwards, possibly
+    // including partial placements where the kernel extends beyond the edge
+    // of input and even padding.  When false, round down so that all
+    // kernel placements fit but some input values may be dropped.
+    bool ceil_mode = false;
+    int lp_order   = 2;
    // Global pooling with dynamic shape input
    bool dyn_global = false;
+    // an attribute of the Onnx pooling operator, not currently enabled here because MIOpen can't
+    // support it. We currently implement padding for average pooling by inserting a Padding
+    // operator during Onnx parsing. But to support dynamic shape inputs and count_include_pad
+    // together, it would be necessary to do this calculation at runtime in MIOpen.
+    bool count_include_pad = false;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
@@ -68,11 +95,29 @@ struct pooling
    void check_attribute_size() const
    {
-        if((padding.size() != stride.size() and (padding.size() / 2) != stride.size()) or
+        if(dyn_global)
-           (not dyn_global and stride.size() != lengths.size()))
+            return;
+        if((padding.size() != stride.size() and (padding.size()) != stride.size() * 2) or
+           stride.size() != lengths.size())
        {
            MIGRAPHX_THROW("POOLING: inconsistent attribute sizes");
        }
+        if(std::any_of(lengths.begin(), lengths.end(), [&](auto i) { return (i == 0); }) or
+           std::any_of(stride.begin(), stride.end(), [&](auto i) { return (i == 0); }))
+        {
+            MIGRAPHX_THROW("POOLING: size 0 pooling kernel or stride");
+        }
+        // TODO:  update lowering to run the reference
+        // code when OneDNN can't execute pooling for a CPU
+        // OneDNN has a limitation on padding size for pooling.  see
+        // https://oneapi-src.github.io/oneDNN/dev_guide_convolution.html#doxid-dev-guide-convolution
+        // padding = {2}; stride = {1}; lengths = {3} succeeds in oneDNN but
+        // padding = {2}; stride = {1}; lengths = {2} fails.
+        // Also, the referenced documentation contains a max. dimension size of 14 for the kernel
+        // ("weights tensor") that MIGraphX doesn't enforce.
    }
    size_t kdims() const
@@ -112,7 +157,11 @@ struct pooling
        const shape& input = inputs.at(0);
        auto padding_size  = padding.size();
        size_t kdims       = input.ndim() - 2;
-        if(input.ndim() != padding_size / 2 + 2 and input.ndim() != padding_size + 2)
+        if(input.ndim() < 3)
+        {
+            MIGRAPHX_THROW("POOLING: input must have 3 or more dimensions and be nonempty");
+        }
+        if(input.ndim() * 2 != padding_size + 4 and input.ndim() != padding_size + 2)
        {
            MIGRAPHX_THROW("POOLING: input and attribute size mismatch!");
        }
@@ -132,7 +181,7 @@ struct pooling
            }
            else
            {
-                // does not compute for optimals
+                // does not compute optimals
                auto min_spatial_dims = calc_spatial_dim_out(input.min_lens(), kdims);
                auto max_spatial_dims = calc_spatial_dim_out(input.max_lens(), kdims);
                for(size_t i = 0; i < kdims; ++i)
@@ -149,7 +198,7 @@ struct pooling
            std::vector<std::size_t> output_lens(input_lens.begin(), input_lens.begin() + 2);
            // Used for when normalize_compute_shape() is called again at model eval time
-            // for an originally dynamic shape. Since kernel shape is not used with dyn_global.
+            // for an originally dynamic shape. Kernel shape is not used with dyn_global.
            if(dyn_global)
            {
                for(size_t i = 0; i < kdims; ++i)
@@ -184,7 +233,7 @@ struct pooling
        double operator()(double x, double y) const { return x + std::pow(std::abs(y), p); }
-        double final(double x, std::size_t) const { return std::pow(x, 1. / p); }
+        double final(double x, std::size_t) const { return (p == 0) ? 1 : std::pow(x, 1. / p); }
    };
    struct avg_pool
@@ -222,37 +271,82 @@ struct pooling
    {
        auto in_s    = input.get_shape();
        auto in_lens = in_s.lens();
+        // For each element of output; i.e., for each placement of pooling kernel...
        par_for(output_shape.elements(), [&](auto i) {
            auto idx_o = output_shape.multi(i);
            auto n_dim = idx_o.size();
-            std::vector<std::size_t> win_start;
+            // starting offset of the pooling window
+            std::vector<int> win_start;
            std::vector<std::size_t> win_size;
+            // For each spatial dimension, find starting and ending index of pooling kernel
            for(std::size_t dim = 2; dim < n_dim; ++dim)
            {
                auto d_2 = dim - 2;
                int start =
                    static_cast<int>(idx_o[dim] * stride[d_2]) - static_cast<int>(padding[d_2]);
-                int end = std::min(start + kernel_dims[d_2], in_lens[dim]);
+                int end;
-                start   = std::max(start, 0);
+                // NOLINT
+                if(count_include_pad and ceil_mode and (mode != pooling_mode::max))
+                {
+                    // TODO: this block can't execute until we enable count_include_pad
+                    // Even when using padding, if in ceil_mode a window
+                    // could extend beyond the end of both input and
+                    // padding.  Clip out-of-bounds indexes but not padding.
+                    // Check if this kernel extends beyond the padding at end of dimension
+                    end = std::min(start + kernel_dims[d_2],
+                                   in_lens[dim] + static_cast<int>(padding[d_2]));
+                }
+                else
+                {
+                    // In non-ceiling mode, when
+                    // count_include_pad is false, or for max pooling, clip off padding.
+                    end   = std::min(start + kernel_dims[d_2], in_lens[dim]);
+                    start = std::max(start, 0);
+                }
                win_start.push_back(start);
+                if(end < start)
+                {
+                    // This error can be caused by misc. bad input combinations
+                    MIGRAPHX_THROW("POOLING:  invalid attributes");
+                }
                win_size.push_back(end - start);
            }
            shape win_shape{output_shape.type(), win_size};
            auto pool_size    = win_shape.elements();
            double output_val = op.template init<Type>();
+            // for each element in the window...
            shape_for_each(win_shape, [&](auto idx_w) {
+                // the coordinates of this element
                auto idx = idx_o;
+                // Add the kernel location idx_w and the offset win_start, for each dimension.
+                // Negative results are cast to very large unsigned integers.
                std::transform(idx_w.begin(),
                               idx_w.end(),
                               win_start.begin(),
                               idx.begin() + 2,
                               [](auto ii, auto jj) { return ii + jj; });
-                if(std::all_of(idx.begin() + 2, idx.end(), [&](auto ii) { return ii >= 0; }) and
+                // Check if any of coordinates are out of input tensor's range
-                   idx < in_lens)
+                if(std::mismatch(idx.begin() + 2,
+                                 idx.end(),
+                                 in_lens.begin() + 2,
+                                 in_lens.end(),
+                                 std::less<>{}) == std::make_pair(idx.end(), in_lens.end()))
                {
                    output_val = op(output_val, input[in_s.index(idx)]);
                }
+                else
+                {
+                    // this is a padding element.  Padding locations
+                    // don't contribute to average or max pooling total but can play in
+                    // lpnorm pooling.
+                    output_val = op(output_val, 0);
+                }
            });
            output[i] = Type(op.final(output_val, pool_size));
        });

--- a/src/include/migraphx/operation.hpp
+++ b/src/include/migraphx/operation.hpp
@@ -499,7 +499,7 @@ lifetime get_lifetime_op(const T&)
 #ifdef TYPE_ERASED_DECLARATION
 // Type-erased interface for:
-struct operation
+struct MIGRAPHX_EXPORT operation
 {
    //
    std::string name() const;
@@ -1390,8 +1390,8 @@ bool has_finalize(const T& x)
    return detail::has_finalize_op(x);
 }
-void migraphx_to_value(value& v, const operation& op);
+MIGRAPHX_EXPORT void migraphx_to_value(value& v, const operation& op);
-void migraphx_from_value(const value& v, operation& op);
+MIGRAPHX_EXPORT void migraphx_from_value(const value& v, operation& op);
 #endif

--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -45,9 +45,9 @@
 #include <migraphx/op/contiguous.hpp>
 #include <migraphx/op/convert.hpp>
 #include <migraphx/op/convolution.hpp>
+#include <migraphx/op/convolution_backwards.hpp>
 #include <migraphx/op/cosh.hpp>
 #include <migraphx/op/cos.hpp>
-#include <migraphx/op/deconvolution.hpp>
 #include <migraphx/op/div.hpp>
 #include <migraphx/op/dot.hpp>
 #include <migraphx/op/elu.hpp>

--- a/src/include/migraphx/optimize_module.hpp
+++ b/src/include/migraphx/optimize_module.hpp
@@ -36,7 +36,7 @@ struct module_pass_manager;
 /**
 * Runs several passes in a loop
 */
-struct optimize_module
+struct MIGRAPHX_EXPORT optimize_module
 {
    std::string name() const { return "optimize_module"; }
    void apply(module_pass_manager& mpm) const;

--- a/src/include/migraphx/pad_calc.hpp
+++ b/src/include/migraphx/pad_calc.hpp
@@ -32,6 +32,7 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
+MIGRAPHX_EXPORT
 void calculate_padding(int64_t idx,
                       std::vector<int64_t>& pads,
                       int64_t input_dim,
@@ -45,6 +46,7 @@ void calculate_padding(int64_t idx,
 * where the padding calculation must be done at evaluation time.
 * \return padding in the form of {x0_begin, x1_begin, ... x0_end , x1_end, ...}
 */
+MIGRAPHX_EXPORT
 std::vector<std::size_t> calc_dyn_auto_pad(const std::vector<std::size_t>& input_lens,
                                           const std::vector<std::size_t>& wei_lens,
                                           const std::vector<std::size_t>& strides,
@@ -53,6 +55,7 @@ std::vector<std::size_t> calc_dyn_auto_pad(const std::vector<std::size_t>& input
 // Used for dynamic auto padding of convolution operators since padding needs to be computed at
 // evaulation time.
+MIGRAPHX_EXPORT
 shape compute_padded_shape(const shape& input,
                           const shape& weights,
                           const std::vector<std::size_t>& padding,

--- a/src/include/migraphx/pass.hpp
+++ b/src/include/migraphx/pass.hpp
@@ -57,7 +57,7 @@ struct pass
 #else
-module& get_module(module_pass_manager& mpm);
+MIGRAPHX_EXPORT module& get_module(module_pass_manager& mpm);
 namespace detail {
@@ -84,7 +84,7 @@ void module_pass_manager_apply(const T& x, module_pass_manager& mpm)
 #ifdef TYPE_ERASED_DECLARATION
 // Type-erased interface for:
-struct pass
+struct MIGRAPHX_EXPORT pass
 {
    //
    std::string name() const;

--- a/src/include/migraphx/pass_manager.hpp
+++ b/src/include/migraphx/pass_manager.hpp
@@ -47,12 +47,14 @@ struct module_pass_manager
    virtual ~module_pass_manager() {}
 };
-void run_passes(program& prog,
+MIGRAPHX_EXPORT void run_passes(program& prog,
-                module_ref root_mod,
+                                module_ref root_mod,
-                const std::vector<pass>& passes,
+                                const std::vector<pass>& passes,
-                tracer trace = tracer{});
+                                tracer trace = tracer{});
-void run_passes(module& mod, const std::vector<pass>& passes, tracer trace = tracer{});
+MIGRAPHX_EXPORT void
-void run_passes(program& prog, const std::vector<pass>& passes, tracer trace = tracer{});
+run_passes(module& mod, const std::vector<pass>& passes, tracer trace = tracer{});
+MIGRAPHX_EXPORT void
+run_passes(program& prog, const std::vector<pass>& passes, tracer trace = tracer{});
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/permutation.hpp
+++ b/src/include/migraphx/permutation.hpp
@@ -43,7 +43,7 @@ inline Vector reorder_dims(const Vector& dims, const std::vector<int64_t>& permu
    return result;
 }
-shape reorder_shape(const shape& s, const std::vector<int64_t>& permutation);
+MIGRAPHX_EXPORT shape reorder_shape(const shape& s, const std::vector<int64_t>& permutation);
 template <class Vector, class Op>
 inline std::vector<int64_t> sort_permutation(const Vector& data, Op op)
@@ -58,13 +58,17 @@ inline std::vector<int64_t> sort_permutation(const Vector& data, Op op)
 /*!
 * Returns the inverse permutation that could be applied to undo the inputted permutation
 */
-std::vector<int64_t> invert_permutation(const std::vector<int64_t>& permutation);
+MIGRAPHX_EXPORT std::vector<int64_t> invert_permutation(const std::vector<int64_t>& permutation);
 /*!
 * Finds the permutation that would make the shape not transposed (refering to shape.transposed())
 */
-std::vector<int64_t> find_permutation(const shape& s);
+MIGRAPHX_EXPORT std::vector<int64_t> find_permutation(const shape& s);
-std::vector<int64_t> find_permutation(const std::vector<shape>& shapes);
+MIGRAPHX_EXPORT std::vector<int64_t> find_permutation(const std::vector<shape>& shapes);
+/// Normalize the shapes so the order of dimensions will be in the order it is
+/// in memory as much as possible.
+MIGRAPHX_EXPORT std::vector<shape> normalize_permutation(const std::vector<shape>& shapes);
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx