Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into...

Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into refactor_dyn_fixed_compare

Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into...
Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into refactor_dyn_fixed_compare
f303624d · charlie · f4a3a182 · 48cc33e4 · f303624d · f303624d
Commit f303624d authored Dec 06, 2022 by charlie
20 changed files
--- a/Dockerfile
+++ b/Dockerfile
@@ -87,7 +87,7 @@ RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXR
 ADD tools/build_and_test_onnxrt.sh /onnxruntime/build_and_test_onnxrt.sh
-RUN cget -p /usr/local install ROCmSoftwarePlatform/llvm-project-mlir@c0723a7e50043d973cb73ae51dc30d36679ee7e5 -DBUILD_MIXR_TARGET=On
+RUN cget -p /usr/local install ROCmSoftwarePlatform/rocMLIR@0f38fb33f518b53b94b541feb9b079668c5518e8 -DBUILD_MIXR_TARGET=On -DLLVM_ENABLE_ZSTD=Off -DLLVM_ENABLE_THREADS=Off
 ENV MIOPEN_FIND_DB_PATH=/tmp/miopen/find-db
 ENV MIOPEN_USER_DB_PATH=/tmp/miopen/user-db

--- a/examples/migraphx/migraphx_driver/README.md
+++ b/examples/migraphx/migraphx_driver/README.md
@@ -29,6 +29,7 @@ See below for a comprehensive list of commands and option arguments, as well as
 | --tf | Load file as a tensorflow graph |
 | --migraphx | Load file as a migraphx graph |
 | --migraphx-json | Load file as a migraphx JSON graph |
+| --batch | Set batch size for the model | 
 | --nhwc | Treat tensorflow format as nhwc | 
 | --nchw | Treat tensorflow format as nchw |
 | --skip-unknown-operators | Skip unknown operators when parsing and continue to parse |

--- a/examples/nlp/python_bert_squad/requirements_bertsquad.txt
+++ b/examples/nlp/python_bert_squad/requirements_bertsquad.txt
@@ -21,6 +21,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 #####################################################################################
-tensorflow==2.7.2
+tensorflow==2.9.3
 onnxruntime
 tokenizers
\ No newline at end of file
--- a/src/include/migraphx/int_divide.hpp
+++ b/src/include/migraphx/int_divide.hpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#ifndef MIGRAPHX_GUARD_RTGLIB_INT_DIVIDE_HPP
-#define MIGRAPHX_GUARD_RTGLIB_INT_DIVIDE_HPP
-#include <migraphx/config.hpp>
-#include <cmath>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-template <class R, class T, class U>
-R floor_divide(T x, U y)
-{
-    return R(std::floor(double(x) / double(y)));
-}
-template <class R, class T, class U>
-R ceil_divide(T x, U y)
-{
-    return R(std::ceil(double(x) / double(y)));
-}
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-#endif
--- a/src/include/migraphx/literal.hpp
+++ b/src/include/migraphx/literal.hpp
@@ -80,6 +80,7 @@ struct literal : raw_data<literal>
        fill(start, end);
    }
+    // Directly copies buffer of x
    template <class T, MIGRAPHX_REQUIRES(sizeof(T) == 1)>
    literal(const shape& s, T* x) : buffer(make_shared_array<char>(s.bytes())), m_shape(s)
    {
@@ -107,25 +108,15 @@ struct literal : raw_data<literal>
    std::shared_ptr<char> buffer;
    shape m_shape;
+    // Keeps the same data ordering as the given container
    template <class Iterator>
    void fill(Iterator start, Iterator end)
    {
        assert(std::distance(start, end) == m_shape.elements());
-        if(m_shape.standard())
-        {
-            m_shape.visit_type([&](auto as) { std::copy(start, end, as.from(buffer.get())); });
-        }
-        else
-        {
-            auto it = start;
        m_shape.visit_type([&](auto as) {
            auto output = make_view(m_shape, as.from(buffer.get()));
-                shape_for_each(output.get_shape(), [&](const auto& idx) {
+            std::copy(start, end, output.begin());
-                    output(idx.begin(), idx.end()) = *it; // NOLINT(bugprone-signed-char-misuse)
-                    it++;
        });
-            });
-        }
    }
 };

--- a/src/include/migraphx/op/pooling.hpp
+++ b/src/include/migraphx/op/pooling.hpp
@@ -31,7 +31,7 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/par_for.hpp>
 #include <migraphx/shape_for_each.hpp>
-#include <migraphx/int_divide.hpp>
+#include <migraphx/dyn_output.hpp>
 #include <cmath>
 #include <utility>
@@ -49,6 +49,9 @@ struct pooling
    bool ceil_mode                   = false;
    int lp_order                     = 2;
+    // Global pooling with dynamic shape input
+    bool dyn_global = false;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
@@ -57,7 +60,8 @@ struct pooling
                    f(self.stride, "stride"),
                    f(self.lengths, "lengths"),
                    f(self.ceil_mode, "ceil_mode"),
-                    f(self.lp_order, "lp_order"));
+                    f(self.lp_order, "lp_order"),
+                    f(self.dyn_global, "dyn_global"));
    }
    std::string name() const { return "pooling"; }
@@ -65,51 +69,111 @@ struct pooling
    void check_attribute_size() const
    {
        if((padding.size() != stride.size() and (padding.size() / 2) != stride.size()) or
-           stride.size() != lengths.size())
+           (not dyn_global and stride.size() != lengths.size()))
        {
            MIGRAPHX_THROW("POOLING: inconsistent attribute sizes");
        }
    }
+    size_t kdims() const
+    {
+        check_attribute_size();
+        return stride.size();
+    }
    value attributes() const { return {{"normalize_padding", "padding"}}; }
+    std::vector<std::size_t> calc_spatial_dim_out(const std::vector<std::size_t>& input_lens,
+                                                  std::size_t kdims) const
+    {
+        std::vector<std::size_t> output_lens{};
+        for(size_t i = 0; i < kdims; ++i)
+        {
+            if(input_lens[i + 2] == 0)
+            {
+                // handle opt = 0
+                output_lens.push_back(0);
+            }
+            else
+            {
+                std::size_t padding_factor = 2 * padding[i];
+                if(padding.size() == 2 * kdims)
+                    padding_factor = padding[i] + padding[i + kdims];
+                assert(input_lens[i + 2] + padding_factor >= lengths[i]);
+                std::size_t dim_size = input_lens[i + 2] + padding_factor - lengths[i];
+                std::size_t len =
+                    (ceil_mode)
+                        ? dim_size / stride[i] + static_cast<std::size_t>((dim_size % stride[i] !=
+                                                                           0)) // ceil uint divide
+                        : dim_size / stride[i];                                // floor divide
+                output_lens.push_back(len + 1);
+            }
+        }
+        return output_lens;
+    }
    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1);
+        check_shapes{inputs, *this, true}.has(1);
+        check_attribute_size();
        const shape& input = inputs.at(0);
-        auto input_lens   = input.lens();
-        size_t kdims      = input_lens.size() - 2;
-        auto input_size   = inputs[0].lens().size();
        auto padding_size  = padding.size();
-        if(input_size != padding_size / 2 + 2 and input_size != padding_size + 2)
+        size_t kdims       = input.ndim() - 2;
+        if(input.ndim() != padding_size / 2 + 2 and input.ndim() != padding_size + 2)
        {
            MIGRAPHX_THROW("POOLING: input and attribute size mismatch!");
        }
-        std::vector<std::size_t> output_lens(input_lens.begin(), input_lens.begin() + 2);
+        if(input.dynamic())
-        for(size_t i = 0; i < kdims; i++)
        {
-            std::ptrdiff_t dim_size;
+            auto input_dyn_dims = input.dyn_dims();
-            auto padding_factor = 2 * padding[i];
+            std::vector<shape::dynamic_dimension> output_dyn_dims(input_dyn_dims.begin(),
-            if(padding_size == 2 * kdims)
+                                                                  input_dyn_dims.begin() + 2);
-                padding_factor = padding[i] + padding[i + kdims];
+            if(dyn_global)
-            dim_size = input_lens[i + 2] + padding_factor - lengths[i];
+            {
-            assert(dim_size >= 0);
+                for(size_t i = 0; i < kdims; ++i)
-            std::size_t len = (ceil_mode) ? ceil_divide<std::ptrdiff_t>(dim_size, stride[i])
+                {
-                                          : floor_divide<std::ptrdiff_t>(dim_size, stride[i]);
+                    output_dyn_dims.push_back(shape::dynamic_dimension{1, 1, 1});
-            output_lens.push_back(std::size_t(std::max<std::ptrdiff_t>(1, len + 1)));
                }
-        return inputs[0].with_lens(output_lens);
+                return {input.type(), output_dyn_dims};
+            }
+            else
+            {
+                auto min_spatial_dims = calc_spatial_dim_out(input.min_lens(), kdims);
+                auto max_spatial_dims = calc_spatial_dim_out(input.max_lens(), kdims);
+                auto opt_spatial_dims = calc_spatial_dim_out(input.opt_lens(), kdims);
+                for(size_t i = 0; i < kdims; ++i)
+                {
+                    output_dyn_dims.push_back(shape::dynamic_dimension{
+                        min_spatial_dims[i], max_spatial_dims[i], opt_spatial_dims[i]});
+                }
+                return {input.type(), output_dyn_dims};
            }
+        }
+        else
+        {
+            auto input_lens = input.lens();
-    size_t kdims() const
+            std::vector<std::size_t> output_lens(input_lens.begin(), input_lens.begin() + 2);
+            // Used for when normalize_compute_shape() is called again at model eval time
+            // for an originally dynamic shape. Since kernel shape is not used with dyn_global.
+            if(dyn_global)
            {
-        check_attribute_size();
+                for(size_t i = 0; i < kdims; ++i)
-        return stride.size();
+                {
+                    output_lens.push_back(1);
+                }
+                return {input.type(), output_lens};
+            }
+            else
+            {
+                auto output_spatial_lens = calc_spatial_dim_out(input_lens, kdims);
+                output_lens.insert(
+                    output_lens.end(), output_spatial_lens.begin(), output_spatial_lens.end());
+                return inputs[0].with_lens(output_lens);
+            }
+        }
    }
    struct lpnorm_pool
@@ -158,7 +222,11 @@ struct pooling
    };
    template <class Type, class Out, class In, class Op>
-    void calc_pooling(const shape& output_shape, Out& output, const In& input, Op op) const
+    void calc_pooling(const shape& output_shape,
+                      Out& output,
+                      const In& input,
+                      const std::vector<std::size_t>& kernel_dims,
+                      Op op) const
    {
        auto in_s    = input.get_shape();
        auto in_lens = in_s.lens();
@@ -172,7 +240,7 @@ struct pooling
                auto d_2 = dim - 2;
                int start =
                    static_cast<int>(idx_o[dim] * stride[d_2]) - static_cast<int>(padding[d_2]);
-                int end = std::min(start + lengths[d_2], in_lens[dim]);
+                int end = std::min(start + kernel_dims[d_2], in_lens[dim]);
                start   = std::max(start, 0);
                win_start.push_back(start);
                win_size.push_back(end - start);
@@ -198,21 +266,32 @@ struct pooling
        });
    }
-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
+    {
+        argument result{dyn_out.computed_shape};
+        auto input_lens = args[0].get_shape().lens();
+        std::vector<std::size_t> kernel_dims;
+        if(dyn_global)
        {
-        argument result{output_shape};
+            kernel_dims.insert(kernel_dims.end(), input_lens.begin() + 2, input_lens.end());
+        }
+        else
+        {
+            kernel_dims = this->lengths;
+        }
        visit_all(result, args[0])([&](auto output, auto input) {
            using type = typename decltype(output)::value_type;
            switch(mode)
            {
            case migraphx::op::pooling_mode::average:
-                calc_pooling<type>(output_shape, output, input, avg_pool{});
+                calc_pooling<type>(dyn_out.computed_shape, output, input, kernel_dims, avg_pool{});
                break;
            case migraphx::op::pooling_mode::max:
-                calc_pooling<type>(output_shape, output, input, max_pool{});
+                calc_pooling<type>(dyn_out.computed_shape, output, input, kernel_dims, max_pool{});
                break;
            case migraphx::op::pooling_mode::lpnorm:
-                calc_pooling<type>(output_shape, output, input, lpnorm_pool{lp_order});
+                calc_pooling<type>(
+                    dyn_out.computed_shape, output, input, kernel_dims, lpnorm_pool{lp_order});
                break;
            }
        });

--- a/src/include/migraphx/op/squeeze.hpp
+++ b/src/include/migraphx/op/squeeze.hpp
@@ -69,14 +69,10 @@ struct squeeze
            std::vector<shape::dynamic_dimension> dyn_dims = {};
            if(axes.empty())
            {
-                for(auto i : range(input_shape.ndim()))
+                std::copy_if(input_shape.dyn_dims().cbegin(),
-                {
+                             input_shape.dyn_dims().cend(),
-                    auto dd = input_shape.dyn_dims()[i];
+                             std::back_inserter(dyn_dims),
-                    if(dd != 1)
+                             [&](auto dd) { return dd != 1; });
-                    {
-                        dyn_dims.push_back(dd);
-                    }
-                }
            }
            else
            {

--- a/src/include/migraphx/op/transpose.hpp
+++ b/src/include/migraphx/op/transpose.hpp
@@ -29,6 +29,7 @@
 #include <migraphx/config.hpp>
 #include <migraphx/value.hpp>
 #include <migraphx/op/normalize_attribute.hpp>
+#include <migraphx/dyn_output.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -45,17 +46,15 @@ struct transpose
    }
    std::string name() const { return "transpose"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1);
+        check_shapes{inputs, *this, true}.has(1);
        auto input = inputs.at(0);
-        auto input_lens    = input.lens();
-        auto input_strides = input.strides();
-        auto t             = input.type();
-        if(dims.size() != input_lens.size())
+        if(dims.size() != input.ndim())
        {
-            MIGRAPHX_THROW("Permutation has wrong number of axes");
+            MIGRAPHX_THROW("TRANSPOSE: Permutation has wrong number of axes");
        }
        std::vector<int64_t> axes(dims.size());
        std::iota(axes.begin(), axes.end(), 0);
@@ -63,19 +62,36 @@ struct transpose
        {
            MIGRAPHX_THROW("TRANSPOSE: Invalid permutation");
        }
-        std::vector<size_t> output_lens(input_lens.size());
-        std::vector<size_t> output_strides(input_lens.size());
+        if(input.dynamic())
-        for(std::size_t i = 0; i < output_lens.size(); i++)
+        {
+            std::vector<shape::dynamic_dimension> output_dyn_dims(input.ndim());
+            std::transform(dims.cbegin(), dims.cend(), output_dyn_dims.begin(), [&](auto dim) {
+                return input.dyn_dims()[dim];
+            });
+            return {input.type(), output_dyn_dims};
+        }
+        else
+        {
+            auto input_lens    = input.lens();
+            auto input_strides = input.strides();
+            std::vector<size_t> output_lens(input.ndim());
+            std::vector<size_t> output_strides(input.ndim());
+            for(std::size_t i = 0; i < input.ndim(); i++)
            {
                output_lens[i]    = input_lens[dims[i]];
                output_strides[i] = input_strides[dims[i]];
            }
-        return {t, output_lens, output_strides};
+            return {input.type(), output_lens, output_strides};
        }
-    argument compute(shape output_shape, std::vector<argument> args) const
+    }
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        return args[0].reshape(output_shape);
+        return args[0].reshape(dyn_out.computed_shape);
    }
    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }
 };

--- a/src/include/migraphx/shape_for_each.hpp
+++ b/src/include/migraphx/shape_for_each.hpp
@@ -31,6 +31,9 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
+/**
+ * Iterates the given function over the indices from the shape in order.
+ */
 template <class F>
 void shape_for_each(const migraphx::shape& s, F f)
 {
@@ -51,7 +54,6 @@ void shape_for_each(const migraphx::shape& s, F f)
        call(indices);
    }
 }
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/insert_pad.cpp
+++ b/src/insert_pad.cpp
@@ -77,14 +77,14 @@ static void update_pooling(const instruction_ref& input, const instruction_ref&
    {
        return;
    }
-    auto kdims = input->get_shape().lens().size() - 2;
+    auto kdims = input->get_shape().ndim() - 2;
    if(std::equal(op.padding.begin(),
                  op.padding.begin() + kdims,
                  op.padding.begin() + kdims,
                  op.padding.end()))
        return;
-    std::vector<int64_t> padding(input->get_shape().lens().size() * 2, 0);
+    std::vector<int64_t> padding(input->get_shape().ndim() * 2, 0);
    std::vector<size_t> pads_l(op.padding.begin(), op.padding.begin() + kdims);
    std::vector<size_t> pads_r(op.padding.begin() + kdims, op.padding.end());
    op.padding = std::vector<size_t>(kdims * 2, 0);

--- a/src/onnx/parse_pooling.cpp
+++ b/src/onnx/parse_pooling.cpp
@@ -47,52 +47,42 @@ struct parse_pooling : op_parser<parse_pooling>
                {"GlobalLpPool", "lpnorm"}};
    }
-    instruction_ref parse(const op_desc& opd,
+    value handle_values(const op_desc& opd,
-                          const onnx_parser& /*parser*/,
                        onnx_parser::node_info info,
-                          std::vector<instruction_ref> args) const
+                        const shape& in_shape,
+                        value values) const
    {
-        const std::unordered_map<std::string, op::pooling_mode> mode_map = {
+        auto kdims = in_shape.ndim() - 2;
-            {"max", op::pooling_mode::max},
+        if(starts_with(opd.onnx_name, "Global"))
-            {"average", op::pooling_mode::average},
-            {"lpnorm", op::pooling_mode::lpnorm}};
-        std::string mode = opd.op_name;
-        if(not contains(mode_map, mode))
        {
-            MIGRAPHX_THROW("onnx pooling mode must be [\"max\", \"average\", \"lpnorm\"]");
+            // if spatial dimensions are dynamic use dyn_global flag
+            if(in_shape.dynamic() and std::any_of(in_shape.dyn_dims().cbegin() + 2,
+                                                  in_shape.dyn_dims().cend(),
+                                                  [](auto dd) { return not dd.is_fixed(); }))
+            {
+                values["dyn_global"] = true;
+                values["lengths"]    = std::vector<size_t>();
            }
-        operation op = make_op("pooling", {{"mode", mode_map.at(mode)}});
+            else
-        value values = op.to_value();
-        auto l0      = args[0];
-        auto in_lens = l0->get_shape().lens();
-        assert(in_lens.size() > 2);
-        auto kdims = in_lens.size() - 2;
-        if(starts_with(opd.onnx_name, "Global"))
            {
-            values["lengths"] = std::vector<size_t>(in_lens.begin() + 2, in_lens.end());
+                // works with static and fixed dynamic shape
+                auto m_lens       = in_shape.max_lens();
+                values["lengths"] = std::vector<size_t>(m_lens.begin() + 2, m_lens.end());
+            }
        }
-        // does not support ceil_mode
        if(contains(info.attributes, "ceil_mode"))
        {
            values["ceil_mode"] = static_cast<bool>(info.attributes.at("ceil_mode").i());
        }
-        // count include padding, if count include pad is 1, we always use
-        // explicit pad
-        int count_include_pad = 0;
-        if(contains(info.attributes, "count_include_pad"))
-        {
-            count_include_pad = info.attributes.at("count_include_pad").i();
-        }
        if(contains(info.attributes, "strides"))
        {
            values["stride"].clear();
            copy(info.attributes["strides"].ints(), std::back_inserter(values["stride"]));
            check_attr_sizes(kdims, values["stride"].size(), "PARSE_POOLING: inconsistent strides");
        }
        if(contains(info.attributes, "kernel_shape"))
        {
            values["lengths"].clear();
@@ -110,6 +100,46 @@ struct parse_pooling : op_parser<parse_pooling>
        // ensure pads availabe only when auto_pad is "NOT_SET"
        check_padding_mode(info, "POOLING");
+        return values;
+    }
+    instruction_ref parse(const op_desc& opd,
+                          const onnx_parser& /*parser*/,
+                          onnx_parser::node_info info,
+                          std::vector<instruction_ref> args) const
+    {
+        std::string mode                                                 = opd.op_name;
+        const std::unordered_map<std::string, op::pooling_mode> mode_map = {
+            {"max", op::pooling_mode::max},
+            {"average", op::pooling_mode::average},
+            {"lpnorm", op::pooling_mode::lpnorm}};
+        if(not contains(mode_map, mode))
+        {
+            MIGRAPHX_THROW(
+                "PARSE_POOLING: onnx pooling mode must be [\"max\", \"average\", \"lpnorm\"]");
+        }
+        operation op  = make_op("pooling", {{"mode", mode_map.at(mode)}});
+        value values  = op.to_value();
+        auto l0       = args[0];
+        auto in_shape = l0->get_shape();
+        assert(in_shape.ndim() > 2);
+        auto kdims = in_shape.ndim() - 2;
+        values = handle_values(opd, info, in_shape, values);
+        // count include padding, if count include pad is 1, we always use
+        // explicit pad
+        int count_include_pad = 0;
+        if(contains(info.attributes, "count_include_pad"))
+        {
+            if(in_shape.dynamic())
+            {
+                MIGRAPHX_THROW("PARSE_POOLING: count_include_pad attribute is not supported for "
+                               "dynamic input shape");
+            }
+            count_include_pad = info.attributes.at("count_include_pad").i();
+        }
        std::vector<int64_t> paddings;
        float pad_val = ((mode == "max") ? std::numeric_limits<float>::lowest() : 0.0f);
@@ -122,6 +152,13 @@ struct parse_pooling : op_parser<parse_pooling>
        }
        if(contains(info.attributes, "auto_pad"))
+        {
+            if(in_shape.dynamic())
+            {
+                MIGRAPHX_THROW(
+                    "PARSE_POOLING: Auto padding pooling with dynamic input shape not supported");
+            }
+            else
            {
                values["padding"].clear();
                // return paddings could be empty, then setting to 0 for no padding
@@ -129,9 +166,10 @@ struct parse_pooling : op_parser<parse_pooling>
                                      values,
                                      values["lengths"].to_vector<std::size_t>(),
                                      {1, 1},
-                                  in_lens,
+                                      in_shape.lens(),
                                      paddings);
            }
+        }
        if(paddings.size() != 2 * kdims)
        {
@@ -150,6 +188,7 @@ struct parse_pooling : op_parser<parse_pooling>
            values["stride"].resize(kdims);
            std::fill_n(values["stride"].begin(), kdims, 1);
        }
        // used to calculate the supposed output shape
        std::vector<int64_t> orig_padding = paddings;
@@ -159,6 +198,11 @@ struct parse_pooling : op_parser<parse_pooling>
        if(not slice_start.empty())
        {
+            if(in_shape.dynamic())
+            {
+                MIGRAPHX_THROW(
+                    "PARSE_POOLING: asymmetric padding not supported for dynamic input shape");
+            }
            // calculate expected output shape
            orig_padding.insert(orig_padding.begin() + kdims, 2, 0);
            orig_padding.insert(orig_padding.begin(), 2, 0);

--- a/src/onnx/parse_transpose.cpp
+++ b/src/onnx/parse_transpose.cpp
@@ -47,7 +47,7 @@ struct parse_transpose : op_parser<parse_transpose>
        }
        // if perm is empty, use the default value
-        auto n_dim = args.front()->get_shape().lens().size();
+        auto n_dim = args.front()->get_shape().ndim();
        if(perm.empty())
        {
            perm.resize(n_dim);

--- a/src/targets/gpu/jit/mlir.cpp
+++ b/src/targets/gpu/jit/mlir.cpp
@@ -24,7 +24,6 @@
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/mlir.hpp>
 namespace migraphx {

--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
@@ -32,7 +32,13 @@
 #include <mlir-c/Dialect/MIGraphX.h>
 #include <mlir-c/IntegerSet.h>
 #include <mlir-c/Pass.h>
-#include <mlir-c/Registration.h>
+#include <mutex>
+#if !defined(MLIR_MIGRAPHX_DIALECT_API_VERSION) || MLIR_MIGRAPHX_DIALECT_API_VERSION != 3
+#warning "Incompatible version of rocMLIR library used, disabling"
+#undef MIGRAPHX_MLIR
+#else
+#include <mlir-c/RegisterRocMLIR.h>
+#endif
 #endif
 #include <migraphx/env.hpp>
@@ -50,10 +56,6 @@
 #include <deque>
 #include <variant>
-#if defined(MLIR_MIGRAPHX_DIALECT_API_VERSION) && MLIR_MIGRAPHX_DIALECT_API_VERSION >= 2
-#define MIGRAPHX_MLIR_BARE_POINTER
-#endif
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
@@ -168,9 +170,11 @@ struct mlir_program
          location(mlirLocationUnknownGet(ctx.get())),
          mmodule(mlirModuleCreateEmpty(location))
    {
-        MlirDialectHandle mixr_handle = mlirGetDialectHandle__migraphx__();
+        MlirDialectRegistry registry = mlirDialectRegistryCreate();
-        mlirDialectHandleRegisterDialect(mixr_handle, ctx.get());
+        mlirRegisterRocMLIRDialects(registry);
-        mlirRegisterAllDialects(ctx.get());
+        mlirContextAppendDialectRegistry(ctx.get(), registry);
+        mlirContextLoadAllAvailableDialects(ctx.get());
+        mlirDialectRegistryDestroy(registry);
        mlirContextSetAllowUnregisteredDialects(ctx.get(), true /*allow*/);
    }
@@ -452,7 +456,8 @@ struct mlir_program
        auto ops = create_operation_state("func.func");
        ops.add_attributes({{"function_type", make_function_type(inputs, outputs)},
                            {"sym_name", std::string("main")},
-                            {"kernel", std::string("mixr")}});
+                            {"kernel", std::string("mixr")},
+                            {"arch", target_arch}});
        ops.add_region(std::move(region));
        insert(body, std::move(ops));
@@ -512,7 +517,8 @@ struct mlir_program
                pp =
                    problem_params{ins->get_operator(), to_shapes(ins->inputs()), ins->get_shape()};
                // check if HW supports xdlops
-                bool xdlops       = contains(get_xdlops_archs(), target_name);
+                auto target_chip  = trim(split_string(target_arch, ':').front());
+                bool xdlops       = contains(get_xdlops_archs(), target_chip);
                std::string tuned = get_tune_params(xdlops);
                if(not tuned.empty())
                    ops.add_attributes({{"perf_config", tuned}});
@@ -540,7 +546,7 @@ struct mlir_program
        // 1st pipeline to call
        mlirMIGraphXAddHighLevelPipeline(pm.get());
        // 2nd pipeline to call
-        mlirMIGraphXAddBackendPipeline(pm.get(), target_name.c_str(), "amdgcn-amd-amdhsa", "");
+        mlirMIGraphXAddBackendPipeline(pm.get(), target_arch.c_str());
        mlirPassManagerRun(pm.get(), mmodule.get());
        code_object_op op{};
@@ -550,16 +556,7 @@ struct mlir_program
        return op;
    }
-    void find_target()
+    void find_target() { target_arch = get_device_name(); }
-    {
-        std::string tname = get_device_name();
-        // HACK: Since MLIR can't handle the full target name
-        target_name = trim(split_string(tname, ':').front());
-        if(tname.size() != target_name.size())
-            std::cout
-                << "*************** WARNING: MLIR may not compile the correct target features for: "
-                << tname << std::endl;
-    }
    std::pair<std::size_t, std::size_t> get_launch_params() const
    {
@@ -588,7 +585,7 @@ struct mlir_program
    mlir_module mmodule;
    problem_params pp;
    std::deque<std::string> strings{};
-    std::string target_name;
+    std::string target_arch;
 };
 std::string dump_mlir(const module& m)
@@ -650,6 +647,10 @@ code_object_op compile_mlir(const context&, module m, const std::vector<instruct
    const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
    if(trace)
        std::cout << m << std::endl;
+    // set mutex while llvm thread support is disabled.
+    static std::mutex g_mlirc_mutex; // NOLINT
+    const std::lock_guard<std::mutex> lock(g_mlirc_mutex);
    mlir_program mp;
    mp.find_target();
    mp.parse(m);
@@ -669,46 +670,9 @@ instruction_ref insert_mlir(module& m,
    std::vector<instruction_ref> refs;
    std::size_t last = 0;
-#ifdef MIGRAPHX_MLIR_BARE_POINTER
    refs.reserve(inputs.size());
    std::copy(inputs.begin(), inputs.end(), std::back_inserter(refs));
    last               = refs.size() - 1;
-#else
-    refs.reserve(inputs.size() * 15);
-    std::unordered_map<uint64_t, instruction_ref> literal_map{};
-    auto get_literal = [&](uint64_t value) {
-        auto fi = literal_map.find(value);
-        if(fi != literal_map.end())
-            return fi->second;
-        auto lit = m.add_literal(value);
-        literal_map.emplace(value, lit);
-        return lit;
-    };
-    for(auto input : inputs)
-    {
-        const size_t offset = 0;
-        auto s              = input->get_shape();
-        last                = refs.size();
-        refs.push_back(input);
-        refs.push_back(input);
-        refs.push_back(get_literal(offset)); // offset
-        // dim sizes
-        std::transform(s.lens().begin(),
-                       s.lens().end(),
-                       std::back_inserter(refs),
-                       [&](const auto& lval) { return get_literal(lval); });
-        // refs.push_back(get_literal(1)); // G
-        // dim strides
-        std::transform(s.strides().begin(),
-                       s.strides().end(),
-                       std::back_inserter(refs),
-                       [&](const auto& lval) { return get_literal(lval); });
-        // refs.push_back(get_literal(1)); // G
-    }
-#endif
    co.expected_inputs = to_shapes(refs);
    co.output_arg      = last;
    return m.insert_instruction(ins, co, refs);

--- a/src/targets/gpu/perfdb.cpp
+++ b/src/targets/gpu/perfdb.cpp
@@ -27,6 +27,7 @@
 #include <migraphx/stringutils.hpp>
 #include <migraphx/permutation.hpp>
 #include <fstream>
+#include <mutex>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -88,6 +89,9 @@ std::string generate_miopen_config(const problem_params& pp)
 auto query_miopen_db(const std::string& query)
 {
+    static std::mutex g_db_mutex; // NOLINT
+    const std::lock_guard<std::mutex> lock(g_db_mutex);
    // TODO: Store db as a static variable
    const auto dbpath = fs::path{"/opt"} / "rocm" / "share" / "miopen" / "db" / "miopen.db";
    // Check if db file exists.

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -146,8 +146,6 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        pack_int8_args{},
        dead_code_elimination{},
-        adjust_allocation{gpu_allocation_model{}},
-        dead_code_elimination{},
        fuse_ops{&ctx, options.fast_math},
        dead_code_elimination{},
        replace_allocate{gpu_allocation_model{}, options.offload_copy},

--- a/test/gpu/mlir.cpp
+++ b/test/gpu/mlir.cpp
@@ -140,7 +140,7 @@ TEST_CASE(conv)
 {
    const std::string mlir_output = R"__migraphx__(
 module {
-  func.func @main(%arg0: tensor<2x8x3x3xf32>, %arg1: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {kernel = "mixr"} {
+  func.func @main(%arg0: tensor<2x8x3x3xf32>, %arg1: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {arch = "", kernel = "mixr"} {
    %0 = migraphx.convolution(%arg1, %arg0) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
    return %0 : tensor<1x2x2x2xf32>
  }
@@ -163,7 +163,7 @@ TEST_CASE(conv_add_relu)
 {
    const std::string mlir_output = R"__migraphx__(
 module {
-  func.func @main(%arg0: tensor<1x2x2x2xf32>, %arg1: tensor<2x8x3x3xf32>, %arg2: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {kernel = "mixr"} {
+  func.func @main(%arg0: tensor<1x2x2x2xf32>, %arg1: tensor<2x8x3x3xf32>, %arg2: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {arch = "", kernel = "mixr"} {
    %0 = migraphx.convolution(%arg2, %arg1) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
    %1 = migraphx.add(%0, %arg0) : (tensor<1x2x2x2xf32>, tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
    %2 = migraphx.relu(%1) : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>

--- a/test/literal_test.cpp
+++ b/test/literal_test.cpp
@@ -49,6 +49,25 @@ TEST_CASE(literal_test)
    EXPECT(l4.empty());
 }
+TEST_CASE(literal_nstd_shape_vector)
+{
+    migraphx::shape nstd_shape{migraphx::shape::float_type, {1, 3, 2, 2}, {12, 1, 6, 3}};
+    std::vector<float> data(12);
+    std::iota(data.begin(), data.end(), 0);
+    auto l0 = migraphx::literal{nstd_shape, data};
+    // check data buffer is read in correctly
+    std::vector<float> expected_buffer = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
+    const auto* start                  = reinterpret_cast<const float*>(l0.data());
+    std::vector<float> l0_data{start, start + 12};
+    EXPECT(l0_data == expected_buffer);
+    // check that using visit() (that uses a tensor view) gives data in correct order
+    std::vector<float> results_vector(12);
+    l0.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    EXPECT(results_vector == data);
+}
 TEST_CASE(literal_os1)
 {
    migraphx::literal l{1};

--- a/test/onnx/averagepool_dyn_asym_padding_error_test.onnx
+++ b/test/onnx/averagepool_dyn_asym_padding_error_test.onnx
--- a/test/onnx/averagepool_dyn_autopad_error_test.onnx
+++ b/test/onnx/averagepool_dyn_autopad_error_test.onnx