Merge branch 'develop' into optimize

913ae362 · Chris Austen · GitHub · f1e16656 · b8c8d09b · 913ae362
Unverified Commit 913ae362 authored Dec 13, 2022 by Chris Austen Committed by GitHub Dec 13, 2022
20 changed files
--- a/src/include/migraphx/shape_for_each.hpp
+++ b/src/include/migraphx/shape_for_each.hpp
@@ -31,6 +31,9 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
+/**
+ * Iterates the given function over the indices from the shape in order.
+ */
 template <class F>
 void shape_for_each(const migraphx::shape& s, F f)
 {
@@ -51,7 +54,6 @@ void shape_for_each(const migraphx::shape& s, F f)
        call(indices);
    }
 }
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/insert_pad.cpp
+++ b/src/insert_pad.cpp
@@ -77,14 +77,14 @@ static void update_pooling(const instruction_ref& input, const instruction_ref&
    {
        return;
    }
-    auto kdims = input->get_shape().lens().size() - 2;
+    auto kdims = input->get_shape().ndim() - 2;
    if(std::equal(op.padding.begin(),
                  op.padding.begin() + kdims,
                  op.padding.begin() + kdims,
                  op.padding.end()))
        return;
-    std::vector<int64_t> padding(input->get_shape().lens().size() * 2, 0);
+    std::vector<int64_t> padding(input->get_shape().ndim() * 2, 0);
    std::vector<size_t> pads_l(op.padding.begin(), op.padding.begin() + kdims);
    std::vector<size_t> pads_r(op.padding.begin() + kdims, op.padding.end());
    op.padding = std::vector<size_t>(kdims * 2, 0);

--- a/src/instruction.cpp
+++ b/src/instruction.cpp
@@ -302,6 +302,24 @@ void instruction::replace_mod_argument(module_ref old, module_ref new_mod)
    std::replace(module_args.begin(), module_args.end(), old, new_mod);
 }
+bool instruction::is_undefined() const
+{
+    if(op.name() == "undefined")
+    {
+        return true;
+    }
+    else if(this->inputs().empty())
+    {
+        return false;
+    }
+    else
+    {
+        return std::all_of(this->inputs().begin(), this->inputs().end(), [](auto arg) {
+            return arg->is_undefined();
+        });
+    }
+}
 bool instruction::can_eval() const
 {
    if(op.name() == "@literal")

--- a/src/onnx/onnx_parser.cpp
+++ b/src/onnx/onnx_parser.cpp
@@ -393,18 +393,31 @@ literal onnx_parser::parse_value(const onnx::AttributeProto& attr) const
 literal onnx_parser::parse_tensor(const onnx::TensorProto& t) const
 {
    std::vector<std::size_t> dims(t.dims().begin(), t.dims().end());
-    if(not t.external_data().empty())
+    auto type = get_type(t.data_type());
+    shape tensor_shape(type, dims);
+    auto external_data = t.external_data();
+    if(not external_data.empty())
    {
-        const std::string& data_file = t.external_data().at(0).value();
+        const std::string& data_file = external_data.at(0).value();
-        auto raw_buffer              = read_buffer(path + "/" + data_file);
+        size_t num_data_fields       = external_data.size();
+        size_t offset                = 0;
+        size_t nbytes                = tensor_shape.bytes();
+        if(num_data_fields > 1) // if offset field is present
+        {
+            offset = std::stoul(t.external_data().at(1).value());
+        }
+        if(num_data_fields > 2) // if nbytes field is present
+        {
+            nbytes = std::stoul(t.external_data().at(2).value());
+        }
+        auto raw_buffer = read_buffer(path + "/" + data_file, offset, nbytes);
        std::string s(raw_buffer.begin(), raw_buffer.end());
-        auto type = get_type(t.data_type());
        return create_literal(type, dims, s.data());
    }
    if(t.has_raw_data())
    {
        const std::string& s = t.raw_data();
-        auto type            = get_type(t.data_type());
        return create_literal(type, dims, s.data());
    }

--- a/src/onnx/parse_pooling.cpp
+++ b/src/onnx/parse_pooling.cpp
@@ -47,52 +47,42 @@ struct parse_pooling : op_parser<parse_pooling>
                {"GlobalLpPool", "lpnorm"}};
    }
-    instruction_ref parse(const op_desc& opd,
+    value handle_values(const op_desc& opd,
-                          const onnx_parser& /*parser*/,
+                        onnx_parser::node_info info,
-                          onnx_parser::node_info info,
+                        const shape& in_shape,
-                          std::vector<instruction_ref> args) const
+                        value values) const
    {
-        const std::unordered_map<std::string, op::pooling_mode> mode_map = {
+        auto kdims = in_shape.ndim() - 2;
-            {"max", op::pooling_mode::max},
-            {"average", op::pooling_mode::average},
-            {"lpnorm", op::pooling_mode::lpnorm}};
-        std::string mode = opd.op_name;
-        if(not contains(mode_map, mode))
-        {
-            MIGRAPHX_THROW("onnx pooling mode must be [\"max\", \"average\", \"lpnorm\"]");
-        }
-        operation op = make_op("pooling", {{"mode", mode_map.at(mode)}});
-        value values = op.to_value();
-        auto l0      = args[0];
-        auto in_lens = l0->get_shape().lens();
-        assert(in_lens.size() > 2);
-        auto kdims = in_lens.size() - 2;
        if(starts_with(opd.onnx_name, "Global"))
        {
-            values["lengths"] = std::vector<size_t>(in_lens.begin() + 2, in_lens.end());
+            // if spatial dimensions are dynamic use dyn_global flag
+            if(in_shape.dynamic() and std::any_of(in_shape.dyn_dims().cbegin() + 2,
+                                                  in_shape.dyn_dims().cend(),
+                                                  [](auto dd) { return not dd.is_fixed(); }))
+            {
+                values["dyn_global"] = true;
+                values["lengths"]    = std::vector<size_t>();
+            }
+            else
+            {
+                // works with static and fixed dynamic shape
+                auto m_lens       = in_shape.max_lens();
+                values["lengths"] = std::vector<size_t>(m_lens.begin() + 2, m_lens.end());
+            }
        }
-        // does not support ceil_mode
        if(contains(info.attributes, "ceil_mode"))
        {
            values["ceil_mode"] = static_cast<bool>(info.attributes.at("ceil_mode").i());
        }
-        // count include padding, if count include pad is 1, we always use
-        // explicit pad
-        int count_include_pad = 0;
-        if(contains(info.attributes, "count_include_pad"))
-        {
-            count_include_pad = info.attributes.at("count_include_pad").i();
-        }
        if(contains(info.attributes, "strides"))
        {
            values["stride"].clear();
            copy(info.attributes["strides"].ints(), std::back_inserter(values["stride"]));
            check_attr_sizes(kdims, values["stride"].size(), "PARSE_POOLING: inconsistent strides");
        }
        if(contains(info.attributes, "kernel_shape"))
        {
            values["lengths"].clear();
@@ -110,6 +100,46 @@ struct parse_pooling : op_parser<parse_pooling>
        // ensure pads availabe only when auto_pad is "NOT_SET"
        check_padding_mode(info, "POOLING");
+        return values;
+    }
+    instruction_ref parse(const op_desc& opd,
+                          const onnx_parser& /*parser*/,
+                          onnx_parser::node_info info,
+                          std::vector<instruction_ref> args) const
+    {
+        std::string mode                                                 = opd.op_name;
+        const std::unordered_map<std::string, op::pooling_mode> mode_map = {
+            {"max", op::pooling_mode::max},
+            {"average", op::pooling_mode::average},
+            {"lpnorm", op::pooling_mode::lpnorm}};
+        if(not contains(mode_map, mode))
+        {
+            MIGRAPHX_THROW(
+                "PARSE_POOLING: onnx pooling mode must be [\"max\", \"average\", \"lpnorm\"]");
+        }
+        operation op  = make_op("pooling", {{"mode", mode_map.at(mode)}});
+        value values  = op.to_value();
+        auto l0       = args[0];
+        auto in_shape = l0->get_shape();
+        assert(in_shape.ndim() > 2);
+        auto kdims = in_shape.ndim() - 2;
+        values = handle_values(opd, info, in_shape, values);
+        // count include padding, if count include pad is 1, we always use
+        // explicit pad
+        int count_include_pad = 0;
+        if(contains(info.attributes, "count_include_pad"))
+        {
+            if(in_shape.dynamic())
+            {
+                MIGRAPHX_THROW("PARSE_POOLING: count_include_pad attribute is not supported for "
+                               "dynamic input shape");
+            }
+            count_include_pad = info.attributes.at("count_include_pad").i();
+        }
        std::vector<int64_t> paddings;
        float pad_val = ((mode == "max") ? std::numeric_limits<float>::lowest() : 0.0f);
@@ -123,14 +153,22 @@ struct parse_pooling : op_parser<parse_pooling>
        if(contains(info.attributes, "auto_pad"))
        {
-            values["padding"].clear();
+            if(in_shape.dynamic())
-            // return paddings could be empty, then setting to 0 for no padding
+            {
-            cal_auto_padding_size(info,
+                MIGRAPHX_THROW(
-                                  values,
+                    "PARSE_POOLING: Auto padding pooling with dynamic input shape not supported");
-                                  values["lengths"].to_vector<std::size_t>(),
+            }
-                                  {1, 1},
+            else
-                                  in_lens,
+            {
-                                  paddings);
+                values["padding"].clear();
+                // return paddings could be empty, then setting to 0 for no padding
+                cal_auto_padding_size(info,
+                                      values,
+                                      values["lengths"].to_vector<std::size_t>(),
+                                      {1, 1},
+                                      in_shape.lens(),
+                                      paddings);
+            }
        }
        if(paddings.size() != 2 * kdims)
@@ -150,6 +188,7 @@ struct parse_pooling : op_parser<parse_pooling>
            values["stride"].resize(kdims);
            std::fill_n(values["stride"].begin(), kdims, 1);
        }
        // used to calculate the supposed output shape
        std::vector<int64_t> orig_padding = paddings;
@@ -159,6 +198,11 @@ struct parse_pooling : op_parser<parse_pooling>
        if(not slice_start.empty())
        {
+            if(in_shape.dynamic())
+            {
+                MIGRAPHX_THROW(
+                    "PARSE_POOLING: asymmetric padding not supported for dynamic input shape");
+            }
            // calculate expected output shape
            orig_padding.insert(orig_padding.begin() + kdims, 2, 0);
            orig_padding.insert(orig_padding.begin(), 2, 0);

--- a/src/onnx/parse_transpose.cpp
+++ b/src/onnx/parse_transpose.cpp
@@ -47,7 +47,7 @@ struct parse_transpose : op_parser<parse_transpose>
        }
        // if perm is empty, use the default value
-        auto n_dim = args.front()->get_shape().lens().size();
+        auto n_dim = args.front()->get_shape().ndim();
        if(perm.empty())
        {
            perm.resize(n_dim);

--- a/src/rewrite_rnn.cpp
+++ b/src/rewrite_rnn.cpp
@@ -92,7 +92,7 @@ void rewrite_rnn::apply_vanilla_rnn(module& m, instruction_ref ins) const
    // process sequence length
    instruction_ref seq_lens = m.end();
-    if((args.size() >= 5) && args[4]->name() != "undefined")
+    if((args.size() >= 5) and not args[4]->is_undefined())
    {
        seq_lens = args[4];
    }
@@ -117,7 +117,7 @@ void rewrite_rnn::apply_vanilla_rnn(module& m, instruction_ref ins) const
        // process bias
        instruction_ref bias_forward = m.end();
        instruction_ref bias_reverse = m.end();
-        if(args.size() >= 4 && args[3]->name() != "undefined")
+        if(args.size() >= 4 and not args[3]->is_undefined())
        {
            bias_forward = m.insert_instruction(
                ins, make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), args[3]);
@@ -129,7 +129,7 @@ void rewrite_rnn::apply_vanilla_rnn(module& m, instruction_ref ins) const
        // or the 5th one (if the sequence len argument is ignored)
        instruction_ref ih_forward{};
        instruction_ref ih_reverse{};
-        if(args.size() == 6 && args[5]->name() != "undefined")
+        if(args.size() == 6 and not args[5]->is_undefined())
        {
            ih_forward = m.insert_instruction(
                ins, make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), args[5]);
@@ -195,14 +195,14 @@ void rewrite_rnn::apply_vanilla_rnn(module& m, instruction_ref ins) const
        // process bias and initial hidden state
        instruction_ref bias = m.end();
-        if(args.size() >= 4 && args[3]->name() != "undefined")
+        if(args.size() >= 4 and not args[3]->is_undefined())
        {
            bias = args[3];
        }
        // process intial hidden state
        instruction_ref ih;
-        if(args.size() == 6 && args[5]->name() != "undefined")
+        if(args.size() == 6 and not args[5]->is_undefined())
        {
            ih = args[5];
        }
@@ -398,7 +398,7 @@ void rewrite_rnn::apply_gru(module& m, instruction_ref ins) const
    // process sequence length
    instruction_ref seq_lens = m.end();
-    if((args.size() >= 5) && args[4]->name() != "undefined")
+    if((args.size() >= 5) and not args[4]->is_undefined())
    {
        seq_lens = args[4];
    }
@@ -423,7 +423,7 @@ void rewrite_rnn::apply_gru(module& m, instruction_ref ins) const
        // bias
        instruction_ref bias_forward = m.end();
        instruction_ref bias_reverse = m.end();
-        if(args.size() >= 4 && args[3]->name() != "undefined")
+        if(args.size() >= 4 and not args[3]->is_undefined())
        {
            bias_forward = m.insert_instruction(
                ins, make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), args[3]);
@@ -434,7 +434,7 @@ void rewrite_rnn::apply_gru(module& m, instruction_ref ins) const
        // intial hidden state
        instruction_ref ih_forward{};
        instruction_ref ih_reverse{};
-        if(args.size() == 6 && args[5]->name() != "undefined")
+        if(args.size() == 6 and not args[5]->is_undefined())
        {
            ih_forward = m.insert_instruction(
                ins, make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), args[5]);
@@ -501,14 +501,14 @@ void rewrite_rnn::apply_gru(module& m, instruction_ref ins) const
        // bias
        instruction_ref bias = m.end();
-        if(args.size() >= 4 && args[3]->name() != "undefined")
+        if(args.size() >= 4 and not args[3]->is_undefined())
        {
            bias = args[3];
        }
        // intial hidden state
        instruction_ref ih{};
-        if(args.size() == 6 && args[5]->name() != "undefined")
+        if(args.size() == 6 and not args[5]->is_undefined())
        {
            ih = args[5];
        }
@@ -784,7 +784,7 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
    // process sequence length
    instruction_ref seq_lens = m.end();
-    if((args.size() >= 5) && args[4]->name() != "undefined")
+    if((args.size() >= 5) and not args[4]->is_undefined())
    {
        seq_lens = args[4];
    }
@@ -813,7 +813,7 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
        // process bias
        instruction_ref bias_forward = m.end();
        instruction_ref bias_reverse = m.end();
-        if(args.size() >= 4 && args[3]->name() != "undefined")
+        if(args.size() >= 4 and not args[3]->is_undefined())
        {
            bias_forward = m.insert_instruction(
                ins, make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), args[3]);
@@ -824,7 +824,7 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
        // process intial hidden state, it is the 6th argument
        instruction_ref ih_forward{};
        instruction_ref ih_reverse{};
-        if(args.size() >= 6 && args[5]->name() != "undefined")
+        if(args.size() >= 6 and not args[5]->is_undefined())
        {
            ih_forward = m.insert_instruction(
                ins, make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), args[5]);
@@ -840,7 +840,7 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
        // process initial cell value
        instruction_ref ic_forward{};
        instruction_ref ic_reverse{};
-        if(args.size() >= 7 && args[6]->name() != "undefined")
+        if(args.size() >= 7 and not args[6]->is_undefined())
        {
            ic_forward = m.insert_instruction(
                ins, make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), args[6]);
@@ -856,7 +856,7 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
        // process weight of the peephole
        instruction_ref pph_forward = m.end();
        instruction_ref pph_reverse = m.end();
-        if(args.size() == 8 && args[7]->name() != "undefined")
+        if(args.size() == 8 and not args[7]->is_undefined())
        {
            pph_forward = m.insert_instruction(
                ins, make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), args[7]);
@@ -940,14 +940,14 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
        // bias
        instruction_ref bias = m.end();
-        if(args.size() >= 4 && args[3]->name() != "undefined")
+        if(args.size() >= 4 and not args[3]->is_undefined())
        {
            bias = args[3];
        }
        // initial hidden state
        instruction_ref ih{};
-        if(args.size() >= 6 && args[5]->name() != "undefined")
+        if(args.size() >= 6 and not args[5]->is_undefined())
        {
            ih = args[5];
        }
@@ -958,7 +958,7 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
        // initial cell value
        instruction_ref ic{};
-        if(args.size() >= 7 && args[6]->name() != "undefined")
+        if(args.size() >= 7 and not args[6]->is_undefined())
        {
            ic = args[6];
        }
@@ -969,7 +969,7 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
        // process weight of the peephole
        instruction_ref pph = m.end();
-        if(args.size() == 8 && args[7]->name() != "undefined")
+        if(args.size() == 8 and not args[7]->is_undefined())
        {
            pph = args[7];
        }

--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -521,6 +521,14 @@ std::ostream& operator<<(std::ostream& os, const shape::dynamic_dimension& x)
    return os;
 }
+bool operator==(const shape::dynamic_dimension& x, const std::size_t& y)
+{
+    return x.min == y and x.max == y;
+}
+bool operator==(const std::size_t& x, const shape::dynamic_dimension& y) { return y == x; }
+bool operator!=(const shape::dynamic_dimension& x, const std::size_t& y) { return not(x == y); }
+bool operator!=(const std::size_t& x, const shape::dynamic_dimension& y) { return not(x == y); }
 bool operator==(const shape& x, const shape& y)
 {
    if(x.dynamic() and y.dynamic())

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -231,14 +231,17 @@ endif()
 include(CheckLibraryExists)
 get_target_property(MIOPEN_LOCATION MIOpen LOCATION)
 check_library_exists(MIOpen "miopenHiddenSetConvolutionFindMode" "${MIOPEN_LOCATION}" HAS_FIND_MODE_API)
-# check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_2_API)
+check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_2_API)
-# if(HAS_FIND_2_API) 
+# TODO: Set default to HAS_FIND_2_API
-#     target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
+set(MIGRAPHX_USE_FIND_2_API OFF CACHE BOOL "")
-#     message(STATUS "MIGraphx is using Find-2.0 API of MIOpen")
-# else()
+if(MIGRAPHX_USE_FIND_2_API) 
-#     message(STATUS "MIOpen does not have Find-2.0 API")
+    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
-# endif()
+    message(STATUS "MIGraphx is using Find-2.0 API of MIOpen")
+else()
+    message(STATUS "MIGraphx is using legacy Find API in MIOpen")
+endif()
 if(HAS_FIND_MODE_API)
    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_MODE_API)

--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
@@ -185,7 +185,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
    options.push_back("-fno-gpu-rdc");
    options.push_back(" -O" + string_value_of(MIGRAPHX_GPU_OPTIMIZE{}, "3"));
    options.push_back("-Wno-cuda-compat");
-    options.push_back("--cuda-gpu-arch=" + arch);
+    options.push_back("--offload-arch=" + arch);
    prog.compile(options);
    return {prog.get_code_obj()};
 }
@@ -237,7 +237,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
    }
    else if(is_hip_clang_compiler())
    {
-        params += " --cuda-gpu-arch=" + arch;
+        params += " --offload-arch=" + arch;
        params += " --cuda-device-only";
        params += " -O" + string_value_of(MIGRAPHX_GPU_OPTIMIZE{}, "3") + " ";
    }

--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -196,12 +196,21 @@ argument to_gpu(const argument& arg, bool host)
 argument from_gpu(const argument& arg)
 {
    argument result;
-    arg.visit([&](auto x) {
+    arg.visit(
-        using type = typename decltype(x)::value_type;
+        [&](auto x) {
-        auto v     = read_from_gpu<type>(arg.data(), x.get_shape().bytes() / sizeof(type));
+            using type = typename decltype(x)::value_type;
-        // cppcheck-suppress returnDanglingLifetime
+            auto v     = read_from_gpu<type>(arg.data(), x.get_shape().bytes() / sizeof(type));
-        result = {x.get_shape(), [v]() mutable { return v.data(); }};
+            // cppcheck-suppress returnDanglingLifetime
-    });
+            result = {x.get_shape(), [v]() mutable { return v.data(); }};
+        },
+        [&](const auto& xs) {
+            std::vector<argument> args;
+            std::transform(xs.begin(), xs.end(), std::back_inserter(args), [&](auto x) {
+                return from_gpu(x);
+            });
+            result = argument{args};
+        });
    return result;
 }

--- a/src/targets/gpu/include/migraphx/gpu/hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/hip.hpp
@@ -105,7 +105,7 @@ struct hip_copy_to_gpu
    std::string name() const { return "hip::copy_to_gpu"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1, 2);
+        check_shapes{inputs, *this}.has(1, 2).same_type();
        return inputs.at(0);
    }
    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
@@ -131,7 +131,7 @@ struct hip_copy_from_gpu
    std::string name() const { return "hip::copy_from_gpu"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1, 2);
+        check_shapes{inputs, *this}.has(1, 2).same_type();
        return inputs.at(0);
    }
    argument
@@ -159,7 +159,7 @@ struct hip_copy
    std::string name() const { return "hip::copy"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2);
+        check_shapes{inputs, *this}.has(2).same_type();
        return inputs.at(1);
    }
    argument compute(context& ctx, const shape&, std::vector<argument> args) const

--- a/src/targets/gpu/jit/mlir.cpp
+++ b/src/targets/gpu/jit/mlir.cpp
@@ -24,7 +24,6 @@
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/mlir.hpp>
 namespace migraphx {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
@@ -25,6 +25,7 @@
 #define MIGRAPHX_GUARD_KERNELS_LAYERNORM_HPP
 #include <migraphx/kernels/reduce.hpp>
 #include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/vec.hpp>
 #include <migraphx/kernels/print.hpp>
 namespace migraphx {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
@@ -33,38 +33,6 @@
 namespace migraphx {
-template <class T>
-struct implicit_conversion_op
-{
-    T x;
-    template <index_int N, class U>
-    constexpr operator vec<U, N>() const
-    {
-        if constexpr(vec_size<T>() == 0)
-        {
-            return x;
-        }
-        else
-        {
-            static_assert(vec_size<T>() == N, "Vector mismatch size");
-            return __builtin_convertvector(x, vec<U, N>);
-        }
-    }
-    template <class U>
-    constexpr operator U() const
-    {
-        return x;
-    }
-};
-template <class T>
-constexpr implicit_conversion_op<T> implicit_conversion(T x)
-{
-    return {x};
-}
 template <class F, class T, class... Ts>
 __device__ void pointwise_tensor(index idx, F f, T out, Ts... xs)
 {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -185,5 +185,37 @@ constexpr auto vec_reduce(T x, Op op)
    }
 }
+template <class T>
+struct implicit_conversion_op
+{
+    T x;
+    template <index_int N, class U>
+    constexpr operator vec<U, N>() const
+    {
+        if constexpr(vec_size<T>() == 0)
+        {
+            return x;
+        }
+        else
+        {
+            static_assert(vec_size<T>() == N, "Vector mismatch size");
+            return __builtin_convertvector(x, vec<U, N>);
+        }
+    }
+    template <class U>
+    constexpr operator U() const
+    {
+        return x;
+    }
+};
+template <class T>
+constexpr implicit_conversion_op<T> implicit_conversion(T x)
+{
+    return {x};
+}
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_VEC_HPP
--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
@@ -32,7 +32,13 @@
 #include <mlir-c/Dialect/MIGraphX.h>
 #include <mlir-c/IntegerSet.h>
 #include <mlir-c/Pass.h>
-#include <mlir-c/Registration.h>
+#include <mutex>
+#if !defined(MLIR_MIGRAPHX_DIALECT_API_VERSION) || MLIR_MIGRAPHX_DIALECT_API_VERSION != 3
+#warning "Incompatible version of rocMLIR library used, disabling"
+#undef MIGRAPHX_MLIR
+#else
+#include <mlir-c/RegisterRocMLIR.h>
+#endif
 #endif
 #include <migraphx/env.hpp>
@@ -50,10 +56,6 @@
 #include <deque>
 #include <variant>
-#if defined(MLIR_MIGRAPHX_DIALECT_API_VERSION) && MLIR_MIGRAPHX_DIALECT_API_VERSION >= 2
-#define MIGRAPHX_MLIR_BARE_POINTER
-#endif
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
@@ -168,9 +170,11 @@ struct mlir_program
          location(mlirLocationUnknownGet(ctx.get())),
          mmodule(mlirModuleCreateEmpty(location))
    {
-        MlirDialectHandle mixr_handle = mlirGetDialectHandle__migraphx__();
+        MlirDialectRegistry registry = mlirDialectRegistryCreate();
-        mlirDialectHandleRegisterDialect(mixr_handle, ctx.get());
+        mlirRegisterRocMLIRDialects(registry);
-        mlirRegisterAllDialects(ctx.get());
+        mlirContextAppendDialectRegistry(ctx.get(), registry);
+        mlirContextLoadAllAvailableDialects(ctx.get());
+        mlirDialectRegistryDestroy(registry);
        mlirContextSetAllowUnregisteredDialects(ctx.get(), true /*allow*/);
    }
@@ -452,7 +456,8 @@ struct mlir_program
        auto ops = create_operation_state("func.func");
        ops.add_attributes({{"function_type", make_function_type(inputs, outputs)},
                            {"sym_name", std::string("main")},
-                            {"kernel", std::string("mixr")}});
+                            {"kernel", std::string("mixr")},
+                            {"arch", target_arch}});
        ops.add_region(std::move(region));
        insert(body, std::move(ops));
@@ -512,7 +517,8 @@ struct mlir_program
                pp =
                    problem_params{ins->get_operator(), to_shapes(ins->inputs()), ins->get_shape()};
                // check if HW supports xdlops
-                bool xdlops       = contains(get_xdlops_archs(), target_name);
+                auto target_chip  = trim(split_string(target_arch, ':').front());
+                bool xdlops       = contains(get_xdlops_archs(), target_chip);
                std::string tuned = get_tune_params(xdlops);
                if(not tuned.empty())
                    ops.add_attributes({{"perf_config", tuned}});
@@ -540,7 +546,7 @@ struct mlir_program
        // 1st pipeline to call
        mlirMIGraphXAddHighLevelPipeline(pm.get());
        // 2nd pipeline to call
-        mlirMIGraphXAddBackendPipeline(pm.get(), target_name.c_str(), "amdgcn-amd-amdhsa", "");
+        mlirMIGraphXAddBackendPipeline(pm.get(), target_arch.c_str());
        mlirPassManagerRun(pm.get(), mmodule.get());
        code_object_op op{};
@@ -550,16 +556,7 @@ struct mlir_program
        return op;
    }
-    void find_target()
+    void find_target() { target_arch = get_device_name(); }
-    {
-        std::string tname = get_device_name();
-        // HACK: Since MLIR can't handle the full target name
-        target_name = trim(split_string(tname, ':').front());
-        if(tname.size() != target_name.size())
-            std::cout
-                << "*************** WARNING: MLIR may not compile the correct target features for: "
-                << tname << std::endl;
-    }
    std::pair<std::size_t, std::size_t> get_launch_params() const
    {
@@ -588,7 +585,7 @@ struct mlir_program
    mlir_module mmodule;
    problem_params pp;
    std::deque<std::string> strings{};
-    std::string target_name;
+    std::string target_arch;
 };
 std::string dump_mlir(const module& m)
@@ -650,6 +647,10 @@ code_object_op compile_mlir(const context&, module m, const std::vector<instruct
    const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
    if(trace)
        std::cout << m << std::endl;
+    // set mutex while llvm thread support is disabled.
+    static std::mutex g_mlirc_mutex; // NOLINT
+    const std::lock_guard<std::mutex> lock(g_mlirc_mutex);
    mlir_program mp;
    mp.find_target();
    mp.parse(m);
@@ -669,46 +670,9 @@ instruction_ref insert_mlir(module& m,
    std::vector<instruction_ref> refs;
    std::size_t last = 0;
-#ifdef MIGRAPHX_MLIR_BARE_POINTER
    refs.reserve(inputs.size());
    std::copy(inputs.begin(), inputs.end(), std::back_inserter(refs));
-    last = refs.size() - 1;
+    last               = refs.size() - 1;
-#else
-    refs.reserve(inputs.size() * 15);
-    std::unordered_map<uint64_t, instruction_ref> literal_map{};
-    auto get_literal = [&](uint64_t value) {
-        auto fi = literal_map.find(value);
-        if(fi != literal_map.end())
-            return fi->second;
-        auto lit = m.add_literal(value);
-        literal_map.emplace(value, lit);
-        return lit;
-    };
-    for(auto input : inputs)
-    {
-        const size_t offset = 0;
-        auto s              = input->get_shape();
-        last                = refs.size();
-        refs.push_back(input);
-        refs.push_back(input);
-        refs.push_back(get_literal(offset)); // offset
-        // dim sizes
-        std::transform(s.lens().begin(),
-                       s.lens().end(),
-                       std::back_inserter(refs),
-                       [&](const auto& lval) { return get_literal(lval); });
-        // refs.push_back(get_literal(1)); // G
-        // dim strides
-        std::transform(s.strides().begin(),
-                       s.strides().end(),
-                       std::back_inserter(refs),
-                       [&](const auto& lval) { return get_literal(lval); });
-        // refs.push_back(get_literal(1)); // G
-    }
-#endif
    co.expected_inputs = to_shapes(refs);
    co.output_arg      = last;
    return m.insert_instruction(ins, co, refs);

--- a/src/targets/gpu/perfdb.cpp
+++ b/src/targets/gpu/perfdb.cpp
@@ -27,6 +27,7 @@
 #include <migraphx/stringutils.hpp>
 #include <migraphx/permutation.hpp>
 #include <fstream>
+#include <mutex>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -88,6 +89,9 @@ std::string generate_miopen_config(const problem_params& pp)
 auto query_miopen_db(const std::string& query)
 {
+    static std::mutex g_db_mutex; // NOLINT
+    const std::lock_guard<std::mutex> lock(g_db_mutex);
    // TODO: Store db as a static variable
    const auto dbpath = fs::path{"/opt"} / "rocm" / "share" / "miopen" / "db" / "miopen.db";
    // Check if db file exists.

--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -51,17 +51,20 @@ struct layernorm_base
        }
        check_shapes{inputs, static_cast<const Derived&>(*this)}.has(nargs + N);
        auto s = inputs.at(0);
+        auto t = s.type();
+        if(not mods.empty())
+            t = mods.front()->get_output_shapes().front().type();
        if(s.scalar())
        {
            return s;
        }
        else if(s.broadcasted())
        {
-            return {s.type(), s.lens()};
+            return {t, s.lens()};
        }
        else
        {
-            return s.with_lens(s.lens());
+            return s.with_lens(t, s.lens());
        }
    }
 };

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -139,8 +139,6 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        pack_int8_args{},
        dead_code_elimination{},
-        adjust_allocation{gpu_allocation_model{}},
-        dead_code_elimination{},
        fuse_ops{&ctx, options.fast_math},
        dead_code_elimination{},
        replace_allocate{gpu_allocation_model{}, options.offload_copy},