Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into dyn_model_test

9b929d4e · charlie · c4b1102e · 4394e9b3 · 9b929d4e · 9b929d4e
Commit 9b929d4e authored Dec 29, 2022 by charlie
20 changed files
--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -83,9 +83,10 @@ struct miopen_convolution
    inline shape compute_shape(const std::vector<shape>& inputs) const
    {
-        check_shapes{inputs, op}.has(4).standard();
+        check_shapes{inputs, op}.has(4);
        std::vector<shape> conv_inputs(inputs.begin(), inputs.begin() + 2);
-        check_shapes{conv_inputs, op}.max_ndims(5);
+        check_shapes{conv_inputs, *this}.max_ndims(5).packed_layouts(
+            {{0, 1, 2}, {0, 1, 2, 3}, {0, 2, 3, 1}, {0, 1, 2, 3, 4}});
        return migraphx::compute_shape<Op>(op, conv_inputs);
    }
@@ -144,13 +145,10 @@ struct miopen_convolution
 #endif
    }
-    inline void set_conv_descriptor()
+    void set_conv_descriptor()
-    {
-        if(cd == nullptr)
    {
        cd = (op.name() == "deconvolution") ? make_deconv(op) : make_conv(op);
    }
-    }
    value compile(migraphx::context& ctx, const shape& output, const std::vector<shape>& input)
    {
@@ -239,7 +237,6 @@ struct miopen_convolution
        if(status != miopenStatusSuccess)
            MIGRAPHX_THROW("MIOpen " + op.name() + " : find convolution failed");
        algo = perf.fwd_algo;
        size_t solution_count;
        status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),

--- a/src/targets/gpu/include/migraphx/gpu/hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/hip.hpp
@@ -105,7 +105,7 @@ struct hip_copy_to_gpu
    std::string name() const { return "hip::copy_to_gpu"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1, 2);
+        check_shapes{inputs, *this}.has(1, 2).same_type();
        return inputs.at(0);
    }
    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
@@ -131,7 +131,7 @@ struct hip_copy_from_gpu
    std::string name() const { return "hip::copy_from_gpu"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1, 2);
+        check_shapes{inputs, *this}.has(1, 2).same_type();
        return inputs.at(0);
    }
    argument
@@ -159,7 +159,7 @@ struct hip_copy
    std::string name() const { return "hip::copy"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2);
+        check_shapes{inputs, *this}.has(2).same_type();
        return inputs.at(1);
    }
    argument compute(context& ctx, const shape&, std::vector<argument> args) const

--- a/src/targets/gpu/include/migraphx/gpu/rocblas.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/rocblas.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_MIGRAPHLIB_ROCBLAS_HPP
 #include <migraphx/manage_ptr.hpp>
 #include <migraphx/config.hpp>
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/jit/concat.cpp
+++ b/src/targets/gpu/jit/concat.cpp
@@ -38,16 +38,19 @@ using namespace migraphx::gpu::gen; // NOLINT
 static const char* const concat_kernel = R"__migraphx__(
 #include <migraphx/kernels/concat.hpp>
 #include <migraphx/kernels/vectorize.hpp>
+#include <migraphx/kernels/ops.hpp>
 #include <args.hpp>
 namespace migraphx {
+${preamble}
 extern "C" {
 __global__ void ${kernel}(${params}) 
 {
-    transform_args(make_tensors(), rotate_last(), ${transformers})(${args})([](auto y, auto... xs) {
+    transform_args(make_tensors(), rotate_last(), ${transformers})(${args})([](auto y, ${concat_params}, auto... xs) {
-        concat<${axis}>(y, xs...);
+        concat<${axis}>(${concat_args})(${post}, y, xs...);
    });
 }
@@ -68,28 +71,42 @@ struct concat_compiler : compiler<concat_compiler>
    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
    {
-        // TODO: Use reduce_dims
+        auto num_of_concat_inputs = v.get("concat_inputs", inputs.size() - 1);
        hip_compile_options options;
        options.inputs      = inputs;
        options.output      = inputs.back();
        options.params      = "-Wno-float-equal";
+        options.kernel_name = v.get("kernel", "concat_kernel");
        auto axis           = find_fast_axis(options.inputs);
        auto vec            = vectorize::elements(ctx, axis, options.inputs);
-        options.kernel_name = v.get("kernel", "concat_kernel");
        options.set_launch_params(
            v, compute_global_for(ctx, get_concat_elements(options.inputs) / vec.size, 256));
-        auto src = interpolate_string(concat_kernel,
+        auto src = interpolate_string(
+            concat_kernel,
            {{"kernel", options.kernel_name},
             {"params", enum_params(inputs.size(), "void * private_p")},
             {"args", enum_params(inputs.size(), "private_p")},
+             {"concat_params", enum_params(num_of_concat_inputs, "auto concat_x")},
+             {"concat_args", enum_params(num_of_concat_inputs, "concat_x")},
+             {"post", v.get("post", std::string{"op::id{}"})},
             {"transformers", make_transformer_args(vec)},
+             {"preamble", v.get("preamble", std::string{})},
             {"axis", v.at("axis").to<std::string>()}});
        return compile_hip_code_object(src, options);
    }
    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
    {
-        return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
+        auto v = op.to_value();
+        if(not ins->module_inputs().empty())
+        {
+            auto* pm           = ins->module_inputs().front();
+            v["concat_inputs"] = ins->inputs().size() - pm->get_parameter_names().size();
+            v["preamble"]      = generate_pointwise(*pm, "post_concat");
+            v["post"]          = "MIGRAPHX_LIFT(post_concat)";
+            v["kernel"]        = "concat_" + generate_name_from_ops(*pm) + "_kernel";
+        }
+        return replace(compile_op(ctx, to_shapes(ins->inputs()), v));
    }
 };

--- a/src/targets/gpu/jit/mlir.cpp
+++ b/src/targets/gpu/jit/mlir.cpp
@@ -24,7 +24,6 @@
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/mlir.hpp>
 namespace migraphx {

--- a/src/targets/gpu/jit/pointwise.cpp
+++ b/src/targets/gpu/jit/pointwise.cpp
@@ -58,7 +58,7 @@ __global__ void ${kernel}(${params})
 struct pointwise_compiler : compiler<pointwise_compiler>
 {
-    std::vector<std::string> names() const { return {"pointwise", "contiguous"}; }
+    std::vector<std::string> names() const { return {"pointwise", "contiguous", "layout"}; }
    static std::size_t oversubscribe_if(bool b)
    {
@@ -91,12 +91,12 @@ struct pointwise_compiler : compiler<pointwise_compiler>
    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
    {
-        if(op.name() == "contiguous")
+        if(contains({"layout", "contiguous"}, op.name()))
        {
            return replace(compile_op(
                ctx,
                to_shapes(ins->inputs()),
-                {{"lambda", "[](auto x) { return x; }"}, {"kernel", "contiguous_kernel"}}));
+                {{"lambda", "[](auto x) { return x; }"}, {"kernel", op.name() + "_kernel"}}));
        }
        else
        {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/concat.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/concat.hpp
@@ -41,7 +41,15 @@ constexpr auto concat_slice(Output out, Input, Start)
        return Start{} * output_shape.strides[Axis];
    });
    constexpr auto s       = make_shape(lens, strides);
-    return make_tensor_view(&out[offset], s);
+    MIGRAPHX_ASSERT(offset < out.get_shape().element_space());
+    MIGRAPHX_ASSERT((s.element_space() + offset) <= out.get_shape().element_space());
+    return make_tensor_view(out.data() + offset, s);
+}
+template <index_int Axis, class Input, class Start, class... Ts>
+constexpr auto concat_slices(Input input, Start start, Ts... xs)
+{
+    return [=](auto f) { f(concat_slice<Axis>(xs, input, start)...); };
 }
 template <index_int Axis, class Input>
@@ -51,15 +59,19 @@ constexpr auto concat_ends(Input)
    return _c<lens[Axis]>;
 }
-template <index_int Axis, class Output, class... Inputs>
+template <index_int Axis, class... Inputs>
-__device__ void concat(Output output, Inputs... inputs)
+__device__ auto concat(Inputs... inputs)
 {
+    return [=](auto f, auto... ts) {
        auto idx = make_index();
        fold([&](auto start, auto input) {
-        auto y = concat_slice<Axis>(output, input, start);
+            concat_slices<Axis>(input, start, ts...)([&](auto y, auto... xs) {
-        idx.global_stride(input.get_shape().elements(), [&](auto i) { y[i] = input[i]; });
+                idx.global_stride(input.get_shape().elements(),
+                                  [&](auto i) { y[i] = f(input[i], xs[i]...); });
+            });
            return start + concat_ends<Axis>(input);
        })(_c<0>, inputs...);
+    };
 }
 } // namespace migraphx

--- a/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
@@ -25,6 +25,7 @@
 #define MIGRAPHX_GUARD_KERNELS_LAYERNORM_HPP
 #include <migraphx/kernels/reduce.hpp>
 #include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/vec.hpp>
 #include <migraphx/kernels/print.hpp>
 namespace migraphx {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
@@ -33,38 +33,6 @@
 namespace migraphx {
-template <class T>
-struct implicit_conversion_op
-{
-    T x;
-    template <index_int N, class U>
-    constexpr operator vec<U, N>() const
-    {
-        if constexpr(vec_size<T>() == 0)
-        {
-            return x;
-        }
-        else
-        {
-            static_assert(vec_size<T>() == N, "Vector mismatch size");
-            return __builtin_convertvector(x, vec<U, N>);
-        }
-    }
-    template <class U>
-    constexpr operator U() const
-    {
-        return x;
-    }
-};
-template <class T>
-constexpr implicit_conversion_op<T> implicit_conversion(T x)
-{
-    return {x};
-}
 template <class F, class T, class... Ts>
 __device__ void pointwise_tensor(index idx, F f, T out, Ts... xs)
 {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -185,5 +185,37 @@ constexpr auto vec_reduce(T x, Op op)
    }
 }
+template <class T>
+struct implicit_conversion_op
+{
+    T x;
+    template <index_int N, class U>
+    constexpr operator vec<U, N>() const
+    {
+        if constexpr(vec_size<T>() == 0)
+        {
+            return x;
+        }
+        else
+        {
+            static_assert(vec_size<T>() == N, "Vector mismatch size");
+            return __builtin_convertvector(x, vec<U, N>);
+        }
+    }
+    template <class U>
+    constexpr operator U() const
+    {
+        return x;
+    }
+};
+template <class T>
+constexpr implicit_conversion_op<T> implicit_conversion(T x)
+{
+    return {x};
+}
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_VEC_HPP
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -29,19 +29,14 @@
 #include <migraphx/instruction_ref.hpp>
 #include <migraphx/stringutils.hpp>
-#include <migraphx/op/convolution.hpp>
-#include <migraphx/op/deconvolution.hpp>
 #include <migraphx/op/dot.hpp>
 #include <migraphx/op/if_op.hpp>
 #include <migraphx/op/reshape.hpp>
-#include <migraphx/op/quant_convolution.hpp>
 #include <migraphx/op/quant_dot.hpp>
 #include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/convolution.hpp>
 #include <migraphx/gpu/device_name.hpp>
 #include <migraphx/gpu/gemm.hpp>
-#include <migraphx/gpu/int8_conv_pack.hpp>
 #include <migraphx/gpu/miopen.hpp>
 #include <migraphx/gpu/rocblas.hpp>
 #include <migraphx/gpu/compiler.hpp>
@@ -109,9 +104,9 @@ struct miopen_apply
        add_extend_op("scatter_none");
        add_extend_op("topk");
-        add_convolution_op<op::convolution>("convolution");
+        add_convolution_op("convolution");
-        add_convolution_op<op::deconvolution>("deconvolution");
+        add_convolution_op("deconvolution");
-        add_convolution_op<op::quant_convolution>("quant_convolution");
+        add_convolution_op("quant_convolution");
        add_gemm_op<op::dot>("dot");
        add_gemm_op<op::quant_dot>("quant_dot");
        add_if_op();
@@ -238,34 +233,19 @@ struct miopen_apply
        });
    }
-    template <typename Op>
    void add_convolution_op(const std::string& name)
    {
        apply_map.emplace(name, [=](instruction_ref ins) {
-            operation conv =
+            operation conv = make_op(
-                miopen_convolution<Op>{any_cast<Op>(ins->get_operator()), int8_x4_format};
+                "gpu::" + name,
-            migraphx::context ctx         = get_context();
+                {{"op", ins->get_operator().to_value()}, {"int8_x4_format", int8_x4_format}});
-            size_t ws_bytes               = 0;
-            auto compile_conv_with_format = [&](bool format) {
-                conv     = miopen_convolution<Op>{any_cast<Op>(ins->get_operator()), format};
-                auto ws  = conv.compile(ctx, ins->get_shape(), to_shapes(ins->inputs()));
-                ws_bytes = ws.get("workspace", 0);
-            };
-            try
-            { // for the regular convolution and deconvolution, this try would always succeed
-                compile_conv_with_format(int8_x4_format);
-            }
-            catch(migraphx::exception&)
-            {
-                // In case no solver supports the default format, retry using the other format.
-                compile_conv_with_format(not int8_x4_format);
-            }
-            auto args      = ins->inputs();
            auto output = insert_allocation(ins, ins->get_shape());
-            auto workspace = insert_allocation(ins, shape{shape::int8_type, {ws_bytes}});
-            return mod->replace_instruction(ins, conv, args[0], args[1], workspace, output);
+            return mod->replace_instruction(ins,
+                                            make_op("gpu::miopen_op", {{"op", to_value(conv)}}),
+                                            ins->inputs().at(0),
+                                            ins->inputs().at(1),
+                                            output);
        });
    }

--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
@@ -32,7 +32,13 @@
 #include <mlir-c/Dialect/MIGraphX.h>
 #include <mlir-c/IntegerSet.h>
 #include <mlir-c/Pass.h>
-#include <mlir-c/Registration.h>
+#include <mutex>
+#if !defined(MLIR_MIGRAPHX_DIALECT_API_VERSION) || MLIR_MIGRAPHX_DIALECT_API_VERSION != 3
+#warning "Incompatible version of rocMLIR library used, disabling"
+#undef MIGRAPHX_MLIR
+#else
+#include <mlir-c/RegisterRocMLIR.h>
+#endif
 #endif
 #include <migraphx/env.hpp>
@@ -50,10 +56,6 @@
 #include <deque>
 #include <variant>
-#if defined(MLIR_MIGRAPHX_DIALECT_API_VERSION) && MLIR_MIGRAPHX_DIALECT_API_VERSION >= 2
-#define MIGRAPHX_MLIR_BARE_POINTER
-#endif
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
@@ -101,7 +103,10 @@ struct mlir_handle
    mlir_handle(T p) : handle(ptr{p}) {}
-    T get() const { return handle.get().get(); }
+    T get() const
+    {
+        return handle.get().get(); // NOLINT(readability-redundant-smartptr-get)
+    }
    T release() { return handle.release().get(); }
@@ -165,9 +170,11 @@ struct mlir_program
          location(mlirLocationUnknownGet(ctx.get())),
          mmodule(mlirModuleCreateEmpty(location))
    {
-        MlirDialectHandle mixr_handle = mlirGetDialectHandle__migraphx__();
+        MlirDialectRegistry registry = mlirDialectRegistryCreate();
-        mlirDialectHandleRegisterDialect(mixr_handle, ctx.get());
+        mlirRegisterRocMLIRDialects(registry);
-        mlirRegisterAllDialects(ctx.get());
+        mlirContextAppendDialectRegistry(ctx.get(), registry);
+        mlirContextLoadAllAvailableDialects(ctx.get());
+        mlirDialectRegistryDestroy(registry);
        mlirContextSetAllowUnregisteredDialects(ctx.get(), true /*allow*/);
    }
@@ -449,7 +456,8 @@ struct mlir_program
        auto ops = create_operation_state("func.func");
        ops.add_attributes({{"function_type", make_function_type(inputs, outputs)},
                            {"sym_name", std::string("main")},
-                            {"kernel", std::string("mixr")}});
+                            {"kernel", std::string("mixr")},
+                            {"arch", target_arch}});
        ops.add_region(std::move(region));
        insert(body, std::move(ops));
@@ -509,7 +517,8 @@ struct mlir_program
                pp =
                    problem_params{ins->get_operator(), to_shapes(ins->inputs()), ins->get_shape()};
                // check if HW supports xdlops
-                bool xdlops       = contains(get_xdlops_archs(), target_name);
+                auto target_chip  = trim(split_string(target_arch, ':').front());
+                bool xdlops       = contains(get_xdlops_archs(), target_chip);
                std::string tuned = get_tune_params(xdlops);
                if(not tuned.empty())
                    ops.add_attributes({{"perf_config", tuned}});
@@ -537,7 +546,7 @@ struct mlir_program
        // 1st pipeline to call
        mlirMIGraphXAddHighLevelPipeline(pm.get());
        // 2nd pipeline to call
-        mlirMIGraphXAddBackendPipeline(pm.get(), target_name.c_str(), "amdgcn-amd-amdhsa", "");
+        mlirMIGraphXAddBackendPipeline(pm.get(), target_arch.c_str());
        mlirPassManagerRun(pm.get(), mmodule.get());
        code_object_op op{};
@@ -547,16 +556,7 @@ struct mlir_program
        return op;
    }
-    void find_target()
+    void find_target() { target_arch = get_device_name(); }
-    {
-        std::string tname = get_device_name();
-        // HACK: Since MLIR can't handle the full target name
-        target_name = trim(split_string(tname, ':').front());
-        if(tname.size() != target_name.size())
-            std::cout
-                << "*************** WARNING: MLIR may not compile the correct target features for: "
-                << tname << std::endl;
-    }
    std::pair<std::size_t, std::size_t> get_launch_params() const
    {
@@ -585,7 +585,7 @@ struct mlir_program
    mlir_module mmodule;
    problem_params pp;
    std::deque<std::string> strings{};
-    std::string target_name;
+    std::string target_arch;
 };
 std::string dump_mlir(const module& m)
@@ -647,6 +647,10 @@ code_object_op compile_mlir(const context&, module m, const std::vector<instruct
    const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
    if(trace)
        std::cout << m << std::endl;
+    // set mutex while llvm thread support is disabled.
+    static std::mutex g_mlirc_mutex; // NOLINT
+    const std::lock_guard<std::mutex> lock(g_mlirc_mutex);
    mlir_program mp;
    mp.find_target();
    mp.parse(m);
@@ -666,46 +670,9 @@ instruction_ref insert_mlir(module& m,
    std::vector<instruction_ref> refs;
    std::size_t last = 0;
-#ifdef MIGRAPHX_MLIR_BARE_POINTER
    refs.reserve(inputs.size());
    std::copy(inputs.begin(), inputs.end(), std::back_inserter(refs));
    last               = refs.size() - 1;
-#else
-    refs.reserve(inputs.size() * 15);
-    std::unordered_map<uint64_t, instruction_ref> literal_map{};
-    auto get_literal = [&](uint64_t value) {
-        auto fi = literal_map.find(value);
-        if(fi != literal_map.end())
-            return fi->second;
-        auto lit = m.add_literal(value);
-        literal_map.emplace(value, lit);
-        return lit;
-    };
-    for(auto input : inputs)
-    {
-        const size_t offset = 0;
-        auto s              = input->get_shape();
-        last                = refs.size();
-        refs.push_back(input);
-        refs.push_back(input);
-        refs.push_back(get_literal(offset)); // offset
-        // dim sizes
-        std::transform(s.lens().begin(),
-                       s.lens().end(),
-                       std::back_inserter(refs),
-                       [&](const auto& lval) { return get_literal(lval); });
-        // refs.push_back(get_literal(1)); // G
-        // dim strides
-        std::transform(s.strides().begin(),
-                       s.strides().end(),
-                       std::back_inserter(refs),
-                       [&](const auto& lval) { return get_literal(lval); });
-        // refs.push_back(get_literal(1)); // G
-    }
-#endif
    co.expected_inputs = to_shapes(refs);
    co.output_arg      = last;
    return m.insert_instruction(ins, co, refs);

--- a/src/targets/gpu/perfdb.cpp
+++ b/src/targets/gpu/perfdb.cpp
@@ -27,6 +27,7 @@
 #include <migraphx/stringutils.hpp>
 #include <migraphx/permutation.hpp>
 #include <fstream>
+#include <mutex>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -88,6 +89,9 @@ std::string generate_miopen_config(const problem_params& pp)
 auto query_miopen_db(const std::string& query)
 {
+    static std::mutex g_db_mutex; // NOLINT
+    const std::lock_guard<std::mutex> lock(g_db_mutex);
    // TODO: Store db as a static variable
    const auto dbpath = fs::path{"/opt"} / "rocm" / "share" / "miopen" / "db" / "miopen.db";
    // Check if db file exists.

--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -51,17 +51,20 @@ struct layernorm_base
        }
        check_shapes{inputs, static_cast<const Derived&>(*this)}.has(nargs + N);
        auto s = inputs.at(0);
+        auto t = s.type();
+        if(not mods.empty())
+            t = mods.front()->get_output_shapes().front().type();
        if(s.scalar())
        {
            return s;
        }
        else if(s.broadcasted())
        {
-            return {s.type(), s.lens()};
+            return {t, s.lens()};
        }
        else
        {
-            return s.with_lens(s.lens());
+            return s.with_lens(t, s.lens());
        }
    }
 };

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -35,6 +35,7 @@
 #include <migraphx/fuse_pointwise.hpp>
 #include <migraphx/inline_module.hpp>
 #include <migraphx/insert_pad.hpp>
+#include <migraphx/layout_nhwc.hpp>
 #include <migraphx/memory_coloring.hpp>
 #include <migraphx/normalize_ops.hpp>
 #include <migraphx/preallocate_param.hpp>
@@ -50,6 +51,7 @@
 #include <migraphx/simplify_qdq.hpp>
 #include <migraphx/simplify_reshapes.hpp>
 #include <migraphx/gpu/allocation_model.hpp>
+#include <migraphx/gpu/compile_miopen.hpp>
 #include <migraphx/gpu/compile_ops.hpp>
 #include <migraphx/gpu/concat_gpu_opt.hpp>
 #include <migraphx/gpu/context.hpp>
@@ -70,6 +72,7 @@ namespace gpu {
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_POINTWISE_FUSION)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NHWC)
 struct id_pass
 {
@@ -120,6 +123,9 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        simplify_algebra{},
        simplify_reshapes{},
+        enable_pass(enabled(MIGRAPHX_ENABLE_NHWC{}), layout_nhwc{}),
+        dead_code_elimination{},
+        simplify_reshapes{},
        simplify_algebra{},
        prefuse_ops{},
        dead_code_elimination{},
@@ -136,6 +142,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        eliminate_concat{concat_gpu_optimization{}},
        dead_code_elimination{},
+        compile_miopen{&gctx},
+        dead_code_elimination{},
        pack_int8_args{},
        dead_code_elimination{},
        fuse_ops{&ctx, options.fast_math},

--- a/src/targets/ref/lowering.cpp
+++ b/src/targets/ref/lowering.cpp
@@ -383,9 +383,9 @@ struct ref_gemm
    std::string name() const { return "ref::dot"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    argument compute(context&, const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
        migemm(result, args[0], args[1], 1.0f, 0.0f);
        return result;
@@ -449,10 +449,10 @@ struct ref_softmax : auto_register_op<ref_softmax<Op>>
    {
        return op.normalize_compute_shape(inputs);
    }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    argument compute(context&, const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
-        auto batch_lens        = output_shape.lens();
+        auto batch_lens        = dyn_out.computed_shape.lens();
        int64_t tuned_axis     = tune_axis(args[0].get_shape().lens().size(), op.axis, op.name());
        std::size_t n_dims     = batch_lens[tuned_axis];
        batch_lens[tuned_axis] = 1;
@@ -475,7 +475,7 @@ struct ref_softmax : auto_register_op<ref_softmax<Op>>
                for(std::size_t j = 0; j < n_dims; ++j)
                {
                    idx[tuned_axis]   = j;
-                    std::size_t index = output_shape.index(idx);
+                    std::size_t index = dyn_out.computed_shape.index(idx);
                    output[index]     = std::exp(input[index] - batch_max[i]);
                }

--- a/test/fuse_pointwise.cpp
+++ b/test/fuse_pointwise.cpp
@@ -272,6 +272,35 @@ TEST_CASE(contiguous_input)
    EXPECT(p1 == p2);
 }
+TEST_CASE(contiguous_boolean_input)
+{
+    migraphx::shape s{migraphx::shape::bool_type, {2, 3}};
+    migraphx::shape s_lit{migraphx::shape::bool_type, {1}, {0}};
+    migraphx::program p1;
+    {
+        auto* mm = p1.get_main_module();
+        auto x   = mm->add_parameter("x", s);
+        auto one = mm->add_literal(migraphx::literal(s_lit, {1.0}));
+        auto yb =
+            mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), one);
+        auto y    = mm->add_instruction(migraphx::make_op("contiguous"), yb);
+        auto xor1 = mm->add_instruction(migraphx::make_op("logical_xor"), x, y);
+        mm->add_return({xor1});
+    }
+    run_pass(p1);
+    migraphx::program p2;
+    {
+        auto* mm  = p2.get_main_module();
+        auto x    = mm->add_parameter("x", s);
+        auto xor1 = add_pointwise(p2, "main:pointwise0", {x}, [=](auto* pm, const auto& inputs) {
+            auto y = pm->add_literal(migraphx::literal(s_lit, {1}));
+            return pm->add_instruction(migraphx::make_op("logical_xor"), inputs[0], y);
+        });
+        mm->add_return({xor1});
+    }
+}
 TEST_CASE(all_scalar_input)
 {
    migraphx::shape s{migraphx::shape::float_type};

--- a/test/gpu/hip.cpp
+++ b/test/gpu/hip.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <test.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/gpu/target.hpp>
+TEST_CASE(tuple_to_from_gpu)
+{
+    migraphx::shape s1{migraphx::shape::float_type, {2, 3}};
+    migraphx::shape s2{migraphx::shape::int32_type, {2, 4}};
+    std::vector<float> p1_data = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6};
+    std::vector<int> p2_data   = {1, 2, 3, 4, 5, 6, 7, 8};
+    auto p1                    = migraphx::argument{s1, p1_data.data()};
+    auto p2                    = migraphx::argument{s2, p2_data.data()};
+    auto p1_gpu                = migraphx::gpu::to_gpu(p1);
+    auto p2_gpu                = migraphx::gpu::to_gpu(p2);
+    auto p_tuple               = migraphx::gpu::from_gpu(migraphx::argument({p1_gpu, p2_gpu}));
+    std::vector<migraphx::argument> results = p_tuple.get_sub_objects();
+    std::vector<float> result1;
+    results[0].visit([&](auto output) { result1.assign(output.begin(), output.end()); });
+    std::vector<int> result2;
+    results[1].visit([&](auto output) { result2.assign(output.begin(), output.end()); });
+    EXPECT(result1 == p1_data);
+    EXPECT(result2 == p2_data);
+}
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/gpu/mlir.cpp
+++ b/test/gpu/mlir.cpp
@@ -140,7 +140,7 @@ TEST_CASE(conv)
 {
    const std::string mlir_output = R"__migraphx__(
 module {
-  func.func @main(%arg0: tensor<2x8x3x3xf32>, %arg1: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {kernel = "mixr"} {
+  func.func @main(%arg0: tensor<2x8x3x3xf32>, %arg1: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {arch = "", kernel = "mixr"} {
    %0 = migraphx.convolution(%arg1, %arg0) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
    return %0 : tensor<1x2x2x2xf32>
  }
@@ -163,7 +163,7 @@ TEST_CASE(conv_add_relu)
 {
    const std::string mlir_output = R"__migraphx__(
 module {
-  func.func @main(%arg0: tensor<1x2x2x2xf32>, %arg1: tensor<2x8x3x3xf32>, %arg2: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {kernel = "mixr"} {
+  func.func @main(%arg0: tensor<1x2x2x2xf32>, %arg1: tensor<2x8x3x3xf32>, %arg2: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {arch = "", kernel = "mixr"} {
    %0 = migraphx.convolution(%arg2, %arg1) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
    %1 = migraphx.add(%0, %arg0) : (tensor<1x2x2x2xf32>, tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
    %2 = migraphx.relu(%1) : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>

--- a/test/instruction.cpp
+++ b/test/instruction.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/instruction.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/make_op.hpp>
+#include "test.hpp"
+TEST_CASE(check_undefined)
+{
+    migraphx::module m;
+    auto und = m.add_instruction(migraphx::make_op("undefined"));
+    auto cov = m.add_instruction(
+        migraphx::make_op("convert", {{"target_type", migraphx::shape::half_type}}), und);
+    auto abs = m.add_instruction(migraphx::make_op("abs"), cov);
+    migraphx::shape xs{migraphx::shape::float_type, {2, 3}};
+    std::vector<float> datax = {1, 2, 3, 4, 5, 6};
+    auto lit = m.add_literal(migraphx::literal(xs, datax));
+    auto mul = m.add_instruction(migraphx::make_op("mul"), lit, lit);
+    EXPECT(und->is_undefined());
+    EXPECT(cov->is_undefined());
+    EXPECT(abs->is_undefined());
+    EXPECT(not lit->is_undefined());
+    EXPECT(not mul->is_undefined());
+}
+int main(int argc, const char* argv[]) { test::run(argc, argv); }