Merge branch 'develop' into blas_tuning

23cb7917 · Brian Pickrell · GitHub · b5fcc0bc · ea32ca70 · 23cb7917
Unverified Commit 23cb7917 authored Aug 16, 2023 by Brian Pickrell Committed by GitHub Aug 16, 2023
20 changed files
--- a/src/targets/gpu/compiler.cpp
+++ b/src/targets/gpu/compiler.cpp
@@ -28,33 +28,45 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-auto& compiler_map()
+namespace {
+struct compiler_handle
 {
-    static std::unordered_map<std::string, compiler_compile> m; // NOLINT
-    return m;
-}
+    compiler_compile compile;
+    compiler_compile_op compile_op;
+    compiler_tuning_config get_tuning_config;
+};
+} // namespace

-auto& compiler_op_map()
+auto& compiler_map()
 {
-    static std::unordered_map<std::string, compiler_compile_op> m; // NOLINT
+    static std::unordered_map<std::string, compiler_handle> m; // NOLINT
    return m;
 }

-void register_compiler(const std::string& name, compiler_compile c, compiler_compile_op cop)
+void register_compiler(const std::string& name,
+                       compiler_compile c,
+                       compiler_compile_op cop,
+                       compiler_tuning_config ctg)
 {
-    compiler_map()[name]    = std::move(c);
-    compiler_op_map()[name] = std::move(cop);
+    compiler_map()[name] = {std::move(c), std::move(cop), std::move(ctg)};
 }

 bool has_compiler_for(const std::string& name) { return compiler_map().count(name) > 0; }
-compiler_replace compile(context& ctx, instruction_ref ins, const operation& op)
+compiler_replace
+compile(context& ctx, instruction_ref ins, const operation& op, const value& solution)
 {
-    return compiler_map().at(op.name())(ctx, ins, op);
+    return compiler_map().at(op.name()).compile(ctx, ins, op, solution);
 }
 operation
 compile_op(const std::string& name, context& ctx, const std::vector<shape>& inputs, const value& v)
 {
-    return compiler_op_map().at(name)(ctx, inputs, v);
+    return compiler_map().at(name).compile_op(ctx, inputs, v);
+}
+
+optional<tuning_config>
+get_tuning_config(context& ctx, instruction_ref ins, const operation& op, bool exhaustive)
+{
+    return compiler_map().at(op.name()).get_tuning_config(ctx, ins, op, exhaustive);
 }

 } // namespace gpu

--- a/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
@@ -41,7 +41,7 @@ struct index

    __device__ index_int nglobal() const { return blockDim.x * gridDim.x; } // NOLINT

-    __device__ index_int nlocal() const { return blockDim.x; } // NOLINT
+    __device__ index_int nlocal() const { return blockDim.x; }              // NOLINT

    template <class F>
    __device__ void global_stride(index_int n, F f) const
@@ -81,6 +81,12 @@ inline auto launch(hipStream_t stream, index_int global, index_int local)
        dim3 nthreads(local);
        // cppcheck-suppress UseDeviceLaunch
        hipLaunchKernelGGL((launcher<f_type>), nblocks, nthreads, 0, stream, f);
+        hipError_t kernel_launch_status = hipGetLastError();
+        if(kernel_launch_status != hipSuccess)
+        {
+            MIGRAPHX_THROW("MIGraphX device kernel failed to launch with error: " +
+                           std::string(hipGetErrorString(kernel_launch_status)));
+        }
    };
 }


--- a/src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
@@ -124,7 +124,7 @@ void nary_broadcast_vec_impl(
                    buffer[i] = binput.data()[i];
                }
                __syncthreads();
-                auto* bp = as_pointer(buffer);
+                const auto* bp = as_pointer(buffer);
                // Process the data
                for(size_t i = idx.global; i < nelements; i += nglobal)
                {
@@ -219,7 +219,7 @@ void nary_double_broadcast_vec_impl(
                    buffer[i + bdim_vec_len] = binput2.data()[i];
                }
                __syncthreads();
-                auto* bp = as_pointer(buffer);
+                const auto* bp = as_pointer(buffer);
                // Process the data
                for(size_t i = idx.global; i < nelements; i += nglobal)
                {

--- a/src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
@@ -94,6 +94,10 @@ template <>
 struct is_hip_type<std::uint8_t> : std::true_type
 {
 };
+template <>
+struct is_hip_type<std::int32_t> : std::true_type
+{
+};

 template <class T, class V, MIGRAPHX_REQUIRES(is_hip_type<typename T::type>{})>
 void hip_visitor_invoke(T as, V&& v)
@@ -120,12 +124,10 @@ void hip_visit_all_impl(const shape& s, F f, V&& v, Ts&&... xs)
    if(not std::all_of(
           types.begin(), types.end(), [&](migraphx::shape::type_t t) { return t == s.type(); }))
        MIGRAPHX_THROW("Types must be the same");
-    std::initializer_list<index_int> ranks = {
-        static_cast<index_int>(get_shape(xs).lens().size())...};
-    if(not std::all_of(
-           ranks.begin(), ranks.end(), [&](index_int r) { return r == s.lens().size(); }))
+    std::initializer_list<index_int> ranks = {static_cast<index_int>(get_shape(xs).ndim())...};
+    if(not std::all_of(ranks.begin(), ranks.end(), [&](index_int r) { return r == s.ndim(); }))
        MIGRAPHX_THROW("Ranks must be the same");
-    visit_tensor_size(s.lens().size(), [&](auto ndim) {
+    visit_tensor_size(s.ndim(), [&](auto ndim) {
        s.visit_type(hip_visitor([&](auto as) { v(f(xs, ndim, as)...); }));
    });
 }
@@ -133,12 +135,10 @@ void hip_visit_all_impl(const shape& s, F f, V&& v, Ts&&... xs)
 template <class V, class F, class... Ts>
 void hip_visit_views_impl(const shape& s, F f, V&& v, Ts&&... xs)
 {
-    std::initializer_list<index_int> ranks = {
-        static_cast<index_int>(get_shape(xs).lens().size())...};
-    if(not std::all_of(
-           ranks.begin(), ranks.end(), [&](index_int r) { return r == s.lens().size(); }))
+    std::initializer_list<index_int> ranks = {static_cast<index_int>(get_shape(xs).ndim())...};
+    if(not std::all_of(ranks.begin(), ranks.end(), [&](index_int r) { return r == s.ndim(); }))
        MIGRAPHX_THROW("Ranks must be the same");
-    visit_tensor_size(s.lens().size(), [&](auto ndim) { v(f(xs, ndim)...); });
+    visit_tensor_size(s.ndim(), [&](auto ndim) { v(f(xs, ndim)...); });
 }

 template <class F>

--- a/src/targets/gpu/device/multinomial.cpp
+++ b/src/targets/gpu/device/multinomial.cpp
@@ -67,18 +67,19 @@ void multinomial(hipStream_t stream,
    size_t class_size  = arg0.get_shape().lens().back();
    size_t sample_size = result.get_shape().lens().back();

-    hip_visit_all(arg0, arg1)([&](auto cdf, auto dist) {
-        result.visit([&](auto out) {
-            hip_visit_views(out)([&](auto output) {
-                gs_launch(stream, batch_size * sample_size)([=](auto i) __device__ {
-                    auto idx       = output.get_shape().multi(i);
-                    auto cdf_begin = cdf.begin() + (idx.front() * class_size);
-                    auto cdf_end   = cdf_begin + class_size;
-                    auto sample_iter =
-                        upper_bound(cdf_begin, cdf_end, dist[i] * *(std::prev(cdf_end)));
-                    output[i] = std::distance(cdf_begin, sample_iter);
+    visit_all(arg0, arg1)([&](auto cdf_host, auto dist_host) {
+        result.visit([&](auto output_host) {
+            hip_visit_views(cdf_host, dist_host, output_host)(
+                [&](auto cdf, auto dist, auto output) {
+                    gs_launch(stream, batch_size * sample_size)([=](auto i) __device__ {
+                        auto idx       = output.get_shape().multi(i);
+                        auto cdf_begin = cdf.begin() + (idx.front() * class_size);
+                        auto cdf_end   = cdf_begin + class_size;
+                        auto* sample_iter =
+                            upper_bound(cdf_begin, cdf_end, dist[i] * *(std::prev(cdf_end)));
+                        output[i] = std::distance(cdf_begin, sample_iter);
+                    });
                });
-            });
        });
    });
 }

--- a/src/targets/gpu/device/scatter.cpp
+++ b/src/targets/gpu/device/scatter.cpp
@@ -37,22 +37,26 @@ argument scatter(
    hipStream_t stream, argument result, argument arg0, argument arg1, argument arg2, int64_t axis)
 {
    auto ds            = arg0.get_shape();
-    auto inds          = arg1.get_shape();
+    auto s1            = arg1.get_shape();
    auto axis_dim_size = ds.lens()[axis];
-    hip_visit_all(result, arg0, inds)([&](auto output, auto data, auto s1) {
+    hip_visit_all(result, arg0, arg2)([&](auto output, auto data, auto update) {
        auto* output_ptr     = device_cast(output.data());
        const auto* data_ptr = device_cast(data.data());
        gs_launch(stream, ds.elements())([=](auto i) __device__ { output_ptr[i] = data_ptr[i]; });
-        hip_visit_all(arg1, arg2)([&](auto indices, auto update) {
-            const auto* upd_ptr     = device_cast(update.data());
-            const auto* indices_ptr = device_cast(indices.data());
-            gs_launch(stream, inds.elements())([=](auto i) __device__ {
-                auto out_idx    = s1.multi(i);
-                auto index      = indices_ptr[i];
-                index           = index < 0 ? index + axis_dim_size : index;
-                out_idx[axis]   = index;
-                output[out_idx] = upd_ptr[i];
-            });
+
+        hip_visit_all(arg1)([&](auto indices) {
+            if constexpr(indices.get_shape().lens.size() == output.get_shape().lens.size())
+            {
+                const auto* upd_ptr     = device_cast(update.data());
+                const auto* indices_ptr = device_cast(indices.data());
+                gs_launch(stream, s1.elements())([=](auto i) __device__ {
+                    auto out_idx    = indices.get_shape().multi(i);
+                    auto index      = indices_ptr[i];
+                    index           = index < 0 ? index + axis_dim_size : index;
+                    out_idx[axis]   = index;
+                    output[out_idx] = upd_ptr[i];
+                });
+            }
        });
    });


--- a/src/targets/gpu/device/topk.cpp
+++ b/src/targets/gpu/device/topk.cpp
@@ -72,12 +72,12 @@ struct hip_heap_vector
            index_int l    = 2 * index + 1;
            index_int r    = 2 * index + 2;

-            if(l < n && compare(data[data_index(l)], data[data_index(index)]))
+            if(l < n and compare(data[data_index(l)], data[data_index(index)]))
            {
                index = l;
            }

-            if(r < n && compare(data[data_index(r)], data[data_index(index)]))
+            if(r < n and compare(data[data_index(r)], data[data_index(index)]))
            {
                index = r;
                if(compare(data[data_index(l)], data[data_index(r)]))

--- a/src/targets/gpu/device_name.cpp
+++ b/src/targets/gpu/device_name.cpp
@@ -31,18 +31,6 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-template <class HipDeviceProp>
-std::string get_arch_name(rank<0>, const HipDeviceProp& props)
-{
-    return "gfx" + std::to_string(props.gcnArch);
-}
-
-template <class HipDeviceProp>
-auto get_arch_name(rank<1>, const HipDeviceProp& props) -> decltype(std::string(props.gcnArchName))
-{
-    return std::string(props.gcnArchName);
-}
-
 int get_device_id()
 {
    int device;
@@ -58,7 +46,7 @@ std::string get_device_name()
    auto status = hipGetDeviceProperties(&props, get_device_id());
    if(status != hipSuccess)
        MIGRAPHX_THROW("Failed to get device properties");
-    return get_arch_name(rank<1>{}, props);
+    return props.gcnArchName;
 }

 } // namespace gpu

--- a/src/targets/gpu/driver/CMakeLists.txt
+++ b/src/targets/gpu/driver/CMakeLists.txt
@@ -22,7 +22,7 @@
 # THE SOFTWARE.
 #####################################################################################

-file(GLOB GPU_DRIVER_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
+file(GLOB GPU_DRIVER_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 add_executable(gpu-driver
    ${GPU_DRIVER_SRCS}
 )

--- a/src/targets/gpu/driver/compile_op.cpp
+++ b/src/targets/gpu/driver/compile_op.cpp
@@ -22,7 +22,7 @@
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/driver/action.hpp>
-#include <migraphx/gpu/driver/perf.hpp>
+#include <migraphx/gpu/time_op.hpp>
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/gpu/context.hpp>


--- a/src/targets/gpu/driver/run_op.cpp
+++ b/src/targets/gpu/driver/run_op.cpp
@@ -22,7 +22,7 @@
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/driver/action.hpp>
-#include <migraphx/gpu/driver/perf.hpp>
+#include <migraphx/gpu/time_op.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/make_op.hpp>


--- a/src/targets/gpu/fuse_ck.cpp
+++ b/src/targets/gpu/fuse_ck.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/fuse_ck.hpp>
+#include <migraphx/matcher.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/register_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct module;
+
+namespace gpu {
+
+struct ck_gemm
+{
+    operation op = make_op("dot");
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"));
+    }
+
+    std::string name() const { return "gpu::ck_gemm"; }
+
+    void check_gemm_shape(const shape& s) const
+    {
+        if(not contains(range(s.strides().rbegin(), s.strides().rbegin() + 3), 1))
+            MIGRAPHX_THROW("Invalid shape for ck_gemm");
+    }
+
+    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
+    {
+        check_shapes{inputs, *this}.same_ndims();
+        if(inputs.size() < 2)
+            MIGRAPHX_THROW("should have at least two inputs.");
+        auto a = inputs[0];
+        auto b = inputs[1];
+        for(const auto& input : inputs)
+            check_gemm_shape(input);
+        auto r = op.compute_shape({a, b});
+        if(mods.empty())
+            return r;
+        return r.with_type(mods.front()->get_output_shapes().front().type());
+    }
+};
+MIGRAPHX_REGISTER_OP(ck_gemm);
+
+namespace {
+
+bool is_ck_supported_type(shape::type_t t)
+{
+    return contains({shape::half_type, shape::int8_type, shape::int32_type}, t);
+}
+
+MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins)
+{
+    if(ins->name() != "dot" and ins->name() != "quant_dot")
+        return false;
+    if(not is_ck_supported_type(ins->get_shape().type()))
+        return false;
+    auto a = ins->inputs().front()->get_shape();
+    auto b = ins->inputs().back()->get_shape();
+    auto m = a.lens()[a.lens().size() - 2];
+    auto n = b.lens().back();
+    auto k = a.lens().back();
+    // Integer gemms must be divisible by 4 in ck
+    if(contains({shape::int8_type, shape::int32_type}, ins->get_shape().type()))
+    {
+        if(m % 4 != 0)
+            return false;
+        if(n % 4 != 0)
+            return false;
+        if(k % 4 != 0)
+            return false;
+    }
+    // Skipping GEMMs with a K dimension greater than 2048 is a course-grained strategy
+    // to avoid poor-performing GEMM kernels from CK
+    // To-do: Investigate a more precise strategy
+    return k <= 2048;
+}
+
+struct find_ck_gemm_pointwise
+{
+    // Find a gemm followed by a pointwise operation.
+    auto matcher() const
+    {
+        auto gemm = match::skip(match::name("contiguous"))(
+            match::name("dot", "quant_dot")(is_ck_gemm().bind("gemm")));
+        return match::name("pointwise")(match::any_of[match::inputs()](gemm.bind("x")));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto ins      = r.result;
+        auto gemm_ins = r.instructions["gemm"];
+        auto x_ins    = r.instructions["x"]; // input after contiguous
+        auto* pm      = ins->module_inputs().front();
+        auto names    = pm->get_parameter_names();
+        std::sort(names.begin(), names.end());
+        auto inputs   = ins->inputs();
+        auto gemm_it  = std::find(inputs.begin(), inputs.end(), x_ins);
+        auto gemm_idx = gemm_it - inputs.begin();
+        if(gemm_ins->get_shape().type() != shape::int32_type and
+           ins->get_shape().type() != gemm_ins->get_shape().type())
+            return;
+        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto input) {
+               return not is_ck_supported_type(input->get_shape().type());
+           }))
+            return;
+        assert(gemm_it != inputs.end());
+        if(gemm_idx != 0)
+        {
+            auto first_param    = pm->get_parameter(names[0]);
+            auto gemm_param     = pm->get_parameter(names[gemm_idx]);
+            auto new_gemm_param = pm->add_parameter(names[0] + "_0", gemm_param->get_shape());
+            auto new_first_param =
+                pm->add_parameter(names[gemm_idx] + "_0", first_param->get_shape());
+            pm->replace_instruction(gemm_param, new_gemm_param);
+            pm->replace_instruction(first_param, new_first_param);
+            pm->remove_instruction(first_param);
+            pm->remove_instruction(gemm_param);
+        }
+        inputs.erase(gemm_it);
+        inputs.insert(inputs.begin(), gemm_ins->inputs().begin(), gemm_ins->inputs().end());
+
+        mpm.get_module().replace_instruction(ins, ck_gemm{gemm_ins->get_operator()}, inputs, {pm});
+    }
+};
+
+struct find_ck_gemm
+{
+    auto matcher() const { return match::name("dot")(is_ck_gemm().bind("gemm")); }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto ins = r.result;
+        mpm.get_module().replace_instruction(ins, ck_gemm{ins->get_operator()}, ins->inputs());
+    }
+};
+
+} // namespace
+
+void fuse_ck::apply(module_pass_manager& mpm) const
+{
+    match::find_matches(mpm, find_ck_gemm_pointwise{});
+    match::find_matches(mpm, find_ck_gemm{});
+}
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
@@ -38,6 +38,27 @@ namespace gpu {

 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_MLIR);

+bool mlir_enabled()
+{
+#ifdef MIGRAPHX_MLIR
+    const bool mlir_enabled = enabled(MIGRAPHX_ENABLE_MLIR{});
+    if(mlir_enabled)
+    {
+        return true;
+    }
+    else
+    {
+
+        std::cerr << "WARNING: MIGraphX built with MLIR but it is not enabled. Please set the env "
+                     "var MIGRAPHX_ENABLE_MLIR to use MLIR kernel generator."
+                  << std::endl;
+        return false;
+    }
+#else
+    return false;
+#endif
+}
+
 #ifdef MIGRAPHX_MLIR

 struct mlir_op
@@ -58,8 +79,41 @@ struct mlir_op
            MIGRAPHX_THROW("should have one submodule.");
        if(inputs.size() < 2)
            MIGRAPHX_THROW("should have at least two inputs.");
-        auto n = inputs.size();
-        return op.compute_shape({inputs[n - 2], inputs[n - 1]});
+
+        module_ref mod = mods[0];
+        auto type      = mod->get_output_shapes().front().type();
+        std::unordered_map<instruction_ref, shape> ins_shapes;
+        size_t param_cnt               = 0;
+        std::vector<std::string> names = mod->get_parameter_names();
+        std::sort(names.begin(), names.end());
+        for(const std::string& param_name : names)
+        {
+            ins_shapes[mod->get_parameter(param_name)] = inputs[param_cnt++];
+        }
+        for(auto ins : iterator_for(*mod))
+        {
+            if(ins->name() == "@param")
+            {
+                continue;
+            }
+            if(ins->name() == "@literal")
+            {
+                ins_shapes[ins] = ins->get_shape();
+                continue;
+            }
+            if(ins->name() == "@return")
+            {
+                return ins_shapes[ins->inputs().at(0)].with_type(type);
+            }
+            std::vector<shape> input_shapes;
+            input_shapes.resize(ins->inputs().size());
+            std::transform(ins->inputs().begin(),
+                           ins->inputs().end(),
+                           input_shapes.begin(),
+                           [&](auto in) { return ins_shapes[in]; });
+            ins_shapes[ins] = ins->get_operator().compute_shape(input_shapes);
+        }
+        MIGRAPHX_THROW("No return found in the submodule");
    }
 };
 MIGRAPHX_REGISTER_OP(mlir_op);
@@ -68,7 +122,7 @@ namespace {

 MIGRAPHX_PRED_MATCHER(is_mlir_conv, instruction_ref ins)
 {
-    if(ins->name() != "convolution")
+    if(ins->name() != "convolution" and ins->name() != "quant_convolution")
        return false;
    value v    = ins->get_operator().to_value();
    auto group = v.at("group").to<int>();
@@ -85,10 +139,126 @@ struct find_mlir_op
    auto matcher() const
    {
        auto dot_or_conv = match::skip(match::name("contiguous"))(
-            match::any_of(match::name("dot"), is_mlir_conv()).bind("gemm_based_op"));
+            match::any_of(match::name("dot"), match::name("quant_dot"), is_mlir_conv())
+                .bind("gemm_based_op"));
        return match::name("pointwise")(match::any_of[match::inputs()](dot_or_conv.bind("x")));
    }

+    std::unordered_map<instruction_ref, instruction_ref>
+    create_param_map_with_literals(module_ref mm, const module* pm, const shape& shape) const
+    {
+        std::unordered_map<instruction_ref, instruction_ref> ins_map;
+        for(auto ins : iterator_for(*pm))
+        {
+            if(ins->name() != "@literal")
+            {
+                continue;
+            }
+            literal r               = ins->get_literal();
+            instruction_ref literal = mm->add_literal(r);
+            instruction_ref mbcast  = mm->add_instruction(
+                make_op("multibroadcast", {{"out_lens", shape.lens()}}), literal);
+            ins_map[ins] = mbcast;
+        }
+        return ins_map;
+    }
+
+    std::tuple<instruction_ref, std::vector<instruction_ref>>
+    fuse_input_ops_and_gemm_based_op(module_ref mm, instruction_ref gemm_based_op) const
+    {
+        std::vector<instruction_ref> top_inputs;
+        std::vector<instruction_ref> imm_inputs;
+        size_t input_cnt = 0;
+        for(instruction_ref input : gemm_based_op->inputs())
+        {
+            std::vector<operation> op_stream;
+            while(contains({"slice", "transpose", "contiguous", "reshape"}, input->name()))
+            {
+                op_stream.push_back(input->get_operator());
+                input = input->inputs().at(0);
+            }
+            top_inputs.push_back(input);
+            instruction_ref prev_input =
+                mm->add_parameter("y" + std::to_string(input_cnt++), input->get_shape());
+            for(const auto& op : reverse(op_stream))
+            {
+                prev_input = mm->add_instruction(op, {prev_input});
+            }
+            imm_inputs.push_back(prev_input);
+        }
+        instruction_ref new_gemm_based_op =
+            mm->add_instruction(gemm_based_op->get_operator(), imm_inputs);
+        return {new_gemm_based_op, top_inputs};
+    }
+
+    // Whitelist supported fusion options, including imposing type constraints
+    // for cases where MLIR only supports an operation (usually a pointwise function)
+    // on particular types.
+    bool is_pointwise_op_supported_by_mlir(const instruction& i) const
+    {
+        using type_t                                      = shape::type_t;
+        const auto& name                                  = i.name();
+        const auto result_type                            = i.get_shape().type();
+        const std::initializer_list<type_t> allowed_types = {type_t::float_type,
+                                                             type_t::half_type,
+                                                             type_t::int8_type,
+                                                             type_t::int32_type,
+                                                             type_t::bool_type};
+        // Preliminary type check.
+        if(not contains(allowed_types, result_type))
+        {
+            return false;
+        }
+        const std::initializer_list<std::string> any_type_ops = {"@literal", "@param", "@return"};
+        const std::initializer_list<std::string> no_bool_ops  = {
+            "convolution",
+            "quant_convolution",
+            "dot",
+            "quant_dot",
+            "add",
+            "clip",
+            "relu",
+            "sub",
+            "mul",
+            "div",
+            "pow",
+            "where",
+            "quantizelinear",
+            "dequantizelinear",
+            "abs",
+            "neg",
+        };
+        const std::initializer_list<std::string> fp_only_ops = {
+            "ceil",
+            "erf",
+            "exp",
+            "floor",
+            "log",
+            "recip",
+            "rsqrt",
+            // There are bugs in MLIR right now for models using sigmoid so disable it for now
+            // "sigmoid",
+            "softmax",
+            "tanh",
+        };
+        bool is_float = contains({type_t::float_type, type_t::half_type}, result_type);
+        if(contains(any_type_ops, name))
+            return true;
+        if(result_type != type_t::bool_type and contains(no_bool_ops, name))
+            return true;
+        if(is_float and contains(fp_only_ops, name))
+            return true;
+        // Only conversions between floating types are known to be unambigiously
+        // supported.
+        if(is_float and name == "convert")
+        {
+            return std::all_of(i.inputs().begin(), i.inputs().end(), [](const auto& arg) {
+                return contains({type_t::float_type, type_t::half_type}, arg->get_shape().type());
+            });
+        }
+        return false;
+    }
+
    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
    {
        auto ins           = r.result;
@@ -96,35 +266,25 @@ struct find_mlir_op
        auto x_ins         = r.instructions["x"]; // input after contiguous
        auto* pm           = ins->module_inputs().front();
        auto names         = pm->get_parameter_names();
-        // Whitelist pointwise operators
-        if(std::any_of(pm->begin(), pm->end(), [](const auto& i) {
-               return not contains(
-                   {"@literal", "@param", "@return", "convolution", "dot", "add", "relu"},
-                   i.name());
-           }))
-            return;
-        // Only fuse with fp32/fp16
-        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [&](auto i) {
-               return not contains({shape::type_t::float_type, shape::type_t::half_type},
-                                   i->get_shape().type());
+        // Whitelist pointwise operators.
+        if(std::any_of(pm->begin(), pm->end(), [&](const auto& i) {
+               return not is_pointwise_op_supported_by_mlir(i);
           }))
            return;
+
        std::sort(names.begin(), names.end());
        module_ref mm = mpm.create_module("mlir_" + pm->name());
        mm->set_bypass();
-        std::unordered_map<instruction_ref, instruction_ref> param_map;
-        auto x    = mm->add_parameter("x" + std::to_string(names.size()),
-                                   gemm_based_op->inputs().at(0)->get_shape());
-        auto w    = mm->add_parameter("x" + std::to_string(names.size() + 1),
-                                   gemm_based_op->inputs().at(1)->get_shape());
-        auto conv = mm->add_instruction(gemm_based_op->get_operator(), {x, w});
+        std::unordered_map<instruction_ref, instruction_ref> param_map =
+            create_param_map_with_literals(mm, pm, gemm_based_op->get_shape());
+        auto [anchor_op, top_inputs] = fuse_input_ops_and_gemm_based_op(mm, gemm_based_op);
        std::transform(names.begin(),
                       names.end(),
                       ins->inputs().begin(),
                       std::inserter(param_map, param_map.end()),
-                       [&](auto name, auto input) {
+                       [&, &anchor_op = anchor_op](auto name, auto input) {
                           if(input == x_ins)
-                               return std::make_pair(pm->get_parameter(name), conv);
+                               return std::make_pair(pm->get_parameter(name), anchor_op);
                           return std::make_pair(pm->get_parameter(name),
                                                 mm->add_parameter(name, input->get_shape()));
                       });
@@ -135,7 +295,7 @@ struct find_mlir_op
                     ins->inputs().end(),
                     std::back_inserter(inputs),
                     [&](auto input) { return input != gemm_based_op; });
-        inputs.insert(inputs.end(), gemm_based_op->inputs().begin(), gemm_based_op->inputs().end());
+        inputs.insert(inputs.end(), top_inputs.begin(), top_inputs.end());
        mpm.get_module().replace_instruction(
            ins, mlir_op{gemm_based_op->get_operator()}, inputs, {mm});
    }
@@ -148,17 +308,7 @@ struct find_mlir_op
 void fuse_mlir::apply(module_pass_manager& mpm) const
 {
 #ifdef MIGRAPHX_MLIR
-    const bool mlir_enabled = enabled(MIGRAPHX_ENABLE_MLIR{});
-    if(mlir_enabled)
-    {
-        match::find_matches(mpm, find_mlir_op{});
-    }
-    else
-    {
-        std::cerr << "WARNING: MIGraphX built with MLIR but it is not enabled. Please set the env "
-                     "var MIGRAPHX_ENABLE_MLIR to use MLIR kernel generator."
-                  << std::endl;
-    }
+    match::find_matches(mpm, find_mlir_op{});
 #else
    (void)mpm;
 #endif

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -165,7 +165,8 @@ struct fusion

 const std::unordered_set<std::string>& get_supported_archs()
 {
-    static std::unordered_set<std::string> supported_archs{"gfx900", "gfx906", "gfx908", "gfx1030"};
+    static std::unordered_set<std::string> supported_archs{
+        "gfx900", "gfx906", "gfx908", "gfx1030", "gfx940"};
    return supported_archs;
 }


--- a/src/targets/gpu/gemm_impl.cpp
+++ b/src/targets/gpu/gemm_impl.cpp
@@ -158,7 +158,15 @@ struct gemm_impl
        {
            beta = 0;
        }
+        if(arg_type == rocblas_datatype_f16_r)
+            compute_type = rocblas_datatype_f32_r;
+    }

+    rocblas_gemm_flags flag = rocblas_gemm_flags_none;
+#if ROCBLAS_VERSION_MAJOR < 3
+    if(int8_x4_format)
+        flag = rocblas_gemm_flags_pack_int8x4;
+#endif
        // Create lambdas that will cast alpha, beta to the output shape's type
        // and retain the values being pointed to
        output_shape.visit_type([&](auto as) {

--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -146,7 +146,11 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz)
    gpu_sync();
    std::vector<T> result(sz);
    assert(not is_device_ptr(result.data()));
-    assert(is_device_ptr(x));
+    if(not is_device_ptr(x))
+    {
+        MIGRAPHX_THROW(
+            "read_from_gpu() requires Src buffer to be on the GPU, Copy from gpu failed\n");
+    }
    auto status = hipMemcpy(result.data(), x, sz * sizeof(T), hipMemcpyDeviceToHost);
    if(status != hipSuccess)
        MIGRAPHX_THROW("Copy from gpu failed: " + hip_error(status)); // NOLINT

--- a/src/targets/gpu/include/migraphx/gpu/allocation_model.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/allocation_model.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_GPU_ALLOCATION_MODEL_HPP
 #define MIGRAPHX_GUARD_AMDMIGRAPHX_GPU_ALLOCATION_MODEL_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/operation.hpp>
 #include <migraphx/instruction_ref.hpp>
 #include <string>
@@ -33,7 +33,7 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-struct gpu_allocation_model
+struct MIGRAPHX_GPU_EXPORT gpu_allocation_model
 {
    std::string name() const;
    std::string copy() const;

--- a/src/targets/gpu/include/migraphx/gpu/analyze_streams.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/analyze_streams.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_GPU_ANALYZE_STREAMS_HPP
 #define MIGRAPHX_GUARD_RTGLIB_GPU_ANALYZE_STREAMS_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/analyze_streams.hpp>

 namespace migraphx {
@@ -34,7 +34,7 @@ struct module;

 namespace gpu {

-std::vector<stream_race> analyze_streams(const module& m);
+MIGRAPHX_GPU_EXPORT std::vector<stream_race> analyze_streams(const module& m);

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
@@ -24,9 +24,10 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_COMPILE_HIP_HPP
 #define MIGRAPHX_GUARD_RTGLIB_COMPILE_HIP_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/filesystem.hpp>
 #include <migraphx/compile_src.hpp>
+#include <migraphx/env.hpp>
 #include <migraphx/functional.hpp>
 #include <string>
 #include <utility>
@@ -36,6 +37,11 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

+#ifdef MIGRAPHX_USE_HIPRTC
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_HIPRTC);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS);
+#endif
+
 struct hiprtc_src_file
 {
    hiprtc_src_file() = default;
@@ -52,14 +58,13 @@ struct hiprtc_src_file
    }
 };

-std::vector<std::vector<char>> compile_hip_src_with_hiprtc(std::vector<hiprtc_src_file> srcs,
-                                                           std::string params,
-                                                           const std::string& arch);
+MIGRAPHX_GPU_EXPORT std::vector<std::vector<char>> compile_hip_src_with_hiprtc(
+    std::vector<hiprtc_src_file> srcs, std::string params, const std::string& arch);

-std::vector<std::vector<char>>
+MIGRAPHX_GPU_EXPORT std::vector<std::vector<char>>
 compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std::string& arch);

-std::string enum_params(std::size_t count, std::string param);
+MIGRAPHX_GPU_EXPORT std::string enum_params(std::size_t count, std::string param);

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
@@ -24,8 +24,9 @@
 #ifndef MIGRAPHX_GUARD_GPU_COMPILE_HIP_CODE_OBJECT_HPP
 #define MIGRAPHX_GUARD_GPU_COMPILE_HIP_CODE_OBJECT_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <migraphx/operation.hpp>
+#include <migraphx/compile_src.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -39,9 +40,10 @@ struct hip_compile_options
    std::size_t local;
    std::vector<shape> inputs;
    shape output;
-    std::string kernel_name           = "kernel";
-    std::string params                = "";
-    std::vector<shape> virtual_inputs = {};
+    std::string kernel_name                    = "kernel";
+    std::string params                         = "";
+    std::vector<shape> virtual_inputs          = {};
+    std::vector<src_file> additional_src_files = {};

    /**
     * @brief Set the launch parameters but allow v to override the values
@@ -64,14 +66,16 @@ struct hip_compile_options
 };

 /// Compute global for n elements, but max out on target-specific upper limit
-std::function<std::size_t(std::size_t local)>
+MIGRAPHX_GPU_EXPORT std::function<std::size_t(std::size_t local)>
 compute_global_for(context& ctx, std::size_t n, std::size_t over = 1);

-operation compile_hip_code_object(const std::string& content, hip_compile_options options);
+MIGRAPHX_GPU_EXPORT operation compile_hip_code_object(const std::string& content,
+                                                      hip_compile_options options);

-std::size_t compute_block_size(std::size_t n, std::size_t max_block_size = 1024);
+MIGRAPHX_GPU_EXPORT std::size_t compute_block_size(std::size_t n,
+                                                   std::size_t max_block_size = 1024);

-std::string generate_make_shape(const shape& s);
+MIGRAPHX_GPU_EXPORT std::string generate_make_shape(const shape& s);

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS