manual_merge

ac04f3cc · Khalique Ahmed · d39c3343 · d8011adf · ac04f3cc · ac04f3cc
Commit ac04f3cc authored Nov 10, 2023 by Khalique Ahmed
20 changed files
--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -55,7 +55,7 @@ bool is_device_ptr(const void* ptr)
    auto status = hipPointerGetAttributes(&attr, ptr);
    if(status != hipSuccess)
        return false;
-    return attr.memoryType == hipMemoryTypeDevice;
+    return attr.type == hipMemoryTypeDevice;
 }

 std::size_t get_available_gpu_memory()

--- a/src/targets/gpu/hiprtc/main.cpp
+++ b/src/targets/gpu/hiprtc/main.cpp
@@ -27,6 +27,7 @@
 #include <migraphx/msgpack.hpp>
 #include <migraphx/file_buffer.hpp>
 #include <migraphx/ranges.hpp>
+#include <array>
 #include <iostream>
 #include <cstring>


--- a/src/targets/gpu/include/migraphx/gpu/ck.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/ck.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_CK_HPP
+#define MIGRAPHX_GUARD_GPU_CK_HPP
+
+#include <migraphx/compile_src.hpp>
+#include <migraphx/env.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/stringutils.hpp>
+#include <string_view>
+
+#include "ck/host/device_gemm_multiple_d.hpp"
+#include "ck/host/device_batched_gemm_softmax_gemm.hpp"
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+#ifndef _WIN32
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_LOG_CK_GEMM);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_DEBUG);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TUNE_CK);
+#endif
+
+// NOLINTNEXTLINE
+const char* const disable_warning_pragma = R"__migraphx__(
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Weverything"
+${content}
+#pragma clang diagnostic pop
+)__migraphx__";
+
+template <class P>
+std::string ck_disable_warnings(P p)
+{
+    return interpolate_string(disable_warning_pragma,
+                              {{"content", std::string{p.data(), p.size()}}});
+}
+
+static std::unordered_map<std::string, std::string> create_ck_header_strings()
+{
+    std::unordered_map<std::string, std::string> result;
+    auto ck_headers = ck::host::GetHeaders();
+
+    std::transform(
+        ck_headers.begin(), ck_headers.end(), std::inserter(result, result.begin()), [&](auto& p) {
+            return std::pair<std::string, std::string>(p.first, ck_disable_warnings(p.second));
+        });
+    return result;
+}
+
+static std::vector<src_file> create_ck_headers()
+{
+    static const auto& header_strings = create_ck_header_strings();
+    std::vector<src_file> srcs;
+    std::transform(header_strings.begin(),
+                   header_strings.end(),
+                   std::back_inserter(srcs),
+                   [&](auto& p) { return src_file{p}; });
+    return srcs;
+}
+
+static inline const std::vector<src_file>& ck_headers()
+{
+    static const auto& headers = create_ck_headers();
+    return headers;
+}
+
+inline bool transposed_matrix(const shape& s) { return s.strides().back() != 1; }
+
+inline ck::host::DataType get_type(const shape& s)
+{
+    if(s.type() == shape::half_type)
+        return ck::host::DataType::Half;
+    else if(s.type() == shape::float_type)
+        return ck::host::DataType::Float;
+    else if(s.type() == shape::int8_type)
+        return ck::host::DataType::Int8;
+    else if(s.type() == shape::int32_type)
+        return ck::host::DataType::Int32;
+    MIGRAPHX_THROW("Unsupported ck type");
+}
+
+inline std::size_t get_batch_count(const shape& s)
+{
+    return std::accumulate(
+        s.lens().rbegin() + 2, s.lens().rend(), std::size_t{1}, std::multiplies<std::size_t>());
+}
+
+inline void fold_batch_dims(shape& s)
+{
+    auto lens = s.lens();
+    if(lens.size() <= 2)
+        return;
+    auto batch_count = get_batch_count(s);
+    auto m1          = lens.at(lens.size() - 2);
+    auto m2          = lens.at(lens.size() - 1);
+    if(transposed_matrix(s))
+        s = shape{s.type(), {m1, m2 * batch_count}};
+    else
+        s = shape{s.type(), {m1 * batch_count, m2}};
+}
+
+inline void remove_batch_dims(shape& s)
+{
+    auto lens = s.lens();
+    if(lens.size() <= 2)
+        return;
+    auto m1 = lens.at(lens.size() - 2);
+    auto m2 = lens.at(lens.size() - 1);
+    s       = shape{s.type(), {m1, m2}};
+}
+
+inline bool standard_batch(const shape& s)
+{
+    if(s.lens().size() < 3)
+        return true;
+    std::vector<std::size_t> lens(s.lens().begin(), s.lens().end() - 2);
+    std::vector<std::size_t> strides(s.strides().begin(), s.strides().end() - 2);
+    auto base = *(s.lens().end() - 2) * *(s.lens().end() - 1);
+    std::transform(strides.begin(), strides.end(), strides.begin(), [&](auto stride) {
+        return stride / base;
+    });
+    return shape{s.type(), lens, strides}.standard();
+}
+
+inline bool can_fold_batch(const std::vector<shape>& inputs)
+{
+    const auto& b_shape = inputs[1];
+    if(std::any_of(inputs.begin() + 2, inputs.end() - 1, [](auto input) {
+           return not standard_batch(input);
+       }))
+        return false;
+    const auto& b_strides = b_shape.strides();
+    return std::all_of(
+        b_strides.begin(), b_strides.end() - 2, [](auto stride) { return stride == 0; });
+}
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_CK_HPP
--- a/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
@@ -45,10 +45,7 @@ MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS);
 struct hiprtc_src_file
 {
    hiprtc_src_file() = default;
-    hiprtc_src_file(const src_file& s)
-        : path(s.path.string()), content(s.content.first, s.content.second)
-    {
-    }
+    hiprtc_src_file(const src_file& s) : path(s.path.string()), content(s.content) {}
    std::string path;
    std::string content;
    template <class Self, class F>
@@ -58,6 +55,8 @@ struct hiprtc_src_file
    }
 };

+MIGRAPHX_GPU_EXPORT bool hip_has_flags(const std::vector<std::string>& flags);
+
 MIGRAPHX_GPU_EXPORT std::vector<std::vector<char>> compile_hip_src_with_hiprtc(
    std::vector<hiprtc_src_file> srcs, std::string params, const std::string& arch);


--- a/src/targets/gpu/include/migraphx/gpu/compile_miopen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_miopen.hpp
@@ -42,7 +42,7 @@ struct compile_miopen
    context* ctx = nullptr;
    std::string name() const { return "gpu::compile_miopen"; }
    void apply(module& m) const;
-    std::size_t compile(operation& op, instruction_ref ins, bool format) const;
+    std::size_t compile(operation& op, instruction_ref ins) const;
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -299,23 +299,6 @@ struct context

    any_ptr get_queue() { return get_stream().get(); }

-    void enable_perf_measurement(bool b = true)
-    {
-        if(b)
-        {
-            start_event = create_event_for_timing();
-            stop_event  = create_event_for_timing();
-            get_stream().record(start_event.get());
-            get_stream().record(stop_event.get());
-        }
-        else
-        {
-            start_event = nullptr;
-            stop_event  = nullptr;
-        }
-        measure_perf = b;
-    }
-
    std::pair<hipEvent_t, hipEvent_t> get_perf_events() const
    {
        if(measure_perf)
@@ -323,12 +306,12 @@ struct context
        return std::make_pair(nullptr, nullptr);
    }

-    float get_elapsed_ms() const
+    static float get_elapsed_ms(hipEvent_t start, hipEvent_t stop)
    {
        float result = 0;
-        if(start_event != nullptr and stop_event != nullptr)
+        if(start != nullptr and stop != nullptr)
        {
-            auto status = hipEventElapsedTime(&result, start_event.get(), stop_event.get());
+            auto status = hipEventElapsedTime(&result, start, stop);
            if(status != hipSuccess)
                MIGRAPHX_THROW("Failed hipEventElapsedTime: " + hip_error(status));
        }

--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -57,7 +57,6 @@ template <class Op>
 struct miopen_convolution
 {
    Op op;
-    bool int8_x4_format               = false;
    shared<convolution_descriptor> cd = nullptr;
    miopenConvFwdAlgorithm_t algo{};
 #ifdef MIGRAPHX_HAS_FIND_2_API
@@ -74,7 +73,6 @@ struct miopen_convolution
                    f(self.solution_object, "solution_object"),
 #endif
                    f(self.algo, "algo"),
-                    f(self.int8_x4_format, "int8_x4_format"),
                    f(self.solution_id, "solution_id"));
    }

@@ -84,17 +82,19 @@ struct miopen_convolution
    {
        check_shapes{inputs, op}.has(4);
        std::vector<shape> conv_inputs(inputs.begin(), inputs.begin() + 2);
-        check_shapes{conv_inputs, *this}.max_ndims(5).packed_layouts(
-            {{0, 1, 2}, {0, 1, 2, 3}, {0, 2, 3, 1}, {0, 1, 2, 3, 4}});
+        check_shapes{conv_inputs, *this}
+            .max_ndims(5)
+            .packed_layouts({{0, 1, 2}, {0, 1, 2, 3}, {0, 2, 3, 1}, {0, 1, 2, 3, 4}})
+            .same_layout();
        return migraphx::compute_shape<Op>(op, conv_inputs);
    }

    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
-        auto x_desc = make_tensor(reshape_if_1d(args[0].get_shape()), int8_x4_format);
-        auto w_desc = make_tensor(reshape_if_1d(args[1].get_shape()), int8_x4_format);
-        auto y_desc = make_tensor(reshape_if_1d(output_shape));
+        auto x_desc                = make_tensor(reshape_if_1d(args[0].get_shape()));
+        auto w_desc                = make_tensor(reshape_if_1d(args[1].get_shape()));
+        auto y_desc                = make_tensor(reshape_if_1d(output_shape));
        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
        auto workspace_size        = args[2].get_shape().bytes();

@@ -160,8 +160,8 @@ struct miopen_convolution
    shape find(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
    {
        shape workspace_shape{};
-        auto x_desc = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
-        auto w_desc = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+        auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
+        auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
        auto y_desc = make_tensor(reshape_if_1d(output_shape));

        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
@@ -177,13 +177,8 @@ struct miopen_convolution

        workspace_shape = shape{shape::int8_type, {workspace_size}};

-        auto x_shape = inputs[0];
-        auto w_shape = inputs[1];
-        if(int8_x4_format)
-        {
-            x_shape = pack_int8_shape(x_shape);
-            w_shape = pack_int8_shape(w_shape);
-        }
+        const auto& x_shape = inputs[0];
+        const auto& w_shape = inputs[1];

 #ifdef MIGRAPHX_HAS_FIND_2_API
        {
@@ -197,9 +192,9 @@ struct miopen_convolution
            // MIOpen has APIs to pass pre-allocated buffers starting from rocm-5.6
            preallocate = true;
 #endif
-            auto x = preallocate ? to_gpu(generate_argument(x_shape)) : inputs[0];
-            auto w = preallocate ? to_gpu(generate_argument(w_shape)) : inputs[1];
-            auto y = preallocate ? allocate_gpu(output_shape) : inputs[2];
+            auto x = preallocate ? to_gpu(generate_argument(x_shape)) : argument{inputs[0]};
+            auto w = preallocate ? to_gpu(generate_argument(w_shape)) : argument{inputs[1]};
+            auto y = preallocate ? allocate_gpu(output_shape) : argument{inputs[2]};
            auto workspace =
                preallocate ? allocate_gpu(workspace_shape) : migraphx::argument(workspace_shape);

@@ -325,8 +320,8 @@ struct miopen_convolution
                                   ": workspace has changed during finalization.");
            }

-            auto x_desc = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
-            auto w_desc = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+            auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
+            auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
            auto y_desc = make_tensor(reshape_if_1d(output_shape));

            auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
@@ -345,21 +340,6 @@ struct miopen_convolution
    {
        return shapes.size() - 1;
    }
-
-    inline shape pack_int8_shape(const shape& s) const
-    {
-        if(s.type() != shape::int8_type)
-        {
-            return s;
-        }
-
-        auto lens    = s.lens();
-        auto strides = s.strides();
-        lens[1]      = (lens[1] + 3) / 4 * 4;
-        strides[0]   = strides[1] * lens[1];
-
-        return {s.type(), lens, strides};
-    }
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -55,7 +55,7 @@ MIGRAPHX_DEVICE_CONSTEXPR val_index<T> make_val_index(T v, int64_t i)
    return {v, i};
 }

-struct argmax_op
+struct argmax_op_first_index
 {
    template <class T>
    MIGRAPHX_DEVICE_CONSTEXPR val_index<T> operator()(val_index<T> x, val_index<T> y) const
@@ -73,7 +73,25 @@ struct argmax_op
    MIGRAPHX_DEVICE_CONSTEXPR auto init() const { return lowest(); }
 };

-struct argmin_op
+struct argmax_op_last_index
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR val_index<T> operator()(val_index<T> x, val_index<T> y) const
+    {
+        if(x.val > y.val)
+            return x;
+        else if(x.val < y.val)
+            return y;
+        else
+        {
+            return (x.index > y.index) ? x : y;
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() const { return lowest(); }
+};
+
+struct argmin_op_first_index
 {
    template <class T>
    MIGRAPHX_DEVICE_CONSTEXPR val_index<T> operator()(val_index<T> x, val_index<T> y) const
@@ -91,6 +109,24 @@ struct argmin_op
    MIGRAPHX_DEVICE_CONSTEXPR auto init() const { return highest(); }
 };

+struct argmin_op_last_index
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR val_index<T> operator()(val_index<T> x, val_index<T> y) const
+    {
+        if(x.val < y.val)
+            return x;
+        else if(x.val > y.val)
+            return y;
+        else
+        {
+            return (x.index > y.index) ? x : y;
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() const { return highest(); }
+};
+
 template <class Op>
 void arg_op(Op op, hipStream_t stream, const argument& result, const argument& arg, int64_t axis)
 {

--- a/src/targets/gpu/include/migraphx/gpu/device/argmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/argmax.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -36,7 +36,8 @@ namespace device {
 void MIGRAPHX_DEVICE_EXPORT argmax(hipStream_t stream,
                                   const argument& result,
                                   const argument& arg,
-                                   int64_t axis);
+                                   int64_t axis,
+                                   bool select_last_index);

 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/argmin.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/argmin.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -36,7 +36,8 @@ namespace device {
 void MIGRAPHX_DEVICE_EXPORT argmin(hipStream_t stream,
                                   const argument& result,
                                   const argument& arg,
-                                   int64_t axis);
+                                   int64_t axis,
+                                   bool select_last_index);

 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/fuse_mlir.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/fuse_mlir.hpp
@@ -38,6 +38,7 @@ MIGRAPHX_GPU_EXPORT bool mlir_enabled();
 struct MIGRAPHX_GPU_EXPORT fuse_mlir
 {
    context* ctx = nullptr;
+    bool enable_extra = false;
    std::string name() const { return "gpu::fuse_mlir"; }
    void apply(module_pass_manager& mpm) const;
 };

--- a/src/targets/gpu/include/migraphx/gpu/fuse_ops.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/fuse_ops.hpp
@@ -24,7 +24,6 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_FUSE_OPS_HPP
 #define MIGRAPHX_GUARD_RTGLIB_FUSE_OPS_HPP

-#include <migraphx/config.hpp>
 #include <migraphx/gpu/context.hpp>

 namespace migraphx {
@@ -34,7 +33,7 @@ struct module;

 namespace gpu {

-struct fuse_ops
+struct MIGRAPHX_GPU_EXPORT fuse_ops
 {
    context* ctx   = nullptr;
    bool fast_math = true;

--- a/src/targets/gpu/include/migraphx/gpu/gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gemm.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -40,9 +40,8 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

 struct context;
-
-void blas_shape(const shape& s);
 shape transpose_batch(const shape& s, unsigned trans_batch);
+void blas_shape(const shape& s);

 template <class Op>
 struct rocblas_gemm
@@ -50,9 +49,9 @@ struct rocblas_gemm
    Op op;
    float alpha          = 1;
    float beta           = 0;
-    bool int8_x4_format  = true;
    bool compute_fp32    = false;
    unsigned trans_batch = 0;
+    int32_t solution_idx = 0;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -60,9 +59,9 @@ struct rocblas_gemm
        return pack_join(migraphx::reflect(self.op, f),
                         pack(f(self.alpha, "alpha"),
                              f(self.beta, "beta"),
-                              f(self.int8_x4_format, "int8_x4_format"),
                              f(self.compute_fp32, "compute_fp32"),
-                              f(self.trans_batch, "trans_batch")));
+                              f(self.trans_batch, "trans_batch"),
+                              f(self.solution_idx, "solution_idx")));
    }

    std::string name() const
@@ -78,6 +77,8 @@ struct rocblas_gemm
    {
        std::vector<shape> in_shapes(inputs);
        in_shapes.pop_back();
+        // When input shapes are A, B, C the GEMM equation is  C  =  α AB+ β C   where α, β are
+        // scalars
        check_shapes{in_shapes, *this}.has(2, 3);
        blas_shape(inputs[0]);
        blas_shape(inputs[1]);
@@ -113,17 +114,12 @@ struct rocblas_gemm
    {
        if(this->name() == "gpu::gemm")
        {
-            gemm(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
+            gemm_compute(ctx, output_shape, args, alpha, beta, compute_fp32, solution_idx);
        }
        else
        {
-            gemm(ctx,
-                 output_shape,
-                 args,
-                 int32_t(alpha),
-                 int32_t(beta),
-                 int8_x4_format,
-                 compute_fp32);
+            gemm_compute(
+                ctx, output_shape, args, int32_t(alpha), int32_t(beta), compute_fp32, solution_idx);
        }
        return args.back();
    }
@@ -132,6 +128,33 @@ struct rocblas_gemm
    {
        return shapes.size() - 1;
    }
+
+    void finalize(context& ctx, const shape& output_shape, const std::vector<shape>& input_shapes)
+    {
+#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
+        if(enabled(MIGRAPHX_ENABLE_GEMM_TUNING{}) or ctx.get_exhaustive_tune_flag())
+        {
+            if(this->name() == "gpu::gemm")
+            {
+                solution_idx = gemm_finalize(
+                    ctx, output_shape, input_shapes, alpha, beta, compute_fp32, solution_idx);
+            }
+            else
+            {
+                solution_idx = gemm_finalize(ctx,
+                                             output_shape,
+                                             input_shapes,
+                                             int32_t(alpha),
+                                             int32_t(beta),
+                                             compute_fp32,
+                                             solution_idx);
+            }
+        }
+#else
+        // suppress compiler warnings
+        (void)ctx, (void)output_shape, (void)input_shapes;
+#endif
+    }
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/gemm_impl.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gemm_impl.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -24,28 +24,64 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_GEMM_IMPL_HPP
 #define MIGRAPHX_GUARD_RTGLIB_GEMM_IMPL_HPP

+#include <iterator>
 #include <migraphx/shape.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/gpu/context.hpp>

+// Set this environment variable to "true" to perform GEMM tuning even when the
+// --exhaustive-tune option isn't set.  Can be used to skip slow convolution tuning.
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_GEMM_TUNING);
+
+using milliseconds = std::chrono::duration<double, std::milli>;
+using microseconds = std::chrono::duration<double, std::micro>;
+
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-void gemm(context& ctx,
-          const shape& output_shape,
-          const std::vector<argument>& args,
-          float alpha,
-          float beta,
-          bool int8_x4_format,
-          bool compute_fp32);
-void gemm(context& ctx,
-          const shape& output_shape,
-          const std::vector<argument>& args,
-          int32_t alpha,
-          int32_t beta,
-          bool int8_x4_format,
-          bool compute_fp32);
+/**
+ * @brief Templated implementations of the compute() and finalize() methods of the Gemm operator.
+ *        For each function there are overloads using either float or int32_t for the arguments
+ * alpha and beta.
+ *
+ * @param ctx .
+ * @param output_shape .
+ * @param args .
+ * @param alpha .
+ * @param beta .
+ * @param compute_fp32 .
+ */
+void gemm_compute(context& ctx,
+                  const shape& output_shape,
+                  const std::vector<argument>& args,
+                  float alpha,
+                  float beta,
+                  bool compute_fp32,
+                  int32_t solution_idx);
+
+void gemm_compute(context& ctx,
+                  const shape& output_shape,
+                  const std::vector<argument>& args,
+                  int32_t alpha,
+                  int32_t beta,
+                  bool compute_fp32,
+                  int32_t solution_idx);
+
+int32_t gemm_finalize(context& ctx,
+                      const shape& output_shape,
+                      const std::vector<shape>& input_shapes,
+                      float alpha,
+                      float beta,
+                      bool compute_fp32);
+
+int32_t gemm_finalize(context& ctx,
+                      const shape& output_shape,
+                      const std::vector<shape>& input_shapes,
+                      int32_t alpha,
+                      int32_t beta,
+                      bool compute_fp32,
+                      int32_t solution_idx);

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_GEMM_SOFTMAX_GEMM_HPP
+#define MIGRAPHX_GUARD_GPU_GEMM_SOFTMAX_GEMM_HPP
+
+#include <migraphx/make_op.hpp>
+#include <migraphx/check_shapes.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct gemm_softmax_gemm
+{
+    operation op = make_op("dot");
+    float scale  = 1.0;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"), f(self.scale, "scale"));
+    }
+
+    std::string name() const { return "gpu::gemm_softmax_gemm"; }
+
+    void check_gemm_shape(const shape& s) const
+    {
+        if(not contains(range(s.strides().rbegin(), s.strides().rbegin() + 3), 1))
+            MIGRAPHX_THROW("Invalid shape for " + name());
+    }
+
+    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>&) const
+    {
+        check_shapes{inputs, *this}.same_ndims();
+        if(inputs.size() < 3)
+            MIGRAPHX_THROW(name() + ": Expected 3 inputs but got " + to_string(inputs.size()));
+        auto a  = inputs[0];
+        auto b  = inputs[1];
+        auto b1 = inputs[2];
+        for(const auto& input : inputs)
+        {
+            check_gemm_shape(input);
+        }
+        return op.compute_shape({op.compute_shape({a, b}), b1});
+    }
+
+    static bool is_ck_supported_type(shape::type_t t) { return contains({shape::half_type}, t); }
+};
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_GEMM_SOFTMAX_GEMM_HPP
--- a/src/targets/gpu/include/migraphx/gpu/miopen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/miopen.hpp
@@ -127,7 +127,7 @@ inline void set_tensor_descriptor(miopenTensorArgumentId_t name,
 }
 #endif

-inline tensor_descriptor make_tensor(const migraphx::shape& os, bool pack = false)
+inline tensor_descriptor make_tensor(const migraphx::shape& os)
 {
    auto s = os.normalize_standard();
    auto t = make_obj<tensor_descriptor>(&miopenCreateTensorDescriptor);
@@ -142,23 +142,9 @@ inline tensor_descriptor make_tensor(const migraphx::shape& os, bool pack = fals
    else if(s.type() == shape::int32_type)
        d = miopenInt32;
    else if(s.type() == shape::int8_type)
-    {
-        if(pack)
-        {
-            // update the lens and corresponding strides
-            d          = miopenInt8x4;
-            lens[1]    = ((lens[1] + 3) / 4) * 4;
-            strides[0] = strides[1] * lens[1];
-        }
-        else
-        {
-            d = miopenInt8;
-        }
-    }
+        d = miopenInt8;
    else
-    {
        MIGRAPHX_THROW("MAKE_TENSOR: unsupported type");
-    }
    miopenSetTensorDescriptor(t.get(), d, s.lens().size(), lens.data(), strides.data());

    return t;

--- a/src/targets/gpu/include/migraphx/gpu/mlir.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/mlir.hpp
@@ -49,7 +49,8 @@ MIGRAPHX_GPU_EXPORT instruction_ref insert_mlir(module& m,

 MIGRAPHX_GPU_EXPORT tuning_config get_tuning_config_mlir(const context& migraphx_ctx,
                                                         module m,
-                                                         const std::vector<shape>& inputs);
+                                                         const std::vector<shape>& inputs,
+                                                         bool exhaustive);

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
 #define MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <string>

 namespace migraphx {
@@ -34,7 +34,7 @@ struct module_pass_manager;

 namespace gpu {

-struct prefuse_ops
+struct MIGRAPHX_GPU_EXPORT prefuse_ops
 {
    std::string name() const { return "gpu::prefuse_ops"; }
    void apply(module_pass_manager& mpm) const;

--- a/src/targets/gpu/include/migraphx/gpu/rocblas.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/rocblas.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -40,8 +40,6 @@ struct context;

 MIGRAPHX_GPU_EXPORT bool get_compute_fp32_flag();

-MIGRAPHX_GPU_EXPORT bool get_int8_x4_format(context& ctx);
-
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/time_op.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/time_op.hpp
@@ -32,7 +32,7 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-MIGRAPHX_GPU_EXPORT std::pair<double, double>
+MIGRAPHX_GPU_EXPORT double
 time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n = 100);

 } // namespace gpu