Merge branch 'develop' into dyn_resize_gather

0662a9a3 · Brian Pickrell · GitHub · b74d3a8f · 35e5298e · 0662a9a3
Unverified Commit 0662a9a3 authored Nov 09, 2023 by Brian Pickrell Committed by GitHub Nov 09, 2023
20 changed files
--- a/src/onnx/parse_multinomial.cpp
+++ b/src/onnx/parse_multinomial.cpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -41,6 +41,9 @@ struct parse_multinomial : op_parser<parse_multinomial>
                          const onnx_parser::node_info& info,
                          std::vector<instruction_ref> args) const
    {
+        if(args.empty())
+            MIGRAPHX_THROW("PARSE_MULTINOMIAL: no arguments given");
        int dtype = 6;
        if(contains(info.attributes, "dtype"))
            dtype = info.attributes.at("dtype").i();
@@ -49,35 +52,90 @@ struct parse_multinomial : op_parser<parse_multinomial>
        size_t sample_size = 1;
        if(contains(info.attributes, "sample_size"))
            sample_size = info.attributes.at("sample_size").i();
+        else
+            MIGRAPHX_THROW("PARSE_MULTINOMIAL: sample_size not given");
+        // Use logarithmic math to scale probabilities while avoiding division by very
+        // small numbers.  Scaling by the maximum makes very tiny ranges more
+        // tractable; any constant factor gives equivalent distr. since the Multinomial op.
+        // normalizes at runtime.
        // Subtract the per-batch maximum log-probability, making the per-batch max 0
        auto maxes =
            info.add_instruction(migraphx::make_op("reduce_max", {{"axes", {1}}}), args[0]);
-        auto mb_maxes = info.add_instruction(
+        auto cdf = info.add_common_op("sub", args[0], maxes);
-            migraphx::make_op("multibroadcast", {{"out_lens", args[0]->get_shape().lens()}}),
-            maxes);
-        auto cdf = info.add_instruction(migraphx::make_op("sub"), args[0], mb_maxes);
        // Take the element-wise exponent to get probabilities in the range (0, 1]
        cdf = info.add_instruction(migraphx::make_op("exp"), cdf);
-        // Compute the cumulative density function
+        // Compute the cumulative distribution function
        cdf = info.add_instruction(
            migraphx::make_op("prefix_scan_sum", {{"axis", 1}, {"exclusive", false}}), cdf);
-        // Pre-compute random distribution
+        instruction_ref seed_input;
-        std::mt19937 gen(std::chrono::high_resolution_clock::now().time_since_epoch().count());
        if(contains(info.attributes, "seed"))
-            gen.seed(info.attributes.at("seed").f());
+        {
+            float seed = info.attributes.at("seed").f();
+            migraphx::shape s{migraphx::shape::float_type, {1}};
+            std::vector<float> data = {seed};
+            seed_input              = info.add_literal(migraphx::literal(s, data));
+        }
+        else
+        {
+            seed_input = info.add_instruction(migraphx::make_op("random_seed"));
+        }
+        instruction_ref randoms;
+        shape s0 = args[0]->get_shape();
+        if(s0.dynamic())
+        {
+            //  Dynamic batch_size will be taken from args[0].  The input argument to this should
+            // have a second dimension of sample_size.
+            std::vector<shape::dynamic_dimension> dyn_dim_set;
+            dyn_dim_set.emplace_back(s0.dyn_dims().front());
+            dyn_dim_set.emplace_back(shape::dynamic_dimension{sample_size, sample_size});
+            // read the input dimensions
+            auto dim_of =
+                info.add_instruction(migraphx::make_op("dimensions_of", {{"end", 2}}), args[0]);
+            // The next two operations insert the value sample_size into the second array position
+            // make an argument of (1, 0)
+            shape s(shape::int64_type, {2});
+            std::vector<int64_t> data1{1, 0};
+            auto l1        = info.add_literal(s, data1);
+            auto batch_arg = info.add_instruction(migraphx::make_op("mul"), dim_of, l1);
+            std::vector<int64_t> data2(2, 0);
+            // make an argument of (0, sample_size)
+            data2[1]         = sample_size;
+            auto l2          = info.add_literal(s, data2);
+            auto alloc_shape = info.add_instruction(migraphx::make_op("add"), batch_arg, l2);
+            // alloc_shape should contain the input-based shape dimensions as its values at runtime,
+            // and its own shape is {2}
+            // compile_shape is the shape used when compiling the Allocate op, and may be dynamic
+            migraphx::shape compile_shape =
+                migraphx::shape(s0.type(), {s0.dyn_dims().front(), {sample_size, sample_size}});
-        std::uniform_real_distribution<> dis(0.0, 1.0);
+            // Allocate on-device storage for the random values
-        size_t batch_size = args[0]->get_shape().lens().front();
+            auto alloc = info.add_instruction(
-        migraphx::shape dist_shape{migraphx::shape::float_type, {batch_size, sample_size}};
+                migraphx::make_op("allocate", {{"shape", to_value(compile_shape)}}), alloc_shape);
+            randoms = info.add_instruction(migraphx::make_op("random_uniform"), seed_input, alloc);
+        }
+        else
+        {
+            // use literal.  The array populated by random_uniform may have any shape, as long its
+            // number of elements is batch_size * sample_size .
+            size_t batch_size = s0.lens().front();
+            auto rand_dummy   = info.add_literal(
+                migraphx::literal{migraphx::shape::float_type, {batch_size * sample_size}});
-        std::vector<float> random_dist(batch_size * sample_size);
+            randoms =
-        std::generate(random_dist.begin(), random_dist.end(), [&]() { return dis(gen); });
+                info.add_instruction(migraphx::make_op("random_uniform"), seed_input, rand_dummy);
-        auto dist_lit = info.add_literal(migraphx::literal{dist_shape, random_dist});
+        }
        return info.add_instruction(
-            migraphx::make_op("multinomial", {{"dtype", output_type}}), cdf, dist_lit);
+            migraphx::make_op("multinomial", {{"dtype", output_type}}), cdf, randoms);
    }
 };

--- a/src/onnx/parse_resize.cpp
+++ b/src/onnx/parse_resize.cpp
@@ -181,6 +181,76 @@ static std::string get_nearest_mode(const onnx_parser::attribute_map& attr)
    return nearest_mode;
 }
+static std::vector<double> get_scales(const onnx_parser::attribute_map& attr)
+{
+    std::vector<double> scales;
+    if(contains(attr, "scales"))
+    {
+        copy(attr.at("scales").floats(), std::back_inserter(scales));
+    }
+    return scales;
+}
+static void parse_args(const std::vector<instruction_ref>& args,
+                       const std::vector<size_t>& in_lens,
+                       const std::string& op_name,
+                       std::vector<double>& vec_scale,
+                       std::vector<std::size_t>& out_lens)
+{
+    for(const auto& arg : args)
+    {
+        if(arg->name() == "undefined" or arg == args.front())
+        {
+            continue;
+        }
+        // skipped empty input
+        auto lens = arg->get_shape().lens();
+        if(lens.empty())
+        {
+            continue;
+        }
+        auto type = arg->get_shape().type();
+        // output size
+        if(type == shape::int64_type)
+        {
+            auto arg_out_s = arg->eval();
+            check_arg_empty(arg_out_s,
+                            "PARSE_" + op_name + ": dynamic output size is not supported!");
+            arg_out_s.visit([&](const auto& ol) { out_lens.assign(ol.begin(), ol.end()); });
+            if(out_lens.size() != in_lens.size())
+            {
+                MIGRAPHX_THROW("PARSE_" + op_name +
+                               ": specified output size does not match input size");
+            }
+            // compute the scale
+            vec_scale.resize(in_lens.size());
+            std::transform(in_lens.begin(),
+                           in_lens.end(),
+                           out_lens.begin(),
+                           vec_scale.begin(),
+                           [](auto iss, auto oss) { return 1.0 * oss / iss; });
+        }
+        else
+        {
+            // scale input
+            if(lens[0] == in_lens.size())
+            {
+                auto arg_scale = arg->eval();
+                check_arg_empty(arg_scale,
+                                "PARSE_" + op_name + ": dynamic input scale is not supported!");
+                arg_scale.visit([&](const auto& v) { vec_scale.assign(v.begin(), v.end()); });
+            }
+        }
+    }
+}
 struct parse_resize : op_parser<parse_resize>
 {
    std::vector<op_desc> operators() const { return {{"Resize"}, {"Upsample"}}; }
@@ -309,7 +379,7 @@ struct parse_resize : op_parser<parse_resize>
        std::vector<size_t> out_lens(in_s.ndim());
        // scale
-        std::vector<double> vec_scale;
+        std::vector<double> vec_scale = get_scales(info.attributes);
        // Look at inputs and infer either output size or scale, depending on input type
        for(const auto& arg : args)
@@ -381,8 +451,6 @@ struct parse_resize : op_parser<parse_resize>
            }
        }
-        // Dynamic batch:  Only args[0] can have a dynamic shape, only the 0'th
-        // dimension--batch size--can be non-fixed, and the only resize mode allowed is "nearest"
        if(args[0]->get_shape().dynamic())
        {
            return dynamic_nearest_parse(out_lens, vec_scale, opd, info, args);

--- a/src/onnx/parse_slice.cpp
+++ b/src/onnx/parse_slice.cpp
@@ -46,6 +46,9 @@ struct parse_slice : op_parser<parse_slice>
        void always_insert(instruction_ref arg) { op_args.insert(op_args.begin(), arg); }
+        /**
+         * Either insert argument into `this->op_args` or return the constant value of the argument
+         */
        std::vector<int64_t> insert(instruction_ref arg)
        {
            std::vector<int64_t> result;
@@ -137,23 +140,22 @@ struct parse_slice : op_parser<parse_slice>
        sd.always_insert(args.at(0));
        // If axes arg is not given, the default is all of them.
-        if(sd.op.axes.empty() and sd.op_args.size() < 3)
+        if(sd.op.axes.empty() and sd.op_args.size() <= 3)
        {
            std::vector<int64_t> axes(args[0]->get_shape().ndim());
            std::iota(axes.begin(), axes.end(), int64_t{0});
            sd.op.axes = axes;
        }
-        if(not sd.steps.empty())
+        if(std::any_of(sd.steps.begin(), sd.steps.end(), [](auto s) { return s != 1; }))
        {
            if(sd.op.starts.empty() or sd.op.ends.empty())
-                MIGRAPHX_THROW("PARSE_SLICE: steps and variable starts and ends is not supported");
+                MIGRAPHX_THROW(
+                    "PARSE_SLICE: steps and variable starts and/or ends is not supported");
            if(sd.op.axes.empty())
                MIGRAPHX_THROW("PARSE_SLICE: steps and variable axes is not supported");
        }
-        assert(sd.steps.empty() or sd.steps.size() == sd.op.axes.size());
        // If any axes have negative step, prepare to add a "reverse" op
        for(auto i : range(sd.steps.size()))
        {

--- a/src/onnx/parse_split.cpp
+++ b/src/onnx/parse_split.cpp
@@ -68,13 +68,34 @@ struct parse_split : op_parser<parse_split>
        // no split attribute, input is equally divided
        else
        {
-            if((lens[tuned_axis] % info.num_outputs) != 0)
+            std::size_t num_outputs = info.num_outputs;
+            // the num_outputs attribute seems to be redundant since we already have
+            // node_info::num_outputs, but we can still perform an error check
+            if(contains(info.attributes, "num_outputs"))
            {
-                MIGRAPHX_THROW("PARSE_SPLIT: input cannot be equally divided into " +
+                num_outputs =
-                               std::to_string(info.num_outputs) + " splits!");
+                    parser.parse_value(info.attributes.at("num_outputs")).at<std::size_t>();
+                if(num_outputs != info.num_outputs)
+                {
+                    MIGRAPHX_THROW("PARSE_SPLIT: num_outputs attribute " +
+                                   std::to_string(num_outputs) +
+                                   " doesn't match actual number of outputs " +
+                                   std::to_string(info.num_outputs) + "!");
+                }
+            }
+            if(lens[tuned_axis] % num_outputs == 0)
+            {
+                std::size_t chunk_size = lens[tuned_axis] / num_outputs;
+                vec_splits.resize(num_outputs, chunk_size);
+            }
+            else
+            {
+                std::size_t chunk_size      = lens[tuned_axis] / num_outputs + 1;
+                std::size_t last_chunk_size = lens[tuned_axis] - chunk_size * (num_outputs - 1);
+                vec_splits.resize(num_outputs - 1, chunk_size);
+                vec_splits.push_back(last_chunk_size);
            }
-            auto dl = lens[tuned_axis] / info.num_outputs;
-            vec_splits.resize(info.num_outputs, dl);
        }
        if(std::accumulate(vec_splits.begin(), vec_splits.end(), int64_t(0)) !=

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -936,7 +936,7 @@ void program::perf_report(std::ostream& os,
    os << std::endl;
    os << "Batch size: " << batch << std::endl;
-    os << "Rate: " << rate * batch << "inferences/sec" << std::endl;
+    os << "Rate: " << rate * batch << " inferences/sec" << std::endl;
    os << "Total time: " << total_time << "ms" << std::endl;
    os << "Total instructions time: " << total_instruction_time << "ms" << std::endl;
    os << "Overhead time: " << overhead_time << "ms"

--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -472,7 +472,8 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
               map_dyn_input_dims,
           bool skip_unknown_operators,
           bool print_program_on_error,
-           int64_t max_loop_iterations) {
+           int64_t max_loop_iterations,
+           int64_t limit_max_iterations) {
            migraphx::onnx_options options;
            options.default_dim_value      = default_dim_value;
            options.default_dyn_dim_value  = default_dyn_dim_value;
@@ -481,6 +482,7 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
            options.skip_unknown_operators = skip_unknown_operators;
            options.print_program_on_error = print_program_on_error;
            options.max_loop_iterations    = max_loop_iterations;
+            options.limit_max_iterations   = limit_max_iterations;
            return migraphx::parse_onnx(filename, options);
        },
        "Parse onnx file",
@@ -492,7 +494,8 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
            std::unordered_map<std::string, std::vector<migraphx::shape::dynamic_dimension>>(),
        py::arg("skip_unknown_operators") = false,
        py::arg("print_program_on_error") = false,
-        py::arg("max_loop_iterations")    = 10);
+        py::arg("max_loop_iterations")    = 10,
+        py::arg("limit_max_iterations")   = std::numeric_limits<uint16_t>::max());
    m.def(
        "parse_onnx_buffer",

--- a/src/quantization.cpp
+++ b/src/quantization.cpp
@@ -147,8 +147,8 @@ void quantize_int8(program& prog,
    run_passes(prog,
               {quantize_int8_pass{ins_names, *int8_quant_params},
-                optimize_module{},
                simplify_qdq{},
+                optimize_module{},
                dead_code_elimination{}});
 }

--- a/src/rewrite_quantization.cpp
+++ b/src/rewrite_quantization.cpp
@@ -47,7 +47,7 @@ void apply_quantizelinear(module& m, instruction_ref ins)
            ins, make_op("convert", {{"target_type", y_scale->get_shape().type()}}), x);
    }
    auto div            = m.insert_instruction(ins, make_op("div"), x, y_scale);
-    auto add_zero_point = m.insert_instruction(ins, make_op("round"), div);
+    auto add_zero_point = m.insert_instruction(ins, make_op("nearbyint"), div);
    if(ins->inputs().size() == 3)
    {

--- a/src/simplify_dyn_ops.cpp
+++ b/src/simplify_dyn_ops.cpp
@@ -24,6 +24,7 @@
 #include <migraphx/simplify_dyn_ops.hpp>
 #include <migraphx/matcher.hpp>
 #include <migraphx/make_op.hpp>
+#include <migraphx/literal.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -131,10 +132,53 @@ struct find_const_4in_slice
    }
 };
+/**
+ * Simplify dimensions_of to a literal when the input arugment has a static shape
+ * or the dynamic dimensions from `start` to `end` are fixed.
+ */
+struct find_static_dimensions_of
+{
+    auto matcher() const { return match::name("dimensions_of")(); }
+    void apply(module& m, const match::matcher_result& mr) const
+    {
+        auto ins                 = mr.result;
+        auto input               = ins->inputs().at(0);
+        auto dimensions_of_value = ins->get_operator().to_value();
+        auto start               = dimensions_of_value.at("start").to<std::size_t>();
+        auto end                 = dimensions_of_value.at("end").to<std::size_t>();
+        if(input->get_shape().dynamic())
+        {
+            // check if dynamic dimensions from start to end are fixed
+            auto dds = input->get_shape().dyn_dims();
+            if(std::any_of(dds.begin() + start, dds.begin() + end, [](auto dd) {
+                   return not dd.is_fixed();
+               }))
+            {
+                return;
+            }
+        }
+        std::size_t output_ndim = end - start;
+        std::vector<int64_t> vec_shape(output_ndim);
+        migraphx::shape s(migraphx::shape::int64_type, {output_ndim});
+        std::vector<std::size_t> input_lens = input->get_shape().to_static(1).lens();
+        std::transform(input_lens.begin() + start,
+                       input_lens.begin() + end,
+                       vec_shape.begin(),
+                       [](auto i) { return int64_t(i); });
+        migraphx::shape output_shape{migraphx::shape::int64_type, {end - start}};
+        auto lit_ins = m.add_literal(migraphx::literal{output_shape, vec_shape});
+        m.replace_instruction(ins, lit_ins);
+    }
+};
 void simplify_dyn_ops::apply(module& m) const
 {
-    match::find_matches(
+    match::find_matches(m,
-        m, find_static_2in_broadcasts{}, find_const_3in_slice{}, find_const_4in_slice{});
+                        find_static_2in_broadcasts{},
+                        find_static_dimensions_of{},
+                        find_const_3in_slice{},
+                        find_const_4in_slice{});
 }
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -647,8 +647,8 @@ struct find_broadcast_transpose
    {
        auto transpose      = r.result;
        auto transpose_lens = transpose->get_shape().lens();
-        auto bcast_ins = r.instructions["bcast_ins"];
+        auto bcast_ins      = r.instructions["bcast_ins"];
-        auto input     = bcast_ins->inputs().front();
+        auto input          = bcast_ins->inputs().front();
        // scalar transformation does not need extra transpose
        if(not input->get_shape().scalar())
        {

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
 # ####################################################################################
 # The MIT License (MIT)
 #
-# Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -37,8 +37,7 @@ if(NOT TARGET MIOpen)
    message(SEND_ERROR "Cant find miopen")
 endif()
-if(NOT WIN32)
+if(MIGRAPHX_USE_COMPOSABLEKERNEL)
-    # TODO: re-enable when CK is ported to Windows
    find_package(composable_kernel 1.0.0 REQUIRED COMPONENTS jit_library)
 endif()
@@ -52,10 +51,10 @@ file(GLOB KERNEL_FILES CONFIGURE_DEPENDS
    ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/*.hpp)
 message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
-if(WIN32)
+if(NOT MIGRAPHX_USE_COMPOSABLEKERNEL)
-    # TODO: re-enable when CK is ported to Windows
    list(REMOVE_ITEM KERNEL_FILES
        ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck_gemm.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp
        ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck.hpp)
 endif()
@@ -103,9 +102,10 @@ rocm_clang_tidy_check(kernel_file_check)
 file(GLOB JIT_GPU_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/jit/*.cpp)
-if(WIN32)
+if(NOT MIGRAPHX_USE_COMPOSABLEKERNEL)
-    # TODO: re-enable when CK is ported to Windows
+    list(REMOVE_ITEM JIT_GPU_SRCS
-    list(REMOVE_ITEM JIT_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/jit/ck_gemm.cpp)
+            ${CMAKE_CURRENT_SOURCE_DIR}/jit/ck_gemm.cpp
+            ${CMAKE_CURRENT_SOURCE_DIR}/jit/ck_gemm_softmax_gemm.cpp)
 endif()
 add_library(migraphx_gpu
@@ -128,8 +128,6 @@ add_library(migraphx_gpu
    gather.cpp
    gemm_impl.cpp
    hip.cpp
-    int8_conv_pack.cpp
-    int8_gemm_pack.cpp
    kernel.cpp
    lowering.cpp
    logsoftmax.cpp
@@ -140,7 +138,6 @@ add_library(migraphx_gpu
    no_device.cpp
    nonzero.cpp
    pack_args.cpp
-    pack_int8_args.cpp
    prefuse_ops.cpp
    pad.cpp
    perfdb.cpp
@@ -184,7 +181,6 @@ register_migraphx_gpu_ops(hip_
 register_migraphx_gpu_ops(miopen_
    abs
    contiguous
-    int8_conv_pack
    lrn
    pooling
 )
@@ -192,10 +188,6 @@ register_op(migraphx_gpu
    HEADER migraphx/gpu/rnn_variable_seq_lens.hpp
    OPERATORS gpu::hip_rnn_var_sl_shift_sequence gpu::hip_rnn_var_sl_shift_output gpu::hip_rnn_var_sl_last_output
    INCLUDES migraphx/gpu/context.hpp)
-register_op(migraphx_gpu
-    HEADER migraphx/gpu/int8_gemm_pack.hpp
-    OPERATORS gpu::hip_int8_gemm_pack_a gpu::hip_int8_gemm_pack_b
-    INCLUDES migraphx/gpu/context.hpp)
 register_op(migraphx_gpu
    HEADER migraphx/gpu/gemm.hpp
    OPERATORS gpu::rocblas_gemm<op::dot> gpu::rocblas_gemm<op::quant_dot>
@@ -239,24 +231,28 @@ else()
        string(REGEX REPLACE " /[^ ]+\\.(a|so) " " " HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
    endforeach()
-    message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
+    message(STATUS "Hip compiler flags: \"${HIP_COMPILER_FLAGS}\"")
    target_compile_definitions(migraphx_gpu PRIVATE
-        "-DMIGRAPHX_HIP_COMPILER=${CMAKE_CXX_COMPILER}"
+        -DMIGRAPHX_HIP_COMPILER="${CMAKE_CXX_COMPILER}"
-        "-DMIGRAPHX_HIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}"
+        -DMIGRAPHX_HIP_COMPILER_FLAGS="${HIP_COMPILER_FLAGS}"
    )
    if(DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
        execute_process(COMMAND which ${CMAKE_CXX_COMPILER_LAUNCHER} OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER)
        string(STRIP "${MIGRAPHX_HIP_COMPILER_LAUNCHER}" MIGRAPHX_HIP_COMPILER_LAUNCHER)
-        target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_HIP_COMPILER_LAUNCHER=${MIGRAPHX_HIP_COMPILER_LAUNCHER}")
+        target_compile_definitions(migraphx_gpu PRIVATE -DMIGRAPHX_HIP_COMPILER_LAUNCHER="${MIGRAPHX_HIP_COMPILER_LAUNCHER}")
    endif()
 endif()
 # Check miopen find mode api
 include(CheckLibraryExists)
 get_target_property(MIOPEN_LOCATION MIOpen LOCATION)
+get_target_property(ROCBLAS_LOCATION roc::rocblas LOCATION)
 check_library_exists(MIOpen "miopenHiddenSetConvolutionFindMode" "${MIOPEN_LOCATION}" HAS_FIND_MODE_API)
 check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_2_API)
+# Beta API for automated GEMM tuning
+check_library_exists(roc::rocblas "rocblas_gemm_ex_get_solutions" "${ROCBLAS_LOCATION}" HAS_ROCBLAS_TUNING_BETA_FEATURE_API)
 set(MIGRAPHX_USE_FIND_2_API "${HAS_FIND_2_API}" CACHE BOOL "")
@@ -279,10 +275,16 @@ else()
    message(STATUS "MIOpen does not have find mode api")
 endif()
+if(HAS_ROCBLAS_TUNING_BETA_FEATURE_API)
+    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_USE_ROCBLAS_TUNING_API -DROCBLAS_BETA_FEATURES_API -DROCBLAS_NO_DEPRECATED_WARNINGS)
+    message(STATUS "MIGraphx is using Beta API of rocBLAS")
+else()
+    message(STATUS "rocBLAS does not have User Tuning Beta API")
+endif()
 target_link_libraries(migraphx_gpu PUBLIC migraphx MIOpen roc::rocblas)
 target_link_libraries(migraphx_gpu PRIVATE migraphx_device migraphx_kernels)
-if(NOT WIN32)
+if(MIGRAPHX_USE_COMPOSABLEKERNEL)
-    # TODO: re-enable when CK is ported to Windows
    target_link_libraries(migraphx_gpu PRIVATE composable_kernel::jit_library)
 endif()

--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
@@ -284,16 +284,20 @@ std::vector<std::vector<char>> compile_hip_src_with_hiprtc(std::vector<hiprtc_sr
 bool is_hip_clang_compiler()
 {
-    static const auto result = ends_with(MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER), "clang++");
+    static const auto result = fs::path{MIGRAPHX_HIP_COMPILER}.stem() == "clang++";
    return result;
 }
+#ifdef MIGRAPHX_HIP_COMPILER_LAUNCHER
 bool has_compiler_launcher()
 {
-    static const auto result = fs::exists(MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER_LAUNCHER));
+    static const auto result = fs::exists(MIGRAPHX_HIP_COMPILER_LAUNCHER);
    return result;
 }
+#endif
 src_compiler assemble(src_compiler compiler)
 {
    compiler.out_ext = ".S";
@@ -306,8 +310,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
 {
    assert(not srcs.empty());
    if(not is_hip_clang_compiler())
-        MIGRAPHX_THROW("Unknown hip compiler: " +
+        MIGRAPHX_THROW("Unknown hip compiler: " MIGRAPHX_HIP_COMPILER);
-                       std::string(MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER)));
    if(params.find("-std=") == std::string::npos)
        params += " --std=c++17";
@@ -323,14 +326,14 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
        params += " -DMIGRAPHX_DEBUG";
    params += " -Wno-unused-command-line-argument -Wno-cuda-compat ";
-    params += MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER_FLAGS);
+    params += MIGRAPHX_HIP_COMPILER_FLAGS;
    src_compiler compiler;
    compiler.flags    = params;
-    compiler.compiler = MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER);
+    compiler.compiler = MIGRAPHX_HIP_COMPILER;
 #ifdef MIGRAPHX_HIP_COMPILER_LAUNCHER
    if(has_compiler_launcher())
-        compiler.launcher = MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER_LAUNCHER);
+        compiler.launcher = MIGRAPHX_HIP_COMPILER_LAUNCHER;
 #endif
    if(enabled(MIGRAPHX_GPU_DUMP_SRC{}))
    {
@@ -354,7 +357,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
 bool hip_has_flags(const std::vector<std::string>& flags)
 {
    src_compiler compiler;
-    compiler.compiler = MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER);
+    compiler.compiler = MIGRAPHX_HIP_COMPILER;
    compiler.flags =
        join_strings(flags, " ") + " -x hip -c --offload-arch=gfx900 --cuda-device-only";

--- a/src/targets/gpu/compile_miopen.cpp
+++ b/src/targets/gpu/compile_miopen.cpp
@@ -60,9 +60,8 @@ struct miopen_op
 };
 MIGRAPHX_REGISTER_OP(miopen_op);
-std::size_t compile_miopen::compile(operation& op, instruction_ref ins, bool format) const
+std::size_t compile_miopen::compile(operation& op, instruction_ref ins) const
 {
-    op.from_value({{"int8_x4_format", format}});
    auto v = op.compile(*ctx, ins->get_shape(), to_shapes(ins->inputs()));
    return v.get<std::size_t>("workspace", 0);
 }
@@ -70,25 +69,15 @@ std::size_t compile_miopen::compile(operation& op, instruction_ref ins, bool for
 void compile_miopen::apply(module& m) const
 {
    assert(ctx);
-    const bool int8_x4_format = get_int8_x4_format(any_cast<migraphx::gpu::context>(*ctx));
    for(auto ins : iterator_for(m))
    {
        if(ins->name() != "gpu::miopen_op")
            continue;
        auto op        = any_cast<miopen_op>(ins->get_operator()).op;
        std::size_t ws = 0;
-        try
+        ws             = compile(op, ins);
-        {
+        auto inputs    = ins->inputs();
-            // for the regular convolution and convolution_backwards, this try would always succeed
+        auto alloc     = m.insert_instruction(
-            ws = compile(op, ins, int8_x4_format);
-        }
-        catch(migraphx::exception&)
-        {
-            // In case no solver supports the default format, retry using the other format.
-            ws = compile(op, ins, not int8_x4_format);
-        }
-        auto inputs = ins->inputs();
-        auto alloc  = m.insert_instruction(
            ins, make_op("allocate", {{"shape", to_value(shape{shape::int8_type, {ws}})}}));
        inputs.insert(std::prev(inputs.end()), alloc);

--- a/src/targets/gpu/compile_ops.cpp
+++ b/src/targets/gpu/compile_ops.cpp
@@ -168,6 +168,7 @@ struct compile_plan
    }
    const compiled_result& benchmark(problem_cache& pc) const
    {
+        const auto trace_level = value_of(MIGRAPHX_TRACE_BENCHMARKING{});
        if(results.empty())
            MIGRAPHX_THROW("No configs to tune");
        if(results.size() == 1)
@@ -178,9 +179,10 @@ struct compile_plan
        }
        if(not config)
            MIGRAPHX_THROW("Multiple kernels without config");
-        std::cout << "Benchmarking " << preop.name() << ": " << results.size() << " configs"
+        if(trace_level > 0)
-                  << std::endl;
+            std::cout << "Benchmarking " << preop.name() << ": " << results.size() << " configs"
-        if(enabled(MIGRAPHX_TRACE_BENCHMARKING{}))
+                      << std::endl;
+        if(trace_level > 1)
            std::cout << "Problem: " << config->problem << std::endl;
        std::vector<double> times;
        times.reserve(results.size());
@@ -189,22 +191,23 @@ struct compile_plan
                       config->solutions.begin(),
                       std::back_inserter(times),
                       [&](const auto& cr, const auto& solution) {
-                           if(enabled(MIGRAPHX_TRACE_BENCHMARKING{}))
+                           if(trace_level > 1)
                               std::cout << "Benchmarking solution: " << solution << std::endl;
                           if(not cr.has_value())
                           {
-                               if(enabled(MIGRAPHX_TRACE_BENCHMARKING{}))
+                               if(trace_level > 1)
                                   std::cout << "No binary" << std::endl;
                               return std::numeric_limits<double>::max();
                           }
                           auto t = time_op(
                               *ctx, cr->replace.code_object, to_shapes(cr->ins->inputs()), 20);
-                           if(enabled(MIGRAPHX_TRACE_BENCHMARKING{}))
+                           if(trace_level > 1)
                               std::cout << t << "ms" << std::endl;
                           return t;
                       });
        auto i = std::distance(times.begin(), std::min_element(times.begin(), times.end()));
-        std::cout << "Fastest solution: " << config->solutions.at(i) << std::endl;
+        if(trace_level > 0)
+            std::cout << "Fastest solution: " << config->solutions.at(i) << std::endl;
        pc.insert(preop.name(), config->problem, config->solutions.at(i));
        if(not results[i].has_value())
            MIGRAPHX_THROW("No valid tuned compilation.");

--- a/src/targets/gpu/device/int8_gemm_pack.cpp
+++ b/src/targets/gpu/device/int8_gemm_pack.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/shape.hpp>
-#include <migraphx/argument.hpp>
-#include <migraphx/gpu/device/int8_gemm_pack.hpp>
-#include <migraphx/gpu/device/launch.hpp>
-#include <migraphx/gpu/device/types.hpp>
-#include <migraphx/gpu/device/tensor.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-namespace device {
-void int8_gemm_pack_a(hipStream_t stream, const argument& result, const argument& arg)
-{
-    auto comp_shape    = arg.get_shape();
-    auto out_lens      = comp_shape.lens();
-    auto dim_0         = out_lens.size() - 2;
-    auto dim_1         = out_lens.size() - 1;
-    std::size_t lda    = comp_shape.strides()[dim_0];
-    std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
-    visit_all(result, arg)([&](auto output, auto input) {
-        std::size_t nelements = comp_shape.elements();
-        auto* out_ptr         = device_cast(output.data());
-        auto* in_ptr          = device_cast(input.data());
-        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
-            hip_tensor_descriptor<out_dim> desc(comp_shape);
-            gs_launch(stream, nelements, 256)([=](auto ii) __device__ {
-                const size_t nb    = 4;
-                auto idx           = desc.multi(ii);
-                std::size_t i_m    = idx[dim_1];
-                std::size_t i_k    = idx[dim_0];
-                std::size_t offset = ii / m_size * m_size;
-                out_ptr[i_k % nb + (i_m + (i_k / nb) * lda) * nb + offset] =
-                    in_ptr[i_m + i_k * lda + offset];
-            });
-        });
-    });
-}
-void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument& arg)
-{
-    auto trans_shape = arg.get_shape();
-    auto out_lens    = trans_shape.lens();
-    auto dim_0       = trans_shape.lens().size() - 2;
-    auto dim_1       = trans_shape.lens().size() - 1;
-    std::size_t ldb  = trans_shape.strides()[dim_1];
-    auto wrap_lens = out_lens;
-    std::swap(wrap_lens[dim_0], wrap_lens[dim_1]);
-    shape comp_shape{trans_shape.type(), wrap_lens};
-    std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
-    visit_all(result, arg)([&](auto output, auto input) {
-        std::size_t nelements = comp_shape.elements();
-        auto* out_ptr         = device_cast(output.data());
-        auto* in_ptr          = device_cast(input.data());
-        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
-            hip_tensor_descriptor<out_dim> desc(comp_shape);
-            gs_launch(stream, nelements, 256)([=](auto ii) __device__ {
-                const size_t nb    = 4;
-                auto idx           = desc.multi(ii);
-                std::size_t i_n    = idx[dim_1];
-                std::size_t i_k    = idx[dim_0];
-                std::size_t offset = ii / m_size * m_size;
-                out_ptr[i_k % nb + (i_n + (i_k / nb) * ldb) * nb + offset] =
-                    in_ptr[i_n + i_k * ldb + offset];
-            });
-        });
-    });
-}
-} // namespace device
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/gemm_impl.cpp
+++ b/src/targets/gpu/gemm_impl.cpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,15 +21,20 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
 #include <rocblas/rocblas.h>
 #include <migraphx/gpu/gemm_impl.hpp>
 #include <migraphx/reduce_dims.hpp>
-#include <migraphx/permutation.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/time.hpp>
+using microseconds = std::chrono::duration<double, std::micro>;
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
+// Convert rocBLAS datatypes to equivalent Migraphx data types
 rocblas_datatype get_type(shape::type_t type)
 {
    switch(type)
@@ -81,196 +86,508 @@ shape transpose_batch(const shape& s, unsigned trans_batch)
    return shape::from_permutation(s.type(), s.lens(), perm);
 }
-template <class R, class... Ts, class... Us>
+/**
-R rocblas_invoke(R (*f)(Ts...), Us... xs)
+ * Returns results of rocblas_status_success, rocblas_status_perf_degraded,
+ * or rocblas_status_invalid_value.  Caller
+ * is expected to check for invalid index.  Any other result causes an exception.
+ *
+ */
+template <class F, class Pack, class... Ts>
+auto rocblas_invoke(F f, Pack p, Ts... xs)
 {
-    if constexpr(sizeof...(Ts) == sizeof...(Us))
+    return p([=](auto... ws) {
-        return f(xs...);
+        auto status = f(ws..., xs...);
-    else
+        if(status != rocblas_status_success and status != rocblas_status_invalid_value)
-        return f(xs..., nullptr, nullptr);
+        {
+            if(status == rocblas_status_perf_degraded)
+            {
+                std::cerr << "WARNING: degraded perf. in rocBLAS call" << std::endl;
+            }
+            else
+                MIGRAPHX_THROW("rocblas_invoke: rocBLAS call failed with status " +
+                               std::to_string(status));
+        }
+        return status;
+    });
 }
-static bool is_transposed(const shape& s)
+static bool is_transposed(const shape& s) { return s.transposed() and s.strides().back() != 1; }
-{
-    if(not s.transposed())
-        return false;
-    return s.strides().back() != 1;
-}
-static rocblas_int get_batch_stride(const argument& a)
+static rocblas_int get_batch_stride(const shape& s)
 {
-    return a.get_shape().strides()[a.get_shape().strides().size() - 3];
+    // This value is not needed for non-strided inputs
+    if(s.strides().size() < 3)
+        return 0;
+    else
+        return s.strides()[s.strides().size() - 3];
 }
-template <class T>
+/**
-void gemm_impl(context& ctx,
+ * Wrapper for multiple rocBLAS calls.  The constructor creates parameters for
-               const shape& output_shape,
+ * these calls based on data shapes and other values contained in the associated
-               const std::vector<argument>& args,
+ * instruction and operation.
-               T alpha,
+ *
-               T beta,
+ * The template parameter T is not the type of the matrix data but of the weighting
-               bool int8_x4_format,
+ * coefficients alpha and beta (these are float in rocBLAS internals)
-               bool compute_fp32)
+ */
+template <typename T>
+struct gemm_impl
 {
-    const bool is_3inputs = (args.size() == 4);
+    gemm_impl(const shape& output_shape,
-    if(not is_3inputs)
+              const std::vector<shape>& input_shapes,
+              T alpha_param,
+              T beta_param,
+              bool compute_fp32_flag)
+        : alpha(alpha_param),
+          beta(beta_param),
+          is_3inputs(input_shapes.size() == 4),
+          compute_fp32(compute_fp32_flag)
    {
-        beta = 0;
+        if(not is_3inputs)
-    }
+        {
+            beta = 0;
-    bool transa     = is_transposed(args[0].get_shape());
+        }
-    bool transb     = is_transposed(args[1].get_shape());
-    auto n_dim      = output_shape.lens().size();
-    auto dim_1      = n_dim - 1;
-    auto dim_0      = n_dim - 2;
-    rocblas_int lda = args[0].get_shape().strides()[transa ? dim_1 : dim_0];
-    rocblas_int ldb = args[1].get_shape().strides()[transb ? dim_1 : dim_0];
-    rocblas_int ldc = args[2].get_shape().strides()[dim_0];
-    rocblas_int ldd = is_3inputs ? args[3].get_shape().strides()[dim_0] : ldc;
-    rocblas_datatype arg_type = get_type(args[0].get_shape().type());
-    auto output_type          = arg_type;
-    if(output_type == rocblas_datatype_i8_r)
-    {
-        output_type = rocblas_datatype_i32_r;
-    }
-    auto compute_type = output_type;
-    if(compute_fp32)
-    {
-        if(arg_type == rocblas_datatype_f16_r)
-            compute_type = rocblas_datatype_f32_r;
-    }
-    rocblas_gemm_flags flag = rocblas_gemm_flags_none;
-#if ROCBLAS_VERSION_MAJOR < 3
-    if(int8_x4_format)
-        flag = rocblas_gemm_flags_pack_int8x4;
-#endif
-    auto a_lens = args[0].get_shape().lens();
+        // Create lambdas that will cast alpha, beta to the output shape's type
-    auto b_lens = args[1].get_shape().lens();
+        // and retain the values being pointed to
-    output_shape.visit_type([&](auto as) {
+        output_shape.visit_type([&](auto as) {
-        auto alpha_r = as(alpha);
+            auto alpha_r = as(alpha);
-        auto beta_r  = as(beta);
+            auto beta_r  = as(beta);
+            if(compute_fp32)
+            {
+                get_alpha = [=] { return &alpha; };
+                get_beta  = [=] { return &beta; };
+            }
+            else
+            {
+                get_alpha = [=] { return &alpha_r; };
+                get_beta  = [=] { return &beta_r; };
+            }
+        });
-        // use void pointer to select different data type if using fp32 mode
+        transa     = is_transposed(input_shapes[0]);
-        void* alpha_v = &alpha_r;
+        transb     = is_transposed(input_shapes[1]);
-        void* beta_v  = &beta_r;
+        auto n_dim = output_shape.lens().size();
+        auto dim_0 = n_dim - 2;
+        auto dim_1 = n_dim - 1;
+        // Leading dimensions of matrices
+        lda = input_shapes[0].strides()[transa ? dim_1 : dim_0];
+        ldb = input_shapes[1].strides()[transb ? dim_1 : dim_0];
+        ldc = input_shapes[2].strides()[dim_0];
+        ldd = is_3inputs ? input_shapes[3].strides()[dim_0] : ldc;
-        if(compute_fp32)
+        arg_type    = get_type(input_shapes[0].type());
+        output_type = arg_type;
+        if(output_type == rocblas_datatype_i8_r)
        {
-            alpha_v = &alpha;
+            output_type = rocblas_datatype_i32_r;
-            beta_v  = &beta;
        }
+        compute_type = output_type;
-        auto out_lens   = output_shape.lens();
+        if(compute_fp32)
-        rocblas_int m   = out_lens[dim_0];
-        rocblas_int n   = out_lens[dim_1];
-        rocblas_int k   = args[0].get_shape().lens()[dim_1];
-        auto to_pointer = [&](auto&& arg) { return as.from(arg.data()); };
-        if(args[0].get_shape().type() == shape::int8_type and (k % 4) != 0 and int8_x4_format)
        {
-            MIGRAPHX_THROW("ROCBLAS_GEMM: k size of int8 type input must be mutlple of 4!");
+            if(arg_type == rocblas_datatype_f16_r)
+                compute_type = rocblas_datatype_f32_r;
        }
-        auto num_matrices = std::accumulate(
+        auto a_lens = input_shapes[0].lens();
+        auto b_lens = input_shapes[1].lens();
+        auto out_lens = output_shape.lens();
+        m             = out_lens[dim_0];
+        n             = out_lens[dim_1];
+        k             = input_shapes[0].lens()[dim_1];
+        a_stride     = get_batch_stride(input_shapes[0]);
+        b_stride     = get_batch_stride(input_shapes[1]);
+        c_stride     = get_batch_stride(input_shapes[2]);
+        d_stride     = is_3inputs ? get_batch_stride(input_shapes[3]) : c_stride;
+        num_matrices = std::accumulate(
            out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
-        if(num_matrices == 1 or (num_matrices > 1 and get_batch_stride(args[1]) == 0))
+        strided_batched = num_matrices > 1;
+        if(strided_batched and b_stride == 0 and input_shapes[0].standard())
        {
            // If the batch dimension of B is broadcasted, then we can
            // multiply m by the batch_size and use rocblas_gemm_ex
            // instead of rocblas_gemm_strided_batched_ex.
            m *= num_matrices;
+            strided_batched = false;
+        }
+    }
-            // the rocblas_gemm API handles inputs and output matrices as
+    void run(context& ctx, const std::vector<argument>& input_args, int32_t solution_idx = 0) const
-            // column-major format. When doing a C = A * B, we actually do
+    {
-            // C^T = (B^T) * (A^T). That is the reason we input args[1] as
+        if(strided_batched)
-            // A and args[0] as B in calling the rocblas_gemm.
+        {
+            auto common_args = create_strided_batched_args_common(ctx, input_args);
+            rocblas_invoke(&rocblas_gemm_strided_batched_ex,
+                           common_args,
+                           rocblas_gemm_algo_solution_index,
+                           solution_idx,
+                           gemm_flags);
+        }
+        else
+        {
+            auto common_args = create_gemm_ex_args_common(ctx, input_args);
            rocblas_invoke(&rocblas_gemm_ex,
-                           ctx.get_stream().get_rocblas(),
+                           common_args,
-                           transb ? rocblas_operation_transpose : rocblas_operation_none,
+                           rocblas_gemm_algo_solution_index,
-                           transa ? rocblas_operation_transpose : rocblas_operation_none,
+                           solution_idx,
-                           n,
+                           gemm_flags);
-                           m,
+        }
-                           k,
+    }
-                           alpha_v,
-                           to_pointer(args.at(1)),
+#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
-                           arg_type,
+    auto validate(context& ctx, const std::vector<shape>& input_shapes, int32_t solution_idx) const
-                           ldb,
+    {
-                           to_pointer(args.at(0)),
+        // Create dummy arguments for the shapes, and call the overloaded method
-                           arg_type,
+        std::vector<argument> input_args;
-                           lda,
+        std::transform(input_shapes.begin(),
-                           beta_v,
+                       input_shapes.end(),
-                           to_pointer(args[2]),
+                       std::back_inserter(input_args),
-                           output_type,
+                       [](const shape& x) { return to_gpu(generate_argument(x)); });
-                           ldc,
-                           is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
+        return validate(ctx, input_args, solution_idx);
-                           output_type,
+    }
-                           ldd,
-                           compute_type,
+    /**
-                           rocblas_gemm_algo_standard,
+     * Checks a particular solution for validity by running it with the flag
-                           0,
+     * rocblas_gemm_flags_check_solution_index (could be invalid if this model was
-                           flag);
+     * tuned with a different rocBLAS version)
+     *
+     * @return Returns either solution_idx if valid, or else the default value 0
+     * if not.  The default does not mean list index 0, but tells the picker
+     * to choose a solution.
+     */
+    int32_t
+    validate(context& ctx, const std::vector<argument>& input_args, int32_t solution_idx) const
+    {
+        rocblas_status_ check_valid(rocblas_status_success);
+        if(strided_batched)
+        {
+            auto common_args = create_strided_batched_args_common(ctx, input_args);
+            check_valid      = rocblas_invoke(&rocblas_gemm_strided_batched_ex,
+                                         common_args,
+                                         rocblas_gemm_algo_solution_index,
+                                         solution_idx,
+                                         rocblas_gemm_flags_check_solution_index);
        }
        else
        {
-            auto a_stride = get_batch_stride(args[0]);
+            auto common_args = create_gemm_ex_args_common(ctx, input_args);
-            auto b_stride = get_batch_stride(args[1]);
+            check_valid      = rocblas_invoke(&rocblas_gemm_ex,
-            auto c_stride = get_batch_stride(args[2]);
+                                         common_args,
-            auto d_stride = is_3inputs ? get_batch_stride(args[3]) : c_stride;
+                                         rocblas_gemm_algo_solution_index,
-            rocblas_invoke(&rocblas_gemm_strided_batched_ex,
+                                         solution_idx,
-                           ctx.get_stream().get_rocblas(),
+                                         rocblas_gemm_flags_check_solution_index);
-                           transb ? rocblas_operation_transpose : rocblas_operation_none,
-                           transa ? rocblas_operation_transpose : rocblas_operation_none,
-                           n,
-                           m,
-                           k,
-                           alpha_v,
-                           to_pointer(args.at(1)),
-                           arg_type,
-                           ldb,
-                           b_stride,
-                           to_pointer(args.at(0)),
-                           arg_type,
-                           lda,
-                           a_stride,
-                           beta_v,
-                           to_pointer(args[2]),
-                           output_type,
-                           ldc,
-                           c_stride,
-                           is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
-                           output_type,
-                           ldd,
-                           d_stride,
-                           num_matrices,
-                           compute_type,
-                           rocblas_gemm_algo_standard,
-                           0,
-                           flag);
        }
-    });
+        if(check_valid == rocblas_status_invalid_value)
+        {
+            std::cerr << "WARNING:  tuned solution is invalid; reverting to default" << std::endl;
+            return 0;
+        }
+        return solution_idx;
+    }
+#endif
+    /**
+     * Helper method to create that subset of a long rocBLAS argument list that is common
+     * to multiple "...strided_batched..." calls.
+     *
+     * The rocblas_gemm API handles inputs and output matrices as
+     *  column-major format. When doing a C = A * B, we actually do
+     *  C^T = (B^T) * (A^T). That is the reason we input args[1] as
+     *   A and args[0] as B in calling the rocblas_gemm.
+     *
+     */
+    auto create_strided_batched_args_common(context& ctx, const std::vector<argument>& args) const
+    {
+        return pack(ctx.get_stream().get_rocblas(),
+                    transb ? rocblas_operation_transpose : rocblas_operation_none,
+                    transa ? rocblas_operation_transpose : rocblas_operation_none,
+                    n,
+                    m,
+                    k,
+                    get_alpha(),
+                    args[1].data(),
+                    arg_type,
+                    ldb,
+                    b_stride,
+                    args[0].data(),
+                    arg_type,
+                    lda,
+                    a_stride,
+                    get_beta(),
+                    args[2].data(),
+                    output_type,
+                    ldc,
+                    c_stride,
+                    is_3inputs ? args[3].data() : args[2].data(),
+                    output_type,
+                    ldd,
+                    d_stride,
+                    num_matrices,
+                    compute_type);
+    }
+    /**
+     * Helper method to create that subset of a long rocBLAS argument list that is common
+     * to multiple "gemm_ex..." calls.
+     *
+     * The rocblas_gemm API handles inputs and output matrices as
+     *  column-major format. When doing a C = A * B, we actually do
+     *   C^T = (B^T) * (A^T). That is the reason we input args[1] as
+     *   A and args[0] as B in calling the rocblas_gemm.
+     *
+     * */
+    auto create_gemm_ex_args_common(context& ctx, const std::vector<argument>& args) const
+    {
+        return pack(ctx.get_stream().get_rocblas(),
+                    transb ? rocblas_operation_transpose : rocblas_operation_none,
+                    transa ? rocblas_operation_transpose : rocblas_operation_none,
+                    n,
+                    m,
+                    k,
+                    get_alpha(),
+                    args[1].data(),
+                    arg_type,
+                    ldb,
+                    args[0].data(),
+                    arg_type,
+                    lda,
+                    get_beta(),
+                    args[2].data(),
+                    output_type,
+                    ldc,
+                    is_3inputs ? args[3].data() : args[2].data(),
+                    output_type,
+                    ldd,
+                    compute_type);
+    }
+#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
+    /**
+     * Find best rocBLAS solution:  Get list of solutions and try them all, returning the index
+     * of the fastest one.
+     */
+    int tune(context& ctx, const std::vector<shape>& input_shapes) const
+    {
+        // tuning meta parameters
+        const int hot_calls = 40;
+        std::vector<argument> input_args;
+        std::transform(input_shapes.begin(),
+                       input_shapes.end(),
+                       std::back_inserter(input_args),
+                       [](const shape& x) { return to_gpu(generate_argument(x)); });
+        // Get the solutions list in 2 rocBLAS steps:
+        // 1.  Find out how many solutions there are and allocate the array
+        // 2.  Get the solutions
+        //
+        rocblas_int list_size = 0;
+        std::vector<rocblas_int> solution_indices;
+        if(strided_batched)
+        {
+            auto common_args = create_strided_batched_args_common(ctx, input_args);
+            rocblas_invoke(&rocblas_gemm_strided_batched_ex_get_solutions,
+                           common_args,
+                           rocblas_gemm_algo_solution_index,
+                           gemm_flags,
+                           nullptr,
+                           &list_size);
+            solution_indices.resize(list_size);
+            auto common_sol_args = create_strided_batched_args_common(ctx, input_args);
+            rocblas_invoke(&rocblas_gemm_strided_batched_ex_get_solutions,
+                           common_sol_args,
+                           rocblas_gemm_algo_solution_index,
+                           gemm_flags,
+                           solution_indices.data(),
+                           &list_size);
+        }
+        else
+        {
+            auto common_args = create_gemm_ex_args_common(ctx, input_args);
+            rocblas_invoke(&rocblas_gemm_ex_get_solutions,
+                           common_args,
+                           rocblas_gemm_algo_solution_index,
+                           gemm_flags,
+                           nullptr,
+                           &list_size);
+            solution_indices.resize(list_size);
+            auto common_sol_args = create_gemm_ex_args_common(ctx, input_args);
+            rocblas_invoke(&rocblas_gemm_ex_get_solutions,
+                           common_sol_args,
+                           rocblas_gemm_algo_solution_index,
+                           gemm_flags,
+                           solution_indices.data(),
+                           &list_size);
+        }
+        double best_time  = std::numeric_limits<double>::max();
+        double first_time = -1;
+        // Initialize to default solution index
+        rocblas_int best_sol = 0;
+        for(auto sol : solution_indices)
+        {
+            // Warmup: the first call to an op. may not be representative since there is
+            // more time taken initializing caches, etc. so we won't time it.
+            run(ctx, input_args, sol);
+            double host_time = time<milliseconds>([&] {
+                for([[maybe_unused]] int hc : range(hot_calls))
+                    run(ctx, input_args, sol);
+                ctx.finish();
+            });
+            host_time /= hot_calls;
+            // dev/evaluation only: track time for first solution.
+            if(first_time < 0)
+                first_time = host_time;
+            // track current best
+            if(host_time < best_time)
+            {
+                best_sol  = sol;
+                best_time = host_time;
+            }
+        }
+        std::cout << "Winning GEMM solution: " << best_sol << " in " << best_time << " ms, beats "
+                  << first_time << "ms" << std::endl;
+        return best_sol;
+    }
+#endif
+    private:
+    size_t num_matrices = 0;
+    rocblas_int m       = 0;
+    rocblas_int n       = 0;
+    rocblas_int k       = 0;
+    bool transa         = false;
+    bool transb         = false;
+    T alpha             = 0;
+    T beta              = 0;
+    std::function<const void*()> get_alpha{};
+    std::function<const void*()> get_beta{};
+    rocblas_gemm_flags gemm_flags = rocblas_gemm_flags_none;
+    rocblas_int lda               = 0;
+    rocblas_int ldb               = 0;
+    rocblas_int ldc               = 0;
+    rocblas_int ldd               = 0;
+    rocblas_int a_stride          = 0;
+    rocblas_int b_stride          = 0;
+    rocblas_int c_stride          = 0;
+    rocblas_int d_stride          = 0;
+    rocblas_datatype compute_type = rocblas_datatype_f32_r;
+    rocblas_datatype arg_type     = rocblas_datatype_f32_r;
+    rocblas_datatype output_type  = rocblas_datatype_f32_r;
+    bool strided_batched          = true;
+    bool is_3inputs               = true;
+    bool compute_fp32             = true;
+}; // gemm_impl
+void gemm_compute(context& ctx,
+                  const shape& output_shape,
+                  const std::vector<argument>& args,
+                  float alpha,
+                  float beta,
+                  bool compute_fp32,
+                  int32_t solution_idx)
+{
+    std::vector<shape> input_shapes;
+    std::transform(args.begin(),
+                   args.end(),
+                   std::back_inserter(input_shapes),
+                   [](const argument& x) { return x.get_shape(); });
+    auto gemm_item = gemm_impl<float>(output_shape, input_shapes, alpha, beta, compute_fp32);
+    gemm_item.run(ctx, args, solution_idx);
+}
+void gemm_compute(context& ctx,
+                  const shape& output_shape,
+                  const std::vector<argument>& args,
+                  int32_t alpha,
+                  int32_t beta,
+                  bool compute_fp32,
+                  int32_t solution_idx)
+{
+    std::vector<shape> input_shapes;
+    std::transform(args.begin(),
+                   args.end(),
+                   std::back_inserter(input_shapes),
+                   [](const argument& x) { return x.get_shape(); });
+    auto gemm_item = gemm_impl<int32_t>(output_shape, input_shapes, alpha, beta, compute_fp32);
+    gemm_item.run(ctx, args, solution_idx);
 }
-void gemm(context& ctx,
+/**
-          const shape& output_shape,
+ * Decides if the tune() or validate() method is appropriate and calls it.
-          const std::vector<argument>& args,
+ * Return value is the chosen solution index, or 0 to let picker choose it.
-          float alpha,
+ */
-          float beta,
+int32_t gemm_finalize(context& ctx,
-          bool int8_x4_format,
+                      const shape& output_shape,
-          bool compute_fp32)
+                      const std::vector<shape>& input_shapes,
+                      float alpha,
+                      float beta,
+                      bool compute_fp32,
+                      int32_t solution_idx)
 {
-    gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
+#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
+    // This code should be called only if either the environment var.
+    // MIGRAPHX_ENABLE_GEMM_TUNING, or option --exhaustive-tune, is set
+    if(solution_idx == 0)
+    {
+        auto gemm_item = gemm_impl<float>(output_shape, input_shapes, alpha, beta, compute_fp32);
+        solution_idx   = gemm_item.tune(ctx, input_shapes);
+    }
+    else
+    {
+        // If a tuned solution index is already given, don't tune again but validate
+        // in case the data was tuned with a different rocBLAS version
+        auto gemm_item = gemm_impl<float>(output_shape, input_shapes, alpha, beta, compute_fp32);
+        solution_idx   = gemm_item.validate(ctx, input_shapes, solution_idx);
+    }
+#else
+    (void)ctx, (void)output_shape, (void)input_shapes;
+    (void)alpha, (void)beta, (void)compute_fp32;
+#endif
+    return solution_idx;
 }
-void gemm(context& ctx,
+/**
-          const shape& output_shape,
+ * Decides if the tune() or validate() method is appropriate and calls it.
-          const std::vector<argument>& args,
+ * Return value is the chosen solution index, or 0 to let picker choose it.
-          int32_t alpha,
+ */
-          int32_t beta,
+int32_t gemm_finalize(context& ctx,
-          bool int8_x4_format,
+                      const shape& output_shape,
-          bool compute_fp32)
+                      const std::vector<shape>& input_shapes,
+                      int32_t alpha,
+                      int32_t beta,
+                      bool compute_fp32,
+                      int32_t solution_idx)
 {
-    gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
+#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
+    if(solution_idx == 0)
+    {
+        auto gemm_item = gemm_impl<int32_t>(output_shape, input_shapes, alpha, beta, compute_fp32);
+        solution_idx   = gemm_item.tune(ctx, input_shapes);
+    }
+    else
+    {
+        // If a tuned solution index is already given, don't tune again but validate
+        // in case the data was tuned with a different rocBLAS version
+        auto gemm_item = gemm_impl<int32_t>(output_shape, input_shapes, alpha, beta, compute_fp32);
+        solution_idx   = gemm_item.validate(ctx, input_shapes, solution_idx);
+    }
+#else
+    (void)ctx, (void)output_shape, (void)input_shapes;
+    (void)alpha, (void)beta, (void)compute_fp32;
+#endif
+    return solution_idx;
 }
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/compile_miopen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_miopen.hpp
@@ -42,7 +42,7 @@ struct compile_miopen
    context* ctx = nullptr;
    std::string name() const { return "gpu::compile_miopen"; }
    void apply(module& m) const;
-    std::size_t compile(operation& op, instruction_ref ins, bool format) const;
+    std::size_t compile(operation& op, instruction_ref ins) const;
 };
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -57,7 +57,6 @@ template <class Op>
 struct miopen_convolution
 {
    Op op;
-    bool int8_x4_format               = false;
    shared<convolution_descriptor> cd = nullptr;
    miopenConvFwdAlgorithm_t algo{};
 #ifdef MIGRAPHX_HAS_FIND_2_API
@@ -74,7 +73,6 @@ struct miopen_convolution
                    f(self.solution_object, "solution_object"),
 #endif
                    f(self.algo, "algo"),
-                    f(self.int8_x4_format, "int8_x4_format"),
                    f(self.solution_id, "solution_id"));
    }
@@ -94,9 +92,9 @@ struct miopen_convolution
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
-        auto x_desc = make_tensor(reshape_if_1d(args[0].get_shape()), int8_x4_format);
+        auto x_desc                = make_tensor(reshape_if_1d(args[0].get_shape()));
-        auto w_desc = make_tensor(reshape_if_1d(args[1].get_shape()), int8_x4_format);
+        auto w_desc                = make_tensor(reshape_if_1d(args[1].get_shape()));
-        auto y_desc = make_tensor(reshape_if_1d(output_shape));
+        auto y_desc                = make_tensor(reshape_if_1d(output_shape));
        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
        auto workspace_size        = args[2].get_shape().bytes();
@@ -162,8 +160,8 @@ struct miopen_convolution
    shape find(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
    {
        shape workspace_shape{};
-        auto x_desc = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
+        auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
-        auto w_desc = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+        auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
        auto y_desc = make_tensor(reshape_if_1d(output_shape));
        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
@@ -179,13 +177,8 @@ struct miopen_convolution
        workspace_shape = shape{shape::int8_type, {workspace_size}};
-        auto x_shape = inputs[0];
+        const auto& x_shape = inputs[0];
-        auto w_shape = inputs[1];
+        const auto& w_shape = inputs[1];
-        if(int8_x4_format)
-        {
-            x_shape = pack_int8_shape(x_shape);
-            w_shape = pack_int8_shape(w_shape);
-        }
 #ifdef MIGRAPHX_HAS_FIND_2_API
        {
@@ -327,8 +320,8 @@ struct miopen_convolution
                                   ": workspace has changed during finalization.");
            }
-            auto x_desc = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
+            auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
-            auto w_desc = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+            auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
            auto y_desc = make_tensor(reshape_if_1d(output_shape));
            auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
@@ -347,21 +340,6 @@ struct miopen_convolution
    {
        return shapes.size() - 1;
    }
-    inline shape pack_int8_shape(const shape& s) const
-    {
-        if(s.type() != shape::int8_type)
-        {
-            return s;
-        }
-        auto lens    = s.lens();
-        auto strides = s.strides();
-        lens[1]      = (lens[1] + 3) / 4 * 4;
-        strides[0]   = strides[1] * lens[1];
-        return {s.type(), lens, strides};
-    }
 };
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/int8_gemm_pack.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/int8_gemm_pack.hpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_INT8_GEMM_PACK_HPP
-#define MIGRAPHX_GUARD_RTGLIB_DEVICE_INT8_GEMM_PACK_HPP
-#include <migraphx/argument.hpp>
-#include <migraphx/gpu/device/config.hpp>
-#include <hip/hip_runtime_api.h>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-namespace device {
-void MIGRAPHX_DEVICE_EXPORT int8_gemm_pack_a(hipStream_t stream,
-                                             const argument& result,
-                                             const argument& arg);
-void MIGRAPHX_DEVICE_EXPORT int8_gemm_pack_b(hipStream_t stream,
-                                             const argument& result,
-                                             const argument& arg);
-} // namespace device
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gemm.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -40,9 +40,8 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 struct context;
-void blas_shape(const shape& s);
 shape transpose_batch(const shape& s, unsigned trans_batch);
+void blas_shape(const shape& s);
 template <class Op>
 struct rocblas_gemm
@@ -50,9 +49,9 @@ struct rocblas_gemm
    Op op;
    float alpha          = 1;
    float beta           = 0;
-    bool int8_x4_format  = true;
    bool compute_fp32    = false;
    unsigned trans_batch = 0;
+    int32_t solution_idx = 0;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -60,9 +59,9 @@ struct rocblas_gemm
        return pack_join(migraphx::reflect(self.op, f),
                         pack(f(self.alpha, "alpha"),
                              f(self.beta, "beta"),
-                              f(self.int8_x4_format, "int8_x4_format"),
                              f(self.compute_fp32, "compute_fp32"),
-                              f(self.trans_batch, "trans_batch")));
+                              f(self.trans_batch, "trans_batch"),
+                              f(self.solution_idx, "solution_idx")));
    }
    std::string name() const
@@ -78,6 +77,8 @@ struct rocblas_gemm
    {
        std::vector<shape> in_shapes(inputs);
        in_shapes.pop_back();
+        // When input shapes are A, B, C the GEMM equation is  C  =  α AB+ β C   where α, β are
+        // scalars
        check_shapes{in_shapes, *this}.has(2, 3);
        blas_shape(inputs[0]);
        blas_shape(inputs[1]);
@@ -113,17 +114,12 @@ struct rocblas_gemm
    {
        if(this->name() == "gpu::gemm")
        {
-            gemm(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
+            gemm_compute(ctx, output_shape, args, alpha, beta, compute_fp32, solution_idx);
        }
        else
        {
-            gemm(ctx,
+            gemm_compute(
-                 output_shape,
+                ctx, output_shape, args, int32_t(alpha), int32_t(beta), compute_fp32, solution_idx);
-                 args,
-                 int32_t(alpha),
-                 int32_t(beta),
-                 int8_x4_format,
-                 compute_fp32);
        }
        return args.back();
    }
@@ -132,6 +128,33 @@ struct rocblas_gemm
    {
        return shapes.size() - 1;
    }
+    void finalize(context& ctx, const shape& output_shape, const std::vector<shape>& input_shapes)
+    {
+#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
+        if(enabled(MIGRAPHX_ENABLE_GEMM_TUNING{}) or ctx.get_exhaustive_tune_flag())
+        {
+            if(this->name() == "gpu::gemm")
+            {
+                solution_idx = gemm_finalize(
+                    ctx, output_shape, input_shapes, alpha, beta, compute_fp32, solution_idx);
+            }
+            else
+            {
+                solution_idx = gemm_finalize(ctx,
+                                             output_shape,
+                                             input_shapes,
+                                             int32_t(alpha),
+                                             int32_t(beta),
+                                             compute_fp32,
+                                             solution_idx);
+            }
+        }
+#else
+        // suppress compiler warnings
+        (void)ctx, (void)output_shape, (void)input_shapes;
+#endif
+    }
 };
 } // namespace gpu