Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into mlir-attention

6b850e98 · Manupa Karunaratne · a34429e9 · 22bb777f · 6b850e98 · 6b850e98
Commit 6b850e98 authored Oct 31, 2023 by Manupa Karunaratne
20 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,49 @@

 Full documentation for MIGraphX is available at [MIGraphX Documentation](https://rocmdocs.amd.com/projects/AMDMIGraphX/en/latest/).

+## MIGraphX 2.8 for ROCm 6.0.0
+### Added
+- Support for MI300 GPUs
+- Support for TorchMIGraphX via PyTorch
+- Boosted overall performance by integrating rocMLIR 
+- INT8 support for ONNX Runtime
+- Support for ONNX version 1.14.1
+- Added operators Qlinearadd, QlinearGlobalAveragePool, Qlinearconv, Shrink, CastLike, and RandomUniform operators
+- Added an error message when gpu_targets is not set when compiling migraphx
+- Added parameter to set tolerances with migraphx-driver verify 
+- Added support for MXR files >4 GB 
+- Added MIGRAPHX_TRACE_MLIR flag
+- BETA added capability to use ROCm Composable Kernels via environment variable MIGRAPHX_ENABLE_CK=1
+
+### Optimizations
+- Improved performance support for INT8
+- Improved time percision while benchmarking candidate kernels from CK or MLIR 
+- Remove contiguous from reshape parsing
+- Updated ConstantOfShape operator to support Dynamic Batch
+- Simplifies dynamic shapes related operators to their static versions if possible
+- Improved debugging tools for accuracy issues
+- Print warning about miopen_fusion while generating mxr 
+- General reduction in system memory usage during model compilation
+- Created additional fusion opportunities during model compilation
+- Improved debugging for matchers
+- Improved general debug messages 
+
+### Fixed
+- Fixed scatter operator for nonstandard shapes with some models from ONNX Model Zoo
+- Provided a compile option to improve accuracy of some models by disabling Fast-Math
+- Improved layernorm + pointwise fusion matching to ignore arguments order
+- Fixed accuracy issue with ROIAlign operator 
+- Fixed Trilu operator computation logic
+- Fixed support for the DETR model 
+
+### Changed
+- Changed migraphx version to 2.8
+- Extracted test packages as its own separate deb file when building migraphx from source
+
+### Removed
+- Removed building Python 2.7 bindings
+
+
 ## MIGraphX 2.7 for ROCm 5.7.0
 ### Added
 - Enabled hipRTC to not require dev packages for migraphx runtime and allow the ROCm install to be in a different directory than it was during build time

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -57,6 +57,12 @@ else()
 option(MIGRAPHX_ENABLE_PYTHON "Enable python bindings" ON)
 endif()

+if(WIN32) # CK is not yet ported to Windows
+option(MIGRAPHX_USE_COMPOSABLEKERNEL "Enable MIGraphX to use composable kernel JIT library" OFF)
+else()
+option(MIGRAPHX_USE_COMPOSABLEKERNEL "Enable MIGraphX to use composable kernel JIT library" ON)
+endif()
+
 find_path(HALF_INCLUDE_DIR half.hpp PATH_SUFFIXES half)
 if (NOT HALF_INCLUDE_DIR)
    message(FATAL_ERROR "Could not find half.hpp - Please check that the install path of half.hpp has been added to CMAKE_PREFIX_PATH")
@@ -75,8 +81,9 @@ include(ROCMSetupVersion)

 option(BUILD_DEV "Build for development purpose only" OFF)

-rocm_setup_version(VERSION 2.8.0)
-set(MIGRAPHX_SO_VERSION ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR})
+rocm_setup_version(VERSION 2.9.0)
+math(EXPR MIGRAPHX_SO_MAJOR_VERSION "(${PROJECT_VERSION_MAJOR} * 1000 * 1000) + (${PROJECT_VERSION_MINOR} * 1000) + ${PROJECT_VERSION_PATCH}")
+set(MIGRAPHX_SO_VERSION ${MIGRAPHX_SO_MAJOR_VERSION}.0)

 option( BUILD_SHARED_LIBS "Build as a shared library" ON )


--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -30,7 +30,7 @@ def rocmtestnode(Map conf) {
            rm -rf build
            mkdir build
            cd build
-            cmake -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DBUILD_DEV=On -DCMAKE_EXECUTE_PROCESS_COMMAND_ECHO=STDOUT ${flags} ..
+            cmake -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DBUILD_DEV=On -DCMAKE_EXECUTE_PROCESS_COMMAND_ECHO=STDOUT -DMIGRAPHX_DISABLE_VIRTUAL_ENV=ON ${flags} ..
            git diff
            git diff-index --quiet HEAD || (echo "Git repo is not clean after running cmake." && exit 1)
            make -j\$(nproc) generate VERBOSE=1

--- a/examples/migraphx/custom_op_miopen_kernel/custom_op_miopen_kernel.cpp
+++ b/examples/migraphx/custom_op_miopen_kernel/custom_op_miopen_kernel.cpp
@@ -32,7 +32,7 @@
 #define MIGRAPHX_MIOPEN_ASSERT(x) (assert((x) == miopenStatusSuccess))
 #define MIGRAPHX_HIP_ASSERT(x) (assert((x) == hipSuccess))

-inline miopenTensorDescriptor_t make_miopen_tensor(const migraphx::shape& s, bool pack = false)
+inline miopenTensorDescriptor_t make_miopen_tensor(const migraphx::shape& s)
 {
    miopenTensorDescriptor_t t;
    MIGRAPHX_MIOPEN_ASSERT(miopenCreateTensorDescriptor(&t));
@@ -49,23 +49,9 @@ inline miopenTensorDescriptor_t make_miopen_tensor(const migraphx::shape& s, boo
    else if(s.type() == migraphx_shape_int32_type)
        d = miopenInt32;
    else if(s.type() == migraphx_shape_int8_type)
-    {
-        if(pack)
-        {
-            // update the lens and corresponding strides
-            d          = miopenInt8x4;
-            lens[1]    = ((lens[1] + 3) / 4) * 4;
-            strides[0] = strides[1] * lens[1];
-        }
-        else
-        {
-            d = miopenInt8;
-        }
-    }
+        d = miopenInt8;
    else
-    {
        throw("MAKE_TENSOR: unsupported type");
-    }
    miopenSetTensorDescriptor(t, d, s_lens.size(), lens.data(), strides.data());
    return t;
 }

--- a/examples/migraphx/migraphx_driver/README.md
+++ b/examples/migraphx/migraphx_driver/README.md
@@ -149,9 +149,6 @@ gpu::gelu
 gpu::gelu_new
 gpu::gemm
 gpu::greater
-gpu::int8_conv_pack
-gpu::int8_gemm_pack_a
-gpu::int8_gemm_pack_b
 gpu::layernorm
 gpu::leaky_relu
 gpu::less

--- a/requirements.txt
+++ b/requirements.txt
@@ -21,12 +21,12 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 #####################################################################################
-google/protobuf@v3.11.0 -DCMAKE_POSITION_INDEPENDENT_CODE=On -X subdir -Dprotobuf_BUILD_TESTS=Off
+google/protobuf@v3.19.0 -DCMAKE_POSITION_INDEPENDENT_CODE=On -X subdir -Dprotobuf_BUILD_TESTS=Off
 nlohmann/json@v3.8.0
 live-clones/blaze@v3.8 -X header -DHEADER_DIR=blaze -H sha256:d0ff011f47538285178908ea5f2cab46bb6a8f55b1edb6e03224a82dbc1a3212
 ROCmSoftwarePlatform/half@rocm-5.6.0
 pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build
 msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off
-sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On
+sqlite3@3.43.2 -DCMAKE_POSITION_INDEPENDENT_CODE=On
 ROCmSoftwarePlatform/composable_kernel@70eefcf4f263aa5c25f3c9ff0db8f6f199ef0fb9 -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On
 ROCmSoftwarePlatform/rocMLIR@507bb94ce7873786486d296ec81d2eadaab49003 -DBUILD_FAT_LIBROCKCOMPILER=On
\ No newline at end of file
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -261,9 +261,8 @@ find_package(nlohmann_json 3.8.0 REQUIRED)
 target_link_libraries(migraphx PRIVATE nlohmann_json::nlohmann_json)
 migraphx_generate_export_header(migraphx)

-find_package(PkgConfig)
-pkg_check_modules(SQLITE3 REQUIRED IMPORTED_TARGET sqlite3)
-target_link_libraries(migraphx PRIVATE PkgConfig::SQLITE3)
+find_package(SQLite3 REQUIRED)
+target_link_libraries(migraphx PRIVATE SQLite::SQLite3)

 find_package(msgpackc-cxx QUIET)
 if(NOT msgpackc-cxx_FOUND)

--- a/src/include/migraphx/op/allocate.hpp
+++ b/src/include/migraphx/op/allocate.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -41,16 +41,16 @@ namespace op {
 * Dynamic allocate:
 * One input: `allocate(output_dims)`
 * `output_dims` are the output buffer dimensions and has a static shape.
- * Either `this.s` or `this.buf_type` must be set to calculate the dynamic output shape at compute
- * time. If `this.buf_type` is set, the compute_shape() of allocate at compile time will have
- * dynamic_dimensions from {0, max_int} with rank = output_dims.ndim(). If `this.s` is set then the
- * compute_shape() will output `this.s`; `this.s` should be a dynamic shape.
+ * Either `this.s` or `this.buf_type` (but not both) must be set to calculate the dynamic output
+ * shape at compute time. If `this.buf_type` is set, the compute_shape() of allocate at compile time
+ * will have dynamic_dimensions from {0, max_int} with rank = output_dims.ndim(). If `this.s` is set
+ * then the compute_shape() will output `this.s`; `this.s` should be a dynamic shape.
 */
 struct allocate
 {
-    shape s{};
+    optional<shape> s;
    // for dynamic allocate to set the buffer type
-    shape::type_t buf_type = shape::half_type;
+    optional<shape::type_t> buf_type;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -62,26 +62,38 @@ struct allocate

    shape compute_shape(const std::vector<shape>& inputs) const
    {
-        if(s != shape())
+        if(s.has_value())
        {
+            if(buf_type.has_value())
+            {
+                MIGRAPHX_THROW("ALLOCATE: shape and buf_type attributes both set");
+            }
            if(inputs.size() == 1)
            {
                migraphx::check_shapes{inputs, *this, false}.only_dims(1);
            }
            else
            {
+                if(s->dynamic())
+                {
+                    MIGRAPHX_THROW("ALLOCATE: dynamic shape attribute and no input");
+                }
                migraphx::check_shapes{inputs, *this, false}.has(0);
            }
-            return s;
+            return s.value();
        }
        else
        {
+            if(not buf_type.has_value())
+            {
+                MIGRAPHX_THROW("ALLOCATE: shape and buf_type attributes both not set");
+            }
            migraphx::check_shapes{inputs, *this, false}.has(1).only_dims(1);
            const auto& out_dims = inputs.at(0);
            std::size_t max_val = std::numeric_limits<std::size_t>::max();
            std::vector<shape::dynamic_dimension> dyn_dims(out_dims.lens().at(0),
                                                           shape::dynamic_dimension{0, max_val});
-            return {buf_type, dyn_dims};
+            return {buf_type.value(), dyn_dims};
        }
    }
    argument compute(const shape& output_shape, const std::vector<argument>& args) const
@@ -94,7 +106,11 @@ struct allocate
        {
            std::vector<std::size_t> output_dims(output_shape.ndim());
            args.at(0).visit([&](auto a) { output_dims.assign(a.begin(), a.end()); });
-            return argument{shape{buf_type, output_dims}};
+            if(s)
+            {
+                return argument{shape{s->type(), output_dims}};
+            }
+            return argument{shape{buf_type.value(), output_dims}};
        }
    }
 };

--- a/src/include/migraphx/streamutils.hpp
+++ b/src/include/migraphx/streamutils.hpp
@@ -30,6 +30,7 @@
 #include <migraphx/rank.hpp>
 #include <migraphx/requires.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/optional.hpp>
 #include <vector>

 namespace migraphx {
@@ -68,6 +69,19 @@ auto stream_write_value_impl(rank<1>, std::ostream& os, const T& x) -> decltype(
    os << x;
 }

+template <class T>
+auto stream_write_value_impl(rank<1>, std::ostream& os, const optional<T>& x)
+{
+    if(x.has_value())
+    {
+        os << *x;
+    }
+    else
+    {
+        os << "nullopt";
+    }
+}
+
 template <class T>
 void stream_write_value_impl(rank<1>, std::ostream& os, const std::vector<T>& r)
 {

--- a/src/onnx/parse_slice.cpp
+++ b/src/onnx/parse_slice.cpp
@@ -137,7 +137,7 @@ struct parse_slice : op_parser<parse_slice>
        sd.always_insert(args.at(0));

        // If axes arg is not given, the default is all of them.
-        if(sd.op.axes.empty() and sd.op_args.size() < 3)
+        if(sd.op.axes.empty() and sd.op_args.size() <= 3)
        {
            std::vector<int64_t> axes(args[0]->get_shape().ndim());
            std::iota(axes.begin(), axes.end(), int64_t{0});

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -936,7 +936,7 @@ void program::perf_report(std::ostream& os,
    os << std::endl;

    os << "Batch size: " << batch << std::endl;
-    os << "Rate: " << rate * batch << "inferences/sec" << std::endl;
+    os << "Rate: " << rate * batch << " inferences/sec" << std::endl;
    os << "Total time: " << total_time << "ms" << std::endl;
    os << "Total instructions time: " << total_instruction_time << "ms" << std::endl;
    os << "Overhead time: " << overhead_time << "ms"

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -37,8 +37,7 @@ if(NOT TARGET MIOpen)
    message(SEND_ERROR "Cant find miopen")
 endif()

-if(NOT WIN32)
-    # TODO: re-enable when CK is ported to Windows
+if(MIGRAPHX_USE_COMPOSABLEKERNEL)
    find_package(composable_kernel 1.0.0 REQUIRED COMPONENTS jit_library)
 endif()

@@ -52,10 +51,10 @@ file(GLOB KERNEL_FILES CONFIGURE_DEPENDS
    ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/*.hpp)
 message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")

-if(WIN32)
-    # TODO: re-enable when CK is ported to Windows
+if(NOT MIGRAPHX_USE_COMPOSABLEKERNEL)
    list(REMOVE_ITEM KERNEL_FILES
        ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck_gemm.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp
        ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/ck.hpp)
 endif()

@@ -103,9 +102,10 @@ rocm_clang_tidy_check(kernel_file_check)

 file(GLOB JIT_GPU_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/jit/*.cpp)

-if(WIN32)
-    # TODO: re-enable when CK is ported to Windows
-    list(REMOVE_ITEM JIT_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/jit/ck_gemm.cpp)
+if(NOT MIGRAPHX_USE_COMPOSABLEKERNEL)
+    list(REMOVE_ITEM JIT_GPU_SRCS
+            ${CMAKE_CURRENT_SOURCE_DIR}/jit/ck_gemm.cpp
+            ${CMAKE_CURRENT_SOURCE_DIR}/jit/ck_gemm_softmax_gemm.cpp)
 endif()

 add_library(migraphx_gpu
@@ -128,8 +128,6 @@ add_library(migraphx_gpu
    gather.cpp
    gemm_impl.cpp
    hip.cpp
-    int8_conv_pack.cpp
-    int8_gemm_pack.cpp
    kernel.cpp
    lowering.cpp
    logsoftmax.cpp
@@ -140,7 +138,6 @@ add_library(migraphx_gpu
    no_device.cpp
    nonzero.cpp
    pack_args.cpp
-    pack_int8_args.cpp
    prefuse_ops.cpp
    pad.cpp
    perfdb.cpp
@@ -184,7 +181,6 @@ register_migraphx_gpu_ops(hip_
 register_migraphx_gpu_ops(miopen_
    abs
    contiguous
-    int8_conv_pack
    lrn
    pooling
 )
@@ -192,10 +188,6 @@ register_op(migraphx_gpu
    HEADER migraphx/gpu/rnn_variable_seq_lens.hpp
    OPERATORS gpu::hip_rnn_var_sl_shift_sequence gpu::hip_rnn_var_sl_shift_output gpu::hip_rnn_var_sl_last_output
    INCLUDES migraphx/gpu/context.hpp)
-register_op(migraphx_gpu
-    HEADER migraphx/gpu/int8_gemm_pack.hpp
-    OPERATORS gpu::hip_int8_gemm_pack_a gpu::hip_int8_gemm_pack_b
-    INCLUDES migraphx/gpu/context.hpp)
 register_op(migraphx_gpu
    HEADER migraphx/gpu/gemm.hpp
    OPERATORS gpu::rocblas_gemm<op::dot> gpu::rocblas_gemm<op::quant_dot>
@@ -281,8 +273,7 @@ endif()

 target_link_libraries(migraphx_gpu PUBLIC migraphx MIOpen roc::rocblas)
 target_link_libraries(migraphx_gpu PRIVATE migraphx_device migraphx_kernels)
-if(NOT WIN32)
-    # TODO: re-enable when CK is ported to Windows
+if(MIGRAPHX_USE_COMPOSABLEKERNEL)
    target_link_libraries(migraphx_gpu PRIVATE composable_kernel::jit_library)
 endif()


--- a/src/targets/gpu/compile_miopen.cpp
+++ b/src/targets/gpu/compile_miopen.cpp
@@ -60,9 +60,8 @@ struct miopen_op
 };
 MIGRAPHX_REGISTER_OP(miopen_op);

-std::size_t compile_miopen::compile(operation& op, instruction_ref ins, bool format) const
+std::size_t compile_miopen::compile(operation& op, instruction_ref ins) const
 {
-    op.from_value({{"int8_x4_format", format}});
    auto v = op.compile(*ctx, ins->get_shape(), to_shapes(ins->inputs()));
    return v.get<std::size_t>("workspace", 0);
 }
@@ -70,25 +69,15 @@ std::size_t compile_miopen::compile(operation& op, instruction_ref ins, bool for
 void compile_miopen::apply(module& m) const
 {
    assert(ctx);
-    const bool int8_x4_format = get_int8_x4_format(any_cast<migraphx::gpu::context>(*ctx));
    for(auto ins : iterator_for(m))
    {
        if(ins->name() != "gpu::miopen_op")
            continue;
        auto op        = any_cast<miopen_op>(ins->get_operator()).op;
        std::size_t ws = 0;
-        try
-        {
-            // for the regular convolution and convolution_backwards, this try would always succeed
-            ws = compile(op, ins, int8_x4_format);
-        }
-        catch(migraphx::exception&)
-        {
-            // In case no solver supports the default format, retry using the other format.
-            ws = compile(op, ins, not int8_x4_format);
-        }
-        auto inputs = ins->inputs();
-        auto alloc  = m.insert_instruction(
+        ws             = compile(op, ins);
+        auto inputs    = ins->inputs();
+        auto alloc     = m.insert_instruction(
            ins, make_op("allocate", {{"shape", to_value(shape{shape::int8_type, {ws}})}}));
        inputs.insert(std::prev(inputs.end()), alloc);


--- a/src/targets/gpu/device/int8_gemm_pack.cpp
+++ b/src/targets/gpu/device/int8_gemm_pack.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/shape.hpp>
-#include <migraphx/argument.hpp>
-#include <migraphx/gpu/device/int8_gemm_pack.hpp>
-#include <migraphx/gpu/device/launch.hpp>
-#include <migraphx/gpu/device/types.hpp>
-#include <migraphx/gpu/device/tensor.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-namespace device {
-
-void int8_gemm_pack_a(hipStream_t stream, const argument& result, const argument& arg)
-{
-    auto comp_shape    = arg.get_shape();
-    auto out_lens      = comp_shape.lens();
-    auto dim_0         = out_lens.size() - 2;
-    auto dim_1         = out_lens.size() - 1;
-    std::size_t lda    = comp_shape.strides()[dim_0];
-    std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
-    visit_all(result, arg)([&](auto output, auto input) {
-        std::size_t nelements = comp_shape.elements();
-        auto* out_ptr         = device_cast(output.data());
-        auto* in_ptr          = device_cast(input.data());
-        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
-            hip_tensor_descriptor<out_dim> desc(comp_shape);
-            gs_launch(stream, nelements, 256)([=](auto ii) __device__ {
-                const size_t nb    = 4;
-                auto idx           = desc.multi(ii);
-                std::size_t i_m    = idx[dim_1];
-                std::size_t i_k    = idx[dim_0];
-                std::size_t offset = ii / m_size * m_size;
-                out_ptr[i_k % nb + (i_m + (i_k / nb) * lda) * nb + offset] =
-                    in_ptr[i_m + i_k * lda + offset];
-            });
-        });
-    });
-}
-
-void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument& arg)
-{
-    auto trans_shape = arg.get_shape();
-    auto out_lens    = trans_shape.lens();
-    auto dim_0       = trans_shape.lens().size() - 2;
-    auto dim_1       = trans_shape.lens().size() - 1;
-    std::size_t ldb  = trans_shape.strides()[dim_1];
-
-    auto wrap_lens = out_lens;
-    std::swap(wrap_lens[dim_0], wrap_lens[dim_1]);
-    shape comp_shape{trans_shape.type(), wrap_lens};
-    std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
-    visit_all(result, arg)([&](auto output, auto input) {
-        std::size_t nelements = comp_shape.elements();
-        auto* out_ptr         = device_cast(output.data());
-        auto* in_ptr          = device_cast(input.data());
-        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
-            hip_tensor_descriptor<out_dim> desc(comp_shape);
-            gs_launch(stream, nelements, 256)([=](auto ii) __device__ {
-                const size_t nb    = 4;
-                auto idx           = desc.multi(ii);
-                std::size_t i_n    = idx[dim_1];
-                std::size_t i_k    = idx[dim_0];
-                std::size_t offset = ii / m_size * m_size;
-                out_ptr[i_k % nb + (i_n + (i_k / nb) * ldb) * nb + offset] =
-                    in_ptr[i_n + i_k * ldb + offset];
-            });
-        });
-    });
-}
-
-} // namespace device
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
@@ -365,7 +365,7 @@ struct find_mlir_standalone_op
    void rewrite(module_pass_manager& mpm, instruction_ref top_ins) const
    {
        static size_t counter = 0;
-        module_ref mm         = mpm.create_module("mlir_" + std::to_string(counter++));
+        module_ref mm = mpm.create_module("mlir_" + top_ins->name() + std::to_string(counter++));
        mm->set_bypass();
        auto [anchor_op, top_inputs] = fuse_input_ops_and_gemm_based_op(mm, top_ins);
        mm->add_return({anchor_op});

--- a/src/targets/gpu/gemm_impl.cpp
+++ b/src/targets/gpu/gemm_impl.cpp
@@ -108,7 +108,6 @@ void gemm_impl(context& ctx,
               const std::vector<argument>& args,
               T alpha,
               T beta,
-               bool int8_x4_format,
               bool compute_fp32)
 {
    const bool is_3inputs = (args.size() == 4);
@@ -141,11 +140,6 @@ void gemm_impl(context& ctx,
    }

    rocblas_gemm_flags flag = rocblas_gemm_flags_none;
-#if ROCBLAS_VERSION_MAJOR < 3
-    if(int8_x4_format)
-        flag = rocblas_gemm_flags_pack_int8x4;
-#endif
-
    auto a_lens = args[0].get_shape().lens();
    auto b_lens = args[1].get_shape().lens();
    output_shape.visit_type([&](auto as) {
@@ -167,10 +161,6 @@ void gemm_impl(context& ctx,
        rocblas_int n   = out_lens[dim_1];
        rocblas_int k   = args[0].get_shape().lens()[dim_1];
        auto to_pointer = [&](auto&& arg) { return as.from(arg.data()); };
-        if(args[0].get_shape().type() == shape::int8_type and (k % 4) != 0 and int8_x4_format)
-        {
-            MIGRAPHX_THROW("ROCBLAS_GEMM: k size of int8 type input must be mutlple of 4!");
-        }

        auto num_matrices = std::accumulate(
            out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
@@ -256,10 +246,9 @@ void gemm(context& ctx,
          const std::vector<argument>& args,
          float alpha,
          float beta,
-          bool int8_x4_format,
          bool compute_fp32)
 {
-    gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
+    gemm_impl(ctx, output_shape, args, alpha, beta, compute_fp32);
 }

 void gemm(context& ctx,
@@ -267,10 +256,9 @@ void gemm(context& ctx,
          const std::vector<argument>& args,
          int32_t alpha,
          int32_t beta,
-          bool int8_x4_format,
          bool compute_fp32)
 {
-    gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
+    gemm_impl(ctx, output_shape, args, alpha, beta, compute_fp32);
 }

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/compile_miopen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_miopen.hpp
@@ -42,7 +42,7 @@ struct compile_miopen
    context* ctx = nullptr;
    std::string name() const { return "gpu::compile_miopen"; }
    void apply(module& m) const;
-    std::size_t compile(operation& op, instruction_ref ins, bool format) const;
+    std::size_t compile(operation& op, instruction_ref ins) const;
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -57,7 +57,6 @@ template <class Op>
 struct miopen_convolution
 {
    Op op;
-    bool int8_x4_format               = false;
    shared<convolution_descriptor> cd = nullptr;
    miopenConvFwdAlgorithm_t algo{};
 #ifdef MIGRAPHX_HAS_FIND_2_API
@@ -74,7 +73,6 @@ struct miopen_convolution
                    f(self.solution_object, "solution_object"),
 #endif
                    f(self.algo, "algo"),
-                    f(self.int8_x4_format, "int8_x4_format"),
                    f(self.solution_id, "solution_id"));
    }

@@ -94,9 +92,9 @@ struct miopen_convolution
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
-        auto x_desc = make_tensor(reshape_if_1d(args[0].get_shape()), int8_x4_format);
-        auto w_desc = make_tensor(reshape_if_1d(args[1].get_shape()), int8_x4_format);
-        auto y_desc = make_tensor(reshape_if_1d(output_shape));
+        auto x_desc                = make_tensor(reshape_if_1d(args[0].get_shape()));
+        auto w_desc                = make_tensor(reshape_if_1d(args[1].get_shape()));
+        auto y_desc                = make_tensor(reshape_if_1d(output_shape));
        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
        auto workspace_size        = args[2].get_shape().bytes();

@@ -162,8 +160,8 @@ struct miopen_convolution
    shape find(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
    {
        shape workspace_shape{};
-        auto x_desc = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
-        auto w_desc = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+        auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
+        auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
        auto y_desc = make_tensor(reshape_if_1d(output_shape));

        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
@@ -179,13 +177,8 @@ struct miopen_convolution

        workspace_shape = shape{shape::int8_type, {workspace_size}};

-        auto x_shape = inputs[0];
-        auto w_shape = inputs[1];
-        if(int8_x4_format)
-        {
-            x_shape = pack_int8_shape(x_shape);
-            w_shape = pack_int8_shape(w_shape);
-        }
+        const auto& x_shape = inputs[0];
+        const auto& w_shape = inputs[1];

 #ifdef MIGRAPHX_HAS_FIND_2_API
        {
@@ -327,8 +320,8 @@ struct miopen_convolution
                                   ": workspace has changed during finalization.");
            }

-            auto x_desc = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
-            auto w_desc = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+            auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
+            auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
            auto y_desc = make_tensor(reshape_if_1d(output_shape));

            auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
@@ -347,21 +340,6 @@ struct miopen_convolution
    {
        return shapes.size() - 1;
    }
-
-    inline shape pack_int8_shape(const shape& s) const
-    {
-        if(s.type() != shape::int8_type)
-        {
-            return s;
-        }
-
-        auto lens    = s.lens();
-        auto strides = s.strides();
-        lens[1]      = (lens[1] + 3) / 4 * 4;
-        strides[0]   = strides[1] * lens[1];
-
-        return {s.type(), lens, strides};
-    }
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/int8_gemm_pack.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/int8_gemm_pack.hpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_INT8_GEMM_PACK_HPP
-#define MIGRAPHX_GUARD_RTGLIB_DEVICE_INT8_GEMM_PACK_HPP
-
-#include <migraphx/argument.hpp>
-#include <migraphx/gpu/device/config.hpp>
-#include <hip/hip_runtime_api.h>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-namespace device {
-
-void MIGRAPHX_DEVICE_EXPORT int8_gemm_pack_a(hipStream_t stream,
-                                             const argument& result,
-                                             const argument& arg);
-
-void MIGRAPHX_DEVICE_EXPORT int8_gemm_pack_b(hipStream_t stream,
-                                             const argument& result,
-                                             const argument& arg);
-
-} // namespace device
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gemm.hpp
@@ -50,7 +50,6 @@ struct rocblas_gemm
    Op op;
    float alpha          = 1;
    float beta           = 0;
-    bool int8_x4_format  = true;
    bool compute_fp32    = false;
    unsigned trans_batch = 0;

@@ -60,7 +59,6 @@ struct rocblas_gemm
        return pack_join(migraphx::reflect(self.op, f),
                         pack(f(self.alpha, "alpha"),
                              f(self.beta, "beta"),
-                              f(self.int8_x4_format, "int8_x4_format"),
                              f(self.compute_fp32, "compute_fp32"),
                              f(self.trans_batch, "trans_batch")));
    }
@@ -113,17 +111,11 @@ struct rocblas_gemm
    {
        if(this->name() == "gpu::gemm")
        {
-            gemm(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
+            gemm(ctx, output_shape, args, alpha, beta, compute_fp32);
        }
        else
        {
-            gemm(ctx,
-                 output_shape,
-                 args,
-                 int32_t(alpha),
-                 int32_t(beta),
-                 int8_x4_format,
-                 compute_fp32);
+            gemm(ctx, output_shape, args, int32_t(alpha), int32_t(beta), compute_fp32);
        }
        return args.back();
    }