Add ck_gemm

bfad5a5b · Paul · f7d987ba · bfad5a5b · bfad5a5b · bfad5a5b
Commit bfad5a5b authored Oct 07, 2022 by Paul
8 changed files
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,3 +28,4 @@ half,https://github.com/pfultz2/half/archive/1.12.0.tar.gz -X header -H sha256:0
 pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build
 msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off
 sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On
+ROCmSoftwarePlatform/composable_kernel@639147432b6922bd8e4051ba751e4e63dd4eb196 -X header
--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -89,6 +89,7 @@ add_library(migraphx_gpu
    deconvolution.cpp
    device_name.cpp
    elu.cpp
+    fuse_ck.cpp
    fuse_mlir.cpp
    fuse_ops.cpp
    gather.cpp
@@ -215,6 +216,10 @@ string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMP
 string(REGEX REPLACE --offload-arch=[a-z0-9:+-]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
 # Skip library paths since hip will incorrectly treat it as a source file
 string(APPEND HIP_COMPILER_FLAGS " ")
+# Add ck includes
+find_path(CK_INCLUDE_PATH ck/ck.hpp)
+message(STATUS "CK path: ${CK_INCLUDE_PATH}")
+string(APPEND HIP_COMPILER_FLAGS " -isystem ${CK_INCLUDE_PATH}")
 foreach(_unused RANGE 2)
    string(REGEX REPLACE " /[^ ]+\\.(a|so) " " " HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
 endforeach()

--- a/src/targets/gpu/fuse_ck.cpp
+++ b/src/targets/gpu/fuse_ck.cpp
+#include <migraphx/gpu/fuse_ck.hpp>
+#include <migraphx/matcher.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/register_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module;
+
+namespace gpu {
+
+struct ck_gemm
+{
+    operation op = make_op("dot");
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"));
+    }
+
+    std::string name() const { return "gpu::ck_gemm"; }
+    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
+    {
+        check_shapes{inputs, *this}.standard();
+        // if(mods.size() != 1)
+        //     MIGRAPHX_THROW("should have one submodule.");
+        if(inputs.size() < 2)
+            MIGRAPHX_THROW("should have at least two inputs.");
+        auto n = inputs.size();
+        auto a = inputs[n - 2];
+        auto b = inputs[n - 1];
+        return op.compute_shape({a, b});
+    }
+};
+MIGRAPHX_REGISTER_OP(ck_gemm);
+
+namespace {
+
+MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins)
+{
+    if(ins->name() != "dot")
+        return false;
+    auto a = ins->inputs().front()->get_shape();
+    auto b = ins->inputs().back()->get_shape();
+    return (a.lens()[0] % 8 == 0 and a.lens()[1] % 8 == 0 and
+            b.lens()[0] % 8 == 0 and b.lens()[1] % 8 == 0);
+}
+
+
+struct find_ck_gemm
+{
+    // Find a convolution followed by a pointwise operation.
+    auto matcher() const
+    {
+        return match::name("dot")(is_ck_gemm().bind("gemm"));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto ins      = r.result;
+        mpm.get_module().replace_instruction(ins, ck_gemm{ins->get_operator()}, ins->inputs());
+    }
+};
+
+} // namespace
+
+
+void fuse_ck::apply(module_pass_manager& mpm) const
+{
+    match::find_matches(mpm, find_ck_gemm{});
+}
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
--- a/src/targets/gpu/include/migraphx/gpu/fuse_ck.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/fuse_ck.hpp
+#ifndef MIGRAPHX_GUARD_GPU_FUSE_CK_HPP
+#define MIGRAPHX_GUARD_GPU_FUSE_CK_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module_pass_manager;
+
+namespace gpu {
+
+struct fuse_ck
+{
+    context* ctx = nullptr;
+    std::string name() const { return "gpu::fuse_ck"; }
+    void apply(module_pass_manager& mpm) const;
+};
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_FUSE_CK_HPP
--- a/src/targets/gpu/jit/ck_gemm.cpp
+++ b/src/targets/gpu/jit/ck_gemm.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <fstream>
+#include <filesystem>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/eliminate_common_subexpression.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/env.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+namespace gpu {
+
+// NOLINTNEXTLINE
+static const char* const ck_gemm_kernel = R"__migraphx__(
+#include <args.hpp>
+#include <migraphx/kernels/ck_gemm.hpp>
+
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+
+using gemm_t = ${instance}, ${m}, ${k}, ${n}, ${sa}, ${sb}, ${sc}>;
+
+constexpr __device__ gemm_t ckdg{};
+using GridwiseGemm = decltype(ckdg.gridwisegemm);
+
+extern "C" {
+
+__global__ void ck_gemm_kernel(void* a_p, void* b_p, void* c_p)
+{
+    make_tensors()(a_p, b_p, c_p)([&](auto a_t, auto b_t, auto c_t) {
+        constexpr ck::index_t shared_block_size =
+            GridwiseGemm::GetSharedMemoryNumberOfByte();
+        __shared__ char p_shared_block[shared_block_size];
+        make_tensors()(p_shared_block)([&](auto p_t) {
+            ck_gemm<gemm_t>(a_t, b_t, c_t, p_t);
+        });
+    });
+}
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+
+std::size_t int_div_ceil(std::size_t x, std::size_t y)
+{
+    return (x + y - 1) / y;
+}
+
+std::size_t get_grid_size(std::size_t m, std::size_t mpb, std::size_t n, std::size_t npb)
+{
+    return int_div_ceil(m, mpb) * int_div_ceil(n, npb);
+}
+
+struct block_settings
+{
+    int bs;
+    int mpb;
+    int npb;
+};
+
+namespace fs = std::filesystem;
+
+struct ck_gemm_compiler : compiler<ck_gemm_compiler>
+{
+    const std::vector<std::string> instances{
+        "       CKDeviceGemm< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 256, 128, 32, 8, 2, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 8",
+        "       CKDeviceGemm< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8",
+        "       CKDeviceGemm< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 256, 32, 8, 2, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 8",
+        "       CKDeviceGemm< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1, 1, 1, S<1, 32, 1, 8>, 8",
+        "       CKDeviceGemm< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 128, 32, 8, 2, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 8",
+        "       CKDeviceGemm< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8",
+        "       CKDeviceGemm< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 64, 32, 8, 2, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 8",
+        "       CKDeviceGemm< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8",
+        "       CKDeviceGemm< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 64, 128, 32, 8, 2, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 8",
+        "       CKDeviceGemm< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 1, S<1, 32, 1, 8>, 8",
+        "       CKDeviceGemm< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 128, 32, 8, 2, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 16, 1, 8>, 8",
+        "       CKDeviceGemm< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8",
+        "       CKDeviceGemm< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 64, 32, 8, 2, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 4>, 8",
+        "       CKDeviceGemm< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 1, S<1, 32, 1, 4>, 8",
+        "       CKDeviceGemm< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 64, 128, 32, 8, 2, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 16, 1, 8>, 8",
+        "       CKDeviceGemm< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1, 1, 1, S<1, 16, 1, 8>, 8"};
+
+    const std::vector<block_settings> params {
+        {256, 256, 128},
+        {256, 256, 128},
+        {256, 128, 256},
+        {256, 128, 256},
+        {256, 128, 128},
+        {256, 128, 128},
+        {256, 128, 64},
+        {256, 128, 64},
+        {256, 64, 128},
+        {256, 64, 128},
+        {128, 128, 128},
+        {128, 128, 128},
+        {128, 128, 64},
+        {128, 128, 64},
+        {128, 64, 128},
+        {128, 64, 128}};
+
+    std::vector<std::string> names() const { return {"ck_gemm", "gpu::ck_gemm"}; }
+
+    operation compile_op(context& /* ctx */, const std::vector<shape>& inputs, const value& v) const
+    {
+        int i = 4;
+        if (contains(v, "tuning_val"))
+            i = v.at("tuning_val").to<int>();
+        assert(i >= 0 and i < instances.size());
+
+        hip_compile_options options;
+        auto out_s = inputs.back();
+
+        auto b_s = params[i];
+        auto block_size = b_s.bs;
+        auto m_per_block = b_s.mpb;
+        auto n_per_block = b_s.npb;
+        auto m = out_s.lens().front();
+        auto n = out_s.lens().back();
+        auto grid_size = get_grid_size(m, m_per_block, n, n_per_block);
+
+        options.set_launch_params(v, grid_size * block_size, block_size);
+        options.inputs         = inputs;
+        options.output         = out_s;
+        options.kernel_name    = "ck_gemm_kernel";
+        options.virtual_inputs = inputs;
+
+        auto k = inputs.front().lens().back();
+        auto sa = inputs.front().strides().front();
+        auto sb = inputs.at(1).strides().front();
+        auto sc = inputs.back().strides().front();
+        auto src = interpolate_string(ck_gemm_kernel, {{"instance", instances[i]},
+                                                       {"m", to_string(m)},
+                                                       {"k", to_string(k)},
+                                                       {"n", to_string(n)},
+                                                       {"sa", to_string(sa)},
+                                                       {"sb", to_string(sb)},
+                                                       {"sc", to_string(sc)}});
+        
+        return compile_hip_code_object(src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_CK_GEMM_HPP
+#define MIGRAPHX_GUARD_KERNELS_CK_GEMM_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/ck_gemm_includes.hpp>
+
+namespace migraphx {
+
+template <class G, class T, class U, class V, class W>
+__device__ void ck_gemm(const T& a_t, const U& b_t, const V& c_t, W& p_t)
+{
+    constexpr G ckdg{};
+    using GridwiseGemm      = decltype(ckdg.gridwisegemm);
+
+    constexpr auto a_grid_desc_ak0_m_ak1 = ckdg.MakeAGridDescriptor_AK0_M_AK1();
+    constexpr auto b_grid_desc_bk0_n_bk1 = ckdg.MakeBGridDescriptor_BK0_N_BK1();
+    constexpr auto c_grid_desc_m_n = ckdg.MakeCGridDescriptor_M_N();
+    constexpr auto block_2_ctile_map = ckdg.MakeDefaultBlock2CTileMap(c_grid_desc_m_n);
+
+    // static_assert(GridwiseGemm::CheckValidity(
+    //         a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, c_grid_desc_m_n, block_2_ctile_map));
+    
+    constexpr auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+        GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n);
+
+    constexpr auto K      = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2);
+    constexpr auto a_element_op = ckdg.a_element_op;
+    constexpr auto b_element_op = ckdg.b_element_op;
+    constexpr auto c_element_op = ckdg.c_element_op;
+
+    if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+    {
+        constexpr bool HasMainKBlockLoop = true;
+        GridwiseGemm::template Run<HasMainKBlockLoop>(a_t.data(),
+                                                      b_t.data(),
+                                                      c_t.data(),
+                                                      p_t.data(),
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      c_element_op,
+                                                      a_grid_desc_ak0_m_ak1,
+                                                      b_grid_desc_bk0_n_bk1,
+                                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                      block_2_ctile_map);
+    }
+    else
+    {
+        constexpr bool HasMainKBlockLoop = false;
+        GridwiseGemm::template Run<HasMainKBlockLoop>(a_t.data(),
+                                                      b_t.data(),
+                                                      c_t.data(),
+                                                      p_t.data(),
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      c_element_op,
+                                                      a_grid_desc_ak0_m_ak1,
+                                                      b_grid_desc_bk0_n_bk1,
+                                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                      block_2_ctile_map);
+    }
+}
+
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_includes.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_includes.hpp
--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -55,6 +55,7 @@
 #include <migraphx/gpu/concat_gpu_opt.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/device_name.hpp>
+#include <migraphx/gpu/fuse_ck.hpp>
 #include <migraphx/gpu/fuse_mlir.hpp>
 #include <migraphx/gpu/fuse_ops.hpp>
 #include <migraphx/gpu/prefuse_ops.hpp>
@@ -134,6 +135,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        fuse_mlir{&ctx},
        dead_code_elimination{},
+        fuse_ck{&ctx},
+        dead_code_elimination{},
        lowering{&ctx, options.offload_copy},
        eliminate_contiguous{"gpu::contiguous"},
        dead_code_elimination{},