Merge remote-tracking branch 'origin/develop' into gemm_f16_int8

05fd7ff8 · Jakub Piasecki · 2784b516 · 84832fc4 · 05fd7ff8 · 05fd7ff8
Commit 05fd7ff8 authored Jan 30, 2024 by Jakub Piasecki
20 changed files
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
+* @zjing14 @asroy @junliume @illsilin @carlushuang @aosewski
 # Documentation files
 docs/* @saadrahim @LisaDelaney
 *.md  @saadrahim @LisaDelaney

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,53 +2,66 @@
 Full documentation for Composable Kernel is not yet available.
-## (Unreleased) CK for ROCm 6.0.0
+## (Unreleased) CK
 ### Fixes
- - Fixed a hazard associated with inline v_dot (#808)
+None
- - Fixed two bugs in grouped convolution backward data without K padding (#848 #876)
+### Optimizations
+None
+### Additions
+* Introduced wrapper sublibrary (limited functionality). (#1071, #1098, #1108, #1126)
+### Changes
+None
+## CK for ROCm 6.0.0
+### Fixes
+ * Fixed a hazard associated with inline v_dot (#808)
+ * Fixed two bugs in grouped convolution backward data without K padding (#848 #876)
 ### Optimizations
 None
 ### Additions
- Added an image to a column kernel (#867)
+* Added an image to a column kernel (#867)
- Added a column to an image kernel (#930)
+* Added a column to an image kernel (#930)
- Support for 3D grouped convolution on RDNA 3 GPUs (#935, #950, #985)
+* Support for 3D grouped convolution on RDNA 3 GPUs (#935, #950, #985)
- Grouped convolution support for small K and C (#822 #879 #897)
+* Grouped convolution support for small K and C (#822 #879 #897)
- Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804)
+* Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804)
- Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799)
+* Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799)
- Support for Batched Gemm DL (#732)
+* Support for Batched Gemm DL (#732)
- Introduce wrapper sublibrary (limited functionality). (#1071, #1098, #1108)
 ### Changes
- - Changed the grouped convolution API to maintain consistency with other convolution kernels (#817)
+ * Changed the grouped convolution API to maintain consistency with other convolution kernels (#817)
 ## CK 0.2.0 for ROCm 5.7.0
 ### Fixes
- Fixed a bug in 6-dimensional kernels (#555)
+* Fixed a bug in 6-dimensional kernels (#555)
- Fixed a test case failure with grouped convolution backward weight (#524)
+* Fixed a test case failure with grouped convolution backward weight (#524)
 ### Optimizations
- Improved the performance of the normalization kernel
+* Improved the performance of the normalization kernel
 ### Additions
- New CMake flags:
+* New CMake flags:
-  - "DL_KERNELS"-- Must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances
+  * "DL_KERNELS"-* Must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances
-  - "DTYPES" -- Can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build an instance of the specified data types
+  * "DTYPES" -- Can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build an instance of the specified data types
-  - "INSTANCES_ONLY" -- Only builds CK library and instances without tests, examples, or profiler
+  * "INSTANCES_ONLY" -- Only builds CK library and instances without tests, examples, or profiler
- New feature: if GPU_TARGETS is not set in the CMake command line, CK will be built for all targets supported by the compiler
+* New feature: if GPU_TARGETS is not set in the CMake command line, CK will be built for all targets supported by the compiler
- Support for MI300A/MI300X
+* Support for MI300A/MI300X
- Support for AMD RDNA 3
+* Support for AMD RDNA 3
- New user tutorial (#563)
+* New user tutorial (#563)
- Additional instances for irregular GEMM sizes (#560)
+* Additional instances for irregular GEMM sizes (#560)
- New inter-wave consumer-producer programming model for GEMM kernels (#310)
+* New inter-wave consumer-producer programming model for GEMM kernels (#310)
- GEMM with support multiple elementwise fusions (multi-D) (#534)
+* GEMM with support multiple elementwise fusions (multi-D) (#534)
- Multi-embeddings support (#542)
+* Multi-embeddings support (#542)
- AMD RDNA 3 blockwise GEMM and real GEMM support (#541)
+* AMD RDNA 3 blockwise GEMM and real GEMM support (#541)
- AMD RDNA grouped convolution backward weight support (#505)
+* AMD RDNA grouped convolution backward weight support (#505)
- MaxPool and AvgPool forward (#815); MaxPool backward (#750)
+* MaxPool and AvgPool forward (#815); MaxPool backward (#750)
 ### Changes
 None
--- a/Dockerfile
+++ b/Dockerfile
@@ -74,7 +74,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
-#Install latest version of cmake
+#Install ninja build tracing tools
 RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip
 RUN gunzip /usr/local/bin/ninja.gz
 RUN chmod a+x /usr/local/bin/ninja
@@ -82,6 +82,11 @@ RUN git clone https://github.com/nico/ninjatracing.git
 # Update the cmake to the latest version
 RUN pip install --upgrade cmake==3.27.5
+#Install latest cppcheck
+RUN git clone https://github.com/danmar/cppcheck.git && \
+    cd cppcheck && mkdir build && cd build && cmake .. && cmake --build .
+WORKDIR /
 # Setup ubsan environment to printstacktrace
 RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
 ENV UBSAN_OPTIONS=print_stacktrace=1

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -304,7 +304,7 @@ def buildHipClangJob(Map conf=[:]){
        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
-                timeout(time: 5, unit: 'HOURS')
+                timeout(time: 48, unit: 'HOURS')
                {
                    cmake_build(conf)
                }
@@ -560,7 +560,7 @@ def Build_CK(Map conf=[:]){
                            sh """#!/bin/bash
                                mkdir -p build
                                ls -ltr
-                                CC=hipcc CXX=hipcc cmake -Bbuild . -D CMAKE_PREFIX_PATH="/opt/rocm;${env.WORKSPACE}/install"
+                                CC=hipcc CXX=hipcc cmake -Bbuild . -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install"
                                cmake --build build -- -j
                            """
                        }
@@ -709,6 +709,10 @@ pipeline {
            name: "USE_SCCACHE",
            defaultValue: true,
            description: "Use the sccache for building CK (default: ON)")
+        booleanParam(
+            name: "RUN_CPPCHECK",
+            defaultValue: false,
+            description: "Run the cppcheck static analysis (default: OFF)")
    }
    environment{
        dbuser = "${dbuser}"
@@ -735,7 +739,39 @@ pipeline {
        }
        stage("Static checks") {
            parallel{
+                stage('Clang Format and Cppcheck') {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CPPCHECK.toBoolean() }
+                    }
+                    agent{ label rocmnode("nogpu") }
+                    environment{
+                        execute_cmd = "find .. -not -path \'*.git*\' -iname \'*.h\' \
+                                -o -not -path \'*.git*\' -iname \'*.hpp\' \
+                                -o -not -path \'*.git*\' -iname \'*.cpp\' \
+                                -o -iname \'*.h.in\' \
+                                -o -iname \'*.hpp.in\' \
+                                -o -iname \'*.cpp.in\' \
+                                -o -iname \'*.cl\' \
+                                | grep -v 'build/' \
+                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-12 -style=file {} | diff - {}\' && \
+                                /cppcheck/build/bin/cppcheck ../* -v -j \$(nproc) -I ../include -I ../profiler/include -I ../library/include \
+                                -D CK_ENABLE_FP64 -D CK_ENABLE_FP32 -D CK_ENABLE_FP16 -D CK_ENABLE_FP8 -D CK_ENABLE_BF16 -D CK_ENABLE_BF8 -D CK_ENABLE_INT8 -D DL_KERNELS \
+                                -D __gfx908__ -D __gfx90a__ -D __gfx940__ -D __gfx941__ -D __gfx942__ -D __gfx1030__ -D __gfx1100__ -D __gfx1101__ -D __gfx1102__ \
+                                -U __gfx803__ -U __gfx900__ -U __gfx906__ -U CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 \
+                                --file-filter=*.cpp --force --enable=all --output-file=ck_cppcheck.log"
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
+                        archiveArtifacts "build/ck_cppcheck.log"
+                        cleanWs()
+                    }
+                }
                stage('Clang Format') {
+                    when {
+                        beforeAgent true
+                        expression { !params.RUN_CPPCHECK.toBoolean() }
+                    }
                    agent{ label rocmnode("nogpu") }
                    environment{
                        execute_cmd = "find .. -not -path \'*.git*\' -iname \'*.h\' \

--- a/LICENSE
+++ b/LICENSE
@@ -7,7 +7,7 @@ Copyright (c) 2020     , Advanced Micro Devices, Inc. (Xiaoyan Zhou)
 Copyright (c) 2021-2022, Advanced Micro Devices, Inc. (Jianfeng Yan)
 SPDX-License-Identifier: MIT
-Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

--- a/client_example/05_layernorm/CMakeLists.txt
+++ b/client_example/05_layernorm/CMakeLists.txt
 add_executable(client_layernorm2d_bwd_data layernorm2d_bwd_data.cpp)
 target_link_libraries(client_layernorm2d_bwd_data PRIVATE composable_kernel::device_other_operations)
+add_executable(client_layernorm2d_bwd_gamma_beta layernorm2d_bwd_gamma_beta.cpp)
+target_link_libraries(client_layernorm2d_bwd_gamma_beta PRIVATE composable_kernel::device_other_operations)
 add_executable(client_layernorm2d_fwd layernorm2d_fwd.cpp)
 target_link_libraries(client_layernorm2d_fwd PRIVATE composable_kernel::device_other_operations)

--- a/client_example/05_layernorm/layernorm2d_bwd_gamma_beta.cpp
+++ b/client_example/05_layernorm/layernorm2d_bwd_gamma_beta.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <vector>
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp"
+#include "ck/library/tensor_operation_instance/gpu/layernorm_bwd_gamma_beta.hpp"
+using DYDataType         = float;
+using XDataType          = float;
+using GammaDataType      = float;
+using MeanInvStdDataType = float;
+using DGammaDataType     = float;
+using DBetaDataType      = float;
+constexpr int Rank         = 2;
+constexpr int NumReduceDim = 1;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    SimpleDeviceMem dy_dev(sizeof(DYDataType) * M * N);
+    SimpleDeviceMem x_dev(sizeof(XDataType) * M * N);
+    SimpleDeviceMem mean_dev(sizeof(MeanInvStdDataType) * M);
+    SimpleDeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * M);
+    SimpleDeviceMem dgamma_dev(sizeof(DGammaDataType) * N);
+    SimpleDeviceMem dbeta_dev(sizeof(DBetaDataType) * N);
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta<DYDataType,
+                                                                      XDataType,
+                                                                      MeanInvStdDataType,
+                                                                      DGammaDataType,
+                                                                      DBetaDataType,
+                                                                      Rank,
+                                                                      NumReduceDim>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    std::size_t num_bytes = sizeof(DYDataType) * M * N + sizeof(XDataType) * M * N +
+                            sizeof(MeanInvStdDataType) * M * 2 + sizeof(DGammaDataType) * N +
+                            sizeof(DBetaDataType) * N;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // inLengths
+                                                        {N, 1}, // dyStrides
+                                                        {N, 1}, // xStrides
+                                                        {1, 0}, // meanStrides
+                                                        {1, 0}, // invStdStrides
+                                                        {N},    // outLengths
+                                                        {1},    // dgammaStrides
+                                                        {1},    // dbetaStrides
+                                                        {0},    // reduceDims
+                                                        dy_dev.GetDeviceBuffer(),
+                                                        x_dev.GetDeviceBuffer(),
+                                                        mean_dev.GetDeviceBuffer(),
+                                                        inv_std_dev.GetDeviceBuffer(),
+                                                        dgamma_dev.GetDeviceBuffer(),
+                                                        dbeta_dev.GetDeviceBuffer());
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            SimpleDeviceMem workspace(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+            float ave_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            float gb_per_sec = num_bytes / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+    // run the best intance
+    if(found)
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // inLengths
+                                                        {N, 1}, // dyStrides
+                                                        {N, 1}, // xStrides
+                                                        {1, 0}, // meanStrides
+                                                        {1, 0}, // invStdStrides
+                                                        {N},    // outLengths
+                                                        {1},    // dgammaStrides
+                                                        {1},    // dbetaStrides
+                                                        {0},    // reduceDims
+                                                        dy_dev.GetDeviceBuffer(),
+                                                        x_dev.GetDeviceBuffer(),
+                                                        mean_dev.GetDeviceBuffer(),
+                                                        inv_std_dev.GetDeviceBuffer(),
+                                                        dgamma_dev.GetDeviceBuffer(),
+                                                        dbeta_dev.GetDeviceBuffer());
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            SimpleDeviceMem workspace(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
--- a/client_example/18_groupnorm/CMakeLists.txt
+++ b/client_example/18_groupnorm/CMakeLists.txt
 add_executable(client_groupnorm_bwd_data groupnorm_bwd_data.cpp)
 target_link_libraries(client_groupnorm_bwd_data PRIVATE composable_kernel::device_other_operations)
+add_executable(client_groupnorm_bwd_gamma_beta groupnorm_bwd_gamma_beta.cpp)
+target_link_libraries(client_groupnorm_bwd_gamma_beta PRIVATE composable_kernel::device_other_operations)
 add_executable(client_groupnorm_swish_fwd groupnorm_swish_fwd.cpp)
 target_link_libraries(client_groupnorm_swish_fwd PRIVATE composable_kernel::device_other_operations)
--- a/client_example/18_groupnorm/groupnorm_bwd_gamma_beta.cpp
+++ b/client_example/18_groupnorm/groupnorm_bwd_gamma_beta.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <vector>
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp"
+#include "ck/library/tensor_operation_instance/gpu/groupnorm_bwd_gamma_beta.hpp"
+using DYDataType         = float;
+using XDataType          = float;
+using GammaDataType      = float;
+using MeanInvStdDataType = float;
+using DGammaDataType     = float;
+using DBetaDataType      = float;
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    ck::index_t N = 32;
+    ck::index_t H = 16;
+    ck::index_t W = 16;
+    ck::index_t G = 64;
+    ck::index_t C = 128;
+    std::size_t length = N * H * W * G * C;
+    std::vector<ck::index_t> strideDy         = {H * W * G * C, W * G * C, G * C, C, 1};
+    std::vector<ck::index_t> strideX          = strideDy;
+    std::vector<ck::index_t> strideMeanInvStd = {G, 0, 0, 1, 0};
+    std::vector<ck::index_t> strideDGammaBeta = {C, 1};
+    SimpleDeviceMem dy_dev(sizeof(DYDataType) * length);
+    SimpleDeviceMem x_dev(sizeof(XDataType) * length);
+    SimpleDeviceMem mean_dev(sizeof(MeanInvStdDataType) * N * G);
+    SimpleDeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * N * G);
+    SimpleDeviceMem dgamma_dev(sizeof(DGammaDataType) * G * C);
+    SimpleDeviceMem dbeta_dev(sizeof(DBetaDataType) * G * C);
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta<DYDataType,
+                                                                      XDataType,
+                                                                      MeanInvStdDataType,
+                                                                      DGammaDataType,
+                                                                      DBetaDataType,
+                                                                      Rank,
+                                                                      NumReduceDim>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    std::size_t num_bytes = sizeof(DYDataType) * length + sizeof(XDataType) * length +
+                            sizeof(GammaDataType) * G * C + sizeof(MeanInvStdDataType) * N * G * 2 +
+                            sizeof(DGammaDataType) * G * C + sizeof(DBetaDataType) * G * C;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},
+                                                        strideDy,
+                                                        strideX,
+                                                        strideMeanInvStd,
+                                                        strideMeanInvStd,
+                                                        {G, C},
+                                                        strideDGammaBeta,
+                                                        strideDGammaBeta,
+                                                        {0, 1, 2}, // reduceDims
+                                                        dy_dev.GetDeviceBuffer(),
+                                                        x_dev.GetDeviceBuffer(),
+                                                        mean_dev.GetDeviceBuffer(),
+                                                        inv_std_dev.GetDeviceBuffer(),
+                                                        dgamma_dev.GetDeviceBuffer(),
+                                                        dbeta_dev.GetDeviceBuffer());
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            SimpleDeviceMem workspace(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+            float ave_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            float gb_per_sec = num_bytes / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    // run the best intance
+    if(found)
+    {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_op_name << std::endl;
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},
+                                                        strideDy,
+                                                        strideX,
+                                                        strideMeanInvStd,
+                                                        strideMeanInvStd,
+                                                        {G, C},
+                                                        strideDGammaBeta,
+                                                        strideDGammaBeta,
+                                                        {0, 1, 2}, // reduceDims
+                                                        dy_dev.GetDeviceBuffer(),
+                                                        x_dev.GetDeviceBuffer(),
+                                                        mean_dev.GetDeviceBuffer(),
+                                                        inv_std_dev.GetDeviceBuffer(),
+                                                        dgamma_dev.GetDeviceBuffer(),
+                                                        dbeta_dev.GetDeviceBuffer());
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            SimpleDeviceMem workspace(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
--- a/client_example/25_tensor_transforms/tensor_transform.cpp
+++ b/client_example/25_tensor_transforms/tensor_transform.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
-#include <iostream>
-#include "ck/ck.hpp"
-#include "ck/utility/number.hpp"
-#include "ck/utility/tuple.hpp"
-#include "ck/utility/sequence.hpp"
-#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "ck/tensor_description/multi_index_transform_helper.hpp"
-static constexpr auto I0 = ck::Number<0>{};
-static constexpr auto I1 = ck::Number<1>{};
-static constexpr auto I2 = ck::Number<2>{};
-using DataType = int;
-template <typename Desc>
-void Print1d(const Desc& desc)
-{
-    std::cout << "Print1d" << std::endl;
-    for(ck::index_t w = 0; w < desc.GetLength(I0); w++)
-    {
-        std::cout << desc.CalculateOffset(ck::make_tuple(w)) << " ";
-    }
-    std::cout << std::endl;
-}
-template <typename Desc>
-void Print2d(const Desc& desc)
-{
-    std::cout << "Print2d" << std::endl;
-    for(ck::index_t h = 0; h < desc.GetLength(I0); h++)
-    {
-        for(ck::index_t w = 0; w < desc.GetLength(I1); w++)
-        {
-            std::cout << desc.CalculateOffset(ck::make_tuple(h, w)) << " ";
-        }
-        std::cout << std::endl;
-    }
-}
-template <typename Desc>
-void Print3dCustom(const Desc& desc)
-{
-    std::cout << "Print3dCustom" << std::endl;
-    for(ck::index_t d = 0; d < desc.GetLength(I0); d++)
-    {
-        for(ck::index_t h = 0; h < desc.GetLength(I1); h++)
-        {
-            for(ck::index_t w = 0; w < desc.GetLength(I2); w++)
-            {
-                std::cout << desc.CalculateOffset(ck::make_tuple(d, h, w)) << " ";
-            }
-            std::cout << std::endl;
-        }
-        std::cout << std::endl;
-    }
-}
-int main()
-{
-    // Tensor descriptor traverse in row-major (need to reverse dims)
-    std::cout << "Note: Tensor descriptor traverse in row-major" << std::endl;
-    // Basic descriptor 0, 1, 2, ... 30, 31
-    // (dims:4,8 strides:1,4)
-    const auto desc_4x8_s1x4 =
-        ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<4>{}, ck::Number<8>{}),
-                                         ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}));
-    std::cout << "dims:4,8 strides:1,4" << std::endl;
-    Print2d(desc_4x8_s1x4);
-    using Cord1x1Type                = ck::Tuple<ck::Number<1>, ck::Number<1>>;
-    constexpr ck::index_t offset_1x1 = desc_4x8_s1x4.CalculateOffset(Cord1x1Type{});
-    std::cout << "Constexpr calculated [1, 1] offset:" << offset_1x1 << std::endl;
-    // Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (compile-time descriptor)
-    // dims:4,(2,4) strides:2,(1,8)
-    const auto desc_4x2x4_s2x1x8 =
-        ck::make_naive_tensor_descriptor(ck::make_tuple(4, 2, 4), ck::make_tuple(2, 1, 8));
-    // Transform to 2d (column-major, need to to reverse dims)
-    const auto desc_4x2x4_s2x1x8_merged = ck::transform_tensor_descriptor(
-        desc_4x2x4_s2x1x8,
-        ck::make_tuple(ck::make_pass_through_transform(4),
-                       ck::make_merge_transform(ck::make_tuple(4, 2))),
-        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<2, 1>{}),
-        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
-    std::cout << "dims:4,(2,4) strides:2,(1,8)" << std::endl;
-    Print2d(desc_4x2x4_s2x1x8_merged);
-    // Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (compile-time descriptor)
-    // dims:(2,2),(2,4) strides:((1,4),(2,8)
-    const auto desc_2x2x2x4_s1x4x2x8 =
-        ck::make_naive_tensor_descriptor(ck::make_tuple(2, 2, 2, 4), ck::make_tuple(1, 4, 2, 8));
-    // Transform to 2d
-    const auto desc_2x2x2x4_s1x4x2x8_double_merged_2d = ck::transform_tensor_descriptor(
-        desc_2x2x2x4_s1x4x2x8,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(2, 2)),
-                       ck::make_merge_transform(ck::make_tuple(4, 2))),
-        ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<3, 2>{}),
-        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
-    // Transform to 3d
-    const auto desc_2x2x2x4_s1x4x2x8_double_merged_3d = ck::transform_tensor_descriptor(
-        desc_2x2x2x4_s1x4x2x8,
-        ck::make_tuple(ck::make_pass_through_transform(2),
-                       ck::make_pass_through_transform(2),
-                       ck::make_merge_transform(ck::make_tuple(4, 2))),
-        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<3, 2>{}),
-        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
-    std::cout << "dims:(2,2),(2,4) strides:(1,4),(2,8)" << std::endl;
-    Print2d(desc_2x2x2x4_s1x4x2x8_double_merged_2d);
-    Print3dCustom(desc_2x2x2x4_s1x4x2x8_double_merged_3d);
-    // Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (compile-time descriptor)
-    // dims:((2,2),2),4 strides:((1,4),2),8
-    // Transform to 2d
-    const auto desc_2x2x2x4_s1x4x2x8_nested =
-        ck::make_naive_tensor_descriptor(ck::make_tuple(2, 2, 2, 4), ck::make_tuple(1, 4, 2, 8));
-    const auto desc_2x2x2x4_s1x4x2x8_nested_merged_3d = ck::transform_tensor_descriptor(
-        desc_2x2x2x4_s1x4x2x8_nested,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(2, 2)),
-                       ck::make_pass_through_transform(2),
-                       ck::make_pass_through_transform(4)),
-        ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<3>{}),
-        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
-    const auto desc_2x2x2x4_s1x4x2x8_nested_merged_1d = ck::transform_tensor_descriptor(
-        desc_2x2x2x4_s1x4x2x8_nested,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(4, 2, 2, 2))),
-        ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
-        ck::make_tuple(ck::Sequence<0>{}));
-    const auto desc_2x2x2x4_s1x4x2x8_nested_merged_2d = ck::transform_tensor_descriptor(
-        desc_2x2x2x4_s1x4x2x8_nested_merged_3d,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(2, 4)),
-                       ck::make_pass_through_transform(4)),
-        ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}),
-        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
-    std::cout << "dims:((2,2),2),4 strides:((1,4),2),8" << std::endl;
-    Print1d(desc_2x2x2x4_s1x4x2x8_nested_merged_1d);
-    Print2d(desc_2x2x2x4_s1x4x2x8_nested_merged_2d);
-    Print3dCustom(desc_2x2x2x4_s1x4x2x8_nested_merged_3d);
-    return 0;
-}
--- a/client_example/25_tensor_transforms/CMakeLists.txt
+++ b/client_example/25_tensor_transforms/CMakeLists.txt
-add_executable(client_tensor_transform tensor_transform.cpp)
-target_link_libraries(client_tensor_transform PRIVATE composable_kernel::device_other_operations)
 add_executable(client_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp)
 target_link_libraries(client_tensor_transform_using_wrapper PRIVATE composable_kernel::device_other_operations)
+add_executable(client_wrapper_img2col wrapper_img2col.cpp)
+target_link_libraries(client_wrapper_img2col PRIVATE composable_kernel::device_other_operations)
--- a/client_example/25_tensor_transforms/tensor_transform_using_wrapper.cpp
+++ b/client_example/25_tensor_transforms/tensor_transform_using_wrapper.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>

--- a/client_example/25_wrapper/wrapper_img2col.cpp
+++ b/client_example/25_wrapper/wrapper_img2col.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <numeric>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/wrapper/layout.hpp"
+#include "ck/wrapper/tensor.hpp"
+#include "ck/wrapper/operations/copy.hpp"
+static constexpr ck::index_t NumDimSpatial = 3;
+using DataType                             = float;
+using InputLayout                          = ck::tensor_layout::convolution::NDHWGC;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+// Test copy from Global to Global through LDS and VGPR
+template <typename InputTensor,
+          typename OutputTensor,
+          typename BlockShape,
+          typename ThreadLayoutShape>
+__global__ void DeviceImageToColumnPad0(InputTensor input_tensor,
+                                        OutputTensor output_tensor,
+                                        const BlockShape tile_shape,
+                                        const ThreadLayoutShape thread_layout)
+{
+    const ck::index_t block_idx = static_cast<ck::index_t>(blockIdx.x);
+    // Get local tiles for global memory
+    auto input_local_tile  = ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idx);
+    auto output_local_tile = ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idx);
+    // Get partition per thread
+    const auto input_local_partition =
+        ck::wrapper::make_local_partition(input_local_tile, thread_layout, threadIdx.x);
+    auto output_local_partition =
+        ck::wrapper::make_local_partition(output_local_tile, thread_layout, threadIdx.x);
+    // Perform copy
+    using DimAccessOrder                    = ck::Tuple<ck::Number<0>, ck::Number<1>>;
+    constexpr ck::index_t vector_dim        = 1;
+    constexpr ck::index_t scalar_per_vector = 4;
+    ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(input_local_partition,
+                                                                     output_local_partition);
+}
+void PerformImageToColumnPad0(const ck::index_t G,
+                              const ck::index_t N,
+                              const ck::index_t Di,
+                              const ck::index_t Hi,
+                              const ck::index_t Wi,
+                              const ck::index_t Do,
+                              const ck::index_t Ho,
+                              const ck::index_t Wo,
+                              const ck::index_t C,
+                              const ck::index_t Z,
+                              const ck::index_t Y,
+                              const ck::index_t X,
+                              std::array<ck::index_t, NumDimSpatial> filter_strides,
+                              std::array<ck::index_t, NumDimSpatial> filter_dilations)
+{
+    const ck::index_t ZYXC = Z * Y * X * C;
+    const ck::index_t GC   = G * C;
+    // shape: (G, (Wo, Ho, Do, N)), (C, X, Y, Z))
+    const auto shape = ck::make_tuple(ck::make_tuple(G, ck::make_tuple(Wo, Ho, Do, N)),
+                                      ck::make_tuple(C, X, Y, Z));
+    const auto in_strides =
+        ck::make_tuple(ck::make_tuple(C,
+                                      ck::make_tuple(filter_strides[2] * GC,
+                                                     filter_strides[1] * Wi * GC,
+                                                     filter_strides[0] * Hi * Wi * GC,
+                                                     Di * Hi * Wi * GC)),
+                       ck::make_tuple(1,
+                                      filter_dilations[2] * GC,
+                                      filter_dilations[1] * Wi * GC,
+                                      filter_dilations[0] * Hi * Wi * GC));
+    const auto in_layout = ck::wrapper::make_layout(shape, in_strides);
+    const auto out_strides = ck::make_tuple(
+        ck::make_tuple(
+            ZYXC,
+            ck::make_tuple(ZYXC * G, Wo * ZYXC * G, Ho * Wo * ZYXC * G, Do * Ho * Wo * ZYXC * G)),
+        ck::make_tuple(1, C, X * C, Y * X * C));
+    const auto out_layout = ck::wrapper::make_layout(shape, out_strides);
+    const ck::index_t input_size = N * Di * Hi * Wi * GC;
+    // Global memory buffers
+    SimpleDeviceMem in_buf(input_size * sizeof(DataType));
+    SimpleDeviceMem out_buf(ck::wrapper::size(out_layout) * sizeof(DataType));
+    // User can choose appropriate number of threads and sizes per block
+    const auto thread_layout = ck::make_tuple(ck::Number<8>{}, ck::Number<16>{});
+    // This example doesn't support padding, user should select tile sizes
+    // which divides the shape completely
+    const auto tile_shape = ck::make_tuple(ck::Number<32>{}, ck::Number<64>{});
+    // Create buffers for global memory
+    auto input_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
+        static_cast<const DataType*>(in_buf.GetDeviceBuffer()), in_layout);
+    auto output_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
+        static_cast<DataType*>(out_buf.GetDeviceBuffer()), out_layout);
+    const ck::index_t grid_size = ck::math::integer_divide_ceil(ck::wrapper::size<0>(in_layout),
+                                                                ck::wrapper::size<0>(tile_shape)) *
+                                  ck::math::integer_divide_ceil(ck::wrapper::size<1>(in_layout),
+                                                                ck::wrapper::size<1>(tile_shape));
+    const auto kernel    = DeviceImageToColumnPad0<decltype(input_tensor_global),
+                                                decltype(output_tensor_global),
+                                                decltype(tile_shape),
+                                                decltype(thread_layout)>;
+    const float avg_time = launch_and_time_kernel(StreamConfig{nullptr, true},
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(ck::wrapper::size(thread_layout)),
+                                                  0,
+                                                  input_tensor_global,
+                                                  output_tensor_global,
+                                                  tile_shape,
+                                                  thread_layout);
+    std::size_t num_btype = G * N * Do * Ho * Wo * ZYXC * 2 * sizeof(DataType);
+    float gb_per_sec      = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+              << std::endl;
+}
+int main(int argc, char* argv[])
+{
+    constexpr ck::index_t G  = 4;  // number of groups
+    constexpr ck::index_t N  = 32; // batch
+    constexpr ck::index_t C  = 64; // input channel (per group)
+    constexpr ck::index_t Z  = 3;  // filter D
+    constexpr ck::index_t Y  = 3;  // filter H
+    constexpr ck::index_t X  = 3;  // filter W
+    constexpr ck::index_t Di = 9;  // input D
+    constexpr ck::index_t Hi = 9;  // input H
+    constexpr ck::index_t Wi = 7;  // input W
+    constexpr ck::index_t Do = 7;  // output D
+    constexpr ck::index_t Ho = 7;  // output H
+    constexpr ck::index_t Wo = 5;  // output W
+    PerformImageToColumnPad0(G,
+                             N,
+                             Di,
+                             Hi,
+                             Wi,
+                             Do,
+                             Ho,
+                             Wo,
+                             C,
+                             Z,
+                             Y,
+                             X,
+                             {1, 1, 1} /*filter_strides*/,
+                             {1, 1, 1} /*filter_dilations*/);
+    return 0;
+}
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -34,6 +34,6 @@ The CK documentation is structured as follows:
  * :ref:`contributing-to`
-To contribute to the documentation refer to `Contributing to ROCm  <https://rocm.docs.amd.com/en/latest/contribute/index.md>`_.
+To contribute to the documentation refer to `Contributing to ROCm  <https://rocm.docs.amd.com/en/latest/contribute/index.html>`_.
-You can find licensing information at the `Licensing <https://rocm.docs.amd.com/en/latest/about/license.md>`_ page.
+You can find licensing information on the `Licensing <https://rocm.docs.amd.com/en/latest/about/license.html>`_ page.
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core==0.30.3
+rocm-docs-core==0.33.0
-sphinxcontrib-bibtex==2.6.1
+sphinxcontrib-bibtex==2.6.2
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -113,7 +113,7 @@ requests==2.31.0
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==0.30.3
+rocm-docs-core==0.33.0
    # via -r requirements.in
 six==1.16.0
    # via
@@ -149,7 +149,7 @@ sphinx-notfound-page==0.8.3
    # via rocm-docs-core
 sphinxcontrib-applehelp==1.0.4
    # via sphinx
-sphinxcontrib-bibtex==2.6.1
+sphinxcontrib-bibtex==2.6.2
    # via -r requirements.in
 sphinxcontrib-devhelp==1.0.2
    # via sphinx

--- a/docs/wrapper.rst
+++ b/docs/wrapper.rst
@@ -18,8 +18,7 @@ Description
 The CK library provides a lightweight wrapper for more complex operations implemented in 
-the library. It allows indexing of nested layouts using a simple interface 
+the library.
-(avoiding complex descriptor transformations) and memory access (using Tensor).
 Example:
@@ -54,6 +53,11 @@ Output::
    1 5 9 13 17 21 25 29 
    2 6 10 14 18 22 26 30 
+Advanced examples:
+* `Image to column <https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_img2col.cpp>`_
 -------------------------------------
 Layout
 -------------------------------------

--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -19,6 +19,9 @@ add_custom_target(example_gemm_xdl)
 add_example_executable(example_gemm_xdl_fp16 gemm_xdl_fp16.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16)
+add_example_executable(example_gemm_xdl_fp16_v2 gemm_xdl_fp16_v2.cpp)
+add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_v2)
 add_example_executable(example_gemm_xdl_wavelet_fp16 gemm_xdl_wavelet_fp16.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16)

--- a/example/01_gemm/gemm_dl_int4.cpp
+++ b/example/01_gemm/gemm_dl_int4.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-#error Should compile this file with ck::int4_t support
-#endif
 #include "common.hpp"
@@ -43,3 +41,4 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
 #include "run_gemm_example.inc"
 int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
+#endif
\ No newline at end of file
--- a/example/01_gemm/gemm_xdl_fp16_v2.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_v2.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v2.hpp"
+using ADataType        = ck::half_t;
+using BDataType        = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using CDataType        = ck::half_t;
+using F16 = ck::half_t;
+using F32 = float;
+using ALayout = Row;
+using BLayout = Row;
+using CLayout = Row;
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// clang-format off
+using DeviceGemmInstance = 
+    ck::tensor_operation::device::DeviceGemm_Xdl_CShuffleV2<
+        ALayout,   BLayout,  CLayout,   
+        F16,   F16,  F16,  F32,  F16, 
+        PassThrough, PassThrough, PassThrough, GemmDefault, 
+        2,   256,
+        256, 256, 
+        32, 8, 4,
+        32,   32,
+        4,    4, 
+        S<4, 64, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
+        2, 8, 8, 0,
+        S<8, 32, 1>,  S<0, 2, 1>,  S<0, 2, 1>,
+        1, 8, 4, 0,
+        1, 1, S<1, 32, 1, 8>, 8,
+        ck::LoopScheduler::Default, ck::PipelineVersion::v1>;
+// clang-format on
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+#include "run_gemm_example.inc"
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }