Merge branch 'develop' into bnorm_bwd_pr

d0b49a14 · Qianfeng Zhang · 29026b0e · 87fd1152 · d0b49a14 · d0b49a14
Commit d0b49a14 authored Oct 28, 2022 by Qianfeng Zhang
20 changed files
--- a/CITATION.cff
+++ b/CITATION.cff
+cff-version: 1.2.0
+title: Composable Kernel
+message: If you use this software, please cite using the following metadata.
+type: software
+authors:
+  - given-names: Chao
+    family-names: Liu
+    email: chao.liu2@amd.com
+    affiliation: AMD
+  - given-names: Jing
+    family-names: Zhang
+    email: jing.zhang3@amd.com
+    affiliation: AMD
+  - given-names: Letao
+    family-names: Qin
+    email: letao.qin@amd.com
+    affiliation: AMD
+  - given-names: Qianfeng
+    family-names: Zhang
+    email: qianfeng.zhang@amd.com
+    affiliation: AMD
+  - given-names: Liang
+    family-names: Huang
+    email: carlus.huang@amd.com
+    affiliation: AMD
+  - given-names: Shaojie
+    family-names: Wang
+    email: shaojie.wang@amd.com
+    affiliation: AMD
+  - given-names: Anthony
+    family-names: Chang
+    email: antc@amd.com
+    affiliation: AMD
+  - given-names: Chunyu
+    family-names: Lai
+    email: chunyu.lai@amd.com
+    affiliation: AMD
+  - given-names: Illia
+    family-names: Silin
+    email: illia.silin@amd.com
+    affiliation: AMD
+  - given-names: Adam
+    family-names: Osewski
+    email: adam.osewski@amd.com
+    affiliation: AMD
+  - given-names: Poyen
+    family-names: Chen
+    email: poyen.chen@amd.com
+    affiliation: AMD
+  - given-names: Rosty
+    family-names: Geyyer
+    email: rosty.geyyer@amd.com
+    affiliation: AMD
+  - given-names: Hanwen
+    family-names: Chen
+  - given-names: Tejash
+    family-names: Shah
+  - given-names: Xiaoyan
+    family-names: Zhou
+  - given-names: Jianfeng
+    family-names: Yan
+repository-code: 'https://github.com/ROCmSoftwarePlatform/composable_kernel'
+abstract: Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for Machine Learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel progarmming languages, like HIP C++.
+keywords:
+  - 'CK, Composable Kernel, Tensor Coordinate Transformation'
+license: MIT
+license-url: https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/7fc3ed761aa35709d87c8fbbe41dd368648b3541/LICENSE
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -247,6 +247,16 @@ message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
 add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
+file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp")
+file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
+set(CK_DEVICE_INSTANCES)
+FOREACH(subdir_path ${dir_list})
+    IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}")
+       list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
+    ENDIF()
+ENDFOREACH()
+add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES}  SOURCES ${INSTANCE_FILES})
 rocm_package_setup_component(tests
        LIBRARY_NAME composablekernel
        PACKAGE_NAME tests # Prevent -static suffix on package name

--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
+# Composable Kernel Developers and Contributors
+This is the list of developers and contributors to Composable Kernel library
+## Developers
+[Chao Liu](https://github.com/asroy), [Jing Zhang](https://github.com/zjing14), 2018-2022
+[Letao Qin](https://github.com/ltqin), [Qianfeng Zhang](https://github.com/qianfengz), [Liang Huang](https://github.com/carlushuang), [Shaojie Wang](https://github.com/shaojiewang), 2019-2022
+[Anthony Chang](https://github.com/rosenrodt), [Chunyu Lai](https://github.com/rocking5566), [Illia Silin](https://github.com/illsilin), [Adam Osewski](https://github.com/aosewski), [Poyen Chen](https://github.com/poyenc), [Rosty Geyyer](https://github.com/geyyer), 2022
+Hanwen Chang, 2019-2021,
+Tejash Shah, 2019-2020
+Xiaoyan Zhou, 2020
+[Jianfeng Yan](https://github.com/j4yan), 2021-2022
+## Product Manager
+[Jun Liu](https://github.com/junliume)
+## Contributors
+[Dan Yao](https://github.com/danyao12), [Guangzhao Lu](https://github.com/guangzlu), [Raman Jana](https://github.com/ramjana), [Jehandad Khan](https://github.com/JehandadKhan), [Wen-Heng (Jack) Chung](https://github.com/whchung)
+## Acknowledgement
+CK team works closely with Meta [AITemplate](https://github.com/facebookincubator/AITemplate) team ([Bing Xu](https://github.com/antinucleon), [Hao Lu](https://github.com/hlu1), [Ying Zhang](https://github.com/ipiszy), etc). Most of the lucrative graph optimization opportunities in ML models were identified by AITemplate team, and we also co-designed many high performance fused kernels for AMD GPUs. Without this collaboration, CK would not reach its current potential.
--- a/Dockerfile
+++ b/Dockerfile
 FROM ubuntu:20.04
-ARG ROCMVERSION=5.2.3
+ARG ROCMVERSION=5.3
-ARG compiler_version
+ARG compiler_version="release"
-ARG compiler_commit
+ARG compiler_commit=""
 RUN set -xe
@@ -19,6 +19,7 @@ RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
    apt-utils \
    build-essential \
+    ccache \
    cmake-data \
    cmake \
    curl \

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -19,10 +19,24 @@ def runShell(String command){
 }
 def getDockerImageName(){
-    def img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm5.2.3_${params.COMPILER_VERSION}"
+    def img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
    return img
 }
+def check_host() {
+    if ("${env.CK_CCACHE}" != "null"){
+        def CCACHE_SERVER="${env.CK_CCACHE.split(':')[0]}"
+        echo "ccache server: ${CCACHE_SERVER}"
+        sh '''ping -c 1 -p 6379 "${CCACHE_SERVER}" | echo $? > tmp.txt'''
+        def output = readFile(file: "tmp.txt")
+        echo "tmp.txt contents: \$output"
+        return (output != "0")
+    }
+    else{
+        return 1
+    }
+}
 def build_compiler(){
    def compiler
    if (params.BUILD_COMPILER == "hipcc"){
@@ -43,21 +57,21 @@ def getDockerImage(Map conf=[:]){
    env.DOCKER_BUILDKIT=1
    def prefixpath = conf.get("prefixpath", "/opt/rocm") // prefix:/opt/rocm
    def no_cache = conf.get("no_cache", false)
-    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' "
+    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-    if(env.CCACHE_HOST)
+    echo "ccache server: ${env.CK_CCACHE}"
+    if(env.CK_CCACHE)
    {
-        def check_host = sh(script:"""(printf "PING\r\n";) | nc -N ${env.CCACHE_HOST} 6379 """, returnStdout: true).trim()
+        if(check_host())
-        if(check_host == "+PONG")
        {
-            echo "FOUND CCACHE SERVER: ${CCACHE_HOST}"
+            echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
        }
        else 
        {
-            echo "CCACHE SERVER: ${CCACHE_HOST} NOT FOUND, got ${check_host} response"
+            echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
        }
-        dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CCACHE_HOST}' --build-arg COMPILER_LAUNCHER='ccache' "
+        dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
        env.CCACHE_DIR = """/tmp/ccache_store"""
-        env.CCACHE_SECONDARY_STORAGE="""redis://${env.CCACHE_HOST}"""
+        env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
    }
    if(no_cache)
    {
@@ -86,21 +100,21 @@ def buildDocker(install_prefix){
    checkout scm
    def image_name = getDockerImageName()
    echo "Building Docker for ${image_name}"
-    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}'"
+    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-    if(env.CCACHE_HOST)
+    echo "ccache server: ${env.CK_CCACHE}"
+    if(env.CK_CCACHE)
    {
-        def check_host = sh(script:"""(printf "PING\\r\\n";) | nc  -N ${env.CCACHE_HOST} 6379 """, returnStdout: true).trim()
+        if(check_host())
-        if(check_host == "+PONG")
        {
-            echo "FOUND CCACHE SERVER: ${CCACHE_HOST}"
+            echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
        }
        else 
        {
-            echo "CCACHE SERVER: ${CCACHE_HOST} NOT FOUND, got ${check_host} response"
+            echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
        }
-        dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CCACHE_HOST}' --build-arg COMPILER_LAUNCHER='ccache' "
+        dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
        env.CCACHE_DIR = """/tmp/ccache_store"""
-        env.CCACHE_SECONDARY_STORAGE="""redis://${env.CCACHE_HOST}"""
+        env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
    }
    echo "Build Args: ${dockerArgs}"
@@ -161,10 +175,11 @@ def cmake_build(Map conf=[:]){
    }else{
        setup_args = " -DCMAKE_BUILD_TYPE=release" + setup_args
    }
-    if(env.CCACHE_HOST)
+    if(env.CK_CCACHE)
    {
        setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER='ccache' -DCMAKE_C_COMPILER_LAUNCHER='ccache' " + setup_args
    }
+    echo "ccache server: ${env.CK_CCACHE}"
    def pre_setup_cmd = """
            echo \$HSA_ENABLE_SDMA
@@ -210,7 +225,7 @@ def buildHipClangJob(Map conf=[:]){
        if (conf.get("enforce_xnack_on", false)) {
            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
        }
-        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' "
+        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
        if (params.COMPILER_VERSION != "release"){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }
@@ -218,41 +233,9 @@ def buildHipClangJob(Map conf=[:]){
        def variant = env.STAGE_NAME
        def retimage
+        (retimage, image) = getDockerImage(conf)
        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
-            try {
-                (retimage, image) = getDockerImage(conf)
-                withDockerContainer(image: image, args: dockerOpts) {
-                    timeout(time: 5, unit: 'MINUTES'){
-                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
-                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
-                            throw new Exception ("GPU not found")
-                        }
-                        else{
-                            echo "GPU is OK"
-                        }
-                    }
-                }
-            }
-            catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
-                echo "The job was cancelled or aborted"
-                throw e
-            }
-            catch(Exception ex) {
-                retimage = docker.build("${image}", dockerArgs + " --no-cache .")
-                withDockerContainer(image: image, args: dockerOpts) {
-                    timeout(time: 5, unit: 'MINUTES'){
-                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo |tee clinfo.log'
-                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
-                            throw new Exception ("GPU not found")
-                        }
-                        else{
-                            echo "GPU is OK"
-                        }
-                    }
-                }
-            }
            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
                timeout(time: 5, unit: 'HOURS')
                {
@@ -297,7 +280,7 @@ def runCKProfiler(Map conf=[:]){
        if (conf.get("enforce_xnack_on", false)) {
            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
        }
-        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' "
+        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
        if (params.COMPILER_VERSION != "release"){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }
@@ -430,7 +413,7 @@ def Build_CK(Map conf=[:]){
        if (conf.get("enforce_xnack_on", false)) {
            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
        }
-        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' "
+        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
        if (params.COMPILER_VERSION != "release"){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }
@@ -566,8 +549,9 @@ def process_results(Map conf=[:]){
    }
 }
-//launch develop branch daily at 23:00 in FULL_QA mode
+//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true''' : ""
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;COMPILER_VERSION=release
+                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open''' : ""
 pipeline {
    agent none
@@ -581,15 +565,19 @@ pipeline {
        booleanParam(
            name: "BUILD_DOCKER",
            defaultValue: false,
-            description: "Force building docker image (default: false)")
+            description: "Force building docker image (default: false), set to true if docker image needs to be updated.")
+        string(
+            name: 'ROCMVERSION', 
+            defaultValue: '5.3', 
+            description: 'Specify which ROCM version to use: 5.2.3, or 5.3 (default), etc.')
        string(
            name: 'COMPILER_VERSION', 
-            defaultValue: 'amd-stg-open', 
+            defaultValue: 'release', 
-            description: 'Specify which version of compiler to use: ck-9110, release, or amd-stg-open (default).')
+            description: 'Specify which version of compiler to use: ck-9110, release (default), or amd-stg-open.')
        string(
            name: 'COMPILER_COMMIT', 
-            defaultValue: '8a82e4eb7ba28521ba9a9424a0315a8a16590424', 
+            defaultValue: '', 
-            description: 'Specify which commit of compiler branch to use: leave empty to use the latest commit, or use 10738 commit (default).')
+            description: 'Specify which commit of compiler branch to use: leave empty to use the latest commit (default), or use 8a82e4eb7ba28521ba9a9424a0315a8a16590424 commit of amd-stg-open branch.')
        string(
            name: 'BUILD_COMPILER', 
            defaultValue: 'hipcc', 
@@ -627,17 +615,6 @@ pipeline {
        }
        stage("Static checks") {
            parallel{
-                // enable after we move from hipcc to hip-clang
-                // stage('Tidy') {
-                //     agent{ label rocmnode("nogpu") }
-                //     environment{
-                //         // setup_cmd = "CXX='/opt/rocm/bin/hipcc' cmake -DBUILD_DEV=On .. "
-                //         build_cmd = "make -j\$(nproc) -k analyze"
-                //     }
-                //     steps{
-                //         buildHipClangJobAndReboot(build_cmd: build_cmd, no_reboot:true, prefixpath: '/opt/rocm', build_type: 'debug')
-                //     }
-                // }
                stage('Clang Format') {
                    agent{ label rocmnode("nogpu") }
                    environment{
@@ -676,28 +653,6 @@ pipeline {
            }
        }
- 		/*
-        //at present this stage only builds binaries. 
-        //we will now build all binaries in a separate stage.
-        //once we have some tests to run in this stage, we can enable it again.
-        stage("Client App")
-        {
-            parallel
-            {
-                stage("Run Client App")
-                {
-                    agent{ label rocmnode("gfx908")}
-                    environment{
-                        setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """ -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS="-O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" """ : """ -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS="-O3 " """ }"
-                        execute_args = "${params.COMPILER_VERSION == "ck-9110" ? """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS="-O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ : """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS="-O3" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ }"
-                    }
-                    steps{
-                        buildHipClangJobAndReboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
-                    }
-                }
-            }
-        }
-        */
        stage("Performance Tests")
        {
            parallel
@@ -746,21 +701,5 @@ pipeline {
                }
            }
        }
-        /* enable after the cmake file supports packaging
-        stage("Packages") {
-            when {
-                expression { params.BUILD_PACKAGES && params.TARGET_NOGPU && params.DATATYPE_NA }
-            }
-            parallel {
-                stage("Package /opt/rocm") {
-                    agent{ label rocmnode("nogpu") }
-                    steps{
-                        buildHipClangJobAndReboot( package_build: "true", prefixpath: '/opt/rocm', gpu_arch: "gfx906;gfx908;gfx90a")
-                    }
-                }
-            }
-        }
-        */
    }
 }
--- a/README.md
+++ b/README.md
-## Docker script
+# Composable Kernel
+## Methodology
+Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
+CK utilizes two concepts to achieve performance portability and code maintainability:
+* A tile-based programming model
+* Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation".
+![ALT](/doc/image/ck_component.png "CK Components")
+## Code Structure
+Current CK library are structured into 4 layers:
+* "Templated Tile Operators" layer
+* "Templated Kernel and Invoker" layer
+* "Instantiated Kernel and Invoker" layer
+* "Client API" layer
+![ALT](/doc/image/ck_layer.png "CK Layers")
+## Contributors
+The list of developers and contributors is here: [Contributors](/CONTRIBUTORS.md)
+## Citation
+If you use CK, please use following citations:
+* CK paper will be freely available on arXiv soon: [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???)
+* [CITATION.cff](/CITATION.cff)
+## License
+CK is released under the MIT license. [License File](/LICENSE)
+# Build CK
+## Build docker image
+```bash
+DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
+```
+## Launch docker
 ```bash
 docker run                                     \
 -it                                            \
@@ -6,47 +45,38 @@ docker run                                     \
 --group-add sudo                               \
 -w /root/workspace                             \
 -v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace  \
-rocm/tensorflow:rocm5.1-tf2.6-dev              \
+ck:latest                                      \
 /bin/bash
 ```
-# Install newer version of rocm-cmake
+## Build CK
-https://github.com/RadeonOpenCompute/rocm-cmake
-## Build
 ```bash
 mkdir build && cd build
-```
-```bash
+# Need to specify target ID, example below is for gfx908 and gfx90a
-# Need to specify target ID, example below is gfx908 and gfx90a
+cmake                                                                                             \
-cmake                                                                 \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
-D BUILD_DEV=OFF                                                      \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
-D CMAKE_BUILD_TYPE=Release                                           \
+-D CMAKE_CXX_FLAGS="-O3"                                                                          \
-D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3" \
+-D CMAKE_BUILD_TYPE=Release                                                                       \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                             \
+-D GPU_TARGETS="gfx908;gfx90a"                                                                    \
-D CMAKE_PREFIX_PATH=/opt/rocm                                        \
-D CMAKE_INSTALL_PREFIX=${PATH_TO_CK_INSTALL_DIRECTORY}               \
 ..
 ```
-### Build and Run Examples
+### Build examples and tests
-```bash
- make -j examples
-```
-Instructions for running each individual examples are under ```example/```
-## Tests
 ```bash
 make -j examples tests
 make test
 ```
+Instructions for running each individual examples are under [example](/example)
 ## Build ckProfiler
 ```bash
 make -j ckProfiler
 ```
-Instructions for running ckProfiler are under ```profiler/```
+Instructions for running ckProfiler are under [profiler](/profiler)
 ## Install CK
 ```bash
@@ -54,13 +84,13 @@ make install
 ```
 ## Using CK as pre-built kernel library
-Instructions for using CK as a pre-built kernel library are under ```client_example/```
+Instructions for using CK as a pre-built kernel library are under [client_example](/client_example)
 ## Caveat
 ### Kernel Timing and Verification
 CK's own kernel timer will warn up kernel once, and then run it multiple times
 to get average kernel time. For some kernels that use atomic add, this will cause
-output buffer to be accumulated multiple times, causing verfication failure.
+output buffer to be accumulated multiple times, causing verification failure.
 To work around it, do not use CK's own timer and do verification at the same time.
 CK's own timer and verification in each example and ckProfiler can be enabled or
 disabled from command line.
--- a/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
+++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
@@ -8,7 +8,7 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
-#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp"

--- a/client_example/05_layernorm/layernorm2d.cpp
+++ b/client_example/05_layernorm/layernorm2d.cpp
@@ -10,7 +10,7 @@
 #include "ck/tensor_operation/gpu/device/device_normalization.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp"
+#include "ck/library/tensor_operation_instance/gpu/normalization.hpp"
 using XDataType     = ck::half_t;
 using GammaDataType = ck::half_t;
@@ -51,14 +51,14 @@ int main(int argc, char* argv[])
    SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
    SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
-    using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType,
+    using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
-                                                                   GammaDataType,
+                                                                       GammaDataType,
-                                                                   BetaDataType,
+                                                                       BetaDataType,
-                                                                   AccDataType,
+                                                                       AccDataType,
-                                                                   YDataType,
+                                                                       YDataType,
-                                                                   PassThrough,
+                                                                       PassThrough,
-                                                                   Rank,
+                                                                       Rank,
-                                                                   NumReduceDim>;
+                                                                       NumReduceDim>;
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<

--- a/client_example/07_conv2d_fwd/CMakeLists.txt
+++ b/client_example/07_conv2d_fwd/CMakeLists.txt
+add_executable(client_conv2d_fwd conv2d_fwd.cpp)
+target_link_libraries(client_conv2d_fwd PRIVATE composable_kernel::device_operations)
--- a/client_example/07_conv2d_fwd/conv2d_fwd.cpp
+++ b/client_example/07_conv2d_fwd/conv2d_fwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/convolution_forward.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+using InLayout    = ck::tensor_layout::convolution::NHWC;
+using WeiLayout   = ck::tensor_layout::convolution::KYXC;
+using OutLayout   = ck::tensor_layout::convolution::NHWK;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t N             = 16;
+static constexpr ck::index_t K             = 32;
+static constexpr ck::index_t C             = 3;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Hi            = 224;
+static constexpr ck::index_t Wi            = 224;
+static constexpr ck::index_t Ho            = 113;
+static constexpr ck::index_t Wo            = 113;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    std::vector<ck::index_t> in_spatial_lengths{Hi, Wi};
+    std::vector<ck::index_t> filter_spatial_lengths{Y, X};
+    std::vector<ck::index_t> out_spatial_lengths{Ho, Wo};
+    std::vector<ck::index_t> filter_strides{2, 2};
+    std::vector<ck::index_t> filter_dilations{1, 1};
+    std::vector<ck::index_t> input_left_pads{2, 2};
+    std::vector<ck::index_t> input_right_pads{2, 2};
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    using DeviceOp = ck::tensor_operation::device::DeviceConvFwd<NumDimSpatial,
+                                                                 InLayout,
+                                                                 WeiLayout,
+                                                                 OutLayout,
+                                                                 InDataType,
+                                                                 WeiDataType,
+                                                                 OutDataType,
+                                                                 PassThrough,
+                                                                 PassThrough,
+                                                                 PassThrough>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr        = op_ptrs[i];
+        auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        out.GetDeviceBuffer(),
+                                                        N,
+                                                        K,
+                                                        C,
+                                                        in_spatial_lengths,
+                                                        filter_spatial_lengths,
+                                                        out_spatial_lengths,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        PassThrough{});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t flop      = 2 * N * K * C * Ho * Wo * Y * X;
+            std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * C +
+                                    sizeof(WeiDataType) * K * Y * X * C +
+                                    sizeof(OutDataType) * N * Ho * Wo * K;
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        out.GetDeviceBuffer(),
+                                                        N,
+                                                        K,
+                                                        C,
+                                                        in_spatial_lengths,
+                                                        filter_spatial_lengths,
+                                                        out_spatial_lengths,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        PassThrough{});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
\ No newline at end of file
--- a/client_example/08_fused_attention/CMakeLists.txt
+++ b/client_example/08_fused_attention/CMakeLists.txt
+add_executable(client_fused_attention fused_attention.cpp)
+target_link_libraries(client_fused_attention PRIVATE composable_kernel::device_operations)
--- a/client_example/08_fused_attention/fused_attention.cpp
+++ b/client_example/08_fused_attention/fused_attention.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+using AElementOp    = ck::tensor_operation::element_wise::PassThrough;
+using B0ElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+using B1ElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp    = ck::tensor_operation::element_wise::PassThrough;
+constexpr static auto MaskingSpec =
+    ck::tensor_operation::device::MaskingSpecialization::MaskDisabled;
+using ADataType   = ck::half_t;
+using B0DataType  = ck::half_t;
+using B1DataType  = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    int G0 = 48;
+    int G1 = 16;
+    int M  = 1024;
+    int N  = 1024;
+    int K  = 64;
+    int O  = 64;
+    // A layout [G0, M, G1, K]
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{M * G1 * K, K, G1 * K, 1};
+    // B0 layout [G0, N, G1, K]
+    std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
+    std::vector<ck::index_t> b0_gs_ns_ks_strides{N * G1 * K, K, G1 * K, 1};
+    // B1 layout [G0, N, G1, O]
+    std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
+    std::vector<ck::index_t> b1_gs_os_ns_strides{N * G1 * O, O, 1, G1 * O};
+    // C layout [G0, M, G1, O]
+    std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
+    std::vector<ck::index_t> c_gs_ms_os_strides{M * G1 * O, O, G1 * O, 1};
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * G0 * G1 * M * K);
+    SimpleDeviceMem b0_device_buf(sizeof(B0DataType) * G0 * G1 * N * K);
+    SimpleDeviceMem b1_device_buf(sizeof(B1DataType) * G0 * G1 * O * N);
+    SimpleDeviceMem c_device_buf(sizeof(CDataType) * G0 * G1 * M * O);
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute<2,
+                                                                          1,
+                                                                          1,
+                                                                          1,
+                                                                          1,
+                                                                          ADataType,
+                                                                          B0DataType,
+                                                                          B1DataType,
+                                                                          CDataType,
+                                                                          ck::Tuple<>,
+                                                                          ck::Tuple<>,
+                                                                          AElementOp,
+                                                                          B0ElementOp,
+                                                                          Acc0ElementOp,
+                                                                          B1ElementOp,
+                                                                          CElementOp,
+                                                                          MaskingSpec>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    // profile device op instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b0_device_buf.GetDeviceBuffer(),
+                                                        b1_device_buf.GetDeviceBuffer(),
+                                                        c_device_buf.GetDeviceBuffer(),
+                                                        {}, // p_acc0_biases
+                                                        {}, // p_acc1_biases
+                                                        a_gs_ms_ks_lengths,
+                                                        a_gs_ms_ks_strides,
+                                                        b0_gs_ns_ks_lengths,
+                                                        b0_gs_ns_ks_strides,
+                                                        b1_gs_os_ns_lengths,
+                                                        b1_gs_os_ns_strides,
+                                                        c_gs_ms_os_lengths,
+                                                        c_gs_ms_os_strides,
+                                                        {}, // acc0_biases_gs_ms_ns_lengths
+                                                        {}, // acc0_biases_gs_ms_ns_strides
+                                                        {}, // acc1_biases_gs_ms_os_lengths
+                                                        {}, // acc1_biases_gs_ms_os_strides
+                                                        AElementOp{},
+                                                        B0ElementOp{},
+                                                        Acc0ElementOp{1 / sqrtf(K)},
+                                                        B1ElementOp{},
+                                                        CElementOp{});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t flop      = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * G0 * G1;
+            std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
+                                     sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
+                                    G0 * G1;
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+    // run the best instance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b0_device_buf.GetDeviceBuffer(),
+                                                        b1_device_buf.GetDeviceBuffer(),
+                                                        c_device_buf.GetDeviceBuffer(),
+                                                        {}, // p_acc0_biases
+                                                        {}, // p_acc1_biases
+                                                        a_gs_ms_ks_lengths,
+                                                        a_gs_ms_ks_strides,
+                                                        b0_gs_ns_ks_lengths,
+                                                        b0_gs_ns_ks_strides,
+                                                        b1_gs_os_ns_lengths,
+                                                        b1_gs_os_ns_strides,
+                                                        c_gs_ms_os_lengths,
+                                                        c_gs_ms_os_strides,
+                                                        {}, // acc0_biases_gs_ms_ns_lengths
+                                                        {}, // acc0_biases_gs_ms_ns_strides
+                                                        {}, // acc1_biases_gs_ms_os_lengths
+                                                        {}, // acc1_biases_gs_ms_os_strides
+                                                        AElementOp{},
+                                                        B0ElementOp{},
+                                                        Acc0ElementOp{1 / sqrtf(K)},
+                                                        B1ElementOp{},
+                                                        CElementOp{});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
 ROCmSoftwarePlatform/rocm-recipes
 RadeonOpenCompute/rocm-cmake@04f694df2a8dc9d7e35fa4dee4ba5fa407ec04f8 --build
-# 1.90+
+danmar/cppcheck@2.9
-danmar/cppcheck@dd05839a7e63ef04afd34711cb3e1e0ef742882f
\ No newline at end of file
\ No newline at end of file
--- a/doc/image/ck_component.png
+++ b/doc/image/ck_component.png
--- a/doc/image/ck_layer.png
+++ b/doc/image/ck_layer.png
--- a/example/01_gemm/gemm_dl_fp16.cpp
+++ b/example/01_gemm/gemm_dl_fp16.cpp
@@ -3,7 +3,7 @@
 #include "common.hpp"
-#include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
 using ADataType   = ck::half_t;
 using BDataType   = ck::half_t;

--- a/example/01_gemm/gemm_dl_fp32.cpp
+++ b/example/01_gemm/gemm_dl_fp32.cpp
@@ -3,7 +3,7 @@
 #include "common.hpp"
-#include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
 using ADataType   = float;
 using BDataType   = float;

--- a/example/01_gemm/gemm_dl_int4.cpp
+++ b/example/01_gemm/gemm_dl_int4.cpp
@@ -7,7 +7,7 @@
 #include "common.hpp"
-#include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
 using ADataType       = ck::int4_t;
 using BDataType       = ck::int4_t;

--- a/example/01_gemm/gemm_dl_int8.cpp
+++ b/example/01_gemm/gemm_dl_int8.cpp
@@ -3,7 +3,7 @@
 #include "common.hpp"
-#include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
 using ADataType   = int8_t;
 using BDataType   = int8_t;

--- a/example/01_gemm/gemm_xdl_bf16.cpp
+++ b/example/01_gemm/gemm_xdl_bf16.cpp
@@ -3,7 +3,7 @@
 #include "common.hpp"
-#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
 using ADataType        = ck::bhalf_t;
 using BDataType        = ck::bhalf_t;