Merge branch 'amd-develop' into amd-master

56de337f · Jun Liu · 41b920e2 · 687d2b7e · 56de337f · 56de337f
Commit 56de337f authored Mar 29, 2024 by Jun Liu
20 changed files
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
-* @zjing14 @asroy @junliume @illsilin @carlushuang @aosewski
+* @zjing14 @junliume @illsilin @carlushuang @aosewski
 # Documentation files
-docs/* @saadrahim @LisaDelaney
+docs/* @ROCm/rocm-documentation
-*.md  @saadrahim @LisaDelaney
+*.md @ROCm/rocm-documentation
-*.rst  @saadrahim @LisaDelaney
+*.rst @ROCm/rocm-documentation
-# Header directory
+# Header directory for Doxygen documentation
-library/include/*  @saadrahim @LisaDelaney
+library/include/* @ROCm/rocm-documentation
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,20 +2,27 @@
 Full documentation for Composable Kernel is not yet available.
-## (Unreleased) CK
+## CK for ROCm 6.1.0
-### Fixes
+### Additions
-None
+* Added generic instances for GEMM XDL operations (#1161)
+* Added gamma and beta parameters for the layernorm and groupnorm bwd operations (#1133)
+* Introduced wrapper sublibrary (limited functionality). (#1071, #1098, #1108, #1126)
+* Added an option to vary the number of warm-up cycles and iterations for ckProfiler (#1124)
 ### Optimizations
-None
+* New performance optimizations for GEMM operations on MI200 and MI300 architectures (#1135)
-### Additions
+### Fixes
-* Introduced wrapper sublibrary (limited functionality). (#1071, #1098, #1108, #1126, #1139)
+* Reduced the build time for most GPU architectures (#1084)
+* Fixed some conversion issues for fp8 data type (#1099)
 ### Changes
 None
+### Known issues
+None
 ## CK for ROCm 6.0.0
 ### Fixes
@@ -32,7 +39,7 @@ None
 * Grouped convolution support for small K and C (#822 #879 #897)
 * Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804)
 * Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799)
-* Support for Batched Gemm DL (#732)
+* Support for Batched GEMM DL (#732)
 ### Changes
 * Changed the grouped convolution API to maintain consistency with other convolution kernels (#817)
@@ -48,7 +55,7 @@ None
 ### Additions
 * New CMake flags:
-  * "DL_KERNELS"-* Must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances
+  * "DL_KERNELS"-* Must be set to "ON" in order to build the GEMM DL and batched_gemm_multi_d_dl instances
  * "DTYPES" -- Can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build an instance of the specified data types
  * "INSTANCES_ONLY" -- Only builds CK library and instances without tests, examples, or profiler
 * New feature: if GPU_TARGETS is not set in the CMake command line, CK will be built for all targets supported by the compiler

--- a/Dockerfile
+++ b/Dockerfile
@@ -3,6 +3,7 @@ ARG DEBIAN_FRONTEND=noninteractive
 ARG ROCMVERSION=6.0
 ARG compiler_version=""
 ARG compiler_commit=""
+ARG CK_SCCACHE=""
 RUN set -xe
@@ -16,29 +17,32 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
 ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
 RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
-RUN if [ "$ROCMVERSION" != "6.0.1" ]; then \
+RUN if [ "$ROCMVERSION" != "6.1" ]; then \
        sh -c "wget https://repo.radeon.com/amdgpu-install/6.0/ubuntu/focal/amdgpu-install_6.0.60000-1_all.deb  --no-check-certificate" && \
        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.0.60000-1_all.deb && \
        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
-    elif [ "$ROCMVERSION" = "6.0.1" ] && [ "$compiler_version" = "rc1" ]; then \
+    elif [ "$ROCMVERSION" = "6.1" ] && [ "$compiler_version" = "rc2" ]; then \
-        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.0-20.04-1_all.deb --no-check-certificate" && \
+        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.1-20.04-1_all.deb --no-check-certificate" && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.0-20.04-1_all.deb && \
+        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.1-20.04-1_all.deb && \
-        sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.0.1 rel-95 > /etc/apt/sources.list.d/rocm-build.list' && \
+        sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.1 rel-48 > /etc/apt/sources.list.d/rocm-build.list' && \
-        amdgpu-repo --amdgpu-build=1704947; \
+        amdgpu-repo --amdgpu-build=1736298; \
    fi
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
 RUN amdgpu-install -y --usecase=rocm --no-dkms
-## Sccache binary built from source for ROCm
+## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
 ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
 ENV SCCACHE_INSTALL_LOCATION=/usr/local/.cargo/bin
-RUN mkdir -p ${SCCACHE_INSTALL_LOCATION} && \
-curl ${SCCACHE_REPO_URL}/portable/0.2.16/sccache-0.2.16-alpha.1-rocm --output ${SCCACHE_INSTALL_LOCATION}/sccache && \
-chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache
 ENV PATH=$PATH:${SCCACHE_INSTALL_LOCATION}
+ENV CK_SCCACHE=$CK_SCCACHE
+RUN if [ "$CK_SCCACHE" != "" ]; then \
+        mkdir -p ${SCCACHE_INSTALL_LOCATION} && \
+        curl ${SCCACHE_REPO_URL}/portable/0.2.16/sccache-0.2.16-alpha.1-rocm --output ${SCCACHE_INSTALL_LOCATION}/sccache && \
+        chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache; \
+    fi
 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
@@ -73,6 +77,13 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
+# hipTensor requires rocm-llvm-dev for rocm versions > 6.0.1
+RUN if [ "$ROCMVERSION" = "6.1" ]; then \
+        sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev"; \
+    fi
+# Update the cmake to version 3.27.5
+RUN pip install --upgrade cmake==3.27.5
 #Install latest ccache
 RUN git clone https://github.com/ccache/ccache.git && \
    cd ccache && mkdir build && cd build && cmake .. && make install
@@ -82,8 +93,6 @@ RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releas
 RUN gunzip /usr/local/bin/ninja.gz
 RUN chmod a+x /usr/local/bin/ninja
 RUN git clone https://github.com/nico/ninjatracing.git
-# Update the cmake to the latest version
-RUN pip install --upgrade cmake==3.27.5
 #Install latest cppcheck
 RUN git clone https://github.com/danmar/cppcheck.git && \

--- a/Jenkinsfile
+++ b/Jenkinsfile
 def rocmnode(name) {
-    return '(rocmtest || miopen) && ' + name
+    return '(rocmtest || miopen) && (' + name + ')'
 }
 def show_node_info() {
@@ -7,6 +7,7 @@ def show_node_info() {
        echo "NODE_NAME = \$NODE_NAME"
        lsb_release -sd
        uname -r
+        cat /sys/module/amdgpu/version
        ls /opt/ -la
    """
 }
@@ -33,7 +34,11 @@ def runShell(String command){
 def getDockerImageName(){
    def img
-    if (params.ROCMVERSION != "6.0.1"){
+    if (params.USE_CUSTOM_DOCKER != ""){
+        img = "${params.USE_CUSTOM_DOCKER}"
+    }
+    else{
+    if (params.ROCMVERSION != "6.1"){
       if (params.COMPILER_VERSION == "") {
           img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
       }
@@ -61,6 +66,7 @@ def getDockerImageName(){
          }
       }
    }
+    }
    return img
 }
@@ -98,7 +104,7 @@ def getDockerImage(Map conf=[:]){
    env.DOCKER_BUILDKIT=1
    def prefixpath = conf.get("prefixpath", "/opt/rocm")
    def no_cache = conf.get("no_cache", false)
-    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
    if(no_cache)
    {
        dockerArgs = dockerArgs + " --no-cache "
@@ -111,7 +117,9 @@ def getDockerImage(Map conf=[:]){
    {
        echo "Pulling down image: ${image}"
        retimage = docker.image("${image}")
-        retimage.pull()
+        withDockerRegistry([ credentialsId: "docker_test_cred", url: "" ]) {
+            retimage.pull()
+        }
    }
    catch(Exception ex)
    {
@@ -126,7 +134,7 @@ def buildDocker(install_prefix){
    checkout scm
    def image_name = getDockerImageName()
    echo "Building Docker for ${image_name}"
-    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
    echo "Build Args: ${dockerArgs}"
    try{
@@ -258,18 +266,24 @@ def cmake_build(Map conf=[:]){
            """)
        sh cmd3
    }
-    def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
    // reduce parallelism when compiling, clang uses too much memory
    def nt = nthreads()
-    def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j${nt} ${config_targets}")
+    def cmd
    def execute_cmd = conf.get("execute_cmd", "")
+    if(!setup_args.contains("NO_CK_BUILD")){
-    def cmd = conf.get("cmd", """
+        def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
+        def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j${nt} ${config_targets}")
+        cmd = conf.get("cmd", """
            ${setup_cmd}
            ${build_cmd}
            ${execute_cmd}
        """)
+    }
+    else{
+        cmd = conf.get("cmd", """
+            ${execute_cmd}
+        """)
+    }
    echo cmd
@@ -297,7 +311,7 @@ def buildHipClangJob(Map conf=[:]){
        if (conf.get("enforce_xnack_on", false)) {
            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
        }
-        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
        if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }
@@ -353,9 +367,6 @@ def runCKProfiler(Map conf=[:]){
            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
        }
        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-        if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){
-            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
-        }
        def variant = env.STAGE_NAME
        def retimage
@@ -365,8 +376,8 @@ def runCKProfiler(Map conf=[:]){
                (retimage, image) = getDockerImage(conf)
                withDockerContainer(image: image, args: dockerOpts) {
                    timeout(time: 5, unit: 'MINUTES'){
-                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+                        sh 'rocminfo | tee rocminfo.log'
-                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                        if ( !runShell('grep -n "gfx" rocminfo.log') ){
                            throw new Exception ("GPU not found")
                        }
                        else{
@@ -379,20 +390,6 @@ def runCKProfiler(Map conf=[:]){
                echo "The job was cancelled or aborted"
                throw e
            }
-            catch(Exception ex) {
-                retimage = docker.build("${image}", dockerArgs + " --no-cache .")
-                withDockerContainer(image: image, args: dockerOpts) {
-                    timeout(time: 5, unit: 'MINUTES'){
-                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
-                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
-                            throw new Exception ("GPU not found")
-                        }
-                        else{
-                            echo "GPU is OK"
-                        }
-                    }
-                }
-            }
            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
                timeout(time: 24, unit: 'HOURS')
@@ -408,7 +405,7 @@ def runCKProfiler(Map conf=[:]){
 					dir("script"){
                        if (params.RUN_FULL_QA){
-                            sh "./run_full_performance_tests.sh 1 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
+                            sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
                            archiveArtifacts "perf_gemm.log"
                            archiveArtifacts "perf_resnet50_N256.log"
                            archiveArtifacts "perf_resnet50_N4.log"
@@ -418,9 +415,9 @@ def runCKProfiler(Map conf=[:]){
                            archiveArtifacts "perf_conv_bwd_data.log"
                            archiveArtifacts "perf_gemm_bilinear.log"
                            archiveArtifacts "perf_reduction.log"
-                            archiveArtifacts "perf_splitK_gemm_verify.log"
                            archiveArtifacts "perf_splitK_gemm.log"
                            archiveArtifacts "perf_onnx_gemm.log"
+                            archiveArtifacts "perf_mixed_gemm.log"
                           // stash perf files to master
                            stash name: "perf_gemm.log"
                            stash name: "perf_resnet50_N256.log"
@@ -433,6 +430,7 @@ def runCKProfiler(Map conf=[:]){
                            stash name: "perf_reduction.log"
                            stash name: "perf_splitK_gemm.log"
                            stash name: "perf_onnx_gemm.log"
+                            stash name: "perf_mixed_gemm.log"
                            //we will process results on the master node
                        }
                        else{
@@ -473,6 +471,7 @@ def Build_CK(Map conf=[:]){
        show_node_info()
        env.HSA_ENABLE_SDMA=0
+        env.DOCKER_BUILDKIT=1
        checkout scm
        def image = getDockerImageName() 
@@ -487,26 +486,25 @@ def Build_CK(Map conf=[:]){
        if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }
+        def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
+        def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
+        dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
+        echo "Docker flags: ${dockerOpts}"
        def variant = env.STAGE_NAME
        def retimage
-        def navi_node = 0
+        gitStatusWrapper(credentialsId: "${env.status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
-        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
            try {
                (retimage, image) = getDockerImage(conf)
                withDockerContainer(image: image, args: dockerOpts) {
                    timeout(time: 5, unit: 'MINUTES'){
-                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+                        sh 'rocminfo | tee rocminfo.log'
-                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                        if ( !runShell('grep -n "gfx" rocminfo.log') ){
                            throw new Exception ("GPU not found")
                        }
                        else{
                            echo "GPU is OK"
                        }
-                        if ( runShell('grep -n "gfx1030" clinfo.log') || runShell('grep -n "gfx1101" clinfo.log') ){
-                            navi_node = 1
-                        }
                    }
                }
            }
@@ -514,43 +512,38 @@ def Build_CK(Map conf=[:]){
                echo "The job was cancelled or aborted"
                throw e
            }
-            catch(Exception ex) {
-                retimage = docker.build("${image}", dockerArgs + " --no-cache .")
-                withDockerContainer(image: image, args: dockerOpts) {
-                    timeout(time: 5, unit: 'MINUTES'){
-                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo |tee clinfo.log'
-                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
-                            throw new Exception ("GPU not found")
-                        }
-                        else{
-                            echo "GPU is OK"
-                        }
-                        if ( runShell('grep -n "gfx1030" clinfo.log') || runShell('grep -n "gfx1101" clinfo.log') ){
-                            navi_node = 1
-                        }
-                    }
-                }
-            }
            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
                timeout(time: 24, unit: 'HOURS')
                {
+                    //check whether running on Navi or MI300 node
+                    def navi_node = 0
+                    def mi300_node = 0
+                    sh 'rocminfo | tee rocminfo.log'
+                    if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') ){
+                        navi_node = 1
+                        echo "This is a Navi node"
+                    }
+                    if ( runShell('grep -n "gfx942" rocminfo.log') ){
+                        mi300_node = 1
+                        echo "This is MI300 node"
+                    }
                    cmake_build(conf)
                    dir("build"){
                        //run tests and examples
                        sh 'make -j check'
-                        if (navi_node == 0 ){
+                        if (params.RUN_PERFORMANCE_TESTS && navi_node == 0 && mi300_node == 0 ){
                            //we only need the ckProfiler to run the performance tests, so we pack and stash it
-                            //do not stash profiler on Navi nodes
+                            //do not stash profiler on Navi or MI300 nodes
                           sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler'
-                           stash "ckProfiler.tar.gz"
+                           stash name: "ckProfiler.tar.gz"
                        }
-                        if (params.RUN_FULL_QA){
+                        if (params.RUN_FULL_QA && mi300_node == 0 ){
-                           // build deb packages
+                           // build deb packages for all MI100/200/300 targets and prepare to export
                           sh 'make -j package'
                           archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb'
                           archiveArtifacts artifacts: 'composablekernel-tests_*.deb'
                           sh 'mv composablekernel-ckprofiler_*.deb ckprofiler_0.2.0_amd64.deb'
-                           stash "ckprofiler_0.2.0_amd64.deb"
+                           stash name: "ckprofiler_0.2.0_amd64.deb"
                        }
                    }
                    if (params.hipTensor_test && navi_node == 0 ){
@@ -610,7 +603,7 @@ def process_results(Map conf=[:]){
    def variant = env.STAGE_NAME
    def retimage
-    gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
+    gitStatusWrapper(credentialsId: "${env.status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
        try {
            (retimage, image) = getDockerImage(conf)
        }
@@ -637,6 +630,7 @@ def process_results(Map conf=[:]){
                        unstash "perf_reduction.log"
                        unstash "perf_splitK_gemm.log"
                        unstash "perf_onnx_gemm.log"
+                        unstash "perf_mixed_gemm.log"
                        sh "./process_qa_data.sh"
                        unstash "ckprofiler_0.2.0_amd64.deb"
                        sh "sshpass -p ${env.ck_deb_pw} scp -o StrictHostKeyChecking=no ckprofiler_0.2.0_amd64.deb ${env.ck_deb_user}@${env.ck_deb_ip}:/var/www/html/composable_kernel/"
@@ -678,6 +672,10 @@ pipeline {
            name: "BUILD_DOCKER",
            defaultValue: false,
            description: "Force building docker image (default: false), set to true if docker image needs to be updated.")
+        string(
+            name: 'USE_CUSTOM_DOCKER',
+            defaultValue: '',
+            description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
        string(
            name: 'ROCMVERSION', 
            defaultValue: '6.0', 
@@ -720,8 +718,12 @@ pipeline {
            description: "Run the cppcheck static analysis (default: OFF)")
        booleanParam(
            name: "RUN_PERFORMANCE_TESTS",
-            defaultValue: false,
+            defaultValue: true,
-            description: "Run the performance tests (default: OFF)")
+            description: "Run the performance tests (default: ON)")
+        booleanParam(
+            name: "RUN_CODEGEN_TESTS",
+            defaultValue: true,
+            description: "Run the codegen tests (default: ON)")
    }
    environment{
        dbuser = "${dbuser}"
@@ -800,7 +802,34 @@ pipeline {
                }
            }
        }
+        stage("Run Codegen Tests")
+        {
+            parallel
+            {
+                stage("Run Codegen Tests on MI100/MI200")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CODEGEN_TESTS.toBoolean() }
+                    }
+                    options { retry(2) }
+                    agent{ label rocmnode("gfx908 || gfx90a")}
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ cd ../codegen && rm -rf build && mkdir build && cd build && \
+                                           cmake -D CMAKE_PREFIX_PATH=/opt/rocm \
+                                           -D CMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
+                                           -D CMAKE_BUILD_TYPE=Release \
+                                           -D GPU_TARGETS="gfx908;gfx90a" \
+                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j check"""
+                   }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+            }
+        }
 		stage("Build CK and run Tests")
        {
            parallel
@@ -828,6 +857,26 @@ pipeline {
                        cleanWs()
                    }
                }
+                stage("Build CK and run Tests on MI300")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_FULL_QA.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx942") }
+                    environment{
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx942" -DCMAKE_CXX_FLAGS=" -O3 " """
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
+                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
+                                           -DGPU_TARGETS="gfx942" \
+                                           -DCMAKE_CXX_COMPILER="${build_compiler()}" \
+                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
+                    }
+                    steps{
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        cleanWs()
+                    }
+                }
                stage("Build CK and run Tests on MI100/MI200")
                {
                    when {

--- a/client_example/16_convnd_fwd/CMakeLists.txt
+++ b/client_example/16_convnd_fwd/CMakeLists.txt
@@ -7,6 +7,9 @@ endif()
 if((DTYPES MATCHES "fp8") OR NOT DEFINED DTYPES)
    add_executable(client_conv3d_fwd_fp16_comp_fp8 conv3d_fwd_fp16_comp_fp8.cpp)
    target_link_libraries(client_conv3d_fwd_fp16_comp_fp8 PRIVATE composable_kernel::device_conv_operations)
+    add_executable(client_conv3d_fwd_fp8 conv3d_fwd_fp8.cpp)
+    target_link_libraries(client_conv3d_fwd_fp8 PRIVATE composable_kernel::device_conv_operations)
 endif()
 if((DTYPES MATCHES "fp32") OR NOT DEFINED DTYPES)

--- a/client_example/16_convnd_fwd/conv3d_fwd_fp8.cpp
+++ b/client_example/16_convnd_fwd/conv3d_fwd_fp8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+using InDataType  = ck::f8_t;
+using WeiDataType = ck::f8_t;
+using OutDataType = ck::f8_t;
+using InLayout  = ck::tensor_layout::convolution::NDHWGC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::NDHWGK;
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t N             = 64;
+static constexpr ck::index_t K             = 128;
+static constexpr ck::index_t C             = 64;
+static constexpr ck::index_t Z             = 3;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Di            = 28;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 3;
+static constexpr ck::index_t Do            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 3;
+int main()
+{
+    return run_grouped_conv_fwd<NumDimSpatial,
+                                InDataType,
+                                WeiDataType,
+                                OutDataType,
+                                InLayout,
+                                WeiLayout,
+                                OutLayout,
+                                3,
+                                ck::f8_t>(
+               {N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}
--- a/client_example/24_grouped_conv_activation/CMakeLists.txt
+++ b/client_example/24_grouped_conv_activation/CMakeLists.txt
@@ -38,3 +38,11 @@ target_link_libraries(client_grouped_convnd_fwd_bilinear_residual_fp16 PRIVATE c
 add_executable(client_grouped_convnd_bwd_data_bilinear_residual_fp16
               grouped_convnd_bwd_data_bilinear/grouped_conv_bwd_data_bilinear_residual_fp16.cpp)
 target_link_libraries(client_grouped_convnd_bwd_data_bilinear_residual_fp16 PRIVATE composable_kernel::device_conv_operations)
+# Fwd scale
+add_executable(client_grouped_convnd_fwd_scale_fp16
+               grouped_convnd_fwd_scale/grouped_conv_fwd_scale_fp16.cpp)
+target_link_libraries(client_grouped_convnd_fwd_scale_fp16 PRIVATE composable_kernel::device_conv_operations)
+# Bwd data scale
+add_executable(client_grouped_convnd_bwd_data_scale_fp16
+               grouped_convnd_bwd_data_scale/grouped_conv_bwd_data_scale_fp16.cpp)
+target_link_libraries(client_grouped_convnd_bwd_data_scale_fp16 PRIVATE composable_kernel::device_conv_operations)
--- a/client_example/24_grouped_conv_activation/grouped_convnd_bwd_data_scale/grouped_conv_bwd_data_scale_fp16.cpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_bwd_data_scale/grouped_conv_bwd_data_scale_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <tuple>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <vector>
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/tuple.hpp"
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_scale.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+// Use std tuple instead of ck tuple to avoid clang
+// implicit instantiation of undefined template error.
+using DDataTypes = std::tuple<ck::half_t>;
+using InLayout    = ck::tensor_layout::convolution::NDHWGC;
+using WeiLayout   = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout   = ck::tensor_layout::convolution::NDHWGK;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t G             = 32;
+static constexpr ck::index_t N             = 64; // batch size
+static constexpr ck::index_t K             = 64; // output channel
+static constexpr ck::index_t C             = 32; // input channel (per group)
+static constexpr ck::index_t Z             = 3;  // filter D
+static constexpr ck::index_t Y             = 3;  // filter H
+static constexpr ck::index_t X             = 3;  // filter W
+static constexpr ck::index_t Di            = 14; // input D
+static constexpr ck::index_t Hi            = 14; // input H
+static constexpr ck::index_t Wi            = 14; // input W
+static constexpr ck::index_t Do            = 14; // output D
+static constexpr ck::index_t Ho            = 14; // output H
+static constexpr ck::index_t Wo            = 14; // output W
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int execute_conv_bwd_data_scale()
+{
+    std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, C, Di, Hi, Wi};
+    std::array<ck::index_t, NumDimSpatial + 3> in_strides{
+        C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
+    std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, C, Z, Y, X};
+    std::array<ck::index_t, NumDimSpatial + 3> wei_strides{
+        K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
+    std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, K, Do, Ho, Wo};
+    std::array<ck::index_t, NumDimSpatial + 3> out_strides{
+        K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
+    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
+    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
+    SimpleDeviceMem in(sizeof(InDataType) * G * N * Di * Hi * Wi * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * G * N * Do * Ho * Wo * K);
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<NumDimSpatial,
+                                                                                     OutLayout,
+                                                                                     WeiLayout,
+                                                                                     ck::Tuple<>,
+                                                                                     InLayout,
+                                                                                     OutDataType,
+                                                                                     WeiDataType,
+                                                                                     ck::Tuple<>,
+                                                                                     InDataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     Scale>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr        = op_ptrs[i];
+        auto argument_ptr   = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {},
+                                                        in.GetDeviceBuffer(),
+                                                        out_lengths,
+                                                        out_strides,
+                                                        wei_lengths,
+                                                        wei_strides,
+                                                        {},
+                                                        {},
+                                                        in_lengths,
+                                                        in_strides,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        Scale{2.f});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t flop = std::size_t(2) * G * N * K * C * Do * Ho * Wo * Y * X +
+                               3 * G * N * Di * Hi * Wi * C;
+            std::size_t num_bytes = 2 * sizeof(InDataType) * G * N * Di * Hi * Wi * C +
+                                    sizeof(WeiDataType) * G * K * Z * Y * X * C +
+                                    sizeof(OutDataType) * G * N * Do * Ho * Wo * K;
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cerr << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    if(best_op_id < 0)
+    {
+        std::cerr << "no suitable instance" << std::endl;
+        return EXIT_FAILURE;
+    }
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {},
+                                                        in.GetDeviceBuffer(),
+                                                        out_lengths,
+                                                        out_strides,
+                                                        wei_lengths,
+                                                        wei_strides,
+                                                        {},
+                                                        {},
+                                                        in_lengths,
+                                                        in_strides,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        Scale{2.f});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
+int main() { return execute_conv_bwd_data_scale(); }
--- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_scale/grouped_conv_fwd_scale_fp16.cpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_scale/grouped_conv_fwd_scale_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <tuple>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <vector>
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/tuple.hpp"
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scale.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+// Use std tuple instead of ck tuple to avoid clang
+// implicit instantiation of undefined template error.
+using DDataTypes = std::tuple<ck::half_t>;
+using InLayout    = ck::tensor_layout::convolution::NDHWGC;
+using WeiLayout   = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout   = ck::tensor_layout::convolution::NDHWGK;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t G             = 32;
+static constexpr ck::index_t N             = 64; // batch size
+static constexpr ck::index_t K             = 64; // output channel
+static constexpr ck::index_t C             = 32; // input channel (per group)
+static constexpr ck::index_t Z             = 3;  // filter D
+static constexpr ck::index_t Y             = 3;  // filter H
+static constexpr ck::index_t X             = 3;  // filter W
+static constexpr ck::index_t Di            = 14; // input D
+static constexpr ck::index_t Hi            = 14; // input H
+static constexpr ck::index_t Wi            = 14; // input W
+static constexpr ck::index_t Do            = 14; // output D
+static constexpr ck::index_t Ho            = 14; // output H
+static constexpr ck::index_t Wo            = 14; // output W
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int execute_conv_fwd_scale()
+{
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space.
+    // However, CK's API only accepts lengths and strides with order of GNCDHW/GKCZYX/GNKDHW.
+    // Hence, we need to adjust the order of strides.
+    std::array<ck::index_t, 6> in_lengths{G, N, C, Di, Hi, Wi};
+    std::array<ck::index_t, 6> in_strides{
+        C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
+    std::array<ck::index_t, 6> wei_lengths{G, K, C, Z, Y, X};
+    std::array<ck::index_t, 6> wei_strides{
+        K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
+    std::array<ck::index_t, 6> out_lengths{G, N, K, Do, Ho, Wo};
+    std::array<ck::index_t, 6> out_strides{
+        K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
+    // Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
+    std::array<ck::index_t, 6> bias_lengths{G, 1, K, 1, 1, 1};
+    std::array<ck::index_t, 6> bias_strides{K, 0, 1, 0, 0, 0};
+    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
+    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
+    SimpleDeviceMem in(sizeof(InDataType) * N * Di * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K);
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
+                                                                                   InLayout,
+                                                                                   WeiLayout,
+                                                                                   ck::Tuple<>,
+                                                                                   OutLayout,
+                                                                                   InDataType,
+                                                                                   WeiDataType,
+                                                                                   ck::Tuple<>,
+                                                                                   OutDataType,
+                                                                                   PassThrough,
+                                                                                   PassThrough,
+                                                                                   Scale>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr        = op_ptrs[i];
+        auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {},
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        wei_lengths,
+                                                        wei_strides,
+                                                        {},
+                                                        {},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        Scale{2.f});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t flop =
+                std::size_t(2) * G * N * K * C * Ho * Wo * Y * X + 3 * N * Ho * Wo * G * K;
+            std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
+                                    sizeof(WeiDataType) * G * K * Y * X * C +
+                                    sizeof(OutDataType) * 2 * N * Ho * Wo * G * K;
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cerr << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    if(best_op_id < 0)
+    {
+        std::cerr << "no suitable instance" << std::endl;
+        return EXIT_FAILURE;
+    }
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {},
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        wei_lengths,
+                                                        wei_strides,
+                                                        {},
+                                                        {},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        Scale{2.f});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
+int main() { return execute_conv_fwd_scale(); }
--- a/cmake/Embed.cmake
+++ b/cmake/Embed.cmake
+#####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#####################################################################################
+if(WIN32)
+    set(EMBED_USE RC CACHE STRING "Use RC or CArrays to embed data files")
+    set_property(CACHE EMBED_USE PROPERTY STRINGS "RC;CArrays")
+else()
+    if(BUILD_SHARED_LIBS)
+        set(EMBED_USE LD CACHE STRING "Use LD or CArrays to embed data files")
+    else()
+        set(EMBED_USE CArrays CACHE STRING "Use LD or CArrays to embed data files")
+    endif()
+    set_property(CACHE EMBED_USE PROPERTY STRINGS "LD;CArrays")
+endif()
+if(EMBED_USE STREQUAL "LD")
+    find_program(EMBED_LD ld REQUIRED)
+    find_program(EMBED_OBJCOPY objcopy REQUIRED)
+endif()
+function(embed_wrap_string)
+    set(options)
+    set(oneValueArgs VARIABLE AT_COLUMN)
+    set(multiValueArgs)
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    string(LENGTH ${${PARSE_VARIABLE}} string_length)
+    math(EXPR offset "0")
+    while(string_length GREATER 0)
+        if(string_length GREATER ${PARSE_AT_COLUMN})
+            math(EXPR length "${PARSE_AT_COLUMN}")
+        else()
+            math(EXPR length "${string_length}")
+        endif()
+        string(SUBSTRING ${${PARSE_VARIABLE}} ${offset} ${length} line)
+        set(lines "${lines}\n${line}")
+        math(EXPR string_length "${string_length} - ${length}")
+        math(EXPR offset "${offset} + ${length}")
+    endwhile()
+    set(${PARSE_VARIABLE} "${lines}" PARENT_SCOPE)
+endfunction()
+function(generate_embed_source EMBED_NAME EMBED_DIR BASE_DIRECTORY)
+    set(options)
+    set(oneValueArgs)
+    set(multiValueArgs SYMBOLS FILES)
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(RESOURCE_ID 100)
+    list(LENGTH PARSE_SYMBOLS SYMBOLS_LEN)
+    list(LENGTH PARSE_FILES FILES_LEN)
+    if(NOT ${SYMBOLS_LEN} EQUAL ${FILES_LEN})
+        message(FATAL_ERROR "Symbols and objects dont match: ${SYMBOLS_LEN} != ${FILES_LEN}")
+    endif()
+    math(EXPR LEN "${SYMBOLS_LEN} - 1")
+    foreach(idx RANGE ${LEN})
+        list(GET PARSE_SYMBOLS ${idx} SYMBOL)
+        list(GET PARSE_FILES ${idx} FILE)
+        file(RELATIVE_PATH BASE_NAME "${BASE_DIRECTORY}" ${FILE})
+        if(EMBED_USE STREQUAL "RC")
+            string(TOUPPER "${SYMBOL}" SYMBOL)
+            string(APPEND FILE_IDS "#define IDR_${SYMBOL} ${RESOURCE_ID}\n")
+            file(TO_NATIVE_PATH "${FILE}" NATIVE_FILE)
+            string(REPLACE "\\" "\\\\" NATIVE_FILE "${NATIVE_FILE}")
+            string(APPEND RC_FILE_MAPPING "IDR_${SYMBOL} TEXTFILE \"${NATIVE_FILE}\"\n")
+            string(APPEND INIT_KERNELS "\n        {\"${BASE_NAME}\", resource::read(IDR_${SYMBOL})},")
+            math(EXPR RESOURCE_ID "${RESOURCE_ID} + 1" OUTPUT_FORMAT DECIMAL)
+        else()
+            set(START_SYMBOL "_binary_${SYMBOL}_start")
+            set(LENGTH_SYMBOL "_binary_${SYMBOL}_length")
+            if(EMBED_USE STREQUAL "LD")
+                string(APPEND EXTERNS "
+extern const char ${START_SYMBOL}[];
+extern const size_t _binary_${SYMBOL}_size;
+const auto ${LENGTH_SYMBOL} = reinterpret_cast<size_t>(&_binary_${SYMBOL}_size);
+")
+            else()
+                string(APPEND EXTERNS "
+extern const char ${START_SYMBOL}[];
+extern const size_t ${LENGTH_SYMBOL};
+")
+            endif()
+            string(APPEND INIT_KERNELS "
+        { \"${BASE_NAME}\", { ${START_SYMBOL}, ${LENGTH_SYMBOL}} },")
+        endif()
+    endforeach()
+    if(EMBED_USE STREQUAL "RC")
+       file(WRITE "${EMBED_DIR}/include/resource.h" "
+#define TEXTFILE 256
+${FILE_IDS}
+")
+        file(WRITE "${EMBED_DIR}/resource.rc" "
+#include \"resource.h\"
+${RC_FILE_MAPPING}
+")
+        set(EXTERNS "
+#include <Windows.h>
+#include \"resource.h\"
+namespace resource {
+std::string_view read(int id)
+{
+    HMODULE handle = GetModuleHandle(nullptr);
+    HRSRC rc = FindResource(handle, MAKEINTRESOURCE(id), MAKEINTRESOURCE(TEXTFILE));
+    HGLOBAL data = LoadResource(handle, rc);
+    return {static_cast<const char*>(LockResource(data)), SizeofResource(handle, rc)};
+}
+}
+")
+        set(EMBED_FILES ${EMBED_DIR}/include/resource.h ${EMBED_DIR}/resource.rc)
+    endif()
+    file(WRITE "${EMBED_DIR}/include/${EMBED_NAME}.hpp" "
+#include <string_view>
+#include <unordered_map>
+#include <utility>
+std::unordered_map<std::string_view, std::string_view> ${EMBED_NAME}();
+")
+    file(WRITE "${EMBED_DIR}/${EMBED_NAME}.cpp" "
+#include <${EMBED_NAME}.hpp>
+${EXTERNS}
+std::unordered_map<std::string_view, std::string_view> ${EMBED_NAME}()
+{
+    static std::unordered_map<std::string_view, std::string_view> result = {${INIT_KERNELS}
+    };
+    return result;
+}
+")
+    list(APPEND EMBED_FILES ${EMBED_DIR}/${EMBED_NAME}.cpp ${EMBED_DIR}/include/${EMBED_NAME}.hpp)
+    set(EMBED_FILES ${EMBED_FILES} PARENT_SCOPE)
+endfunction()
+function(embed_file FILE BASE_DIRECTORY)
+    message(STATUS "    ${FILE}")
+    file(RELATIVE_PATH REL_FILE "${BASE_DIRECTORY}" ${FILE})
+    string(MAKE_C_IDENTIFIER "${REL_FILE}" OUTPUT_SYMBOL)
+    get_filename_component(OUTPUT_FILE_DIR "${REL_FILE}" DIRECTORY)
+    file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${OUTPUT_FILE_DIR}")
+    if(EMBED_USE STREQUAL "LD")
+        set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.o")
+        add_custom_command(
+            OUTPUT "${OUTPUT_FILE}"
+            COMMAND ${EMBED_LD} -r -o "${OUTPUT_FILE}" -z noexecstack --format=binary "${REL_FILE}"
+            COMMAND ${EMBED_OBJCOPY} --rename-section .data=.rodata,alloc,load,readonly,data,contents "${OUTPUT_FILE}"
+            WORKING_DIRECTORY "${BASE_DIRECTORY}"
+            DEPENDS "${FILE}"
+            VERBATIM)
+        set(OUTPUT_FILE ${OUTPUT_FILE} PARENT_SCOPE)
+    elseif(EMBED_USE STREQUAL "CArrays")
+        set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${FILE})
+        set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.cpp")
+        # reads source file contents as hex string
+        file(READ ${FILE} HEX_STRING HEX)
+        # wraps the hex string into multiple lines
+        embed_wrap_string(VARIABLE HEX_STRING AT_COLUMN 80)
+        # adds '0x' prefix and comma suffix before and after every byte respectively
+        string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1, " ARRAY_VALUES ${HEX_STRING})
+        # removes trailing comma
+        string(REGEX REPLACE ", $" "" ARRAY_VALUES ${ARRAY_VALUES})
+        file(WRITE "${OUTPUT_FILE}" "
+#include <cstddef>
+extern const char _binary_${OUTPUT_SYMBOL}_start[] = { ${ARRAY_VALUES} };
+extern const size_t _binary_${OUTPUT_SYMBOL}_length = sizeof(_binary_${OUTPUT_SYMBOL}_start);
+")
+        set(OUTPUT_FILE ${OUTPUT_FILE} PARENT_SCOPE)
+    endif()
+    set(OUTPUT_SYMBOL ${OUTPUT_SYMBOL} PARENT_SCOPE)
+endfunction()
+function(add_embed_library EMBED_NAME)
+    set(options)
+    set(oneValueArgs RELATIVE)
+    set(multiValueArgs)
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(EMBED_DIR ${CMAKE_CURRENT_BINARY_DIR}/embed/${EMBED_NAME})
+    file(MAKE_DIRECTORY ${EMBED_DIR})
+    message(STATUS "Embedding kernel files:")
+    foreach(FILE ${PARSE_UNPARSED_ARGUMENTS})
+        embed_file(${FILE} ${PARSE_RELATIVE})
+        list(APPEND OUTPUT_FILES ${OUTPUT_FILE})
+        list(APPEND SYMBOLS ${OUTPUT_SYMBOL})
+    endforeach()
+    message(STATUS "Generating embedding library '${EMBED_NAME}'")
+    generate_embed_source(${EMBED_NAME} ${EMBED_DIR} "${PARSE_RELATIVE}" SYMBOLS ${SYMBOLS} FILES ${PARSE_UNPARSED_ARGUMENTS})
+    set(INTERNAL_EMBED_LIB embed_lib_${EMBED_NAME})
+    if(EMBED_USE STREQUAL "LD")
+        add_library(${INTERNAL_EMBED_LIB} STATIC ${EMBED_FILES} ${OUTPUT_FILES})
+    else()
+        add_library(${INTERNAL_EMBED_LIB} OBJECT ${EMBED_FILES})
+    endif()
+    if(EMBED_USE STREQUAL "CArrays")
+        target_sources(${INTERNAL_EMBED_LIB} PRIVATE ${OUTPUT_FILES})
+    endif()
+    target_include_directories(${INTERNAL_EMBED_LIB} PRIVATE "${EMBED_DIR}/include")
+    target_compile_options(${INTERNAL_EMBED_LIB} PRIVATE -Wno-reserved-identifier -Wno-extern-initializer -Wno-missing-variable-declarations)
+    set_target_properties(${INTERNAL_EMBED_LIB} PROPERTIES POSITION_INDEPENDENT_CODE On)
+    add_library(${EMBED_NAME} INTERFACE)
+    if(EMBED_USE STREQUAL "RC")
+        target_link_libraries(${EMBED_NAME} INTERFACE $<TARGET_OBJECTS:${INTERNAL_EMBED_LIB}>)
+    elseif(EMBED_USE STREQUAL "LD")
+        target_link_libraries(${EMBED_NAME} INTERFACE ${INTERNAL_EMBED_LIB})
+    else()
+        target_sources(${EMBED_NAME} INTERFACE $<TARGET_OBJECTS:${INTERNAL_EMBED_LIB}>)
+    endif()
+    target_include_directories(${EMBED_NAME} INTERFACE "${EMBED_DIR}/include")
+endfunction()
--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
+cmake_minimum_required(VERSION 3.16)
+project(composable_kernel_host)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
+find_package(ROCM)
+include(ROCMInstallTargets)
+include(ROCMTest)
+list(APPEND CMAKE_MODULE_PATH ${CK_ROOT}/cmake)
+include(Embed)
+file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
+    ${CK_ROOT}/include/ck/*.hpp)
+message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
+message(STATUS "RELATIVE: ${CK_ROOT}/include")
+add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include)
+add_definitions(-std=c++17)
+file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
+# TODO: Use object library
+add_library(ck_host STATIC ${SOURCES})
+target_link_libraries(ck_host PRIVATE ck_headers)
+set_target_properties(ck_host PROPERTIES 
+    LINKER_LANGUAGE CXX
+    POSITION_INDEPENDENT_CODE ON)
+target_include_directories(ck_host PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+)
+add_executable(ck-template-driver driver/main.cpp)
+target_link_libraries(ck-template-driver ck_host)
+rocm_install(
+    TARGETS ck_host ck_headers
+    EXPORT ck_hostTargets
+)
+rocm_install(DIRECTORY include/ck DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+if(BUILD_TESTING)
+add_subdirectory(test)
+endif()
--- a/codegen/driver/main.cpp
+++ b/codegen/driver/main.cpp
+#include <functional>
+#include <iostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "ck/host/device_gemm_multiple_d/operation.hpp"
+#include "ck/host/stringutils.hpp"
+using ck::host::Transform;
+struct Emitters
+{
+    std::unordered_map<std::string, std::function<std::vector<std::string>()>> m;
+    template <class T>
+    void Register(const std::string& name)
+    {
+        m[name] = [] {
+            auto configs = T::CreateOperations();
+            return Transform(configs, [](const auto& ops) { return ToTuple(ops); });
+        };
+    }
+    template <class T>
+    static std::string ToTuple(const T& ops)
+    {
+        auto templates = Transform(
+            ops, [](const auto& op) { return "    " + op.ToSolution().ToTemplateString(); });
+        return "std::tuple<\n" + ck::host::JoinStrings(templates, ",\n") + ">";
+    }
+    std::string Emit(const std::string& name) { return ck::host::JoinStrings(m.at(name)(), "\n"); }
+    std::vector<std::string> List() const
+    {
+        return Transform(m, [](auto&& p) { return p.first; });
+    }
+};
+int main(int argc, const char* argv[])
+{
+    std::string prog = argv[0];
+    std::vector<std::string> args(argv + 1, argv + argc);
+    Emitters e;
+    e.Register<ck::host::device_gemm_multiple_d::Operation_Xdl_CShuffle>(
+        "DeviceGemmMultipleD_Xdl_CShuffle");
+    if(args.empty() or std::any_of(args.begin(), args.end(), [](auto arg) {
+           return arg == "-h" or arg == "--help";
+       }))
+    {
+        std::cout << "USAGE:" << std::endl;
+        std::cout << "    " << prog << " [TEMPLATE]" << std::endl;
+        std::cout << std::endl;
+        std::cout << "FLAGS:" << std::endl;
+        std::cout << "    -h, --help                     Show help" << std::endl;
+        std::cout << std::endl;
+        std::cout << "TEMPLATES:" << std::endl;
+        for(auto x : e.List())
+            std::cout << "    " << x << std::endl;
+        std::cout << std::endl;
+        return 0;
+    }
+    for(auto name : args)
+        std::cout << e.Emit(name) << std::endl;
+    return 0;
+}
--- a/codegen/include/ck/host/device_gemm_multiple_d.hpp
+++ b/codegen/include/ck/host/device_gemm_multiple_d.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <cstdlib>
+#include <vector>
+#include <memory>
+#include <sstream>
+#include <iterator>
+#include <numeric>
+#include "ck/host/types.hpp"
+namespace ck {
+namespace host {
+namespace device_gemm_multiple_d {
+struct Problem
+{
+    std::size_t M                    = 0;
+    std::size_t N                    = 0;
+    std::size_t K                    = 0;
+    bool TransA                      = false;
+    bool TransB                      = false;
+    bool TransE                      = false;
+    std::vector<bool> DsTrans        = {};
+    DataType ADataType               = DataType::Half;
+    DataType BDataType               = DataType::Half;
+    DataType EDataType               = DataType::Half;
+    std::vector<DataType> DsDataType = {};
+    std::string AElementOp           = "ck::tensor_operation::element_wise::PassThrough";
+    std::string BElementOp           = "ck::tensor_operation::element_wise::PassThrough";
+    std::string CDEElementOp         = "ck::Tuple<>";
+    std::string GetIncludeHeader() const;
+    std::vector<Solution> GetSolutions(const std::string& arch) const;
+};
+} // namespace device_gemm_multiple_d
+} // namespace host
+} // namespace ck
--- a/codegen/include/ck/host/device_gemm_multiple_d/operation.hpp
+++ b/codegen/include/ck/host/device_gemm_multiple_d/operation.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <cstdlib>
+#include <vector>
+#include <string>
+#include "ck/host/types.hpp"
+#include "ck/host/operation/gemm.hpp"
+#include "ck/host/device_gemm_multiple_d/problem.hpp"
+namespace ck {
+namespace host {
+namespace device_gemm_multiple_d {
+struct Operation_Xdl_CShuffle
+{
+    static std::vector<std::vector<Operation_Xdl_CShuffle>> CreateOperations();
+    static std::vector<Operation_Xdl_CShuffle> CreateOperations(const Problem& prob);
+    TensorDesc A{};
+    TensorDesc B{};
+    DataType acc               = DataType::Float;
+    DataType cs_type           = DataType::Half;
+    std::vector<TensorDesc> Ds = {};
+    TensorDesc E{};
+    std::string a_elem_op           = PassThrough;
+    std::string b_elem_op           = PassThrough;
+    std::string cde_elem_op         = Bilinear;
+    std::string gemm_specialization = "ck::tensor_operation::device::GemmSpecialization::Default";
+    operation::TileDesc tile_desc{};
+    operation::BlockTransferDesc a_block_transfer{};
+    operation::BlockTransferDesc b_block_transfer{};
+    operation::CShuffleDesc cshuffle{};
+    operation::CBlockTransferDesc c_block_transfer{};
+    Solution ToSolution() const;
+};
+} // namespace device_gemm_multiple_d
+} // namespace host
+} // namespace ck
--- a/codegen/include/ck/host/device_gemm_multiple_d/problem.hpp
+++ b/codegen/include/ck/host/device_gemm_multiple_d/problem.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <cstdlib>
+#include <vector>
+#include <string>
+#include "ck/host/types.hpp"
+namespace ck {
+namespace host {
+namespace device_gemm_multiple_d {
+struct Problem
+{
+    std::size_t M                    = 0;
+    std::size_t N                    = 0;
+    std::size_t K                    = 0;
+    bool TransA                      = false;
+    bool TransB                      = false;
+    bool TransE                      = false;
+    std::vector<bool> DsTrans        = {};
+    DataType ADataType               = DataType::Half;
+    DataType BDataType               = DataType::Half;
+    DataType EDataType               = DataType::Half;
+    std::vector<DataType> DsDataType = {};
+    std::string AElementOp           = PassThrough;
+    std::string BElementOp           = PassThrough;
+    std::string CDEElementOp         = PassThrough;
+    std::string GetIncludeHeader() const;
+    std::vector<Solution> GetSolutions(const std::string& arch) const;
+};
+} // namespace device_gemm_multiple_d
+} // namespace host
+} // namespace ck
--- a/codegen/include/ck/host/headers.hpp
+++ b/codegen/include/ck/host/headers.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <string>
+#include <string_view>
+#include <utility>
+#include <unordered_map>
+#include <vector>
+namespace ck {
+namespace host {
+std::unordered_map<std::string_view, std::string_view> GetHeaders();
+} // namespace host
+} // namespace ck
--- a/codegen/include/ck/host/operation/gemm.hpp
+++ b/codegen/include/ck/host/operation/gemm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <string>
+namespace ck {
+namespace host {
+namespace operation {
+struct TileDesc
+{
+    int block_size               = 0;
+    int m_per_block              = 0;
+    int n_per_block              = 0;
+    int k_per_block              = 0;
+    int ak1                      = 0;
+    int bk1                      = 0;
+    int m_per_XDL                = 0;
+    int n_per_XDL                = 0;
+    int m_Xdl_per_wave           = 0;
+    int n_Xdl_per_wave           = 0;
+    int num_gemmk_prefetch_stage = 0;
+};
+struct BlockTransferDesc
+{
+    std::string thread_cluster_length        = "";
+    std::string thread_cluster_arrange_order = "";
+    std::string src_access_order             = "";
+    int src_vec_dim                          = 0;
+    int src_scalar_per_vector                = 0;
+    int dst_scalar_per_vector_k1             = 0;
+    int lds_add_extra_dim                    = 0;
+};
+struct CShuffleDesc
+{
+    int m_Xdl_per_wave_per_shuffle = 0;
+    int n_Xdl_per_wave_per_shuffle = 0;
+};
+struct CBlockTransferDesc
+{
+    std::string cluster_lengths_m_block_m_wave_m_per_Xdl_n_block_n_wave_n_per_Xdl = "";
+    int scalar_per_vector_n_wave_n_per_Xdl                                        = 0;
+};
+} // namespace operation
+} // namespace host
+} // namespace ck
--- a/codegen/include/ck/host/stringutils.hpp
+++ b/codegen/include/ck/host/stringutils.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <unordered_map>
+#include <vector>
+namespace ck {
+namespace host {
+template <class F>
+std::string trim(const std::string& s, F f)
+{
+    auto start = std::find_if_not(s.begin(), s.end(), f);
+    auto last  = std::find_if_not(s.rbegin(), std::string::const_reverse_iterator(start), f).base();
+    return {start, last};
+}
+inline std::string trim(const std::string& s)
+{
+    return trim(s, [](unsigned char c) { return std::isspace(c); });
+}
+template <class Strings>
+inline std::string JoinStrings(Strings strings, const std::string& delim)
+{
+    auto it = strings.begin();
+    if(it == strings.end())
+        return "";
+    auto nit = std::next(it);
+    return std::accumulate(nit, strings.end(), *it, [&](std::string x, std::string y) {
+        return std::move(x) + delim + std::move(y);
+    });
+}
+template <class F>
+inline std::string
+InterpolateString(const std::string& input, F f, std::string start = "${", std::string end = "}")
+{
+    std::string result = "";
+    result.reserve(input.size());
+    auto it = input.begin();
+    while(it != input.end())
+    {
+        auto next_start = std::search(it, input.end(), start.begin(), start.end());
+        auto next_end   = std::search(next_start, input.end(), end.begin(), end.end());
+        result.append(it, next_start);
+        if(next_start == input.end())
+            break;
+        if(next_end == input.end())
+        {
+            throw std::runtime_error("Unbalanced brackets");
+        }
+        auto r = f(next_start + start.size(), next_end);
+        result.append(r.begin(), r.end());
+        it = next_end + end.size();
+    }
+    return result;
+}
+inline std::string InterpolateString(const std::string& input,
+                                     const std::unordered_map<std::string, std::string>& vars,
+                                     std::string start = "${",
+                                     std::string end   = "}")
+{
+    return InterpolateString(
+        input,
+        [&](auto start_it, auto last_it) {
+            auto key = trim({start_it, last_it});
+            auto it  = vars.find(key);
+            if(it == vars.end())
+                throw std::runtime_error("Unknown key: " + key);
+            return it->second;
+        },
+        std::move(start),
+        std::move(end));
+}
+template <class Range, class F>
+inline auto Transform(const Range& r, F f) -> std::vector<decltype(f(*r.begin()))>
+{
+    std::vector<decltype(f(*r.begin()))> result;
+    std::transform(r.begin(), r.end(), std::back_inserter(result), f);
+    return result;
+}
+template <class Range1, class Range2, class F>
+inline auto Transform(const Range1& r1, const Range2& r2, F f)
+    -> std::vector<decltype(f(*r1.begin(), *r2.begin()))>
+{
+    std::vector<decltype(f(*r1.begin(), *r2.begin()))> result;
+    assert(std::distance(r1.begin(), r1.end()) == std::distance(r2.begin(), r2.end()));
+    std::transform(r1.begin(), r1.end(), r2.begin(), std::back_inserter(result), f);
+    return result;
+}
+} // namespace host
+} // namespace ck
--- a/codegen/include/ck/host/types.hpp
+++ b/codegen/include/ck/host/types.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <string>
+#include <sstream>
+#include <utility>
+#include <unordered_map>
+#include <vector>
+namespace ck {
+namespace host {
+struct Solution
+{
+    Solution() = default;
+    Solution(std::string str, std::unordered_map<std::string, std::string> values);
+    std::string ToTemplateString() const;
+    std::string GetTemplateParameter(const std::string& name) const;
+    template <class T>
+    T GetTemplateParameter(const std::string& name) const
+    {
+        T result;
+        std::stringstream ss(GetTemplateParameter(name));
+        ss >> result;
+        return result;
+    }
+    private:
+    std::string template_str;
+    std::unordered_map<std::string, std::string> template_values;
+};
+enum class DataType
+{
+    Half,
+    Float,
+    Int8,
+    Int32
+};
+std::string ToString(DataType dt);
+enum class Layout
+{
+    Row,
+    Column
+};
+std::string ToString(Layout dl);
+enum class GemmType
+{
+    Default
+};
+std::string ToString(GemmType gt);
+struct TensorDesc
+{
+    DataType element;
+    Layout layout;
+};
+std::string SequenceStr(const std::vector<int>& v);
+std::string MakeTuple(const std::vector<std::string>& v);
+template <int... xs>
+const std::string S = SequenceStr({xs...});
+constexpr const char* PassThrough = "ck::tensor_operation::element_wise::PassThrough";
+constexpr const char* Bilinear    = "ck::tensor_operation::element_wise::Bilinear";
+} // namespace host
+} // namespace ck
--- a/codegen/include/ck/host/utils.hpp
+++ b/codegen/include/ck/host/utils.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <cstdint>
+#include <unordered_set>
+namespace ck {
+namespace host {
+std::size_t integer_divide_ceil(std::size_t x, std::size_t y);
+const std::unordered_set<std::string>& get_xdlop_archs();
+} // namespace host
+} // namespace ck