merge from public repo

09d4c3a4 · illsilin · 171ed358 · 8e4c3fb1 · 09d4c3a4 · 09d4c3a4
Commit 09d4c3a4 authored Oct 01, 2024 by illsilin
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,11 +26,15 @@ set(version 1.1.0)
 project(composable_kernel VERSION ${version} LANGUAGES CXX HIP)
 include(CTest)

+# Usage: for customized Python location cmake -DCK_USE_ALTERNATIVE_PYTHON="/opt/Python-3.8.13/bin/python3.8"
+# CK Codegen requires dataclass which is added in Python 3.7
+# Python version 3.8 is required for general good practice as it is default for Ubuntu 20.04
 if(NOT CK_USE_ALTERNATIVE_PYTHON)
-   find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED)
+   find_package(Python3 3.8 COMPONENTS Interpreter REQUIRED)
 else()
   message("Using alternative python version")
   set(EXTRA_PYTHON_PATH)
+   # this is overly restrictive, we may need to be more flexible on the following
   string(REPLACE "/bin/python3.8" "" EXTRA_PYTHON_PATH "${CK_USE_ALTERNATIVE_PYTHON}")
   message("alternative python path is: ${EXTRA_PYTHON_PATH}")
   find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED)

--- a/Dockerfile
+++ b/Dockerfile
@@ -23,6 +23,11 @@ RUN if [ "$ROCMVERSION" != "6.3" ]; then \
        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
+    elif [ "$ROCMVERSION" = "6.3" ] && [ "$compiler_version" = "rc1" ]; then \
+        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.3.0.1-20.04-1_all.deb --no-check-certificate" && \
+        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.3.0.1-20.04-1_all.deb && \
+        sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.3.0.1 rel-5 > /etc/apt/sources.list.d/rocm-build.list' && \
+        amdgpu-repo --amdgpu-build=2033700; \
    fi

 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
@@ -130,6 +135,8 @@ ENV compiler_commit=$compiler_commit
 RUN sh -c "echo compiler version = '$compiler_version'"
 RUN sh -c "echo compiler commit = '$compiler_commit'"

+ARG DISABLE_CACHE=0
+
 RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" = "" ]; then \
        git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
        cd llvm-project && mkdir build && cd build && \

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -94,13 +94,21 @@ def getDockerImage(Map conf=[:]){
    env.DOCKER_BUILDKIT=1
    def prefixpath = conf.get("prefixpath", "/opt/rocm")
    def no_cache = conf.get("no_cache", false)
-    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' "
    if(no_cache)
    {
        dockerArgs = dockerArgs + " --no-cache "
    }
    echo "Docker Args: ${dockerArgs}"
-    def image = getDockerImageName()
+    def image
+    if ( params.BUILD_LEGACY_OS && conf.get("docker_name", "") != "" ){
+        image = conf.get("docker_name", "")
+        echo "Using legacy docker: ${image}"
+    }
+    else{
+        image = getDockerImageName()
+        echo "Using default docker: ${image}"
+    }
    //Check if image exists 
    def retimage
    try 
@@ -124,8 +132,10 @@ def buildDocker(install_prefix){
    checkout scm
    def image_name = getDockerImageName()
    echo "Building Docker for ${image_name}"
-    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-
+    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' "
+    if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){
+        dockerArgs = dockerArgs + " --no-cache "
+    }
    echo "Build Args: ${dockerArgs}"
    try{
        if(params.BUILD_DOCKER){
@@ -259,6 +269,7 @@ def cmake_build(Map conf=[:]){
            """)
        sh cmd3
    }
+
    // reduce parallelism when compiling, clang uses too much memory
    def nt = nthreads()
    def cmd
@@ -273,7 +284,7 @@ def cmake_build(Map conf=[:]){
        }
        else{
            setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
-            build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j${nt} ${config_targets}")
+            build_cmd = conf.get("build_cmd", "${build_envs} make -j${nt} ${config_targets}")
        }
        cmd = conf.get("cmd", """
            ${setup_cmd}
@@ -292,8 +303,8 @@ def cmake_build(Map conf=[:]){
    dir("build"){
        //build CK
        sh cmd
-        //run tests
-        if(!setup_args.contains("NO_CK_BUILD")){
+        //run tests except when NO_CK_BUILD or BUILD_LEGACY_OS are set
+        if(!setup_args.contains("NO_CK_BUILD") && !params.BUILD_LEGACY_OS){
            if (setup_args.contains("gfx90a") && params.NINJA_BUILD_TRACE){
                sh "/ninjatracing/ninjatracing .ninja_log > ck_build_trace.json"
                archiveArtifacts "ck_build_trace.json"
@@ -330,7 +341,15 @@ def buildHipClangJob(Map conf=[:]){
        env.HSA_ENABLE_SDMA=0
        checkout scm

-        def image = getDockerImageName() 
+        def image
+        if ( params.BUILD_LEGACY_OS  && conf.get("docker_name", "") != "" ){
+            image = conf.get("docker_name", "")
+            echo "Using legacy docker: ${image}"
+        }
+        else{
+            image = getDockerImageName()
+            echo "Using default docker: ${image}"
+        }
        def prefixpath = conf.get("prefixpath", "/opt/rocm")

        // Jenkins is complaining about the render group 
@@ -352,7 +371,7 @@ def buildHipClangJob(Map conf=[:]){
        def retimage
        (retimage, image) = getDockerImage(conf)

-        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel-internal') {
+        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel-internal') {
            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
                timeout(time: 48, unit: 'HOURS')
                {
@@ -407,7 +426,7 @@ def runCKProfiler(Map conf=[:]){
        def variant = env.STAGE_NAME
        def retimage

-        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel-internal') {
+        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel-internal') {
            try {
                (retimage, image) = getDockerImage(conf)
                withDockerContainer(image: image, args: dockerOpts) {
@@ -512,7 +531,16 @@ def Build_CK(Map conf=[:]){
        env.DOCKER_BUILDKIT=1
        checkout scm

-        def image = getDockerImageName() 
+        def image
+        if ( params.BUILD_LEGACY_OS  && conf.get("docker_name", "") != "" ){
+            image = conf.get("docker_name", "")
+            echo "Using legacy docker: ${image}"
+        }
+        else{
+            image = getDockerImageName()
+            echo "Using default docker: ${image}"
+        }
+
        def prefixpath = conf.get("prefixpath", "/opt/rocm")

        // Jenkins is complaining about the render group 
@@ -524,6 +552,9 @@ def Build_CK(Map conf=[:]){
        if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){
            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
        }
+        if(params.BUILD_LEGACY_OS){
+            dockerOpts = dockerOpts + " --env LD_LIBRARY_PATH='/opt/Python-3.8.13/lib' "
+        }
        def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
        def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
        dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
@@ -532,7 +563,7 @@ def Build_CK(Map conf=[:]){
        def variant = env.STAGE_NAME
        def retimage

-        gitStatusWrapper(credentialsId: "${env.status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel-internal') {
+        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel-internal') {
            try {
                (retimage, image) = getDockerImage(conf)
                withDockerContainer(image: image, args: dockerOpts) {
@@ -637,7 +668,7 @@ def process_results(Map conf=[:]){
    def variant = env.STAGE_NAME
    def retimage

-    gitStatusWrapper(credentialsId: "${env.status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel-internal') {
+    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel-internal') {
        try {
            (retimage, image) = getDockerImage(conf)
        }
@@ -765,9 +796,13 @@ pipeline {
            defaultValue: false,
            description: "Run the grouped conv large cases tests (default: OFF)")
        booleanParam(
-            name: "RUN_CK_TILE_TESTS",
+            name: "RUN_CK_TILE_FMHA_TESTS",
            defaultValue: false,
-            description: "Run the ck_tile tests (default: OFF)")
+            description: "Run the ck_tile FMHA tests (default: OFF)")
+        booleanParam(
+            name: "RUN_CK_TILE_GEMM_TESTS",
+            defaultValue: false,
+            description: "Run the ck_tile GEMM tests (default: OFF)")
        booleanParam(
            name: "BUILD_INSTANCES_ONLY",
            defaultValue: false,
@@ -780,6 +815,10 @@ pipeline {
            name: "NINJA_BUILD_TRACE",
            defaultValue: false,
            description: "Generate a ninja build trace (default: OFF)")
+        booleanParam(
+            name: "BUILD_LEGACY_OS",
+            defaultValue: false,
+            description: "Try building CK with legacy OS dockers: RHEL8 and SLES15 (default: OFF)")
    }
    environment{
        dbuser = "${dbuser}"
@@ -788,7 +827,7 @@ pipeline {
        dbsshport = "${dbsshport}"
        dbsshuser = "${dbsshuser}"
        dbsshpassword = "${dbsshpassword}"
-        status_wrapper_creds = "${status_wrapper_creds}"
+        ck_git_creds = "${ck_git_creds}"
        gerrit_cred="${gerrit_cred}"
        DOCKER_BUILDKIT = "1"
    }
@@ -884,15 +923,15 @@ pipeline {
                }
            }
        }
-        stage("Run CK_TILE Tests")
+        stage("Run CK_TILE_FMHA Tests")
        {
            parallel
            {
-                stage("Run CK_TILE Tests on gfx90a")
+                stage("Run CK_TILE_FMHA Tests on gfx90a")
                {
                    when {
                        beforeAgent true
-                        expression { params.RUN_CK_TILE_TESTS.toBoolean() }
+                        expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
                    }
                    agent{ label rocmnode("gfx90a") }
                    environment{
@@ -907,11 +946,11 @@ pipeline {
                        cleanWs()
                    }
                }
-                stage("Run CK_TILE Tests on gfx942")
+                stage("Run CK_TILE_FMHA Tests on gfx942")
                {
                    when {
                        beforeAgent true
-                        expression { params.RUN_CK_TILE_TESTS.toBoolean() }
+                        expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
                    }
                    agent{ label rocmnode("gfx942") }
                    environment{
@@ -928,21 +967,103 @@ pipeline {
                }
            }
        }
+        stage("Run CK_TILE_GEMM Tests")
+        {
+            parallel
+            {
+                stage("Run CK_TILE_GEMM Tests on gfx90a")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx90a") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
+                                           make -j64 tile_example_gemm_basic && \
+                                           cd ../ &&
+                                           example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
+                   }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+                stage("Run CK_TILE_GEMM Tests on gfx942")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx942") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
+                                           make -j64 tile_example_gemm_basic && \
+                                           cd ../ &&
+                                           example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
+                   }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+            }
+        }
+
 		stage("Build CK and run Tests")
        {
            parallel
            {
+                stage("Build CK with RHEL8")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.BUILD_LEGACY_OS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx90a") }
+                    environment{
+                        def docker_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_rhel8_rocm6.3"
+                        setup_args = """ -DGPU_TARGETS="gfx942" \
+                                         -DCMAKE_CXX_FLAGS=" -O3 " \
+                                         -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
+                        execute_args = " "
+                   }
+                    steps{
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name)
+                        cleanWs()
+                    }
+                }
+                stage("Build CK with SLES15")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.BUILD_LEGACY_OS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx90a") }
+                    environment{
+                        def docker_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_sles15_rocm6.3"
+                        setup_args = """ -DGPU_TARGETS="gfx942" \
+                                         -DCMAKE_CXX_FLAGS=" -O3 " \
+                                         -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
+                        execute_args = " "
+                   }
+                    steps{
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name)
+                        cleanWs()
+                    }
+                }
                stage("Build CK for all gfx9 targets")
                {
                    when {
                        beforeAgent true
-                        expression { params.RUN_FULL_QA.toBoolean() }
+                        expression { params.RUN_FULL_QA.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                    }
                    agent{ label rocmnode("gfx90a") }
                    environment{
                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \
                                         -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \
-                                         -DCMAKE_EXE_LINKER_FLAGS=" -L ${env.WORKSPACE}/script -T hip_fatbin_insert " \
                                         -DCMAKE_CXX_FLAGS=" -O3 " """
                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
@@ -959,7 +1080,7 @@ pipeline {
                {
                    when {
                        beforeAgent true
-                        expression { params.RUN_FULL_QA.toBoolean() }
+                        expression { params.RUN_FULL_QA.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                    }
                    agent{ label rocmnode("gfx942") }
                    environment{
@@ -979,7 +1100,7 @@ pipeline {
                {
                    when {
                        beforeAgent true
-                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() }
+                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                    }
                    agent{ label rocmnode("gfx90a") }
                    environment{
@@ -999,7 +1120,7 @@ pipeline {
                {
                    when {
                        beforeAgent true
-                        expression { params.BUILD_INSTANCES_ONLY.toBoolean() && !params.RUN_FULL_QA.toBoolean() }
+                        expression { params.BUILD_INSTANCES_ONLY.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                    }
                    agent{ label rocmnode("gfx90a") }
                    environment{
@@ -1018,7 +1139,7 @@ pipeline {
                {
                    when {
                        beforeAgent true
-                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() }
+                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                    }
                    agent{ label rocmnode("gfx1030") }
                    environment{
@@ -1038,7 +1159,7 @@ pipeline {
                {
                    when {
                        beforeAgent true
-                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() }
+                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                    }
                    agent{ label rocmnode("gfx1101") }
                    environment{
@@ -1058,7 +1179,7 @@ pipeline {
                {
                    when {
                        beforeAgent true
-                        expression { params.BUILD_GFX12.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() }
+                        expression { params.BUILD_GFX12.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                    }
                    agent{ label rocmnode("gfx1201") }
                    environment{
@@ -1085,7 +1206,7 @@ pipeline {
                {
                    when {
                        beforeAgent true
-                        expression { params.RUN_PERFORMANCE_TESTS.toBoolean() }
+                        expression { params.RUN_PERFORMANCE_TESTS.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                    }
                    options { retry(1) }
                    agent{ label rocmnode("gfx90a")}
@@ -1106,7 +1227,7 @@ pipeline {
                stage("Process results"){
                    when {
                        beforeAgent true
-                        expression { params.RUN_PERFORMANCE_TESTS.toBoolean() }
+                        expression { params.RUN_PERFORMANCE_TESTS.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                    }
                    agent { label 'mici' }
                    steps{

--- a/codegen/test/CMakeLists.txt
+++ b/codegen/test/CMakeLists.txt
@@ -6,6 +6,9 @@ if(NOT INSTANCES_ONLY)
    set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP)
    get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
    add_executable(codegen_test_${BASE_NAME} ${TEST_SRC})
+    if(CK_USE_ALTERNATIVE_PYTHON)
+      target_link_options(codegen_test_${BASE_NAME} PRIVATE -lstdc++fs)
+    endif()
    add_dependencies(codegen codegen_test_${BASE_NAME})
    add_dependencies(tests codegen_test_${BASE_NAME})
    add_dependencies(check codegen_test_${BASE_NAME})

--- a/codegen/test/rtc/include/rtc/compile_kernel.hpp
+++ b/codegen/test/rtc/include/rtc/compile_kernel.hpp
@@ -2,14 +2,14 @@
 #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL

 #include <rtc/kernel.hpp>
-#include <filesystem>
+#include <ck/filesystem.hpp>
 #include <string>

 namespace rtc {

 struct src_file
 {
-    std::filesystem::path path;
+    CK::fs::path path;
    std::string_view content;
 };


--- a/codegen/test/rtc/include/rtc/tmp_dir.hpp
+++ b/codegen/test/rtc/include/rtc/tmp_dir.hpp
@@ -2,13 +2,13 @@
 #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR

 #include <string>
-#include <filesystem>
+#include <ck/filesystem.hpp>

 namespace rtc {

 struct tmp_dir
 {
-    std::filesystem::path path;
+    CK::fs::path path;
    tmp_dir(const std::string& prefix = "");

    void execute(const std::string& cmd) const;

--- a/codegen/test/rtc/src/compile_kernel.cpp
+++ b/codegen/test/rtc/src/compile_kernel.cpp
@@ -70,9 +70,9 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options

    for(const auto& src : srcs)
    {
-        std::filesystem::path full_path   = td.path / src.path;
-        std::filesystem::path parent_path = full_path.parent_path();
-        std::filesystem::create_directories(parent_path);
+        CK::fs::path full_path   = td.path / src.path;
+        CK::fs::path parent_path = full_path.parent_path();
+        CK::fs::create_directories(parent_path);
        write_string(full_path.string(), src.content);
        if(src.path.extension().string() == ".cpp")
        {
@@ -86,7 +86,7 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
    td.execute(compiler() + options.flags);

    auto out_path = td.path / out;
-    if(not std::filesystem::exists(out_path))
+    if(not CK::fs::exists(out_path))
        throw std::runtime_error("Output file missing: " + out);

    auto obj = read_buffer(out_path.string());

--- a/codegen/test/rtc/src/tmp_dir.cpp
+++ b/codegen/test/rtc/src/tmp_dir.cpp
@@ -31,10 +31,10 @@ std::string unique_string(const std::string& prefix)
 }

 tmp_dir::tmp_dir(const std::string& prefix)
-    : path(std::filesystem::temp_directory_path() /
+    : path(CK::fs::temp_directory_path() /
           unique_string(prefix.empty() ? "ck-rtc" : "ck-rtc-" + prefix))
 {
-    std::filesystem::create_directories(this->path);
+    CK::fs::create_directories(this->path);
 }

 void tmp_dir::execute(const std::string& cmd) const
@@ -43,6 +43,6 @@ void tmp_dir::execute(const std::string& cmd) const
    std::system(s.c_str());
 }

-tmp_dir::~tmp_dir() { std::filesystem::remove_all(this->path); }
+tmp_dir::~tmp_dir() { CK::fs::remove_all(this->path); }

 } // namespace rtc
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core==1.7.2
-sphinxcontrib-bibtex==2.6.2
+rocm-docs-core==1.8.2
+sphinxcontrib-bibtex==2.6.3
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.7.2
+rocm-docs-core==1.8.2
    # via -r requirements.in
 six==1.16.0
    # via pybtex
@@ -137,7 +137,7 @@ sphinx-notfound-page==1.0.3
    # via rocm-docs-core
 sphinxcontrib-applehelp==2.0.0
    # via sphinx
-sphinxcontrib-bibtex==2.6.2
+sphinxcontrib-bibtex==2.6.3
    # via -r requirements.in
 sphinxcontrib-devhelp==2.0.0
    # via sphinx

--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -305,6 +305,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        }
 #endif
    }
+    else
+    {
+        // When the Problem Type and Problem Size does not fit.
+
+        std::cerr << gemm.GetTypeString() << ": the instance does not support the problem config."
+                  << std::endl;
+        return true;
+    }

    std::size_t flop = 2_uz * M * N * K;
    std::size_t num_btype =

--- a/example/01_gemm/run_gemm_example_v2.inc
+++ b/example/01_gemm/run_gemm_example_v2.inc
@@ -161,18 +161,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
    }
-#if 0
-    printf("B matrix:\n");
-    for (int in = 0; in < N; in++)
-    {
-        for (int ik = 0; ik < K; ik++)
-        {
-            printf("%02x ", *(reinterpret_cast<uint8_t*>(&b_k_n(ik,in))));
-            if(ik%8==7) printf("|");
-        }
-        printf("\n");
-    }
-#endif

    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
@@ -272,7 +260,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)

    if(config.time_kernel)
    {
-        ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+        ave_time =
+            invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 5, 10, true, 4});

        std::size_t flop = 2_uz * M * N * K;
        std::size_t num_btype =

--- a/example/66_complex_contraction_bilinear/CMakeLists.txt
+++ b/example/66_complex_contraction_bilinear/CMakeLists.txt
+add_example_executable(example_complex_contraction_bilinear_xdl_fp32 complex_contraction_bilinear_xdl_fp32.cpp)
+add_example_executable(example_complex_contraction_bilinear_xdl_fp64 complex_contraction_bilinear_xdl_fp64.cpp)
+
--- a/example/66_complex_contraction_bilinear/README.md
+++ b/example/66_complex_contraction_bilinear/README.md
+# Instructions for ```example_complex_contraction_bilinear_xdl_fp32```
+
+## Run
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+./bin/example_contraction_bilinear_xdl_fp32 1 1 1
+```
+
+
--- a/example/66_complex_contraction_bilinear/common_instances.hpp
+++ b/example/66_complex_contraction_bilinear/common_instances.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+using F64  = double;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Generic instances for fp32, fp16 and bf16 data types.
+template <ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementOp,
+          typename BElementOp,
+          typename CDEElementOp>
+// clang-format off
+using DeviceOpInstanceKK_Generic = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>;
+// clang-format on
+
+template <ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementOp,
+          typename BElementOp,
+          typename CDEElementOp>
+// clang-format off
+using DeviceOpInstanceKN_Generic = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>;
+// clang-format on
+
+template <ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementOp,
+          typename BElementOp,
+          typename CDEElementOp>
+// clang-format off
+using DeviceOpInstanceMK_Generic = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>;
+// clang-format on
+
+template <ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementOp,
+          typename BElementOp,
+          typename CDEElementOp>
+// clang-format off
+using DeviceOpInstanceMN_Generic = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>;
+// clang-format on
+
+// Fp64 instances.
+template <ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementOp,
+          typename BElementOp,
+          typename CDEElementOp>
+// clang-format off
+using DeviceOpInstanceKK_FP64 = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>;
+// clang-format on
+
+template <ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementOp,
+          typename BElementOp,
+          typename CDEElementOp>
+// clang-format off
+using DeviceOpInstanceKN_FP64 = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>;
+// clang-format on
+
+template <ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementOp,
+          typename BElementOp,
+          typename CDEElementOp>
+// clang-format off
+using DeviceOpInstanceMK_FP64 = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    16,   1,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>;
+// clang-format on
+
+template <ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ComputeDataType,
+          typename AElementOp,
+          typename BElementOp,
+          typename CDEElementOp>
+// clang-format off
+using DeviceOpInstanceMN_FP64 = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   128,   128,    16,   1,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>;
+// clang-format on
--- a/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp32.cpp
+++ b/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "common_instances.hpp"
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F32;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F32;
+using ComputeDataType  = F32;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
+
+using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
+                                                        NumDimN,
+                                                        NumDimK,
+                                                        ADataType,
+                                                        BDataType,
+                                                        AccDataType,
+                                                        CShuffleDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        ComputeDataType,
+                                                        AElementOp,
+                                                        BElementOp,
+                                                        CDEElementOp>;
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+#include "run_complex_contraction_bilinear_example.inc"
+
+int main(int argc, char* argv[]) { return run_complex_contraction_bilinear_example(argc, argv); }
--- a/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp64.cpp
+++ b/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp64.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "common_instances.hpp"
+
+using ADataType        = F64;
+using BDataType        = F64;
+using AccDataType      = F64;
+using CShuffleDataType = F64;
+using DDataType        = F64;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F64;
+using ComputeDataType  = F64;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
+
+using DeviceOpInstanceKKNN = DeviceOpInstanceKK_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+
+using DeviceOpInstanceKNNN = DeviceOpInstanceKN_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+
+using DeviceOpInstanceMKNN = DeviceOpInstanceMK_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+
+using DeviceOpInstanceMNNN = DeviceOpInstanceMN_FP64<NumDimM,
+                                                     NumDimN,
+                                                     NumDimK,
+                                                     ADataType,
+                                                     BDataType,
+                                                     AccDataType,
+                                                     CShuffleDataType,
+                                                     DsDataType,
+                                                     EDataType,
+                                                     ComputeDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CDEElementOp>;
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+#include "run_complex_contraction_bilinear_example.inc"
+
+int main(int argc, char* argv[]) { return run_complex_contraction_bilinear_example(argc, argv); }
--- a/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc
+++ b/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
+
+int run_complex_contraction_bilinear_example(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+
+    float alpha = 1.f;
+    float beta  = 1.f;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 28)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        const ck::index_t M0 = std::stoi(argv[4]);
+        const ck::index_t M1 = std::stoi(argv[5]);
+
+        const ck::index_t N0 = std::stoi(argv[6]);
+        const ck::index_t N1 = std::stoi(argv[7]);
+
+        const ck::index_t K0 = std::stoi(argv[8]);
+        const ck::index_t K1 = std::stoi(argv[9]);
+
+        a_ms_ks_lengths = {M0, M1, K0, K1};
+        a_ms_ks_strides = {
+            std::stoi(argv[10]), std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13])};
+
+        b_ns_ks_lengths = {N0, N1, K0, K1};
+        b_ns_ks_strides = {
+            std::stoi(argv[14]), std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17])};
+
+        d_ms_ns_lengths = {M0, M1, N0, N1};
+        d_ms_ns_strides = {
+            std::stoi(argv[18]), std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21])};
+
+        e_ms_ns_lengths = {M0, M1, N0, N1};
+        e_ms_ns_strides = {
+            std::stoi(argv[22]), std::stoi(argv[23]), std::stoi(argv[24]), std::stoi(argv[25])};
+
+        alpha = std::stof(argv[26]);
+        beta  = std::stof(argv[27]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 9: M0, M1, N0, N1, K0, K1\n");
+        printf("arg10 to 13: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+        printf("arg14 to 17: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+        printf("arg18 to 21: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n");
+        printf("arg22 to 25: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+        printf("arg26 to 27: alpha, beta\n");
+        exit(0);
+    }
+
+    // For Real Part of Complex Tensor
+    Tensor<ADataType> a_ms_ks_re(a_ms_ks_lengths, a_ms_ks_strides);
+    Tensor<BDataType> b_ns_ks_re(b_ns_ks_lengths, b_ns_ks_strides);
+    Tensor<EDataType> d_ms_ns_re(d_ms_ns_lengths, d_ms_ns_strides);
+
+    Tensor<EDataType> e_ms_ns_host_result_re(e_ms_ns_lengths, e_ms_ns_strides);
+    Tensor<EDataType> e_ms_ns_device_result_re(e_ms_ns_lengths, e_ms_ns_strides);
+
+    // For Imaginary Part of Complex Tensor
+    Tensor<ADataType> a_ms_ks_img(a_ms_ks_lengths, a_ms_ks_strides);
+    Tensor<BDataType> b_ns_ks_img(b_ns_ks_lengths, b_ns_ks_strides);
+    Tensor<EDataType> d_ms_ns_img(d_ms_ns_lengths, d_ms_ns_strides);
+
+    Tensor<EDataType> e_ms_ns_host_result_img(e_ms_ns_lengths, e_ms_ns_strides);
+    Tensor<EDataType> e_ms_ns_device_result_img(e_ms_ns_lengths, e_ms_ns_strides);
+
+    // Intermediate E tensor Definition
+    Tensor<EDataType> e_ms_ns_device_result_re1(e_ms_ns_lengths, e_ms_ns_strides);
+    Tensor<EDataType> e_ms_ns_device_result_img1(e_ms_ns_lengths, e_ms_ns_strides);
+
+    std::cout << "a_ms_ks_re: " << a_ms_ks_re.mDesc << std::endl;
+    std::cout << "b_ns_ks_re: " << b_ns_ks_re.mDesc << std::endl;
+    std::cout << "d_ms_ns_re: " << d_ms_ns_re.mDesc << std::endl;
+    std::cout << "e_ms_ns_re: " << e_ms_ns_host_result_re.mDesc << std::endl;
+
+    std::cout << "a_ms_ks_img: " << a_ms_ks_img.mDesc << std::endl;
+    std::cout << "b_ns_ks_img: " << b_ns_ks_img.mDesc << std::endl;
+    std::cout << "d_ms_ns_img: " << d_ms_ns_img.mDesc << std::endl;
+    std::cout << "e_ms_ns_img: " << e_ms_ns_host_result_img.mDesc << std::endl;
+
+    switch(init_method)
+    {
+        case 0: break;
+        case 1:
+
+            a_ms_ks_re.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+            b_ns_ks_re.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+            d_ms_ns_re.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+
+            a_ms_ks_img.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+            b_ns_ks_img.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+            d_ms_ns_img.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+            break;
+
+        default:
+            a_ms_ks_re.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+            b_ns_ks_re.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+            d_ms_ns_re.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+
+            a_ms_ks_img.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+            b_ns_ks_img.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+            d_ms_ns_img.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+
+            break;
+    }
+
+    DeviceMem a_device_buf_re(sizeof(ADataType) * a_ms_ks_re.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf_re(sizeof(BDataType) * b_ns_ks_re.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf_re(sizeof(DDataType) * d_ms_ns_re.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf_re(sizeof(EDataType) * e_ms_ns_device_result_re.mDesc.GetElementSpaceSize());
+
+    DeviceMem a_device_buf_img(sizeof(ADataType) * a_ms_ks_img.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf_img(sizeof(BDataType) * b_ns_ks_img.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf_img(sizeof(DDataType) * d_ms_ns_img.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf_img(sizeof(EDataType) * e_ms_ns_device_result_img.mDesc.GetElementSpaceSize());
+
+    // Intermediate Value For E Real and Img
+    DeviceMem e_device_buf_re1(sizeof(EDataType) * e_ms_ns_device_result_re.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf_img1(sizeof(EDataType) * e_ms_ns_device_result_img.mDesc.GetElementSpaceSize());
+
+
+    a_device_buf_re.ToDevice(a_ms_ks_re.mData.data());
+    b_device_buf_re.ToDevice(b_ns_ks_re.mData.data());
+    d_device_buf_re.ToDevice(d_ms_ns_re.mData.data());
+
+    a_device_buf_img.ToDevice(a_ms_ks_img.mData.data());
+    b_device_buf_img.ToDevice(b_ns_ks_img.mData.data());
+    d_device_buf_img.ToDevice(d_ms_ns_img.mData.data());
+
+    // set zero
+    e_device_buf_re.SetZero();
+    e_device_buf_img.SetZero();
+
+    // set zero for intermediate values
+    e_device_buf_re1.SetZero();
+    e_device_buf_img1.SetZero();
+ 
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{alpha, beta};
+
+    // device operation
+    // For real Intermediate Value re_1
+
+    auto op       = DeviceOpInstance{};
+    auto invoker  = op.MakeInvoker();
+    auto argument_re1 = op.MakeArgument(a_device_buf_re.GetDeviceBuffer(),
+                                    b_device_buf_re.GetDeviceBuffer(),
+                                    std::array<const void*, 1>{d_device_buf_re.GetDeviceBuffer()},
+                                    e_device_buf_re1.GetDeviceBuffer(),
+                                    a_ms_ks_lengths,
+                                    a_ms_ks_strides,
+                                    b_ns_ks_lengths,
+                                    b_ns_ks_strides,
+                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                                    e_ms_ns_lengths,
+                                    e_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+
+    if(!op.IsSupportedArgument(argument_re1))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time_re1 = invoker.Run(argument_re1, StreamConfig{nullptr, time_kernel});
+
+
+    alpha = -1.f;
+    beta  = 1.f;
+
+    a_element_op   = AElementOp{};
+    b_element_op   = BElementOp{};
+    cde_element_op = CDEElementOp{alpha, beta};
+
+    // device operation
+    // For real Intermediate Value re_2
+    // auto op       = DeviceOpInstance{};
+    // auto invoker  = op.MakeInvoker();
+    auto argument_re2 = op.MakeArgument(a_device_buf_img.GetDeviceBuffer(),
+                                    b_device_buf_img.GetDeviceBuffer(),
+                                    std::array<const void*, 1>{e_device_buf_re1.GetDeviceBuffer()},
+                                    e_device_buf_re.GetDeviceBuffer(),
+                                    a_ms_ks_lengths,
+                                    a_ms_ks_strides,
+                                    b_ns_ks_lengths,
+                                    b_ns_ks_strides,
+                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                                    e_ms_ns_lengths,
+                                    e_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+
+    if(!op.IsSupportedArgument(argument_re2))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time_re2 = invoker.Run(argument_re2, StreamConfig{nullptr, time_kernel});
+
+    
+    alpha = 1.f;
+    beta  = 1.f;
+
+    a_element_op   = AElementOp{};
+    b_element_op   = BElementOp{};
+    cde_element_op = CDEElementOp{alpha, beta};
+
+    auto argument_img1 = op.MakeArgument(a_device_buf_re.GetDeviceBuffer(),
+                                b_device_buf_img.GetDeviceBuffer(),
+                                std::array<const void*, 1>{d_device_buf_img.GetDeviceBuffer()},
+                                e_device_buf_img1.GetDeviceBuffer(),
+                                a_ms_ks_lengths,
+                                a_ms_ks_strides,
+                                b_ns_ks_lengths,
+                                b_ns_ks_strides,
+                                std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                                std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                                e_ms_ns_lengths,
+                                e_ms_ns_strides,
+                                a_element_op,
+                                b_element_op,
+                                cde_element_op);
+
+
+    if(!op.IsSupportedArgument(argument_img1))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time_img1 = invoker.Run(argument_img1, StreamConfig{nullptr, time_kernel});
+
+    alpha = 1.f;
+    beta  = 1.f;
+
+    auto argument_img2 = op.MakeArgument(a_device_buf_img.GetDeviceBuffer(),
+                                b_device_buf_re.GetDeviceBuffer(),
+                                std::array<const void*, 1>{e_device_buf_img1.GetDeviceBuffer()},
+                                e_device_buf_img.GetDeviceBuffer(),
+                                a_ms_ks_lengths,
+                                a_ms_ks_strides,
+                                b_ns_ks_lengths,
+                                b_ns_ks_strides,
+                                std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                                std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                                e_ms_ns_lengths,
+                                e_ms_ns_strides,
+                                a_element_op,
+                                b_element_op,
+                                cde_element_op);
+
+
+
+    if(!op.IsSupportedArgument(argument_img2))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time_img2 = invoker.Run(argument_img2, StreamConfig{nullptr, time_kernel});
+
+
+    ck::index_t M =
+        ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+    ck::index_t N = ck::accumulate_n<ck::index_t>(
+        e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+    ck::index_t K = ck::accumulate_n<ck::index_t>(
+        a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+    std::size_t flop      = std::size_t(2) * M * N * K * 2;
+    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                            sizeof(DDataType) * M * N + sizeof(EDataType) * M * N * 2;
+
+    float ave_time = ave_time_img2 + ave_time_img1 + ave_time_re2 + ave_time_re1 ; 
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op.GetTypeString() << std::endl;
+
+    e_device_buf_re.FromDevice(e_ms_ns_device_result_re.mData.data());
+    e_device_buf_img.FromDevice(e_ms_ns_device_result_img.mData.data());
+
+    auto isRealOk = 0;
+    auto isImgOk = 0;
+
+    if(do_verification)
+    {
+        // Real Part Verification
+        Tensor<CShuffleDataType> c_ms_ns_host_result_re(e_ms_ns_lengths, e_ms_ns_strides);
+        Tensor<CShuffleDataType> c_ms_ns_host_result_re1(e_ms_ns_lengths, e_ms_ns_strides);
+
+        using ReferenceOpInstance =
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
+                                                                      NumDimN,
+                                                                      NumDimK,
+                                                                      ADataType,
+                                                                      BDataType,
+                                                                      CShuffleDataType,
+                                                                      AccDataType,
+                                                                      F32,
+                                                                      AElementOp,
+                                                                      BElementOp>;
+
+        auto ref_op      = ReferenceOpInstance{};
+        auto ref_invoker = ref_op.MakeInvoker();
+
+        auto ref_argument_re =
+            ref_op.MakeArgument(a_ms_ks_re, b_ns_ks_re, c_ms_ns_host_result_re, a_element_op, b_element_op);
+
+        ref_invoker.Run(ref_argument_re);
+
+        alpha = 1.f;
+        beta  = 1.f;
+   
+        cde_element_op = CDEElementOp{alpha, beta};
+
+       
+        for(size_t m0 = 0; m0 < e_ms_ns_host_result_re.mDesc.GetLengths()[0]; ++m0)
+        {
+            for(size_t m1 = 0; m1 < e_ms_ns_host_result_re.mDesc.GetLengths()[1]; ++m1)
+            {
+                for(size_t n0 = 0; n0 < e_ms_ns_host_result_re.mDesc.GetLengths()[2]; ++n0)
+                {
+                    for(size_t n1 = 0; n1 < e_ms_ns_host_result_re.mDesc.GetLengths()[3]; ++n1)
+                    {
+                        cde_element_op(e_ms_ns_host_result_re(m0, m1, n0, n1),
+                                       c_ms_ns_host_result_re(m0, m1, n0, n1),
+                                       d_ms_ns_re(m0, m1, n0, n1));
+                    }
+                }
+            }
+        }
+
+        alpha = 1.f;
+        beta  = -1.f;
+   
+        cde_element_op = CDEElementOp{alpha, beta};
+
+        auto ref_argument_re1 =
+            ref_op.MakeArgument(a_ms_ks_img, b_ns_ks_img, c_ms_ns_host_result_re1, a_element_op, b_element_op);
+
+        ref_invoker.Run(ref_argument_re1);
+
+        for(size_t m0 = 0; m0 < e_ms_ns_host_result_re.mDesc.GetLengths()[0]; ++m0)
+        {
+            for(size_t m1 = 0; m1 < e_ms_ns_host_result_re.mDesc.GetLengths()[1]; ++m1)
+            {
+                for(size_t n0 = 0; n0 < e_ms_ns_host_result_re.mDesc.GetLengths()[2]; ++n0)
+                {
+                    for(size_t n1 = 0; n1 < e_ms_ns_host_result_re.mDesc.GetLengths()[3]; ++n1)
+                    {
+                        cde_element_op(e_ms_ns_host_result_re(m0, m1, n0, n1),
+                                       e_ms_ns_host_result_re(m0, m1, n0, n1),
+                                       c_ms_ns_host_result_re1(m0, m1, n0, n1));
+                    }
+                }
+            }
+        }
+
+        isRealOk =  ck::utils::check_err(e_ms_ns_device_result_re, e_ms_ns_host_result_re) ? 0 : 1;
+
+        
+
+
+        // Img Part Verification
+        Tensor<CShuffleDataType> c_ms_ns_host_result_img(e_ms_ns_lengths, e_ms_ns_strides);
+        Tensor<CShuffleDataType> c_ms_ns_host_result_img1(e_ms_ns_lengths, e_ms_ns_strides);
+
+        auto ref_argument_img =
+            ref_op.MakeArgument(a_ms_ks_re, b_ns_ks_img, c_ms_ns_host_result_img, a_element_op, b_element_op);
+            
+        ref_invoker.Run(ref_argument_img);
+
+        alpha = 1.f;
+        beta  = 1.f;
+   
+        cde_element_op = CDEElementOp{alpha, beta};
+
+        for(size_t m0 = 0; m0 < e_ms_ns_host_result_img.mDesc.GetLengths()[0]; ++m0)
+        {
+            for(size_t m1 = 0; m1 < e_ms_ns_host_result_img.mDesc.GetLengths()[1]; ++m1)
+            {
+                for(size_t n0 = 0; n0 < e_ms_ns_host_result_img.mDesc.GetLengths()[2]; ++n0)
+                {
+                    for(size_t n1 = 0; n1 < e_ms_ns_host_result_img.mDesc.GetLengths()[3]; ++n1)
+                    {
+                        cde_element_op(e_ms_ns_host_result_img(m0, m1, n0, n1),
+                                       c_ms_ns_host_result_img(m0, m1, n0, n1),
+                                       d_ms_ns_img(m0, m1, n0, n1));
+                    }
+                }
+            }
+        }
+
+        auto ref_argument_img1 =
+            ref_op.MakeArgument(a_ms_ks_img, b_ns_ks_re, c_ms_ns_host_result_img1, a_element_op, b_element_op);
+            
+        ref_invoker.Run(ref_argument_img1);
+
+        for(size_t m0 = 0; m0 < e_ms_ns_host_result_img.mDesc.GetLengths()[0]; ++m0)
+        {
+            for(size_t m1 = 0; m1 < e_ms_ns_host_result_img.mDesc.GetLengths()[1]; ++m1)
+            {
+                for(size_t n0 = 0; n0 < e_ms_ns_host_result_img.mDesc.GetLengths()[2]; ++n0)
+                {
+                    for(size_t n1 = 0; n1 < e_ms_ns_host_result_img.mDesc.GetLengths()[3]; ++n1)
+                    {
+                        cde_element_op(e_ms_ns_host_result_img(m0, m1, n0, n1),
+                                       e_ms_ns_host_result_img(m0, m1, n0, n1),
+                                       c_ms_ns_host_result_img1(m0, m1, n0, n1));
+                    }
+                }
+            }
+        }
+
+        isImgOk =  ck::utils::check_err(e_ms_ns_device_result_re, e_ms_ns_host_result_re) ? 0 : 1;
+
+        return (isRealOk && isImgOk);
+    }
+
+    return 0;
+}
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -22,12 +22,20 @@ string(REPLACE ";" "," FMHA_FWD_APIS "${FMHA_FWD_ENABLE_APIS}")
 execute_process(
  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
  --api ${FMHA_FWD_APIS} --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/fwd_blob_list.txt
+  RESULT_VARIABLE ret
 )
+if(ret AND NOT ret EQUAL 0)
+  message( FATAL_ERROR "CK Tile FMHA FAILED to genrate a list of FWD kernels via Python.")
+endif()

 execute_process(
  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
  --api bwd --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt --receipt 3
+  RESULT_VARIABLE ret
 )
+if(ret AND NOT ret EQUAL 0)
+  message( FATAL_ERROR "CK Tile FMHA FAILED to genrate a list of BWD kernels via Python.")
+endif()

 # NOTE: for cmake, the FMHA_FWD_GEN_BLOBS/FMHA_BWD_GEN_BLOBS files must be in the same directory
 #       as current cmake list, otherwise will not figure out the dependency properly

--- a/example/ck_tile/01_fmha/fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.cpp
@@ -99,13 +99,26 @@ auto create_args(int argc, char* argv[])

 // different threshold for different dtype
 template <typename DataType>
-auto get_elimit(int /*init_method*/)
+auto get_elimit(ck_tile::index_t /*hdim_q*/, ck_tile::index_t /*hdim_v*/)
 {
    double rtol = 1e-2;
    double atol = 1e-2;
    return ck_tile::make_tuple(rtol, atol);
 }

+template <>
+auto get_elimit<ck_tile::bf16_t>(ck_tile::index_t hdim_q, ck_tile::index_t hdim_v)
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    if(hdim_q > 128 && hdim_v > 128) // 3.2 for RTZ/1.5 for RTN
+    {
+        rtol = 3.2e-2;
+        atol = 3.2e-2;
+    }
+    return ck_tile::make_tuple(rtol, atol);
+}
+
 template <typename DataType>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
@@ -899,7 +912,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        }
        // clang-format on

-        auto [rtol, atol] = get_elimit<DataType>(init_method);
+        auto [rtol, atol] = get_elimit<DataType>(hdim_q, hdim_v);
        bool dq_cur_pass  = ck_tile::check_err(dq_host_result,
                                              dq_host_ref,
                                              std::string("Error: QGrad Incorrect results!"),