Merge remote-tracking branch 'origin/develop' into jakpias/pool1d_fwd

86f8ac01 · Jakub Piasecki · 3f6360d0 · ab60b390 · 86f8ac01 · 86f8ac01
Commit 86f8ac01 authored Aug 12, 2024 by Jakub Piasecki
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -189,7 +189,9 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
   message("Adding the enable-post-misched=0 compiler flag")
   add_compile_options("SHELL: -mllvm -enable-post-misched=0")
 endif()
-if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132 AND ${hip_VERSION_FLAT} LESS 600300000)
+set(check-coerce)
+check_cxx_compiler_flag(" -mllvm -amdgpu-coerce-illegal-types=1" check-coerce)
+if(NOT WIN32 AND check-coerce AND ${hip_VERSION_FLAT} GREATER 600241132 AND ${hip_VERSION_FLAT} LESS 600300000)
   message("Adding the amdgpu-coerce-illegal-types=1")
   add_compile_options("SHELL: -mllvm -amdgpu-coerce-illegal-types=1")
 endif()
@@ -539,6 +541,9 @@ if(NOT DEFINED INSTANCES_ONLY)
        PACKAGE_NAME examples
   )
   add_subdirectory(example)
+   if(GPU_TARGETS MATCHES "gfx9" AND NOT INSTANCES_ONLY)
+      add_subdirectory(codegen)
+   endif()
   if(BUILD_TESTING)
      add_subdirectory(test)
   endif()

--- a/Dockerfile
+++ b/Dockerfile
 FROM ubuntu:20.04
 ARG DEBIAN_FRONTEND=noninteractive
-ARG ROCMVERSION=6.1
+ARG ROCMVERSION=6.2
 ARG compiler_version=""
 ARG compiler_commit=""
 ARG CK_SCCACHE=""
@@ -17,17 +17,12 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
 ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
 RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg

-RUN if [ "$ROCMVERSION" != "6.2" ]; then \
-        sh -c "wget https://repo.radeon.com/amdgpu-install/6.1/ubuntu/focal/amdgpu-install_6.1.60100-1_all.deb  --no-check-certificate" && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.1.60100-1_all.deb && \
+RUN if [ "$ROCMVERSION" != "6.3" ]; then \
+        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.2.60200-1_all.deb  --no-check-certificate" && \
+        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.2.60200-1_all.deb && \
        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
-    elif [ "$ROCMVERSION" = "6.2" ] && [ "$compiler_version" = "rc4" ]; then \
-        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.2-20.04-1_all.deb --no-check-certificate" && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.2-20.04-1_all.deb && \
-        sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.2 rel-63 > /etc/apt/sources.list.d/rocm-build.list' && \
-        amdgpu-repo --amdgpu-build=2009461; \
    fi

 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
@@ -64,6 +59,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    python3-dev \
    python3-pip \
    redis \
+    rocm-llvm-dev \
    sshpass \
    stunnel \
    software-properties-common \

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -38,7 +38,7 @@ def getDockerImageName(){
        img = "${params.USE_CUSTOM_DOCKER}"
    }
    else{
-    if (params.ROCMVERSION != "6.2"){
+    if (params.ROCMVERSION != "6.3"){
       if (params.COMPILER_VERSION == "") {
           img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
       }
@@ -204,6 +204,9 @@ def cmake_build(Map conf=[:]){
            cd build
        """
    def invocation_tag=""
+    if (setup_args.contains("gfx12")){
+        invocation_tag="gfx12"
+    }
    if (setup_args.contains("gfx11")){
        invocation_tag="gfx11"
    }
@@ -285,6 +288,19 @@ def cmake_build(Map conf=[:]){
    if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "amd-master")) {
        archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true
    }
+    if (params.RUN_CK_TILE_TESTS){
+        try{
+            archiveArtifacts "perf_fmha_fwd_*.log"
+            archiveArtifacts "perf_fmha_bwd_*.log"
+            stash name: "perf_fmha_fwd_gfx942.log"
+            stash name: "perf_fmha_bwd_gfx942.log"
+            stash name: "perf_fmha_fwd_gfx90a.log"
+            stash name: "perf_fmha_bwd_gfx90a.log"
+        }
+        catch(Exception err){
+            echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
+        }
+    }
 }

 def buildHipClangJob(Map conf=[:]){
@@ -518,7 +534,7 @@ def Build_CK(Map conf=[:]){
                    //check whether to run performance tests on this node
                    def do_perf_tests = 0
                    sh 'rocminfo | tee rocminfo.log'
-                    if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') || runShell('grep -n "gfx942" rocminfo.log') ){
+                    if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') || runShell('grep -n "gfx1201" rocminfo.log') || runShell('grep -n "gfx942" rocminfo.log') ){
                        do_perf_tests = 1
                        echo "Stash profiler and run performance tests"
                    }
@@ -612,6 +628,17 @@ def process_results(Map conf=[:]){
        timeout(time: 1, unit: 'HOURS'){
            try{
                dir("script"){
+                    if (params.RUN_CK_TILE_TESTS){
+                        try{
+                            unstash "perf_fmha_fwd_gfx942.log"
+                            unstash "perf_fmha_bwd_gfx942.log"
+                            unstash "perf_fmha_fwd_gfx90a.log"
+                            unstash "perf_fmha_bwd_gfx90a.log"
+                        }
+                        catch(Exception err){
+                            echo "could not locate the FMHA performance logs: ${err.getMessage()}."
+                        }
+                    }
                    if (params.RUN_FULL_QA){
                        // unstash perf files to master
                        unstash "ckprofiler_0.2.0_amd64.deb"
@@ -652,10 +679,10 @@ def process_results(Map conf=[:]){
 }

 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.1; RUN_CK_TILE_TESTS=true
-                                              0 21 * * * % ROCMVERSION=6.1;hipTensor_test=true
-                                              0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false
-                                              0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2; RUN_CK_TILE_TESTS=true
+                                              0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true
+                                              0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false
+                                              0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false
                                              0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_CODEGEN_TESTS=false;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false''' : ""

 pipeline {
@@ -677,8 +704,8 @@ pipeline {
            description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
        string(
            name: 'ROCMVERSION', 
-            defaultValue: '6.1', 
-            description: 'Specify which ROCM version to use: 6.1 (default).')
+            defaultValue: '6.2', 
+            description: 'Specify which ROCM version to use: 6.2 (default).')
        string(
            name: 'COMPILER_VERSION', 
            defaultValue: '', 
@@ -719,10 +746,6 @@ pipeline {
            name: "RUN_PERFORMANCE_TESTS",
            defaultValue: true,
            description: "Run the performance tests (default: ON)")
-        booleanParam(
-            name: "RUN_CODEGEN_TESTS",
-            defaultValue: true,
-            description: "Run the codegen tests (default: ON)")
        booleanParam(
            name: "RUN_CK_TILE_TESTS",
            defaultValue: false,
@@ -731,6 +754,11 @@ pipeline {
            name: "BUILD_INSTANCES_ONLY",
            defaultValue: false,
            description: "Test building instances for various architectures simultaneously (default: OFF)")
+        booleanParam(
+            name: "BUILD_GFX12",
+            defaultValue: false,
+            description: "Build CK and run tests on gfx12 (default: OFF)")
+
    }
    environment{
        dbuser = "${dbuser}"
@@ -809,33 +837,6 @@ pipeline {
                }
            }
        }
-        stage("Run Codegen Tests")
-        {
-            parallel
-            {
-                stage("Run Codegen Tests on gfx90a")
-                {
-                    when {
-                        beforeAgent true
-                        expression { params.RUN_CODEGEN_TESTS.toBoolean() }
-                    }
-                    agent{ label rocmnode("gfx90a")}
-                    environment{
-                        setup_args = "NO_CK_BUILD"
-                        execute_args = """ cd ../codegen && rm -rf build && mkdir build && cd build && \
-                                           cmake -D CMAKE_PREFIX_PATH=/opt/rocm \
-                                           -D CMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
-                                           -D CMAKE_BUILD_TYPE=Release \
-                                           -D GPU_TARGETS="gfx90a" \
-                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j check"""
-                   }
-                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
-                        cleanWs()
-                    }
-                }
-            }
-        }
        stage("Run CK_TILE Tests")
        {
            parallel
@@ -852,8 +853,7 @@ pipeline {
                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
                                           make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
                                           cd ../ &&
-                                           example/ck_tile/01_fmha/script/smoke_test_fwd.sh && \
-                                           example/ck_tile/01_fmha/script/smoke_test_bwd.sh"""
+                                           example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
                   }
                    steps{
                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
@@ -872,8 +872,7 @@ pipeline {
                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
                                           make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
                                           cd ../ &&
-                                           example/ck_tile/01_fmha/script/smoke_test_fwd.sh && \
-                                           example/ck_tile/01_fmha/script/smoke_test_bwd.sh"""
+                                           example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
                   }
                    steps{
                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
@@ -1008,6 +1007,26 @@ pipeline {
                        cleanWs()
                    }
                }
+                stage("Build CK and run Tests on gfx1201")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.BUILD_GFX12.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx1201") }
+                    environment{
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1201" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
+                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
+                                           -DGPU_TARGETS="gfx1201" \
+                                           -DCMAKE_CXX_COMPILER="${build_compiler()}" \
+                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
+                    }
+                    steps{
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        cleanWs()
+                    }
+                }
            }
        }


--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
-cmake_minimum_required(VERSION 3.16)
-project(composable_kernel_host LANGUAGES CXX HIP)
-
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
@@ -8,17 +5,9 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)

-find_package(ROCM)
-include(ROCMInstallTargets)
-include(ROCMTest)
-
 add_compile_options(-std=c++17)
 find_package(hip)
-## HIP
-set(CMAKE_HIP_PLATFORM amd)
-set(CMAKE_HIP_COMPILER ${CMAKE_CXX_COMPILER})
-set(CMAKE_HIP_EXTENSIONS ON)
-message("CMAKE_HIP_COMPILER: ${CMAKE_HIP_COMPILER}")
+add_custom_target(codegen)

 # add include directories
 include_directories(BEFORE
@@ -32,8 +21,9 @@ list(APPEND CMAKE_MODULE_PATH ${CK_ROOT}/cmake)
 include(Embed)
 file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
 	${CK_ROOT}/include/ck/*.hpp)
-message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
-message(STATUS "RELATIVE: ${CK_ROOT}/include")
+#printouts fot debug purposes
+#message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
+#message(STATUS "RELATIVE: ${CK_ROOT}/include")
 add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include)

 file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)

--- a/codegen/include/ck/host/types.hpp
+++ b/codegen/include/ck/host/types.hpp
@@ -76,8 +76,11 @@ std::string SequenceStr(const std::vector<int>& v);

 std::string MakeTuple(const std::vector<std::string>& v);

+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wglobal-constructors"
 template <int... xs>
 const std::string S = SequenceStr({xs...});
+#pragma clang diagnostic pop

 constexpr const char* PassThrough = "ck::tensor_operation::element_wise::PassThrough";
 constexpr const char* Bilinear    = "ck::tensor_operation::element_wise::Bilinear";

--- a/codegen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp
+++ b/codegen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp
@@ -3,6 +3,7 @@

 #include "ck/host/device_gemm_multiple_d/operation.hpp"
 #include "ck/host/stringutils.hpp"
+#include "ck/host/types.hpp"
 #include "ck/host/utils.hpp"
 #include <cassert>

@@ -32,11 +33,11 @@ static std::string GetGemmSpec(const std::size_t m,
 }

 // function to update prologue/epilogue with user provided operation
-void Operation_Xdl_CShuffle::update_prologue(const std::string& prologue)
+void Operation_Xdl_CShuffle::update_prologue(const std::string& pro)
 {
-    if(!prologue.empty())
+    if(!pro.empty())
    {
-        this->prologue    = prologue;
+        this->prologue    = pro;
        this->cde_elem_op = "CDEElementOp";
    }
    else
@@ -45,11 +46,11 @@ void Operation_Xdl_CShuffle::update_prologue(const std::string& prologue)
    }
 }

-void Operation_Xdl_CShuffle::update_epilogue(const std::string& epilogue)
+void Operation_Xdl_CShuffle::update_epilogue(const std::string& epi)
 {
-    if(!epilogue.empty())
+    if(!epi.empty())
    {
-        this->epilogue    = epilogue;
+        this->epilogue    = epi;
        this->cde_elem_op = "CDEElementOp";
    }
    else

--- a/codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp
+++ b/codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp
@@ -4,6 +4,7 @@
 #include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
 #include <iostream>
 #include "ck/host/stringutils.hpp"
+#include "ck/host/types.hpp"
 #include "ck/host/utils.hpp"
 #include <cassert>

@@ -11,34 +12,15 @@ namespace ck {
 namespace host {
 namespace conv {

-// calculate appropriate Gemm Specification based on input tensor dimensions
-// NOTE: in CK, MNKPadding is always used for forward convolution
-static std::string GetGemmSpec(const std::size_t m,
-                               const std::size_t n,
-                               const std::size_t k,
-                               const std::size_t m_per_block,
-                               const std::size_t n_per_block,
-                               const std::size_t k_per_block)
-{
-    std::string spec = "";
-    if(integer_divide_ceil(m, m_per_block) * m_per_block - m != 0)
-        spec += "M";
-    if(integer_divide_ceil(n, n_per_block) * n_per_block - n != 0)
-        spec += "N";
-    if(integer_divide_ceil(k, k_per_block) * k_per_block - k != 0)
-        spec += "K";
-    if(spec == "")
-        return "ck::tensor_operation::device::GemmSpecialization::Default";
-
-    return "ck::tensor_operation::device::GemmSpecialization::" + spec + "Padding";
-}
+// NOTE: in CK, MNKPadding is always used for forward convolution, so didn't
+// add GemmSpec function here

 // function to update prologue/epilogue with user provided operation
-void Operation_Conv_Fwd_Xdl_Cshuffle::update_prologue(const std::string& prologue)
+void Operation_Conv_Fwd_Xdl_Cshuffle::update_prologue(const std::string& pro)
 {
-    if(!prologue.empty())
+    if(!pro.empty())
    {
-        this->prologue    = prologue;
+        this->prologue    = pro;
        this->cde_elem_op = "CDEElementOp";
    }
    else
@@ -47,11 +29,11 @@ void Operation_Conv_Fwd_Xdl_Cshuffle::update_prologue(const std::string& prologu
    }
 }

-void Operation_Conv_Fwd_Xdl_Cshuffle::update_epilogue(const std::string& epilogue)
+void Operation_Conv_Fwd_Xdl_Cshuffle::update_epilogue(const std::string& epi)
 {
-    if(!epilogue.empty())
+    if(!epi.empty())
    {
-        this->epilogue    = epilogue;
+        this->epilogue    = epi;
        this->cde_elem_op = "CDEElementOp";
    }
    else

--- a/codegen/src/headers.cpp
+++ b/codegen/src/headers.cpp
@@ -4,7 +4,10 @@
 namespace ck {
 namespace host {

+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wglobal-constructors"
 const std::string config_header = "";
+#pragma clang diagnostic pop

 std::unordered_map<std::string_view, std::string_view> GetHeaders()
 {

--- a/codegen/test/CMakeLists.txt
+++ b/codegen/test/CMakeLists.txt
@@ -4,7 +4,9 @@ file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp)
 foreach(TEST_SRC ${TEST_SRCS})
  set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP)
  get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
-  rocm_add_test_executable(test_host_${BASE_NAME} ${TEST_SRC})
+  add_executable(test_host_${BASE_NAME} ${TEST_SRC})
+  add_dependencies(codegen test_host_${BASE_NAME})
+  add_test(NAME codegen_test_${BASE_NAME} COMMAND test_host_${BASE_NAME})
  target_link_libraries(test_host_${BASE_NAME} ck_rtc ck_host)
  # target_link_libraries(test_host_${BASE_NAME} ${CK_ROOT}/build/lib/libutility.a)
  target_include_directories(test_host_${BASE_NAME} PUBLIC include())

--- a/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
@@ -92,7 +92,6 @@ struct Epilogue
                                          static_cast<int>(prob.C),
                                          static_cast<int>(prob.Y),
                                          static_cast<int>(prob.X)};
-    ck::Array<ck::index_t, 5> d_lengths = {};

    ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
                                         static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
@@ -109,7 +108,6 @@ struct Epilogue
                                          1,
                                          static_cast<int>(prob.X * prob.C),
                                          static_cast<int>(prob.C)};
-    ck::Array<ck::index_t, 5> d_strides = {};

    ck::Array<ck::index_t, 2> conv_filter_strides   = {2, 2};
    ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};

--- a/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
@@ -92,7 +92,6 @@ struct Epilogue
                                          static_cast<int>(prob.C),
                                          static_cast<int>(prob.Y),
                                          static_cast<int>(prob.X)};
-    ck::Array<ck::index_t, 5> d_lengths = {};

    ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
                                         static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
@@ -109,7 +108,6 @@ struct Epilogue
                                          1,
                                          static_cast<int>(prob.X * prob.C),
                                          static_cast<int>(prob.C)};
-    ck::Array<ck::index_t, 5> d_strides = {};

    ck::Array<ck::index_t, 2> conv_filter_strides   = {1, 1};
    ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};

--- a/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
@@ -92,7 +92,6 @@ struct Epilogue
                                          static_cast<int>(prob.C),
                                          static_cast<int>(prob.Y),
                                          static_cast<int>(prob.X)};
-    ck::Array<ck::index_t, 5> d_lengths = {};

    ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
                                         static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
@@ -109,7 +108,6 @@ struct Epilogue
                                          1,
                                          static_cast<int>(prob.X * prob.C),
                                          static_cast<int>(prob.C)};
-    ck::Array<ck::index_t, 5> d_strides = {};

    ck::Array<ck::index_t, 2> conv_filter_strides   = {2, 2};
    ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};

--- a/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
@@ -92,7 +92,6 @@ struct Epilogue
                                          static_cast<int>(prob.C),
                                          static_cast<int>(prob.Y),
                                          static_cast<int>(prob.X)};
-    ck::Array<ck::index_t, 5> d_lengths = {};

    ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
                                         static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
@@ -109,7 +108,6 @@ struct Epilogue
                                          1,
                                          static_cast<int>(prob.X * prob.C),
                                          static_cast<int>(prob.C)};
-    ck::Array<ck::index_t, 5> d_strides = {};

    ck::Array<ck::index_t, 2> conv_filter_strides   = {1, 1};
    ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};

--- a/codegen/test/rtc/src/kernel.cpp
+++ b/codegen/test/rtc/src/kernel.cpp
@@ -118,4 +118,4 @@ void kernel::launch(hipStream_t stream,
    launch_kernel(impl->fun, stream, global, local, kernargs.data(), size);
 }

-} // namespace rtc
\ No newline at end of file
+} // namespace rtc
--- a/codegen/test/rtc/src/tmp_dir.cpp
+++ b/codegen/test/rtc/src/tmp_dir.cpp
@@ -45,4 +45,4 @@ void tmp_dir::execute(const std::string& cmd) const

 tmp_dir::~tmp_dir() { std::filesystem::remove_all(this->path); }

-} // namespace rtc
\ No newline at end of file
+} // namespace rtc
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core==1.6.1
+rocm-docs-core==1.6.2
 sphinxcontrib-bibtex==2.6.2
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.6.1
+rocm-docs-core==1.6.2
    # via -r requirements.in
 six==1.16.0
    # via pybtex

--- a/example/ck_tile/01_fmha/script/run_full_test.sh
+++ b/example/ck_tile/01_fmha/script/run_full_test.sh
+#!/bin/bash 
+#
+# in order to run this script you'd first need to build the tile_example_fmha_fwd and tile_eaxmple_fmha_bwd executables in ../build/bin/
+#
+# run the script as "./run_full_test.sh <tag for your test environment> <branch name> <host name> <gpu_arch>
+# input arguments: 
+# environment tag  : a string describing the specifics of your test environment
+# branch name      : name of the branch in git repo (git status | grep -e 'On branch')
+# host name        : $hostname
+# gpu architecture: e.g., gfx90a, or gfx942, etc.
+
+#get the command line arguments:
+export env_type=$1
+echo 'Environment type: ' $env_type
+export branch=$2
+echo 'Branch name: ' $branch
+export host_name=$3
+echo 'Host name: ' $host_name
+export GPU_arch=$4
+echo 'GPU_arch: ' $GPU_arch
+
+function print_log_header(){
+	rm -f $1;
+	echo 'On branch ' $3 &> $1;
+	echo 'Node name: ' $4 >> $1;
+	#get GPU_arch and number of compute units from rocminfo
+	echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
+	rocminfo | grep "Compute Unit:" >> $1;
+	hipcc --version | grep -e 'HIP version'  >> $1;
+	echo 'Environment type: ' $2 >> $1;
+	/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
+}
+
+#run verification tests
+example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+example/ck_tile/01_fmha/script/smoke_test_bwd.sh
+
+#run performance benchmarks
+export fmha_fwd_log="perf_fmha_fwd_$GPU_arch.log"
+print_log_header $fmha_fwd_log $env_type $branch $host_name
+example/ck_tile/01_fmha/script/benchmark_fwd.sh 2>&1 | tee -a $fmha_fwd_log
+
+export fmha_bwd_log="perf_fmha_bwd_$GPU_arch.log"
+print_log_header $fmha_bwd_log $env_type $branch $host_name
+example/ck_tile/01_fmha/script/benchmark_bwd.sh 2>&1 | tee -a $fmha_bwd_log
+
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -102,9 +102,10 @@ __global__ void
    // offset base pointer for each work-group
    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
-    const long_index_t e_group_offset =
+
+    const long_index_t e_batch_offset =
        amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetEPtrOffset(g_idx));
-    const auto& ds_group_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx);
+    const auto& ds_batch_offset = compute_ptr_offset_of_groups.GetDsPtrOffset(g_idx);

    const long_index_t e_n_offset =
        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
@@ -117,14 +118,14 @@ __global__ void
        DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();

    static_for<0, NumDTensor, 1>{}(
-        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_group_offset[i]; });
+        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });

    if constexpr(isMultiA || isMultiB)
    {
        AsPointer p_as_grid_grp;
        BsPointer p_bs_grid_grp;

-        const auto& as_group_offset = compute_ptr_offset_of_groups.GetAsPtrOffset(g_idx);
+        const auto& as_batch_offset = compute_ptr_offset_of_groups.GetAsPtrOffset(g_idx);

        // compute_ptr_offset_of_n_ not need BatchStrideB so
        // in case of MultiA is false but isMultiB is true
@@ -135,27 +136,27 @@ __global__ void

            static constexpr index_t NumATensor = AGridDesc_AK0_M_AK1::Size();
            static_for<0, NumATensor, 1>{}([&](auto i) {
-                p_as_grid_grp(i) = p_as_grid[i] + as_group_offset[i] + as_n_offset[i];
+                p_as_grid_grp(i) = p_as_grid[i] + as_batch_offset[i] + as_n_offset[i];
            });
        }
        else
        {
            const long_index_t a_n_offset = compute_ptr_offset_of_n.GetAPtrOffset(n_idx);
            static_for<0, 1, 1>{}(
-                [&](auto i) { p_as_grid_grp(i) = p_as_grid[i] + as_group_offset[i] + a_n_offset; });
+                [&](auto i) { p_as_grid_grp(i) = p_as_grid[i] + as_batch_offset[i] + a_n_offset; });
        }

-        const auto& bs_group_offset = compute_ptr_offset_of_groups.GetBsPtrOffset(g_idx);
+        const auto& bs_batch_offset = compute_ptr_offset_of_groups.GetBsPtrOffset(g_idx);

        static constexpr index_t NumBTensor = BGridDesc_BK0_N_BK1::Size();
        static_for<0, NumBTensor, 1>{}(
-            [&](auto i) { p_bs_grid_grp(i) = p_bs_grid[i] + bs_group_offset[i]; });
+            [&](auto i) { p_bs_grid_grp(i) = p_bs_grid[i] + bs_batch_offset[i]; });

        GridwiseGemm::template Run<HasMainKBlockLoop>(
            p_as_grid_grp,
            p_bs_grid_grp,
            p_ds_grid_grp,
-            p_e_grid + e_group_offset + e_n_offset,
+            p_e_grid + e_batch_offset + e_n_offset,
            p_shared,
            a_element_op,
            b_element_op,
@@ -168,19 +169,19 @@ __global__ void
    }
    else
    {
-        const long_index_t a_group_offset =
+        const long_index_t a_batch_offset =
            amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
-        const long_index_t b_group_offset =
+        const long_index_t b_batch_offset =
            amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));

        const long_index_t a_n_offset =
            amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));

        GridwiseGemm::template Run<HasMainKBlockLoop>(
-            p_as_grid + a_group_offset + a_n_offset,
-            p_bs_grid + b_group_offset,
+            p_as_grid + a_batch_offset + a_n_offset,
+            p_bs_grid + b_batch_offset,
            p_ds_grid_grp,
-            p_e_grid + e_group_offset + e_n_offset,
+            p_e_grid + e_batch_offset + e_n_offset,
            p_shared,
            a_element_op,
            b_element_op,
@@ -282,8 +283,7 @@ template <index_t NDimSpatial,
                                                     // in tuple for MultiAB), unpack if tuple was
                                                     // passed
          typename BComputeDataType = AComputeDataType,
-          LoopScheduler LoopSched   = make_default_loop_scheduler(),
-          index_t NumGroupsToMerge  = 1>
+          LoopScheduler LoopSched   = make_default_loop_scheduler()>
 struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
    : public DeviceGroupedConvFwdMultipleABD<NDimSpatial,
                                             ALayout,
@@ -302,8 +302,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
 {
    using DeviceOp = DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle;

-    static_assert(NumGroupsToMerge >= 1);
-
    static constexpr bool isMultiA = is_detected<is_tuple, ADataType>::value;
    static constexpr bool isMultiB = is_detected<is_tuple, BDataType>::value;

@@ -319,9 +317,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
    using ConvToGemmFwdTransformer = TransformConvFwdToGemm<NDimSpatial,
                                                            ConvForwardSpecialization,
                                                            true /*SplitN*/,
-                                                            ALayout,
-                                                            ELayout,
-                                                            NumGroupsToMerge>;
+                                                            ADataType,
+                                                            EDataType>;

    static constexpr auto matrix_padder =
        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
@@ -520,8 +517,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
            {
                static_for<0, NumATensor, 1>{}([&](auto i) {
                    // Init compute_ptr_offset_of_groups_ for multiple AB
-                    compute_ptr_offset_of_groups_.BatchStrideA_(i) =
-                        a_g_n_c_wis_strides[0] * NumGroupsToMerge;
+                    compute_ptr_offset_of_groups_.BatchStrideA_(i) = a_g_n_c_wis_strides[0];

                    // Use GemmADataType/GemmBDataType to iterate over tuple (even if passed data
                    // type is not tuple)
@@ -549,8 +545,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                });
                static_for<0, NumBTensor, 1>{}([&](auto i) {
                    // Init compute_ptr_offset_of_groups_ for multiple AB
-                    compute_ptr_offset_of_groups_.BatchStrideB_(i) =
-                        b_g_k_c_xs_strides[0] * NumGroupsToMerge;
+                    compute_ptr_offset_of_groups_.BatchStrideB_(i) = b_g_k_c_xs_strides[0];

                    using DataType = remove_cvref_t<tuple_element_t<i.value, GemmBDataType>>;
                    // It is possible that one of the AB is a pointer and one is a tuple.
@@ -570,10 +565,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
            }
            else
            {
-                compute_ptr_offset_of_groups_.BatchStrideA_ =
-                    a_g_n_c_wis_strides[0] * NumGroupsToMerge;
-                compute_ptr_offset_of_groups_.BatchStrideB_ =
-                    b_g_k_c_xs_strides[0] * NumGroupsToMerge;
+                compute_ptr_offset_of_groups_.BatchStrideA_ = a_g_n_c_wis_strides[0];
+                compute_ptr_offset_of_groups_.BatchStrideB_ = b_g_k_c_xs_strides[0];
                compute_ptr_offset_of_n_.BatchStrideA_ = a_g_n_c_wis_strides[1] * conv_N_per_block_;

                // p_as and p_bs are pointers
@@ -590,8 +583,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds[i]);

                // D batch stride
-                compute_ptr_offset_of_groups_.BatchStrideDs_(i) =
-                    ds_g_n_k_wos_strides[i][0] * NumGroupsToMerge;
+                compute_ptr_offset_of_groups_.BatchStrideDs_(i) = ds_g_n_k_wos_strides[i][0];
                compute_ptr_offset_of_n_.BatchStrideDs_(i) =
                    ds_g_n_k_wos_strides[i][1] * conv_N_per_block_;

@@ -610,7 +602,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                ds_grid_desc_m_n_(i) =
                    DeviceOp::MakeEGridDescriptor_M_N<DLayout>(conv_to_gemm_transformer_d);
            });
-            compute_ptr_offset_of_groups_.BatchStrideE_ = e_g_n_k_wos_strides[0] * NumGroupsToMerge;
+            compute_ptr_offset_of_groups_.BatchStrideE_ = e_g_n_k_wos_strides[0];
            compute_ptr_offset_of_n_.BatchStrideE_ = e_g_n_k_wos_strides[1] * conv_N_per_block_;

            // populate desc for Ds/E
@@ -734,7 +726,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                arg.a_g_n_c_wis_lengths_[I1] / arg.conv_N_per_block_;

            const index_t gdx = arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_);
-            const index_t gdy = arg.num_group_ / NumGroupsToMerge;
+            const index_t gdy = arg.num_group_;
            const index_t gdz = num_workgroups_per_Conv_N;

            const auto K =
@@ -858,10 +850,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
    {
        namespace ctc = tensor_layout::convolution;

-        const index_t G = arg.b_g_k_c_xs_lengths_[I0];
-        const index_t K = arg.b_g_k_c_xs_lengths_[I1];
-        const index_t C = arg.b_g_k_c_xs_lengths_[I2];
-
        // check device
        if(get_device_name() == "gfx908")
        {
@@ -910,42 +898,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                }
            }
        }
-        else if constexpr(ConvForwardSpecialization == ConvolutionForwardSpecialization::Filter3x3)
-        {
-            if(C != 1)
-            {
-                return false;
-            }
-            for(index_t i = 0; i < NDimSpatial; ++i)
-            {
-                const index_t filter_spatial_dim = arg.b_g_k_c_xs_lengths_[i + I3];
-
-                if(filter_spatial_dim != I3)
-                {
-                    return false;
-                }
-            }
-            if constexpr(!is_NSpatialGK_GKSpatial_NSpatialGC<ALayout, BLayout, ELayout>())
-            {
-                return false;
-            }
-        }
-
-        if constexpr(NumGroupsToMerge > 1)
-        {
-            if(!(C == 1))
-            {
-                return false;
-            }
-            if(G % NumGroupsToMerge != 0)
-            {
-                return false;
-            }
-            if constexpr(!is_NSpatialGK_GKSpatial_NSpatialGC<ALayout, BLayout, ELayout>())
-            {
-                return false;
-            }
-        }

        // check vector access of A
        // FIXME: layout
@@ -955,16 +907,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                     is_same_v<ALayout, ctc::NWGC> || is_same_v<ALayout, ctc::NHWGC> ||
                     is_same_v<ALayout, ctc::NDHWGC>)
        {
-            // Check access per C
+            const index_t C = arg.a_g_n_c_wis_lengths_[2];
+
            if(!(ABlockTransferSrcVectorDim == 2 && C % ABlockTransferSrcScalarPerVector == 0))
            {
-                // If not possible, check access per G
-                if(!(ABlockTransferSrcVectorDim == 1 && C == 1 &&
-                     is_NSpatialGK_GKSpatial_NSpatialGC<ALayout, BLayout, ELayout>() &&
-                     G % ABlockTransferSrcScalarPerVector == 0))
-                {
-                    return false;
-                }
+                return false;
            }
        }
        else
@@ -981,6 +928,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                     is_same_v<BLayout, ctc::KZYXGC>)

        {
+            const index_t C = arg.b_g_k_c_xs_lengths_[2];
+
            if(!(BBlockTransferSrcVectorDim == 2 && C % BBlockTransferSrcScalarPerVector == 0))
            {
                return false;
@@ -1004,6 +953,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                         is_same_v<DLayout, ctc::NWGK> || is_same_v<DLayout, ctc::NHWGK> ||
                         is_same_v<DLayout, ctc::NDHWGK> || is_same_v<DLayout, ctc::G_K>)
            {
+                const index_t K = arg.ds_g_n_k_wos_lengths_[i][2];
+
                if(!(K % CDEBlockTransferScalarPerVector_NPerBlock == 0))
                {
                    valid = false;
@@ -1048,6 +999,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                     is_same_v<ELayout, ctc::NWGK> || is_same_v<ELayout, ctc::NHWGK> ||
                     is_same_v<ELayout, ctc::NDHWGK>)
        {
+            const index_t K = arg.e_g_n_k_wos_lengths_[2];
+
            if(!(K % CDEBlockTransferScalarPerVector_NPerBlock == 0))
            {
                return false;
@@ -1345,8 +1298,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
            << BBlockTransferSrcScalarPerVector << ", "
            << CDEBlockTransferScalarPerVector_NPerBlock << ", "
            << CShuffleMXdlPerWavePerShuffle << ", "
-            << CShuffleNXdlPerWavePerShuffle << ", "
-            << NumGroupsToMerge
+            << CShuffleNXdlPerWavePerShuffle
            << ">";
        // clang-format on


--- a/include/ck/utility/f8_utils.hpp
+++ b/include/ck/utility/f8_utils.hpp
@@ -44,7 +44,7 @@ __host__ __device__ Y run_cast_to_f8(X x, uint32_t rng)

    // convert to bitwise
    using T_bitwise     = typename NumericUtils<X>::bitwise_type;
-    T_bitwise x_bitwise = *(reinterpret_cast<T_bitwise*>(&x));
+    T_bitwise x_bitwise = bit_cast<T_bitwise>(x);

    // unpack the input, depends on datatype
    head     = x_bitwise & NumericUtils<X>::head_mask;
@@ -196,18 +196,17 @@ __host__ __device__ Y run_cast_from_f8(X x)

    // prepare the codes
    constexpr X nan_code = 0x80;
-    Y Inf, NegInf, NaN, Neg0;
-    using T_bitwise = typename NumericUtils<Y>::bitwise_type;
+    using T_bitwise      = typename NumericUtils<Y>::bitwise_type;

    constexpr T_bitwise Inf_bitwise    = NumericUtils<Y>::Inf;
    constexpr T_bitwise NegInf_bitwise = NumericUtils<Y>::NegInf;
    constexpr T_bitwise NaN_bitwise    = NumericUtils<Y>::NaN;
    constexpr T_bitwise Neg0_bitwise   = NumericUtils<Y>::Neg0;

-    Inf    = *(reinterpret_cast<const Y*>(&Inf_bitwise));
-    NegInf = *(reinterpret_cast<const Y*>(&NegInf_bitwise));
-    NaN    = *(reinterpret_cast<const Y*>(&NaN_bitwise));
-    Neg0   = *(reinterpret_cast<const Y*>(&Neg0_bitwise));
+    constexpr Y Inf    = bit_cast<Y>(Inf_bitwise);
+    constexpr Y NegInf = bit_cast<Y>(NegInf_bitwise);
+    constexpr Y NaN    = bit_cast<Y>(NaN_bitwise);
+    constexpr Y Neg0   = bit_cast<Y>(Neg0_bitwise);

    // check if x is 0.0
    if(x == 0)
@@ -240,7 +239,7 @@ __host__ __device__ Y run_cast_from_f8(X x)
    {
        retval = x;
        retval <<= 8;
-        return *(reinterpret_cast<const Y*>(&retval));
+        return bit_cast<Y>(retval);
    }

    // subnormal input
@@ -264,7 +263,7 @@ __host__ __device__ Y run_cast_from_f8(X x)
    }

    retval = (sign << (out_exp + out_mant)) | (exponent << out_mant) | mantissa;
-    return *(reinterpret_cast<const Y*>(&retval));
+    return bit_cast<Y>(retval);
 }

 } // namespace