Merge branch 'develop' into ck_codegen_build

c136e9b2 · arai713 · GitHub · 3068d5c0 · ae3b8ff8 · c136e9b2
Unverified Commit c136e9b2 authored Aug 08, 2024 by arai713 Committed by GitHub Aug 08, 2024
13 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -189,7 +189,9 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
   message("Adding the enable-post-misched=0 compiler flag")
   add_compile_options("SHELL: -mllvm -enable-post-misched=0")
 endif()
-if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132 AND ${hip_VERSION_FLAT} LESS 600300000)
+set(check-coerce)
+check_cxx_compiler_flag(" -mllvm -amdgpu-coerce-illegal-types=1" check-coerce)
+if(NOT WIN32 AND check-coerce AND ${hip_VERSION_FLAT} GREATER 600241132 AND ${hip_VERSION_FLAT} LESS 600300000)
   message("Adding the amdgpu-coerce-illegal-types=1")
   add_compile_options("SHELL: -mllvm -amdgpu-coerce-illegal-types=1")
 endif()

--- a/Dockerfile
+++ b/Dockerfile
 FROM ubuntu:20.04
 ARG DEBIAN_FRONTEND=noninteractive
-ARG ROCMVERSION=6.1
+ARG ROCMVERSION=6.2
 ARG compiler_version=""
 ARG compiler_commit=""
 ARG CK_SCCACHE=""
@@ -17,17 +17,12 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
 ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
 RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
-RUN if [ "$ROCMVERSION" != "6.2" ]; then \
+RUN if [ "$ROCMVERSION" != "6.3" ]; then \
-        sh -c "wget https://repo.radeon.com/amdgpu-install/6.1/ubuntu/focal/amdgpu-install_6.1.60100-1_all.deb  --no-check-certificate" && \
+        sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.2.60200-1_all.deb  --no-check-certificate" && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.1.60100-1_all.deb && \
+        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.2.60200-1_all.deb && \
        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
-    elif [ "$ROCMVERSION" = "6.2" ] && [ "$compiler_version" = "rc4" ]; then \
-        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.2-20.04-1_all.deb --no-check-certificate" && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.2-20.04-1_all.deb && \
-        sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.2 rel-63 > /etc/apt/sources.list.d/rocm-build.list' && \
-        amdgpu-repo --amdgpu-build=2009461; \
    fi
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
@@ -64,6 +59,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    python3-dev \
    python3-pip \
    redis \
+    rocm-llvm-dev \
    sshpass \
    stunnel \
    software-properties-common \

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -38,7 +38,7 @@ def getDockerImageName(){
        img = "${params.USE_CUSTOM_DOCKER}"
    }
    else{
-    if (params.ROCMVERSION != "6.2"){
+    if (params.ROCMVERSION != "6.3"){
       if (params.COMPILER_VERSION == "") {
           img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
       }
@@ -285,6 +285,19 @@ def cmake_build(Map conf=[:]){
    if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "amd-master")) {
        archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true
    }
+    if (params.RUN_CK_TILE_TESTS){
+        try{
+            archiveArtifacts "perf_fmha_fwd_*.log"
+            archiveArtifacts "perf_fmha_bwd_*.log"
+            stash name: "perf_fmha_fwd_gfx942.log"
+            stash name: "perf_fmha_bwd_gfx942.log"
+            stash name: "perf_fmha_fwd_gfx90a.log"
+            stash name: "perf_fmha_bwd_gfx90a.log"
+        }
+        catch(Exception err){
+            echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
+        }
+    }
 }
 def buildHipClangJob(Map conf=[:]){
@@ -612,6 +625,17 @@ def process_results(Map conf=[:]){
        timeout(time: 1, unit: 'HOURS'){
            try{
                dir("script"){
+                    if (params.RUN_CK_TILE_TESTS){
+                        try{
+                            unstash "perf_fmha_fwd_gfx942.log"
+                            unstash "perf_fmha_bwd_gfx942.log"
+                            unstash "perf_fmha_fwd_gfx90a.log"
+                            unstash "perf_fmha_bwd_gfx90a.log"
+                        }
+                        catch(Exception err){
+                            echo "could not locate the FMHA performance logs: ${err.getMessage()}."
+                        }
+                    }
                    if (params.RUN_FULL_QA){
                        // unstash perf files to master
                        unstash "ckprofiler_0.2.0_amd64.deb"
@@ -652,8 +676,8 @@ def process_results(Map conf=[:]){
 }
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.1; RUN_CK_TILE_TESTS=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2; RUN_CK_TILE_TESTS=true
-                                              0 21 * * * % ROCMVERSION=6.1;hipTensor_test=true
+                                              0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true
                                              0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false
                                              0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false
                                              0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_CODEGEN_TESTS=false;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false''' : ""
@@ -677,8 +701,8 @@ pipeline {
            description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
        string(
            name: 'ROCMVERSION', 
-            defaultValue: '6.1', 
+            defaultValue: '6.2', 
-            description: 'Specify which ROCM version to use: 6.1 (default).')
+            description: 'Specify which ROCM version to use: 6.2 (default).')
        string(
            name: 'COMPILER_VERSION', 
            defaultValue: '', 
@@ -821,8 +845,7 @@ pipeline {
                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
                                           make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
                                           cd ../ &&
-                                           example/ck_tile/01_fmha/script/smoke_test_fwd.sh && \
+                                           example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
-                                           example/ck_tile/01_fmha/script/smoke_test_bwd.sh"""
                   }
                    steps{
                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
@@ -841,8 +864,7 @@ pipeline {
                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
                                           make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
                                           cd ../ &&
-                                           example/ck_tile/01_fmha/script/smoke_test_fwd.sh && \
+                                           example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
-                                           example/ck_tile/01_fmha/script/smoke_test_bwd.sh"""
                   }
                    steps{
                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)

--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core==1.6.1
+rocm-docs-core==1.6.2
 sphinxcontrib-bibtex==2.6.2
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.6.1
+rocm-docs-core==1.6.2
    # via -r requirements.in
 six==1.16.0
    # via pybtex

--- a/example/ck_tile/01_fmha/script/run_full_test.sh
+++ b/example/ck_tile/01_fmha/script/run_full_test.sh
+#!/bin/bash 
+#
+# in order to run this script you'd first need to build the tile_example_fmha_fwd and tile_eaxmple_fmha_bwd executables in ../build/bin/
+#
+# run the script as "./run_full_test.sh <tag for your test environment> <branch name> <host name> <gpu_arch>
+# input arguments: 
+# environment tag  : a string describing the specifics of your test environment
+# branch name      : name of the branch in git repo (git status | grep -e 'On branch')
+# host name        : $hostname
+# gpu architecture: e.g., gfx90a, or gfx942, etc.
+#get the command line arguments:
+export env_type=$1
+echo 'Environment type: ' $env_type
+export branch=$2
+echo 'Branch name: ' $branch
+export host_name=$3
+echo 'Host name: ' $host_name
+export GPU_arch=$4
+echo 'GPU_arch: ' $GPU_arch
+function print_log_header(){
+	rm -f $1;
+	echo 'On branch ' $3 &> $1;
+	echo 'Node name: ' $4 >> $1;
+	#get GPU_arch and number of compute units from rocminfo
+	echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
+	rocminfo | grep "Compute Unit:" >> $1;
+	hipcc --version | grep -e 'HIP version'  >> $1;
+	echo 'Environment type: ' $2 >> $1;
+	/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
+}
+#run verification tests
+example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+example/ck_tile/01_fmha/script/smoke_test_bwd.sh
+#run performance benchmarks
+export fmha_fwd_log="perf_fmha_fwd_$GPU_arch.log"
+print_log_header $fmha_fwd_log $env_type $branch $host_name
+example/ck_tile/01_fmha/script/benchmark_fwd.sh 2>&1 | tee -a $fmha_fwd_log
+export fmha_bwd_log="perf_fmha_bwd_$GPU_arch.log"
+print_log_header $fmha_bwd_log $env_type $branch $host_name
+example/ck_tile/01_fmha/script/benchmark_bwd.sh 2>&1 | tee -a $fmha_bwd_log
--- a/include/ck/utility/f8_utils.hpp
+++ b/include/ck/utility/f8_utils.hpp
@@ -44,7 +44,7 @@ __host__ __device__ Y run_cast_to_f8(X x, uint32_t rng)
    // convert to bitwise
    using T_bitwise     = typename NumericUtils<X>::bitwise_type;
-    T_bitwise x_bitwise = *(reinterpret_cast<T_bitwise*>(&x));
+    T_bitwise x_bitwise = bit_cast<T_bitwise>(x);
    // unpack the input, depends on datatype
    head     = x_bitwise & NumericUtils<X>::head_mask;
@@ -196,18 +196,17 @@ __host__ __device__ Y run_cast_from_f8(X x)
    // prepare the codes
    constexpr X nan_code = 0x80;
-    Y Inf, NegInf, NaN, Neg0;
+    using T_bitwise      = typename NumericUtils<Y>::bitwise_type;
-    using T_bitwise = typename NumericUtils<Y>::bitwise_type;
    constexpr T_bitwise Inf_bitwise    = NumericUtils<Y>::Inf;
    constexpr T_bitwise NegInf_bitwise = NumericUtils<Y>::NegInf;
    constexpr T_bitwise NaN_bitwise    = NumericUtils<Y>::NaN;
    constexpr T_bitwise Neg0_bitwise   = NumericUtils<Y>::Neg0;
-    Inf    = *(reinterpret_cast<const Y*>(&Inf_bitwise));
+    constexpr Y Inf    = bit_cast<Y>(Inf_bitwise);
-    NegInf = *(reinterpret_cast<const Y*>(&NegInf_bitwise));
+    constexpr Y NegInf = bit_cast<Y>(NegInf_bitwise);
-    NaN    = *(reinterpret_cast<const Y*>(&NaN_bitwise));
+    constexpr Y NaN    = bit_cast<Y>(NaN_bitwise);
-    Neg0   = *(reinterpret_cast<const Y*>(&Neg0_bitwise));
+    constexpr Y Neg0   = bit_cast<Y>(Neg0_bitwise);
    // check if x is 0.0
    if(x == 0)
@@ -240,7 +239,7 @@ __host__ __device__ Y run_cast_from_f8(X x)
    {
        retval = x;
        retval <<= 8;
-        return *(reinterpret_cast<const Y*>(&retval));
+        return bit_cast<Y>(retval);
    }
    // subnormal input
@@ -264,7 +263,7 @@ __host__ __device__ Y run_cast_from_f8(X x)
    }
    retval = (sign << (out_exp + out_mant)) | (exponent << out_mant) | mantissa;
-    return *(reinterpret_cast<const Y*>(&retval));
+    return bit_cast<Y>(retval);
 }
 } // namespace

--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -46,8 +46,10 @@ if(GPU_TARGETS MATCHES "gfx9")
    list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp)
  endif()
  list(APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp)
-  list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp)
+  if(GPU_TARGETS MATCHES "gfx94")
-  list(APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp)
+    list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp)
+    list(APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp)
+  endif()
  list(APPEND PROFILER_SOURCES profile_batched_gemm.cpp)
  list(APPEND PROFILER_SOURCES profile_batched_gemm_reduce.cpp)
  list(APPEND PROFILER_SOURCES profile_gemm_add_multiply.cpp)
@@ -128,8 +130,10 @@ if(GPU_TARGETS MATCHES "gfx9")
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_instance)
+  if(GPU_TARGETS MATCHES "gfx94")
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_ab_scale_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_ab_scale_instance)
+  endif()
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance)

--- a/python/ck4inductor/universal_gemm/gen_instances.py
+++ b/python/ck4inductor/universal_gemm/gen_instances.py
@@ -62,17 +62,13 @@ def parse_instances(str_instances: List[str]) -> List[CKGemmOperation]:
                    i_current = i_next + 1
            if i_next == -1:
                break
-        # pad with `None`s for the fields which are not defined in the instance
+        template_args.insert(2, tuple())  # ds layout
+        template_args.insert(6, tuple())  # ds dtype
        new_instance = CKGemmOperation(
            *template_args,  # type: ignore[arg-type]
-            *((None,) * (len(fields(CKGemmOperation)) - len(template_args))),
        )
-        # the last 2 template parameters are optional
-        # if they are absent, substitute them with default values from Universal Gemm C++ template declaration
-        if new_instance.a_compute_dtype is None:
-            new_instance.a_compute_dtype = new_instance.c_element_dtype
-        if new_instance.b_compute_dtype is None:
-            new_instance.b_compute_dtype = new_instance.c_element_dtype
        op_instances.append(new_instance)
    return op_instances
@@ -208,6 +204,8 @@ def gen_ops_preselected() -> List[CKGemmOperation]:
        a_layout="Row",
        b_layout="Col",
        c_layout="Row",
+        ds_element_dtypes=tuple(),
+        ds_layouts=tuple(),
        a_element_dtype="F16",
        b_element_dtype="F16",
        c_element_dtype="F16",

--- a/python/ck4inductor/universal_gemm/op.py
+++ b/python/ck4inductor/universal_gemm/op.py
@@ -10,10 +10,12 @@ class CKGemmOperation:
    a_layout: str
    b_layout: str
+    ds_layouts: Tuple[str]  # addmm specific
    c_layout: str
    a_element_dtype: str
    b_element_dtype: str
+    ds_element_dtypes: Tuple[str]  # addmm specific
    c_element_dtype: str
    acc_dtype: str
@@ -64,16 +66,15 @@ class CKGemmOperation:
        Tuple[int, int, int, int]
    )
    c_shuffle_block_transfer_scalar_per_vector_n_per_block: int
    block_gemm_pipeline_scheduler: str
-    block_gemm_pipeline_version: Optional[str]
+    block_gemm_pipeline_version: str
-    a_compute_dtype: Optional[str]
+    a_compute_dtype: Optional[str] = None
-    b_compute_dtype: Optional[str]
+    b_compute_dtype: Optional[str] = None
    def name(self):
        # cpp alias for template instance
-        return f"ck_devicegemm_xdl_shuffle_v3_{self.key_name()}"
+        return f"ck_devicegemm_multid_xdl_shuffle_v3_{self.key_name()}"
    def key_name(self):
        # TBD; must be unique per instance. Intended to use as dict key

--- a/script/process_perf_data.py
+++ b/script/process_perf_data.py
@@ -143,6 +143,12 @@ def parse_logfile(logfile):
            if 'Best Perf' in line:
                lst=line.split()
                res.append(lst[36])
+    elif 'perf_fmha' in logfile:
+        for line in open(logfile):
+            if 'TFlops' in line:
+                lst=line.split()
+                line_dict=dict(zip(lst[1:],lst))
+                res.append(line_dict['TFlops,'])
    return res
@@ -304,6 +310,14 @@ def main():
            for i in range(1,len(results)+1):
                testlist.append("Test%i"%i)
            table_name="ck_mixed_gemm_tflops"
+        if 'fmha_fwd' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_fmha_fwd_tflops"
+        if 'fmha_bwd' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_fmha_bwd_tflops"
        tflops_base = get_baseline(table_name,conn)
        store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, conn)

--- a/script/process_perf_data.sh
+++ b/script/process_perf_data.sh
@@ -13,3 +13,20 @@
 python3 process_perf_data.py perf_gemm.log
 python3 process_perf_data.py perf_resnet50_N256.log
 python3 process_perf_data.py perf_resnet50_N4.log
+file=./perf_fmha_fwd_gfx942.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_fwd_gfx942.log
+fi
+file=./perf_fmha_bwd_gfx942.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_bwd_gfx942.log
+fi
+file=./perf_fmha_fwd_gfx90a.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_fwd_gfx90a.log
+fi
+file=./perf_fmha_bwd_gfx90a.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
+fi
--- a/script/process_qa_data.sh
+++ b/script/process_qa_data.sh
@@ -21,3 +21,20 @@ python3 process_perf_data.py perf_gemm_bilinear.log
 python3 process_perf_data.py perf_reduction.log
 python3 process_perf_data.py perf_splitK_gemm.log
 python3 process_perf_data.py perf_onnx_gemm.log
+file=./perf_fmha_fwd_gfx942.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_fwd_gfx942.log
+fi
+file=./perf_fmha_bwd_gfx942.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_bwd_gfx942.log
+fi
+file=./perf_fmha_fwd_gfx90a.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_fwd_gfx90a.log
+fi
+file=./perf_fmha_bwd_gfx90a.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
+fi