Re-enable the performance tracking in CI. (#1203)

* test CK with rocm6.1 RC2 * add docker credentials for pull * update the performance db name * use environment variable for db name * add rocm-llvm-dev package to ck docker * turn off verification for daily performance runs * do not stash ckProfiler on MI300 node * add processing of mixed gemms to qa, fix parsing of splitk gemm logs * fix the splitk gemm log file name * turn the timing on for splitk gemm performance

Re-enable the performance tracking in CI. (#1203)
* test CK with rocm6.1 RC2 * add docker credentials for pull * update the performance db name * use environment variable for db name * add rocm-llvm-dev package to ck docker * turn off verification for daily performance runs * do not stash ckProfiler on MI300 node * add processing of mixed gemms to qa, fix parsing of splitk gemm logs * fix the splitk gemm log file name * turn the timing on for splitk gemm performance
bdcd0374 · Illia Silin · GitHub · e626d520 · bdcd0374 · bdcd0374
Unverified Commit bdcd0374 authored Mar 18, 2024 by Illia Silin Committed by GitHub Mar 18, 2024
Showing with 52 additions and 49 deletions

Dockerfile Dockerfile +11 -8

Jenkinsfile Jenkinsfile +26 -21

script/process_perf_data.py script/process_perf_data.py +7 -2

script/run_full_performance_tests.sh script/run_full_performance_tests.sh +8 -18

No files found.
--- a/Dockerfile
+++ b/Dockerfile
@@ -16,17 +16,17 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
 ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
 RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg

-RUN if [ "$ROCMVERSION" != "6.0.1" ]; then \
+RUN if [ "$ROCMVERSION" != "6.1" ]; then \
        sh -c "wget https://repo.radeon.com/amdgpu-install/6.0/ubuntu/focal/amdgpu-install_6.0.60000-1_all.deb  --no-check-certificate" && \
        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.0.60000-1_all.deb && \
        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
-    elif [ "$ROCMVERSION" = "6.0.1" ] && [ "$compiler_version" = "rc1" ]; then \
-        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.0-20.04-1_all.deb --no-check-certificate" && \
-        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.0-20.04-1_all.deb && \
-        sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.0.1 rel-95 > /etc/apt/sources.list.d/rocm-build.list' && \
-        amdgpu-repo --amdgpu-build=1704947; \
+    elif [ "$ROCMVERSION" = "6.1" ] && [ "$compiler_version" = "rc2" ]; then \
+        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.1-20.04-1_all.deb --no-check-certificate" && \
+        apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.1-20.04-1_all.deb && \
+        sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.1 rel-48 > /etc/apt/sources.list.d/rocm-build.list' && \
+        amdgpu-repo --amdgpu-build=1736298; \
    fi

 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
@@ -41,6 +41,7 @@ chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache
 ENV PATH=$PATH:${SCCACHE_INSTALL_LOCATION}

 # Install dependencies
+# hipTensor requires rocm-llvm-dev for rocm versions > 6.0.1
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
    build-essential \
    cmake \
@@ -60,6 +61,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    python3-dev \
    python3-pip \
    redis \
+    rocm-llvm-dev \
    sshpass \
    stunnel \
    software-properties-common \
@@ -73,6 +75,9 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

+# Update the cmake to version 3.27.5
+RUN pip install --upgrade cmake==3.27.5
+
 #Install latest ccache
 RUN git clone https://github.com/ccache/ccache.git && \
    cd ccache && mkdir build && cd build && cmake .. && make install
@@ -82,8 +87,6 @@ RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releas
 RUN gunzip /usr/local/bin/ninja.gz
 RUN chmod a+x /usr/local/bin/ninja
 RUN git clone https://github.com/nico/ninjatracing.git
-# Update the cmake to the latest version
-RUN pip install --upgrade cmake==3.27.5

 #Install latest cppcheck
 RUN git clone https://github.com/danmar/cppcheck.git && \

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -38,7 +38,7 @@ def getDockerImageName(){
        img = "${params.USE_CUSTOM_DOCKER}"
    }
    else{
-    if (params.ROCMVERSION != "6.0.1"){
+    if (params.ROCMVERSION != "6.1"){
       if (params.COMPILER_VERSION == "") {
           img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
       }
@@ -117,8 +117,10 @@ def getDockerImage(Map conf=[:]){
    {
        echo "Pulling down image: ${image}"
        retimage = docker.image("${image}")
+        withDockerRegistry([ credentialsId: "docker_test_cred", url: "" ]) {
            retimage.pull()
        }
+    }
    catch(Exception ex)
    {
        error "Unable to locate image: ${image}"
@@ -406,7 +408,7 @@ def runCKProfiler(Map conf=[:]){

 					dir("script"){
                        if (params.RUN_FULL_QA){
-                            sh "./run_full_performance_tests.sh 1 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
+                            sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
                            archiveArtifacts "perf_gemm.log"
                            archiveArtifacts "perf_resnet50_N256.log"
                            archiveArtifacts "perf_resnet50_N4.log"
@@ -416,9 +418,9 @@ def runCKProfiler(Map conf=[:]){
                            archiveArtifacts "perf_conv_bwd_data.log"
                            archiveArtifacts "perf_gemm_bilinear.log"
                            archiveArtifacts "perf_reduction.log"
-                            archiveArtifacts "perf_splitK_gemm_verify.log"
                            archiveArtifacts "perf_splitK_gemm.log"
                            archiveArtifacts "perf_onnx_gemm.log"
+                            archiveArtifacts "perf_mixed_gemm.log"
                           // stash perf files to master
                            stash name: "perf_gemm.log"
                            stash name: "perf_resnet50_N256.log"
@@ -431,6 +433,7 @@ def runCKProfiler(Map conf=[:]){
                            stash name: "perf_reduction.log"
                            stash name: "perf_splitK_gemm.log"
                            stash name: "perf_onnx_gemm.log"
+                            stash name: "perf_mixed_gemm.log"
                            //we will process results on the master node
                        }
                        else{
@@ -493,9 +496,6 @@ def Build_CK(Map conf=[:]){

        def variant = env.STAGE_NAME
        def retimage
-        def navi_node = 0
-        def mi300_node = 0
-
        gitStatusWrapper(credentialsId: "${env.status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
            try {
                (retimage, image) = getDockerImage(conf)
@@ -508,14 +508,6 @@ def Build_CK(Map conf=[:]){
                        else{
                            echo "GPU is OK"
                        }
-                        if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') ){
-                            navi_node = 1
-                            echo "This is a Navi node"
-                        }
-                        if ( runShell('grep -n "gfx942" rocminfo.log') ){
-                            mi300_node = 1
-                            echo "This is MI300 node"
-                        }
                    }
                }
            }
@@ -526,15 +518,27 @@ def Build_CK(Map conf=[:]){
            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
                timeout(time: 24, unit: 'HOURS')
                {
+                    //check whether running on Navi or MI300 node
+                    def navi_node = 0
+                    def mi300_node = 0
+                    sh 'rocminfo | tee rocminfo.log'
+                    if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') ){
+                        navi_node = 1
+                        echo "This is a Navi node"
+                    }
+                    if ( runShell('grep -n "gfx942" rocminfo.log') ){
+                        mi300_node = 1
+                        echo "This is MI300 node"
+                    }
                    cmake_build(conf)
                    dir("build"){
                        //run tests and examples
                        sh 'make -j check'
-                        if (navi_node == 0 ){
+                        if (params.RUN_PERFORMANCE_TESTS && navi_node == 0 && mi300_node == 0 ){
                            //we only need the ckProfiler to run the performance tests, so we pack and stash it
-                            //do not stash profiler on Navi nodes
+                            //do not stash profiler on Navi or MI300 nodes
                           sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler'
-                           stash "ckProfiler.tar.gz"
+                           stash name: "ckProfiler.tar.gz"
                        }
                        if (params.RUN_FULL_QA && mi300_node == 0 ){
                           // build deb packages for all MI100/200/300 targets and prepare to export
@@ -542,7 +546,7 @@ def Build_CK(Map conf=[:]){
                           archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb'
                           archiveArtifacts artifacts: 'composablekernel-tests_*.deb'
                           sh 'mv composablekernel-ckprofiler_*.deb ckprofiler_0.2.0_amd64.deb'
-                           stash "ckprofiler_0.2.0_amd64.deb"
+                           stash name: "ckprofiler_0.2.0_amd64.deb"
                        }
                    }
                    if (params.hipTensor_test && navi_node == 0 ){
@@ -629,6 +633,7 @@ def process_results(Map conf=[:]){
                        unstash "perf_reduction.log"
                        unstash "perf_splitK_gemm.log"
                        unstash "perf_onnx_gemm.log"
+                        unstash "perf_mixed_gemm.log"
                        sh "./process_qa_data.sh"
                        unstash "ckprofiler_0.2.0_amd64.deb"
                        sh "sshpass -p ${env.ck_deb_pw} scp -o StrictHostKeyChecking=no ckprofiler_0.2.0_amd64.deb ${env.ck_deb_user}@${env.ck_deb_ip}:/var/www/html/composable_kernel/"
@@ -716,8 +721,8 @@ pipeline {
            description: "Run the cppcheck static analysis (default: OFF)")
        booleanParam(
            name: "RUN_PERFORMANCE_TESTS",
-            defaultValue: false,
-            description: "Run the performance tests (default: OFF)")
+            defaultValue: true,
+            description: "Run the performance tests (default: ON)")
        booleanParam(
            name: "RUN_CODEGEN_TESTS",
            defaultValue: true,

--- a/script/process_perf_data.py
+++ b/script/process_perf_data.py
@@ -133,11 +133,16 @@ def parse_logfile(logfile):
            if 'Best Perf' in line:
                lst=line.split()
                res.append(lst[4])
-    elif 'onnx_gemm' in logfile or 'splitK_gemm' in logfile or 'mixed_gemm' in logfile:
+    elif 'onnx_gemm' in logfile or 'mixed_gemm' in logfile:
        for line in open(logfile):
            if 'Best Perf' in line:
                lst=line.split()
                res.append(lst[33])
+    elif 'splitK_gemm' in logfile:
+        for line in open(logfile):
+            if 'Best Perf' in line:
+                lst=line.split()
+                res.append(lst[36])
    return res


@@ -231,7 +236,7 @@ def main():
    sql_hostname = '127.0.0.1'
    sql_username = os.environ["dbuser"]
    sql_password = os.environ["dbpassword"]
-    sql_main_database = 'miopen_perf'
+    sql_main_database = os.environ["ck_perf_db"]
    sql_port = 3306
    ssh_host = os.environ["dbsship"]
    ssh_user = os.environ["dbsshuser"]

--- a/script/run_full_performance_tests.sh
+++ b/script/run_full_performance_tests.sh
@@ -121,26 +121,16 @@ print_log_header $reduction_log $env_type $branch $host_name
 ./profile_reduce_no_index.sh $verify 2 10 --half 2>&1 | tee -a $reduction_log

 #run splitK_gemm tests, first correctness verification, then performance
-export splitK_gemm_ver_log="perf_splitK_gemm_verify.log"
-print_log_header $splitK_gemm_ver_log $env_type $branch $host_name
-./profile_splitK_gemm.sh gemm_splitk 0 0 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
-./profile_splitK_gemm.sh gemm_splitk 0 1 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
-./profile_splitK_gemm.sh gemm_splitk 0 2 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
-./profile_splitK_gemm.sh gemm_splitk 0 3 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
-./profile_splitK_gemm.sh gemm_splitk 1 0 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
-./profile_splitK_gemm.sh gemm_splitk 1 1 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
-./profile_splitK_gemm.sh gemm_splitk 1 2 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
-./profile_splitK_gemm.sh gemm_splitk 1 3 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
 export splitK_gemm_log="perf_splitK_gemm.log"
 print_log_header $splitK_gemm_log $env_type $branch $host_name
-./profile_splitK_gemm.sh gemm_splitk 0 0 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 0 1 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 0 2 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 0 3 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 1 0 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 1 1 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 1 2 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 1 3 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 0 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 1 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 2 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 3 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 0 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 1 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 2 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 3 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log

 #run ONNX gemm tests
 export onnx_log="perf_onnx_gemm.log"