"git@developer.sourcefind.cn:gaoqiong/composable_kernel.git" did not exist on "246ceee49e7a12c7b0298a01f83c959fd130770a"
Unverified Commit 73a076ee authored by Thomas Ning's avatar Thomas Ning Committed by GitHub
Browse files

Ck tile/gemm perf measure (#1750)



* Finished adding the performance benchmark for ck tile gemm

* Fix the executable rename problem

* fix the executable name error

* delete the unsupported layout combinations

* Update run_full_test.sh

* Update benchmark_mem_pipeline.sh

* Update benchmark_basic.sh

* change the executable of gemm_universal

* change ck_tile_gemm script permissions

* Addressed the comment

* Addressed the comment

* Fixed the comments

* Fixed Comment

* roll back the malfunctioned change

* Fix the Typo

* finalize the tile_gemm_fp16 performance monitoring

* fix the stash names for ck_tile gemm logs

* change the stashing logic

* change stashing syntax

---------
Co-authored-by: default avatarIllia Silin <98187287+illsilin@users.noreply.github.com>
Co-authored-by: default avatarillsilin <Illia.Silin@amd.com>
parent 26b3829c
...@@ -326,12 +326,38 @@ def cmake_build(Map conf=[:]){ ...@@ -326,12 +326,38 @@ def cmake_build(Map conf=[:]){
if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "amd-master")) { if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "amd-master")) {
archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true
} }
//check the node gpu architecture
def arch_type = 0
sh 'rocminfo | tee rocminfo.log'
if ( runShell('grep -n "gfx90a" rocminfo.log') ){
arch_type = 1
}
else if ( runShell('grep -n "gfx942" rocminfo.log') ) {
arch_type = 2
}
if (params.RUN_CK_TILE_FMHA_TESTS){ if (params.RUN_CK_TILE_FMHA_TESTS){
try{ try{
archiveArtifacts "perf_fmha_fwd_*.log" archiveArtifacts "perf_fmha_*.log"
archiveArtifacts "perf_fmha_bwd_*.log" if (arch_type == 1){
stash includes: "perf_fmha_**_gfx942.log", name: "perf_fmha_log_gfx942" stash includes: "perf_fmha_**_gfx90a.log", name: "perf_fmha_log_gfx90a"
stash includes: "perf_fmha_**_gfx90a.log", name: "perf_fmha_log_gfx90a" }
else if (arch_type == 2){
stash includes: "perf_fmha_**_gfx942.log", name: "perf_fmha_log_gfx942"
}
}
catch(Exception err){
echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
}
}
if (params.RUN_CK_TILE_GEMM_TESTS){
try{
archiveArtifacts "perf_tile_gemm_*.log"
if (arch_type == 1){
stash includes: "perf_tile_gemm_**_fp16_gfx90a.log", name: "perf_tile_gemm_log_gfx90a"
}
else if (arch_type == 2){
stash includes: "perf_tile_gemm_**_fp16_gfx942.log", name: "perf_tile_gemm_log_gfx942"
}
} }
catch(Exception err){ catch(Exception err){
echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
...@@ -630,6 +656,15 @@ def process_results(Map conf=[:]){ ...@@ -630,6 +656,15 @@ def process_results(Map conf=[:]){
echo "could not locate the FMHA performance logs: ${err.getMessage()}." echo "could not locate the FMHA performance logs: ${err.getMessage()}."
} }
} }
if (params.RUN_CK_TILE_GEMM_TESTS){
try{
unstash "perf_tile_gemm_log_gfx942"
unstash "perf_tile_gemm_log_gfx90a"
}
catch(Exception err){
echo "could not locate the GEMM performance logs: ${err.getMessage()}."
}
}
if (params.RUN_FULL_QA){ if (params.RUN_FULL_QA){
// unstash perf files to master // unstash perf files to master
unstash "ckprofiler_0.2.0_amd64.deb" unstash "ckprofiler_0.2.0_amd64.deb"
...@@ -956,7 +991,7 @@ pipeline { ...@@ -956,7 +991,7 @@ pipeline {
environment{ environment{
setup_args = "NO_CK_BUILD" setup_args = "NO_CK_BUILD"
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \ execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \
make -j64 tile_example_gemm_basic && \ make -j64 tile_example_gemm_basic tile_example_gemm_universal && \
cd ../ && cd ../ &&
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """ example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
} }
...@@ -975,7 +1010,7 @@ pipeline { ...@@ -975,7 +1010,7 @@ pipeline {
environment{ environment{
setup_args = "NO_CK_BUILD" setup_args = "NO_CK_BUILD"
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \ execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \
make -j64 tile_example_gemm_basic && \ make -j64 tile_example_gemm_basic tile_example_gemm_universal && \
cd ../ && cd ../ &&
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """ example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
} }
......
add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp) add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
add_executable(tile_example_universal_gemm EXCLUDE_FROM_ALL universal_gemm.cpp) add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp)
...@@ -11,9 +11,9 @@ sh ../script/cmake-ck-dev.sh ../ <arch> ...@@ -11,9 +11,9 @@ sh ../script/cmake-ck-dev.sh ../ <arch>
# The basic pipeline method on the gemm calculation # The basic pipeline method on the gemm calculation
make tile_example_gemm_basic -j make tile_example_gemm_basic -j
# The memory bound pipeline on the gemm calculation # The memory bound pipeline on the gemm calculation
make tile_example_gemm_mem_pipeline -j make tile_example_gemm_universal -j
``` ```
This will result in an executable `build/bin/tile_example_gemm_basic` This will result in an executable `build/bin/tile_example_gemm_basic` & `build/bin/tile_example_gemm_universal`
## example ## example
``` ```
...@@ -22,6 +22,9 @@ args: ...@@ -22,6 +22,9 @@ args:
-m m dimension (default:1024) -m m dimension (default:1024)
-n n dimension (default:2048) -n n dimension (default:2048)
-k k dimension (default:64) -k k dimension (default:64)
-a_layout Tensor A data layout (default: R)
-b_layout Tensor B data layout (default: R)
-c_layout Tensor C data layout (default: R)
-stride_a Tensor A stride (default:0) -stride_a Tensor A stride (default:0)
-stride_b Tensor B stride (default:0) -stride_b Tensor B stride (default:0)
-stride_c Tensor C stride (default:0) -stride_c Tensor C stride (default:0)
......
#!/bin/sh
EXE="$(find . -name tile_example_gemm_basic -type f | head -n 1)"
VALID=0
for b_matrix_layout in "R" "C"; do
for m in "64" "512" "1024" "2048"; do
for n in "512" "1024" "2048"; do
for k in "64" "512" "1024" "2048"; do
$EXE -prec=fp16 -b=1 -m=$m -n=$n -k=$k -a_layout="R" -b_layout="$b_matrix_layout" -c_layout="R" -v=$VALID
done
done
done
done
#!/bin/sh
EXE="$(find . -name tile_example_gemm_universal -type f | head -n 1)"
VALID=0
for b_matrix_layout in "R" "C"; do
for m in "64" "512" "1024" "2048"; do
for n in "512" "1024" "2048"; do
for k in "64" "512" "1024" "2048"; do
$EXE -prec=fp16 -b=1 -m=$m -n=$n -k=$k -a_layout="R" -b_layout="$b_matrix_layout" -c_layout="R" -v=$VALID
done
done
done
done
...@@ -19,7 +19,27 @@ echo 'Host name: ' $host_name ...@@ -19,7 +19,27 @@ echo 'Host name: ' $host_name
export GPU_arch=$4 export GPU_arch=$4
echo 'GPU_arch: ' $GPU_arch echo 'GPU_arch: ' $GPU_arch
function print_log_header(){
rm -f $1;
echo 'On branch ' $3 &> $1;
echo 'Node name: ' $4 >> $1;
# get GPU architecture and compute units from rocminfo
echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
rocminfo | grep "Compute Unit:" >> $1;
hipcc --version | grep -e 'HIP version' >> $1;
echo 'Environment type: ' $2 >> $1;
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
}
# run verification tests # run verification tests
example/ck_tile/03_gemm/script/smoke_test.sh example/ck_tile/03_gemm/script/smoke_test_basic.sh
example/ck_tile/03_gemm/script/smoke_test_mem_pipeline.sh
# run performance benchmarks
export gemm_basic_log="perf_tile_gemm_basic_fp16_$GPU_arch.log"
print_log_header $gemm_basic_log $env_type $branch $host_name
example/ck_tile/03_gemm/script/benchmark_basic.sh 2>&1 | tee -a $gemm_basic_log
# We do not have a performance benchmark for gemm yet. Will add it in the future. export gemm_mem_pipeline_log="perf_tile_gemm_mem_pipeline_fp16_$GPU_arch.log"
\ No newline at end of file print_log_header $gemm_mem_pipeline_log $env_type $branch $host_name
example/ck_tile/03_gemm/script/benchmark_mem_pipeline.sh 2>&1 | tee -a $gemm_mem_pipeline_log
...@@ -32,4 +32,4 @@ set -x ...@@ -32,4 +32,4 @@ set -x
run_fp16_tests run_fp16_tests
set +x set +x
\ No newline at end of file
#!/bin/bash
EXE="$(find . -name tile_example_gemm_universal -type f | head -n 1)"
KNAME=1
export CK_WARMUP=0
export CK_REPEAT=1
COMMON_ARGS='-v=2 -warmup=0 -repeat=1'
run_fp16_tests() {
for batch in 1 2; do
for m in 128 1024; do
for n in 128 2048; do
for k in 32 64; do
$EXE -b=$batch -m=$m -n=$n -k=$k -stride_a=0 -stride_b=0 -stride_c=0 -e=1e-5 -prec=fp16 $COMMON_ARGS
if [ $? -eq 0 ]; then
echo "Success: Test with batch=$batch, m=$m, n=$n, k=$k executed successfully."
else
echo "Error: Test with batch=$batch, m=$m, n=$n, k=$k failed to execute properly."
# Optionally, exit or break if you need to halt further execution
# exit 1
fi
done
done
done
done
}
set -x
run_fp16_tests
set +x
...@@ -149,6 +149,12 @@ def parse_logfile(logfile): ...@@ -149,6 +149,12 @@ def parse_logfile(logfile):
lst=line.split() lst=line.split()
line_dict=dict(zip(lst[1:],lst)) line_dict=dict(zip(lst[1:],lst))
res.append(line_dict['TFlops,']) res.append(line_dict['TFlops,'])
elif 'perf_tile_gemm_basic' in logfile or 'perf_tile_gemm_mem_pipeline' in logfile:
for line in open(logfile):
if 'TFlops' in line:
lst=line.split()
line_dict=dict(zip(lst[1:],lst))
res.append(line_dict['TFlops,'])
return res return res
...@@ -330,6 +336,14 @@ def main(): ...@@ -330,6 +336,14 @@ def main():
for i in range(1,len(results)+1): for i in range(1,len(results)+1):
testlist.append("Test%i"%i) testlist.append("Test%i"%i)
table_name="ck_fmha_bwd_tflops" table_name="ck_fmha_bwd_tflops"
if 'gemm_basic_fp16' in filename:
for i in range(1, len(results)+1):
testlist.append("Test%i"%i)
table_name="ck_tile_gemm_basic_fp16_tflops"
if 'gemm_mem_pipeline_fp16' in filename:
for i in range(1, len(results)+1):
testlist.append("Test%i"%i)
table_name="ck_tile_gemm_mem_pipeline_fp16_tflops"
tflops_base = get_baseline(table_name,conn) tflops_base = get_baseline(table_name,conn)
store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, sqlEngine) store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, sqlEngine)
......
...@@ -43,3 +43,19 @@ file=./perf_fmha_bwd_gfx90a.log ...@@ -43,3 +43,19 @@ file=./perf_fmha_bwd_gfx90a.log
if [ -e "$file" ]; then if [ -e "$file" ]; then
python3 process_perf_data.py perf_fmha_bwd_gfx90a.log python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
fi fi
file=./perf_tile_gemm_basic_fp16_gfx942.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_tile_gemm_basic_fp16_gfx942.log
fi
file=./perf_tile_gemm_basic_fp16_gfx90a.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_tile_gemm_basic_fp16_gfx90a.log
fi
file=./perf_tile_gemm_mem_pipeline_fp16_gfx942.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_tile_gemm_mem_pipeline_fp16_gfx942.log
fi
file=./perf_tile_gemm_mem_pipeline_fp16_gfx90a.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_tile_gemm_mem_pipeline_fp16_gfx90a.log
fi
...@@ -52,3 +52,19 @@ file=./perf_fmha_bwd_gfx90a.log ...@@ -52,3 +52,19 @@ file=./perf_fmha_bwd_gfx90a.log
if [ -e "$file" ]; then if [ -e "$file" ]; then
python3 process_perf_data.py perf_fmha_bwd_gfx90a.log python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
fi fi
file=./perf_gemm_basic_gfx942.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_gemm_basic_gfx942.log
fi
file=./perf_gemm_basic_gfx90a.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_gemm_basic_gfx90a.log
fi
file=./perf_gemm_mem_pipeline_gfx942.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_gemm_mem_pipeline_gfx942.log
fi
file=./perf_gemm_mem_pipeline_gfx90a.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_gemm_mem_pipeline_gfx90a.log
fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment