"git@developer.sourcefind.cn:modelzoo/resnet50_tensorflow.git" did not exist on "f52b8c93ee0e7dfa6b804706f53355215ced952f"
Unverified Commit 1ced00a5 authored by Illia Silin's avatar Illia Silin Committed by GitHub
Browse files

Add performance tests on MI200 in CI, reporting number of CUs, add stand-alone perf test. (#277)

* use pre-built docker instead of building a new one

* try docker.image.pull

* change syntax in docker.image()

* add 30 min timeout

* increase timeout to 3 hours

* move performance tests to first stage for testing

* set image variable to the new container name

* update image name

* check available images

* check available images in both places

* try different image name

* use image ID to refer to image

* run performance on gfx90a

* fix the gpu_arch labeling, add parameter

* move env vars out of stages

* add stand-alone performance script, MI200 tests, CU numbers
parent 1677cf70
......@@ -100,9 +100,9 @@ def buildHipClangJob(Map conf=[:]){
def variant = env.STAGE_NAME
def retimage
gitStatusWrapper(credentialsId: '7126e5fe-eb51-4576-b52b-9aaf1de8f0fd', gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
if (params.USE_DOCKERFILE){
try {
retimage = docker.build("${image}", dockerArgs + '.')
withDockerContainer(image: image, args: dockerOpts) {
......@@ -125,10 +125,19 @@ def buildHipClangJob(Map conf=[:]){
}
}
}
}
else{
timeout(time: 3, unit: 'HOURS'){
retimage = docker.image('compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54').pull()
image="b56f8ac0d6ea"
sh "docker images"
}
}
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 5, unit: 'HOURS')
{
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
cmake_build(conf)
}
}
......@@ -181,9 +190,9 @@ def runCKProfiler(Map conf=[:]){
def variant = env.STAGE_NAME
def retimage
gitStatusWrapper(credentialsId: '7126e5fe-eb51-4576-b52b-9aaf1de8f0fd', gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
if (params.USE_DOCKERFILE){
try {
retimage = docker.build("${image}", dockerArgs + '.')
withDockerContainer(image: image, args: dockerOpts) {
......@@ -206,6 +215,14 @@ def runCKProfiler(Map conf=[:]){
}
}
}
}
else{
timeout(time: 3, unit: 'HOURS'){
retimage = docker.image('compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54').pull()
image="b56f8ac0d6ea"
sh "docker images"
}
}
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 5, unit: 'HOURS')
......@@ -217,7 +234,8 @@ def runCKProfiler(Map conf=[:]){
sh "rm -f ${gemm_log}"
sh "echo Branch name: ${env.BRANCH_NAME} > ${gemm_log}"
sh "echo Node name: ${NODE_NAME} >> ${gemm_log}"
sh "echo GPU_arch: ${gpu_arch} >> ${gemm_log}"
sh "echo GPU_arch name: ${gpu_arch} >> ${gemm_log}"
sh "rocminfo | grep 'Compute Unit:' >> ${gemm_log} "
sh "hipcc --version | grep -e 'HIP version' >> ${gemm_log}"
sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}"
sh "./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}"
......@@ -246,7 +264,8 @@ def runCKProfiler(Map conf=[:]){
sh "rm -f ${resnet_log}"
sh "echo Branch name: ${env.BRANCH_NAME} > ${resnet_log}"
sh "echo Node name: ${NODE_NAME} >> ${resnet_log}"
sh "echo GPU_arch: ${gpu_arch} >> ${resnet_log}"
sh "echo GPU_arch name: ${gpu_arch} >> ${resnet_log}"
sh "rocminfo | grep 'Compute Unit:' >> ${resnet_log} "
sh "hipcc --version | grep -e 'HIP version' >> ${resnet_log}"
sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet_log}"
//first run tests with N=256
......@@ -285,9 +304,20 @@ pipeline {
options {
parallelsAlwaysFailFast()
}
// environment{
// variable = value
// }
parameters {
booleanParam(
name: "USE_DOCKERFILE",
defaultValue: true,
description: "")
}
environment{
dbuser = "${dbuser}"
dbpassword = "${dbpassword}"
dbsship = "${dbsship}"
dbsshport = "${dbsshport}"
dbsshuser = "${dbsshuser}"
dbsshpassword = "${dbsshpassword}"
}
stages{
stage("Static checks") {
parallel{
......@@ -302,30 +332,6 @@ pipeline {
// buildHipClangJobAndReboot(build_cmd: build_cmd, no_reboot:true, prefixpath: '/opt/rocm', build_type: 'debug')
// }
// }
// we will build and run ckProfiler release version later, during the performance test stage
//stage('Build Profiler: Release, gfx908')
//{
// agent { label rocmnode("nogpu")}
// environment{
// setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
// }
// steps{
// buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
// }
//}
//stage('Build Profiler: Debug, gfx908')
//{
// agent { label rocmnode("nogpu")}
// environment{
// setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
// }
// steps{
// // until we stabilize debug build due to compiler crashes
// catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
// buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Debug')
// }
// }
//}
stage('Clang Format') {
agent{ label rocmnode("nogpu") }
environment{
......@@ -353,12 +359,11 @@ pipeline {
{
agent{ label rocmnode("gfx908")}
environment{
setup_args = """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx900 --offload-arch=gfx906 --offload-arch=gfx908 --offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
setup_args = """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release')
buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release', gpu_arch: "gfx908")
}
}
stage("Run Tests: gfx90a")
{
......@@ -367,11 +372,9 @@ pipeline {
setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release')
buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release', gpu_arch: "gfx90a")
}
}
}
}
stage("Client App")
......@@ -400,33 +403,37 @@ pipeline {
agent{ label rocmnode("gfx908")}
environment{
setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
dbuser = "${dbuser}"
dbpassword = "${dbpassword}"
dbsship = "${dbsship}"
dbsshport = "${dbsshport}"
dbsshuser = "${dbsshuser}"
dbsshpassword = "${dbsshpassword}"
}
steps{
runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release', gpu_arch: "gfx908")
}
}
stage("Run ckProfiler: gfx90a")
{
agent{ label rocmnode("gfx90a")}
environment{
setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
}
steps{
runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release', gpu_arch: "gfx90a")
}
// enable after the cmake file supports packaging
// stage("Packages") {
// when {
// expression { params.BUILD_PACKAGES && params.TARGET_NOGPU && params.DATATYPE_NA }
// }
// parallel {
// stage("Package /opt/rocm") {
// agent{ label rocmnode("nogpu") }
// steps{
// buildHipClangJobAndReboot( package_build: "true", prefixpath: '/opt/rocm', gpu_arch: "gfx906;gfx908;gfx90a")
// }
// }
// }
// }
}
}
}
/* enable after the cmake file supports packaging
stage("Packages") {
when {
expression { params.BUILD_PACKAGES && params.TARGET_NOGPU && params.DATATYPE_NA }
}
parallel {
stage("Package /opt/rocm") {
agent{ label rocmnode("nogpu") }
steps{
buildHipClangJobAndReboot( package_build: "true", prefixpath: '/opt/rocm', gpu_arch: "gfx906;gfx908;gfx90a")
}
}
}
}
*/
}
}
......@@ -52,21 +52,28 @@ def main():
if 'Branch name' in line:
lst=line.split()
branch_name=lst[2]
if 'On branch' in line:
lst=line.split()
branch_name=lst[2]
if 'Node name' in line:
lst=line.split()
node_id=lst[2]
if 'GPU_arch' in line:
lst=line.split()
gpu_arch=lst[1]
gpu_arch=lst[2]
if 'HIP version' in line:
lst=line.split()
hip_vers=lst[2]
if 'Compute Unit' in line:
lst=line.split()
compute_units=lst[2]
if 'InstalledDir' in line:
lst=line.split()
rocm_vers=lst[1][lst[1].find('/opt/rocm-')+len('/opt/rocm-'):lst[1].rfind('/llvm/bin')]
print("Branch name:",branch_name)
print("Node name:",node_id)
print("GPU_arch:",gpu_arch)
print("Compute units:",compute_units)
print("ROCM_version:",rocm_vers)
print("HIP_version:",hip_vers)
......@@ -188,8 +195,8 @@ def main():
testlist=[]
for i in range(1,len(tests)+1):
testlist.append("Test%i"%i)
ck_gemm_tflops=[str(branch_name),str(node_id),str(gpu_arch),str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Node_ID','GPU_arch','ROCM_version','HIP_version','Datetime'])
ck_gemm_tflops=[str(branch_name),str(node_id),str(gpu_arch),compute_units,str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Datetime'])
df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist)
flops=pd.concat([flops,df_add],axis=1)
print("new tflops for gemm tests:",flops)
......@@ -207,8 +214,8 @@ def main():
testlist=[]
for i in range(1,50):
testlist.append("Layer%i"%i)
ck_resnet_tflops=[str(branch_name),str(node_id),str(gpu_arch),str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
flops0=pd.DataFrame(data=[ck_resnet_tflops],columns=['Branch_ID','Node_ID','GPU_arch','ROCM_version','HIP_version','Datetime'])
ck_resnet_tflops=[str(branch_name),str(node_id),str(gpu_arch),compute_units,str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
flops0=pd.DataFrame(data=[ck_resnet_tflops],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Datetime'])
df_add=pd.DataFrame(data=[tflops[0:49]],columns=testlist)
flops=pd.concat([flops0,df_add],axis=1)
print("new tflops for N=256 resnet50 test:",flops)
......
#!/bin/bash
#
# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
# and make sure the following python packages are installed in your environment:
# pip3 install --upgrade pip
# pip3 install sqlalchemy
# pip3 install pymysql
# pip3 install pandas
# pip3 install sshtunnel
# you would also need to set up some environment variables in order to
# post your new test results to the database and compare them to the baseline
# please contact Illia.Silin@amd.com for more details
#
export gemm_log="perf_gemm.log"
rm -f $gemm_log
git status | grep -e 'On branch' > ${gemm_log}
echo -n 'Node name: ' >>${gemm_log}; hostname >> ${gemm_log}
#get GPU_arch and number of compute units from rocminfo
echo -n "GPU_arch: " >> ${gemm_log}; rocminfo | grep "Name:" | grep "gfx" >> ${gemm_log}
rocminfo | grep "Compute Unit:" >> ${gemm_log}
hipcc --version | grep -e 'HIP version' >> ${gemm_log}
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}
./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}
./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a $gemm_log
python3 parse_perf_data.py ${gemm_log}
#run resnet50 test
export resnet_log="perf_resnet50.log"
rm -f $resnet_log
git status | grep -e 'On branch' > ${resnet_log}
echo -n 'Node name: '>>${resnet_log}; hostname >>${resnet_log}
#get GPU_arch and number of compute units from rocminfo
echo -n "GPU_arch: " >> ${resnet_log}; rocminfo | grep "Name:" | grep "gfx" >> ${resnet_log}
rocminfo | grep "Compute Unit:" >> ${resnet_log}
hipcc --version | grep -e 'HIP version' >> ${resnet_log}
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet_log}
#first run tests with N=256
./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a ${resnet_log}
#then run with N=4
./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a ${resnet_log}
#the script will put the results from N=256 and N=4 runs into separate tables
python3 parse_perf_data.py ${resnet_log}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment