"git@developer.sourcefind.cn:modelzoo/qwen_lmdeploy.git" did not exist on "64936449929bb6e61853eb8e93e76c0e7d39ab02"
Unverified Commit d8415a96 authored by Illia Silin's avatar Illia Silin Committed by GitHub
Browse files

Add full QA with verification option, few other changes. (#331)

* add verify flag and update scripts

* replace old check_error function with the new check_err

* fix syntax

* remove blank spaces

* remove empty line

* add check_err for tensors

* fix syntax

* replace tensors with vectors in check_err calls

* fix syntax

* remove blank spaces

* fix syntax

* add new line at end of file

* disable conv2d_bwd_weight test, add gpu check

* set check_gpu using export

* check GPU using runShell

* add definition of runShell

* fix script syntax

* reduce the number of threads, add full qa option

* run processing scripts in bash

* fix the branch and host names in performance scripts, add chronos

* replace parameterizedCron with cron

* archive the perf log files

* try to fix git call

* pass branch and host names as arguments into scripts

* fix script arguments

* fix script arguments

* process results on master

* fix pipeline

* add definition of gpu_arch

* run processing scripts in docker

* fix the brackets

* add agent master for the processing stage

* get rid of show_node_info call on master

* try using mici label instead of master, disable MI100 tests for now

* fix syntax

* simplify container for results processing

* remove node(master) from the process_results stage

* put all stages in original order

* change the agent label from master to mici for gfx908
parent 7959dad5
...@@ -11,6 +11,12 @@ def show_node_info() { ...@@ -11,6 +11,12 @@ def show_node_info() {
""" """
} }
def runShell(String command){
def responseCode = sh returnStatus: true, script: "${command} &> tmp.txt"
def output = readFile(file: "tmp.txt")
return (output != "")
}
def cmake_build(Map conf=[:]){ def cmake_build(Map conf=[:]){
def compiler = conf.get("compiler","/opt/rocm/bin/hipcc") def compiler = conf.get("compiler","/opt/rocm/bin/hipcc")
...@@ -60,7 +66,7 @@ def cmake_build(Map conf=[:]){ ...@@ -60,7 +66,7 @@ def cmake_build(Map conf=[:]){
""" """
def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ") def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ")
// reduce parallelism when compiling, clang uses too much memory // reduce parallelism when compiling, clang uses too much memory
def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j\$(( \$(nproc) / 1 )) ${config_targets}") def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j\$(( \$(nproc) / 2 )) ${config_targets}")
def execute_cmd = conf.get("execute_cmd", "") def execute_cmd = conf.get("execute_cmd", "")
def cmd = conf.get("cmd", """ def cmd = conf.get("cmd", """
...@@ -113,7 +119,14 @@ def buildHipClangJob(Map conf=[:]){ ...@@ -113,7 +119,14 @@ def buildHipClangJob(Map conf=[:]){
retimage = docker.build("${image}", dockerArgs + '.') retimage = docker.build("${image}", dockerArgs + '.')
withDockerContainer(image: image, args: dockerOpts) { withDockerContainer(image: image, args: dockerOpts) {
timeout(time: 5, unit: 'MINUTES'){ timeout(time: 5, unit: 'MINUTES'){
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
echo "GPU not found"
throw e
}
else{
echo "GPU is OK"
}
} }
} }
} }
...@@ -125,7 +138,14 @@ def buildHipClangJob(Map conf=[:]){ ...@@ -125,7 +138,14 @@ def buildHipClangJob(Map conf=[:]){
retimage = docker.build("${image}", dockerArgs + " --no-cache .") retimage = docker.build("${image}", dockerArgs + " --no-cache .")
withDockerContainer(image: image, args: dockerOpts) { withDockerContainer(image: image, args: dockerOpts) {
timeout(time: 5, unit: 'MINUTES'){ timeout(time: 5, unit: 'MINUTES'){
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo |tee clinfo.log'
if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
echo "GPU not found"
throw e
}
else{
echo "GPU is OK"
}
} }
} }
} }
...@@ -133,7 +153,14 @@ def buildHipClangJob(Map conf=[:]){ ...@@ -133,7 +153,14 @@ def buildHipClangJob(Map conf=[:]){
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 5, unit: 'HOURS') timeout(time: 5, unit: 'HOURS')
{ {
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
echo "GPU not found"
throw e
}
else{
echo "GPU is OK"
}
cmake_build(conf) cmake_build(conf)
} }
} }
...@@ -145,7 +172,6 @@ def reboot(){ ...@@ -145,7 +172,6 @@ def reboot(){
build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),] build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),]
} }
def buildHipClangJobAndReboot(Map conf=[:]){ def buildHipClangJobAndReboot(Map conf=[:]){
try{ try{
buildHipClangJob(conf) buildHipClangJob(conf)
...@@ -162,7 +188,6 @@ def buildHipClangJobAndReboot(Map conf=[:]){ ...@@ -162,7 +188,6 @@ def buildHipClangJobAndReboot(Map conf=[:]){
} }
} }
def runCKProfiler(Map conf=[:]){ def runCKProfiler(Map conf=[:]){
show_node_info() show_node_info()
...@@ -189,7 +214,6 @@ def runCKProfiler(Map conf=[:]){ ...@@ -189,7 +214,6 @@ def runCKProfiler(Map conf=[:]){
} }
def variant = env.STAGE_NAME def variant = env.STAGE_NAME
def retimage def retimage
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
...@@ -197,7 +221,14 @@ def runCKProfiler(Map conf=[:]){ ...@@ -197,7 +221,14 @@ def runCKProfiler(Map conf=[:]){
retimage = docker.build("${image}", dockerArgs + '.') retimage = docker.build("${image}", dockerArgs + '.')
withDockerContainer(image: image, args: dockerOpts) { withDockerContainer(image: image, args: dockerOpts) {
timeout(time: 5, unit: 'MINUTES'){ timeout(time: 5, unit: 'MINUTES'){
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
echo "GPU not found"
throw e
}
else{
echo "GPU is OK"
}
} }
} }
} }
...@@ -209,89 +240,69 @@ def runCKProfiler(Map conf=[:]){ ...@@ -209,89 +240,69 @@ def runCKProfiler(Map conf=[:]){
retimage = docker.build("${image}", dockerArgs + " --no-cache .") retimage = docker.build("${image}", dockerArgs + " --no-cache .")
withDockerContainer(image: image, args: dockerOpts) { withDockerContainer(image: image, args: dockerOpts) {
timeout(time: 5, unit: 'MINUTES'){ timeout(time: 5, unit: 'MINUTES'){
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
echo "GPU not found"
throw e
}
else{
echo "GPU is OK"
}
} }
} }
} }
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 5, unit: 'HOURS') timeout(time: 24, unit: 'HOURS')
{ {
cmake_build(conf) cmake_build(conf)
dir("script"){ dir("script"){
//run gemm performance tests if (params.RUN_FULL_QA){
def gemm_log = "perf_gemm_${gpu_arch}.log" def qa_log = "qa_${gpu_arch}.log"
sh "rm -f ${gemm_log}" if (params.USE_9110){
sh "echo Branch name: ${env.BRANCH_NAME} > ${gemm_log}" sh "./run_full_performance_tests.sh 1 QA_9110 ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
sh "echo Node name: ${NODE_NAME} >> ${gemm_log}" }
sh "echo GPU_arch name: ${gpu_arch} >> ${gemm_log}" else{
sh "rocminfo | grep 'Compute Unit:' >> ${gemm_log} " sh "./run_full_performance_tests.sh 1 QA_release ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
sh "hipcc --version | grep -e 'HIP version' >> ${gemm_log}" }
if (params.USE_9110){ archiveArtifacts "perf_gemm_${gpu_arch}.log"
sh "echo Environment type: CI_9110 >> ${gemm_log}" archiveArtifacts "perf_resnet50_N256_${gpu_arch}.log"
} archiveArtifacts "perf_resnet50_N4_${gpu_arch}.log"
else{ archiveArtifacts "perf_bathced_gemm_${gpu_arch}.log"
sh "echo Environment type: CI_release >> ${gemm_log}" archiveArtifacts "perf_grouped_gemm_${gpu_arch}.log"
} archiveArtifacts "perf_fwd_conv_${gpu_arch}.log"
sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}" archiveArtifacts "perf_bwd_conv_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}" archiveArtifacts "perf_fusion_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a ${gemm_log}" archiveArtifacts "perf_reduction_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a ${gemm_log}" // stash perf files to master
sh "./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a ${gemm_log}" stash name: "perf_gemm_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a ${gemm_log}" stash name: "perf_resnet50_N256_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a ${gemm_log}" stash name: "perf_resnet50_N4_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a ${gemm_log}" stash name: "perf_bathced_gemm_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a ${gemm_log}" stash name: "perf_grouped_gemm_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a ${gemm_log}" stash name: "perf_fwd_conv_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a ${gemm_log}" stash name: "perf_bwd_conv_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a ${gemm_log}" stash name: "perf_fusion_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a ${gemm_log}" stash name: "perf_reduction_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a ${gemm_log}" //we will process results on the master node
sh "./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a ${gemm_log}"
sh "./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a ${gemm_log}"
sh "./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a ${gemm_log}"
//results will be parsed, stored, and analyzed within the python script
//the script will return 0 if the performance criteria are met
//or return 1 if the criteria are not met
archiveArtifacts "${gemm_log}"
sh "python3 process_perf_data.py ${gemm_log} "
//run resnet50 test
def resnet256_log = "perf_resnet50_N256_${gpu_arch}.log"
sh "rm -f ${resnet256_log}"
sh "echo Branch name: ${env.BRANCH_NAME} > ${resnet256_log}"
sh "echo Node name: ${NODE_NAME} >> ${resnet256_log}"
sh "echo GPU_arch name: ${gpu_arch} >> ${resnet256_log}"
sh "rocminfo | grep 'Compute Unit:' >> ${resnet256_log} "
sh "hipcc --version | grep -e 'HIP version' >> ${resnet256_log}"
if (params.USE_9110){
sh "echo Environment type: CI_9110 >> ${resnet256_log}"
} }
else{ else{
sh "echo Environment type: CI_release >> ${resnet256_log}" if (params.USE_9110){
sh "./run_performance_tests.sh 0 CI_9110 ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
}
else{
sh "./run_performance_tests.sh 0 CI_release ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
}
archiveArtifacts "perf_gemm_${gpu_arch}.log"
archiveArtifacts "perf_resnet50_N256_${gpu_arch}.log"
archiveArtifacts "perf_resnet50_N4_${gpu_arch}.log"
// stash perf files to master
stash name: "perf_gemm_${gpu_arch}.log"
stash name: "perf_resnet50_N256_${gpu_arch}.log"
stash name: "perf_resnet50_N4_${gpu_arch}.log"
//we will process the results on the master node
} }
sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet256_log}"
//first run tests with N=256
sh "./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a ${resnet256_log}"
archiveArtifacts "${resnet256_log}"
sh "python3 process_perf_data.py ${resnet256_log} "
//then run with N=4
def resnet4_log = "perf_resnet50_N4_${gpu_arch}.log"
sh "rm -f ${resnet4_log}"
sh "echo Branch name: ${env.BRANCH_NAME} > ${resnet4_log}"
sh "echo Node name: ${NODE_NAME} >> ${resnet4_log}"
sh "echo GPU_arch name: ${gpu_arch} >> ${resnet4_log}"
sh "rocminfo | grep 'Compute Unit:' >> ${resnet4_log} "
sh "hipcc --version | grep -e 'HIP version' >> ${resnet4_log}"
if (params.USE_9110){
sh "echo Environment type: CI_9110 >> ${resnet4_log}"
}
else{
sh "echo Environment type: CI_release >> ${resnet4_log}"
}
sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet4_log}"
sh "./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a ${resnet4_log}"
archiveArtifacts "${resnet4_log}"
sh "python3 process_perf_data.py ${resnet4_log} "
} }
} }
} }
...@@ -299,7 +310,6 @@ def runCKProfiler(Map conf=[:]){ ...@@ -299,7 +310,6 @@ def runCKProfiler(Map conf=[:]){
return retimage return retimage
} }
def runPerfTest(Map conf=[:]){ def runPerfTest(Map conf=[:]){
try{ try{
runCKProfiler(conf) runCKProfiler(conf)
...@@ -316,8 +326,76 @@ def runPerfTest(Map conf=[:]){ ...@@ -316,8 +326,76 @@ def runPerfTest(Map conf=[:]){
} }
} }
def process_results(Map conf=[:]){
env.HSA_ENABLE_SDMA=0
checkout scm
def image = "composable_kernels"
def prefixpath = "/opt/rocm"
def gpu_arch = conf.get("gpu_arch", "gfx908")
// Jenkins is complaining about the render group
def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if (conf.get("enforce_xnack_on", false)) {
dockerOpts = dockerOpts + " --env HSA_XNACK=1"
}
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='release' "
def variant = env.STAGE_NAME
def retimage
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
try {
retimage = docker.build("${image}", dockerArgs + '.')
}
catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
echo "The job was cancelled or aborted"
throw e
}
}
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 1, unit: 'HOURS'){
try{
dir("script"){
if (params.RUN_FULL_QA){
// unstash perf files to master
unstash "perf_gemm_${gpu_arch}.log"
unstash "perf_resnet50_N256_${gpu_arch}.log"
unstash "perf_resnet50_N4_${gpu_arch}.log"
unstash "perf_bathced_gemm_${gpu_arch}.log"
unstash "perf_grouped_gemm_${gpu_arch}.log"
unstash "perf_fwd_conv_${gpu_arch}.log"
unstash "perf_bwd_conv_${gpu_arch}.log"
unstash "perf_fusion_${gpu_arch}.log"
unstash "perf_reduction_${gpu_arch}.log"
sh "./process_qa_data.sh ${gpu_arch}"
}
else{
// unstash perf files to master
unstash "perf_gemm_${gpu_arch}.log"
unstash "perf_resnet50_N256_${gpu_arch}.log"
unstash "perf_resnet50_N4_${gpu_arch}.log"
sh "./process_perf_data.sh ${gpu_arch}"
}
}
}
catch(e){
echo "throwing error exception while processing performance test results"
echo 'Exception occurred: ' + e.toString()
throw e
}
}
}
}
//launch develop branch daily at 23:00 in FULL_QA mode
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;USE_9110=true''' : ""
pipeline { pipeline {
agent none agent none
triggers {
cron(CRON_SETTINGS)
}
options { options {
parallelsAlwaysFailFast() parallelsAlwaysFailFast()
} }
...@@ -325,7 +403,11 @@ pipeline { ...@@ -325,7 +403,11 @@ pipeline {
booleanParam( booleanParam(
name: "USE_9110", name: "USE_9110",
defaultValue: true, defaultValue: true,
description: "") description: "Select compiler version: 9110 (default) or release")
booleanParam(
name: "RUN_FULL_QA",
defaultValue: false,
description: "Select whether to run small set of performance tests (default) or full QA")
} }
environment{ environment{
dbuser = "${dbuser}" dbuser = "${dbuser}"
...@@ -438,6 +520,25 @@ pipeline { ...@@ -438,6 +520,25 @@ pipeline {
} }
} }
} }
stage("Process Performance Test Results")
{
parallel
{
stage("Process results for gfx908"){
agent { label 'mici' }
steps{
process_results(gpu_arch: "gfx908")
}
}
stage("Process results for gfx90a"){
agent { label 'mici' }
steps{
process_results(gpu_arch: "gfx90a")
}
}
}
}
/* enable after the cmake file supports packaging /* enable after the cmake file supports packaging
stage("Packages") { stage("Packages") {
when { when {
......
...@@ -381,52 +381,3 @@ HostTensorDescriptor::HostTensorDescriptor(const std::vector<X>& lens, ...@@ -381,52 +381,3 @@ HostTensorDescriptor::HostTensorDescriptor(const std::vector<X>& lens,
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end()) : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
{ {
} }
#if 1
// FIXME: remove
template <typename T>
float check_error(const Tensor<T>& ref, const Tensor<T>& result)
{
float l1_error = 0;
float linf_error = -1;
float linf_rel_error = -1;
float linf_ref_value = 0, linf_result_value = 0;
float linf_rel_ref_value = 0, linf_rel_result_value = 0;
constexpr float eps = 1e-10;
for(std::size_t i = 0; i < ref.mData.size(); ++i)
{
float ref_v = ck::type_convert<float>(ref.mData[i]);
float result_v = ck::type_convert<float>(result.mData[i]);
float diff = std::abs(ref_v - result_v);
float rel_diff = diff / std::max(std::abs(ref_v), eps);
l1_error += diff;
if(linf_error < diff)
{
linf_error = diff;
linf_ref_value = ref_v;
linf_result_value = result_v;
}
if(linf_rel_error < rel_diff)
{
linf_rel_error = rel_diff;
linf_rel_ref_value = ref_v;
linf_rel_result_value = result_v;
}
}
std::cout << "Absolute Error L1 Norm (sum of abs diff): " << l1_error << std::endl;
std::cout << "Absolute Error L-inf Norm (max abs diff): " << linf_error << ", ref "
<< linf_ref_value << ", result " << linf_result_value << std::endl;
std::cout << "Relative Error L-inf Norm (max relative abs diff): " << linf_rel_error << ", ref "
<< linf_rel_ref_value << ", result " << linf_rel_result_value << std::endl;
return linf_error;
}
#endif
...@@ -29,9 +29,8 @@ check_err(const std::vector<T>& out, ...@@ -29,9 +29,8 @@ check_err(const std::vector<T>& out,
{ {
if(out.size() != ref.size()) if(out.size() != ref.size())
{ {
std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size() std::cout << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl << std::endl;
<< msg << std::endl;
return false; return false;
} }
...@@ -48,9 +47,8 @@ check_err(const std::vector<T>& out, ...@@ -48,9 +47,8 @@ check_err(const std::vector<T>& out,
err_count++; err_count++;
if(err_count < 5) if(err_count < 5)
{ {
std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref[" std::cout << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< i << "]: " << out[i] << " != " << ref[i] << std::endl << "] != ref[" << i << "]: " << out[i] << " != " << ref[i] << std::endl;
<< msg << std::endl;
} }
res = false; res = false;
} }
...@@ -72,9 +70,8 @@ check_err(const std::vector<T>& out, ...@@ -72,9 +70,8 @@ check_err(const std::vector<T>& out,
{ {
if(out.size() != ref.size()) if(out.size() != ref.size())
{ {
std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size() std::cout << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl << std::endl;
<< msg << std::endl;
return false; return false;
} }
...@@ -94,9 +91,8 @@ check_err(const std::vector<T>& out, ...@@ -94,9 +91,8 @@ check_err(const std::vector<T>& out,
err_count++; err_count++;
if(err_count < 5) if(err_count < 5)
{ {
std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref[" std::cout << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< i << "]: " << o << " != " << r << std::endl << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
<< msg << std::endl;
} }
res = false; res = false;
} }
...@@ -118,9 +114,8 @@ check_err(const std::vector<T>& out, ...@@ -118,9 +114,8 @@ check_err(const std::vector<T>& out,
{ {
if(out.size() != ref.size()) if(out.size() != ref.size())
{ {
std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size() std::cout << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl << std::endl;
<< msg << std::endl;
return false; return false;
} }
...@@ -139,9 +134,8 @@ check_err(const std::vector<T>& out, ...@@ -139,9 +134,8 @@ check_err(const std::vector<T>& out,
err_count++; err_count++;
if(err_count < 5) if(err_count < 5)
{ {
std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref[" std::cout << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< i << "]: " << o << " != " << r << std::endl << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
<< msg << std::endl;
} }
res = false; res = false;
} }
...@@ -163,9 +157,8 @@ check_err(const std::vector<T>& out, ...@@ -163,9 +157,8 @@ check_err(const std::vector<T>& out,
{ {
if(out.size() != ref.size()) if(out.size() != ref.size())
{ {
std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size() std::cout << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl << std::endl;
<< msg << std::endl;
return false; return false;
} }
...@@ -185,9 +178,9 @@ check_err(const std::vector<T>& out, ...@@ -185,9 +178,9 @@ check_err(const std::vector<T>& out,
err_count++; err_count++;
if(err_count < 5) if(err_count < 5)
{ {
std::cout << "out[" << i << "] != ref[" << i << "]: " << static_cast<int>(out[i]) std::cout << msg << " out[" << i << "] != ref[" << i
<< " != " << static_cast<int>(ref[i]) << std::endl << "]: " << static_cast<int>(out[i]) << " != " << static_cast<int>(ref[i])
<< msg << std::endl; << std::endl;
} }
res = false; res = false;
} }
......
...@@ -318,13 +318,16 @@ bool profile_batched_gemm_reduce_impl(int do_verification, ...@@ -318,13 +318,16 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
reduce0_device_buf.FromDevice(d0_g_m_device_result.mData.data()); reduce0_device_buf.FromDevice(d0_g_m_device_result.mData.data());
reduce1_device_buf.FromDevice(d1_g_m_device_result.mData.data()); reduce1_device_buf.FromDevice(d1_g_m_device_result.mData.data());
float c_error = check_error(c_g_m_n_host_result, c_g_m_n_device_result); bool c_error =
float d0_error = check_error(d0_g_m_host_result, d0_g_m_device_result); ck::utils::check_err(c_g_m_n_host_result.mData, c_g_m_n_device_result.mData);
float d1_error = check_error(d1_g_m_host_result, d1_g_m_device_result); bool d0_error =
ck::utils::check_err(d0_g_m_host_result.mData, d0_g_m_device_result.mData);
pass = pass && (c_error < 1E-6); bool d1_error =
pass = pass && (d0_error < 1E-6); ck::utils::check_err(d1_g_m_host_result.mData, d1_g_m_device_result.mData);
pass = pass && (d1_error < 1E-6);
pass = pass && (c_error == true);
pass = pass && (d0_error == true);
pass = pass && (d1_error == true);
if(do_log) if(do_log)
{ {
......
...@@ -250,11 +250,11 @@ bool profile_conv_bwd_weight_impl(int do_verification, ...@@ -250,11 +250,11 @@ bool profile_conv_bwd_weight_impl(int do_verification,
{ {
wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data()); wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data());
float max_error = check_error(wei_k_c_y_x_host_result, wei_k_c_y_x_device_result); pass = ck::utils::check_err(wei_k_c_y_x_host_result.mData,
wei_k_c_y_x_device_result.mData);
if(max_error > 8) if(pass == false)
{ {
pass = false;
std::cout << "Fail info:" << conv_ptr->GetTypeString() << std::endl; std::cout << "Fail info:" << conv_ptr->GetTypeString() << std::endl;
} }
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp" #include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp" #include "ck/library/utility/conv_util.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/host_tensor/host_tensor.hpp"
...@@ -452,7 +453,7 @@ bool profile_convnd_bwd_data_impl(int do_verification, ...@@ -452,7 +453,7 @@ bool profile_convnd_bwd_data_impl(int do_verification,
std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl; std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
} }
check_error(input_host_result, input_device_result); success = ck::utils::check_err(input_host_result.mData, input_device_result.mData);
if(do_log) if(do_log)
{ {
......
...@@ -433,21 +433,17 @@ bool profile_convnd_bwd_weight_impl(int do_verification, ...@@ -433,21 +433,17 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
{ {
wei_device_buf.FromDevice(weights_device_result.mData.data()); wei_device_buf.FromDevice(weights_device_result.mData.data());
float max_error = check_error(weights_host_result, weights_device_result); success = ck::utils::check_err(weights_host_result.mData, weights_device_result.mData);
if(max_error > 8) if(success == false)
{ {
std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl; std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
success = false;
} }
else else
{ {
std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl; std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
} }
check_error(weights_host_result, weights_device_result);
if(do_log) if(do_log)
{ {
std::cout << "in : "; std::cout << "in : ";
......
File mode changed from 100644 to 100755
...@@ -85,7 +85,6 @@ def parse_logfile(logfile): ...@@ -85,7 +85,6 @@ def parse_logfile(logfile):
for line in open(logfile): for line in open(logfile):
if 'Best Perf' in line: if 'Best Perf' in line:
lst=line.split() lst=line.split()
print("len(lst)=",len(lst),"lst:",lst)
if len(lst)>=37: #the line is complete if len(lst)>=37: #the line is complete
tests.append(glue.join(lst[5:30])) tests.append(glue.join(lst[5:30]))
kernels.append(glue.join(lst[37:])) kernels.append(glue.join(lst[37:]))
...@@ -293,4 +292,4 @@ def main(): ...@@ -293,4 +292,4 @@ def main():
return regression return regression
if __name__ == '__main__': if __name__ == '__main__':
main() main()
\ No newline at end of file
#!/bin/bash
#
# in order to run this script you'd need the following python packages:
pip3 install --upgrade pip
pip3 install sqlalchemy pymysql pandas sshtunnel
# you would also need to set up some environment variables in order to
# post your new test results to the database and compare them to the baseline
# please contact Illia.Silin@amd.com for more details
#process results
gpu_arch=$1
python3 process_perf_data.py perf_gemm_"$gpu_arch".log
python3 process_perf_data.py perf_resnet50_N265_"$gpu_arch".log
python3 process_perf_data.py perf_resnet50_N4_"$gpu_arch".log
\ No newline at end of file
#!/bin/bash
#
# in order to run this script you'd need the following python packages:
pip3 install --upgrade pip
pip3 install sqlalchemy pymysql pandas sshtunnel
# you would also need to set up some environment variables in order to
# post your new test results to the database and compare them to the baseline
# please contact Illia.Silin@amd.com for more details
#process results
gpu_arch=$1
python3 process_perf_data.py perf_gemm_"$gpu_arch".log
python3 process_perf_data.py perf_resnet50_N265_"$gpu_arch".log
python3 process_perf_data.py perf_resnet50_N4_"$gpu_arch".log
python3 process_perf_data.py perf_batched_gemm_"$gpu_arch".log
python3 process_perf_data.py perf_grouped_gemm_"$gpu_arch".log
python3 process_perf_data.py perf_fwd_conv_"$gpu_arch".log
python3 process_perf_data.py perf_bwd_conv_"$gpu_arch".log
python3 process_perf_data.py perf_fusion_"$gpu_arch".log
python3 process_perf_data.py perf_reduction_"$gpu_arch".log
\ No newline at end of file
...@@ -11,26 +11,34 @@ INIT=$5 ...@@ -11,26 +11,34 @@ INIT=$5
LOG=$6 LOG=$6
REPEAT=$7 REPEAT=$7
######## op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchCount OP=$1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 960 1024 1024 -1 -1 -1 8 DATATYPE=$2
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1920 2048 2048 -1 -1 -1 8 LAYOUT=$3
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3840 4096 4096 -1 -1 -1 4 VERIFY=$4
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7680 8192 8192 -1 -1 -1 2 INIT=$5
LOG=$6
REPEAT=$7
######## op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 960 1024 1024 -1 -1 -1 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1920 2048 2048 -1 -1 -1 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3840 4096 4096 -1 -1 -1 -1 -1 -1 4
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7680 8192 8192 -1 -1 -1 -1 -1 -1 2
####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchCount ####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1024 1024 1024 8 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1024 1024 1024 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2048 2048 2048 8 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2048 2048 2048 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4096 4096 4096 4 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4096 4096 4096 -1 -1 -1 4
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8192 8192 8192 2 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8192 8192 8192 -1 -1 -1 2
####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchCount ####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1056 1056 1056 8 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1056 1056 1056 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2080 2080 2080 8 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2080 2080 2080 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4128 4128 4128 4 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4128 4128 4128 -1 -1 -1 4
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8224 8224 8224 2 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8224 8224 8224 -1 -1 -1 2
####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchCount ####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1088 1088 1088 8 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1088 1088 1088 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2112 2112 2112 8 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2112 2112 2112 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4160 4160 4160 4 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4160 4160 4160 -1 -1 -1 4
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8256 8256 8256 2 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8256 8256 8256 -1 -1 -1 2
\ No newline at end of file \ No newline at end of file
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 -1 -1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920 2048 2048 -1 -1 -1 -1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840 4096 4096 -1 -1 -1 -1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680 8192 8192 -1 -1 -1 -1 1 1
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 0 -1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920 2048 2048 -1 -1 0 -1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840 4096 4096 -1 -1 0 -1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680 8192 8192 -1 -1 0 -1 1 1
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1000 1000 1000 -1 -1 0 -1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2000 2000 2000 -1 -1 0 -1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4000 4000 4000 -1 -1 0 -1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8000 8000 8000 -1 -1 0 -1 1 1
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1056 1056 1056 1056 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2080 2080 2080 2080 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4128 4128 4128 4128 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8224 8224 8224 8224 1 1
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1088 1088 1088 1088 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2112 2112 2112 2112 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4160 4160 4160 4160 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8256 8256 8256 8256 1 1
\ No newline at end of file
#!/bin/bash #!/bin/bash
# #
# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/ # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
# and make sure the following python packages are installed in your environment:
pip3 install --upgrade pip
pip3 install sqlalchemy pymysql pandas sshtunnel
# you would also need to set up some environment variables in order to # you would also need to set up some environment variables in order to
# post your new test results to the database and compare them to the baseline # post your new test results to the database and compare them to the baseline
# please contact Illia.Silin@amd.com for more details # please contact Illia.Silin@amd.com for more details
# #
# run the script as "./run_full_performance_tests.sh <tag for your test environment> # run the script as "./run_full_performance_tests.sh <verification> <tag for your test environment> <gpu_arch> <branch name> < node name>
# input arguments:
#get the test environment type: # verification = 0 : do not verify result correctness on CPU
export env_type=$1 # = 1 : verifuy correctness on CPU (may take a long time)
echo 'Environment type ' $env_type # environment tag : a string describing the specifics of your test environment
# gpu_arch : a string for GPU architecture, e.g. "gfx908" or "gfx90a".
# branch name : name of the branch in git repo (git status | grep -e 'On branch')
# node name : $hostname
#get the command line arguments:
export verify=$1
echo 'Verification: ' $verify
export env_type=$2
echo 'Environment type: ' $env_type
export gpu_arch=$3
echo 'GPU architecture: ' $gpu_arch
export branch=$4
echo 'Branch name: ' $branch
export host_name=$5
echo 'Host name: ' $host_name
function print_log_header(){ function print_log_header(){
rm -f $1; rm -f $1;
git status | grep -e 'On branch' > $1; echo 'On branch ' $3 &> $1;
echo -n 'Node name: ' >>$1; hostname >> $1; echo 'Node name: ' $4 >> $1;
#get GPU_arch and number of compute units from rocminfo #get GPU_arch and number of compute units from rocminfo
echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1; echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
rocminfo | grep "Compute Unit:" >> $1; rocminfo | grep "Compute Unit:" >> $1;
hipcc --version | grep -e 'HIP version' >> $1; hipcc --version | grep -e 'HIP version' >> $1;
echo 'Environment type: ' $2 >>$1; echo 'Environment type: ' $2 >> $1;
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1; /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
} }
#run gemm tests #run gemm tests
export gemm_log="perf_gemm.log" export gemm_log="perf_gemm_${gpu_arch}.log"
print_log_header $gemm_log $env_type print_log_header $gemm_log $env_type $branch $host_name
./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 0 0 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 1 0 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 2 0 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 3 0 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 0 1 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 1 1 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 2 1 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 3 1 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 0 2 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 1 2 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 2 2 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 3 2 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 0 3 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 1 3 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 2 3 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 3 3 $verify 1 0 5 | tee -a $gemm_log
python3 process_perf_data.py $gemm_log
#run resnet50 tests #run resnet50 tests
export resnet256_log="perf_resnet50_N256.log" export resnet256_log="perf_resnet50_N256_${gpu_arch}.log"
print_log_header $resnet256_log $env_type print_log_header $resnet256_log $env_type $branch $host_name
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a $resnet256_log ./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 256 | tee -a $resnet256_log
python3 process_perf_data.py $resnet256_log export resnet4_log="perf_resnet50_N4_${gpu_arch}.log"
export resnet4_log="perf_resnet50_N4.log" print_log_header $resnet4_log $env_type $branch $host_name
print_log_header $resnet4_log $env_type ./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 4 | tee -a $resnet4_log
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a $resnet4_log
python3 process_perf_data.py $resnet4_log
#run batched_gemm tests #run batched_gemm tests
export batched_gemm_log="perf_batched_gemm.log" export batched_gemm_log="perf_batched_gemm_${gpu_arch}.log"
print_log_header $batched_gemm_log $env_type print_log_header $batched_gemm_log $env_type $branch $host_name
./profile_batched_gemm.sh batched_gemm 0 0 0 2 0 5 | tee -a $batched_gemm_log ./profile_batched_gemm.sh batched_gemm 0 0 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 1 0 2 0 5 | tee -a $batched_gemm_log ./profile_batched_gemm.sh batched_gemm 0 1 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 2 0 2 0 5 | tee -a $batched_gemm_log ./profile_batched_gemm.sh batched_gemm 0 2 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 3 0 2 0 5 | tee -a $batched_gemm_log ./profile_batched_gemm.sh batched_gemm 0 3 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 0 0 2 0 5 | tee -a $batched_gemm_log ./profile_batched_gemm.sh batched_gemm 1 0 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 1 0 2 0 5 | tee -a $batched_gemm_log ./profile_batched_gemm.sh batched_gemm 1 1 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 2 0 2 0 5 | tee -a $batched_gemm_log ./profile_batched_gemm.sh batched_gemm 1 2 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 3 0 2 0 5 | tee -a $batched_gemm_log ./profile_batched_gemm.sh batched_gemm 1 3 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 0 0 2 0 5 | tee -a $batched_gemm_log ./profile_batched_gemm.sh batched_gemm 2 0 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 1 0 2 0 5 | tee -a $batched_gemm_log ./profile_batched_gemm.sh batched_gemm 2 1 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 2 0 2 0 5 | tee -a $batched_gemm_log ./profile_batched_gemm.sh batched_gemm 2 2 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 3 0 2 0 5 | tee -a $batched_gemm_log ./profile_batched_gemm.sh batched_gemm 2 3 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 0 0 2 0 5 | tee -a $batched_gemm_log ./profile_batched_gemm.sh batched_gemm 3 0 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 1 0 2 0 5 | tee -a $batched_gemm_log ./profile_batched_gemm.sh batched_gemm 3 1 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 2 0 2 0 5 | tee -a $batched_gemm_log ./profile_batched_gemm.sh batched_gemm 3 2 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 3 0 2 0 5 | tee -a $batched_gemm_log ./profile_batched_gemm.sh batched_gemm 3 3 $verify 2 0 5 | tee -a $batched_gemm_log
python3 process_perf_data.py $batched_gemm_log
#run grouped_gemm tests #run grouped_gemm tests
export grouped_gemm_log="perf_grouped_gemm.log" export grouped_gemm_log="perf_grouped_gemm_${gpu_arch}.log"
print_log_header $grouped_gemm_log $env_type print_log_header $grouped_gemm_log $env_type $branch $host_name
./profile_grouped_gemm.sh grouped_gemm 1 0 0 2 0 5 | tee -a $grouped_gemm_log ./profile_grouped_gemm.sh grouped_gemm 1 0 $verify 2 0 5 | tee -a $grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 1 0 2 0 5 | tee -a $grouped_gemm_log ./profile_grouped_gemm.sh grouped_gemm 1 1 $verify 2 0 5 | tee -a $grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 2 0 2 0 5 | tee -a $grouped_gemm_log ./profile_grouped_gemm.sh grouped_gemm 1 2 $verify 2 0 5 | tee -a $grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 3 0 2 0 5 | tee -a $grouped_gemm_log ./profile_grouped_gemm.sh grouped_gemm 1 3 $verify 2 0 5 | tee -a $grouped_gemm_log
python3 process_perf_data.py $grouped_gemm_log
#run fwd_conv tests #run fwd_conv tests
export fwd_conv_log="perf_fwd_conv.log" export fwd_conv_log="perf_fwd_conv_${gpu_arch}.log"
print_log_header $fwd_conv_log $env_type print_log_header $fwd_conv_log $env_type $branch $host_name
./profile_conv.sh conv_fwd 0 1 0 2 0 5 2 256 | tee -a $fwd_conv_log ./profile_conv.sh conv_fwd 0 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log
./profile_conv.sh conv_fwd 1 1 0 2 0 5 2 256 | tee -a $fwd_conv_log ./profile_conv.sh conv_fwd 1 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log
./profile_conv.sh conv_fwd 2 1 0 2 0 5 2 256 | tee -a $fwd_conv_log ./profile_conv.sh conv_fwd 2 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log
./profile_conv.sh conv_fwd 3 1 0 2 0 5 2 256 | tee -a $fwd_conv_log ./profile_conv.sh conv_fwd 3 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log
python3 process_perf_data.py $fwd_conv_log
#run bwd_conv tests #run bwd_conv tests
export bwd_conv_log="perf_bwd_conv.log" export bwd_conv_log="perf_bwd_conv_${gpu_arch}.log"
print_log_header $bwd_conv_log $env_type print_log_header $bwd_conv_log $env_type $branch $host_name
./profile_conv.sh conv2d_bwd_data 0 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log ./profile_conv.sh conv2d_bwd_data 0 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log
./profile_conv.sh conv2d_bwd_data 1 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log ./profile_conv.sh conv2d_bwd_data 1 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log
./profile_conv.sh conv2d_bwd_data 2 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log ./profile_conv.sh conv2d_bwd_data 2 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log
./profile_conv.sh conv2d_bwd_data 3 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log ./profile_conv.sh conv2d_bwd_data 3 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log
python3 process_perf_data.py $bwd_conv_log
#run fusion tests #run fusion tests
export fusion_log="perf_fusion.log" export fusion_log="perf_fusion_${gpu_arch}.log"
print_log_header $fusion_log $env_type print_log_header $fusion_log $env_type $branch $host_name
./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 0 0 2 0 5 | tee -a $fusion_log ./profile_gemm_bilinear.sh gemm_bilinear 1 0 $verify 2 0 1 | tee -a $fusion_log
./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 1 0 2 0 5 | tee -a $fusion_log ./profile_gemm_bilinear.sh gemm_bilinear 1 1 $verify 2 0 1 | tee -a $fusion_log
./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 2 0 2 0 5 | tee -a $fusion_log ./profile_gemm_bilinear.sh gemm_bilinear 1 2 $verify 2 0 1 | tee -a $fusion_log
./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 3 0 2 0 5 | tee -a $fusion_log ./profile_gemm_bilinear.sh gemm_bilinear 1 3 $verify 2 0 1 | tee -a $fusion_log
python3 process_perf_data.py $fusion_log
#run reduction tests #run reduction tests
export reduction_log="perf_reduction.log" export reduction_log="perf_reduction_${gpu_arch}.log"
print_log_header $reduction_log $env_type print_log_header $reduction_log $env_type $branch $host_name
./profile_reduce_with_index.sh 0 2 10 --half | tee -a $reduction_log ./profile_reduce_with_index.sh $verify 2 10 --half | tee -a $reduction_log
./profile_reduce_no_index.sh 0 2 10 --half | tee -a $reduction_log ./profile_reduce_no_index.sh $verify 2 10 --half | tee -a $reduction_log
python3 process_perf_data.py $reduction_log
\ No newline at end of file
#!/bin/bash #!/bin/bash
# #
# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/ # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
# and make sure the following python packages are installed in your environment: # run the script as "./run_performance_tests.sh <verification> <tag for your test environment> <gpu_arch> <branch name> < node name>
# input arguments:
# verification = 0 : do not verify result correctness on CPU
# = 1 : verify correctness on CPU (may take a long time)
# environment tag : a string describing the specifics of your test environment
# gpu_arch : a string for GPU architecture, e.g. "gfx908" or "gfx90a".
# branch name : name of the branch in git repo (git status | grep -e 'On branch')
# node name : $hostname
pip3 install --upgrade pip #get the command line arguments:
pip3 install sqlalchemy pymysql pandas sshtunnel export verify=$1
echo 'Verification: ' $verify
# you would also need to set up some environment variables in order to export env_type=$2
# post your new test results to the database and compare them to the baseline echo 'Environment type: ' $env_type
# please contact Illia.Silin@amd.com for more details export gpu_arch=$3
# echo 'GPU architecture: ' $gpu_arch
# run the script as "./run_performance_tests.sh <tag for your test environment> export branch=$4
echo 'Branch name: ' $branch
#get the test environment type: export host_name=$5
export env_type=$1 echo 'Host name: ' $host_name
echo 'Environment type ' $env_type
function print_log_header(){ function print_log_header(){
rm -f $1; rm -f $1;
git status | grep -e 'On branch' > $1; echo 'On branch ' $3 &> $1;
echo -n 'Node name: ' >>$1; hostname >> $1; echo 'Node name: ' $4 >> $1;
#get GPU_arch and number of compute units from rocminfo #get GPU_arch and number of compute units from rocminfo
echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1; echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
rocminfo | grep "Compute Unit:" >> $1; rocminfo | grep "Compute Unit:" >> $1;
hipcc --version | grep -e 'HIP version' >> $1; hipcc --version | grep -e 'HIP version' >> $1;
echo 'Environment type: ' $2 >>$1; echo 'Environment type: ' $2 >> $1;
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1; /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
} }
#run gemm tests #run gemm tests
export gemm_log="perf_gemm.log" export gemm_log="perf_gemm_${gpu_arch}.log"
print_log_header $gemm_log $env_type print_log_header $gemm_log $env_type $branch $host_name
./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 0 0 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 1 0 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 2 0 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 3 0 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 0 1 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 1 1 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 2 1 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 3 1 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 0 2 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 1 2 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 2 2 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 3 2 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 0 3 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 1 3 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 2 3 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a $gemm_log ./profile_gemm.sh gemm 3 3 $verify 1 0 5 | tee -a $gemm_log
python3 process_perf_data.py $gemm_log
#run resnet50 test #run resnet50 test
export resnet256_log="perf_resnet50_N256.log" export resnet256_log="perf_resnet50_N256_${gpu_arch}.log"
print_log_header $resnet256_log $env_type print_log_header $resnet256_log $env_type $branch $host_name
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a $resnet256_log ./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 256 | tee -a $resnet256_log
python3 process_perf_data.py $resnet256_log export resnet4_log="perf_resnet50_N4_${gpu_arch}.log"
export resnet4_log="perf_resnet50_N4.log" print_log_header $resnet4_log $env_type $branch $host_name
print_log_header $resnet4_log $env_type ./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 4 | tee -a $resnet4_log
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a $resnet4_log
python3 process_perf_data.py $resnet4_log
add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp) #add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp)
target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_util) #target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_util)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment