Unverified Commit 926008bc authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Merge branch 'develop' into batched_gemm_multiD

parents e9652cbc d8415a96
File mode changed from 100644 to 100755
......@@ -85,7 +85,6 @@ def parse_logfile(logfile):
for line in open(logfile):
if 'Best Perf' in line:
lst=line.split()
print("len(lst)=",len(lst),"lst:",lst)
if len(lst)>=37: #the line is complete
tests.append(glue.join(lst[5:30]))
kernels.append(glue.join(lst[37:]))
......
#!/bin/bash
#
# in order to run this script you'd need the following python packages:
pip3 install --upgrade pip
pip3 install sqlalchemy pymysql pandas sshtunnel
# you would also need to set up some environment variables in order to
# post your new test results to the database and compare them to the baseline
# please contact Illia.Silin@amd.com for more details
#process results
gpu_arch=$1
python3 process_perf_data.py perf_gemm_"$gpu_arch".log
python3 process_perf_data.py perf_resnet50_N265_"$gpu_arch".log
python3 process_perf_data.py perf_resnet50_N4_"$gpu_arch".log
\ No newline at end of file
#!/bin/bash
#
# in order to run this script you'd need the following python packages:
pip3 install --upgrade pip
pip3 install sqlalchemy pymysql pandas sshtunnel
# you would also need to set up some environment variables in order to
# post your new test results to the database and compare them to the baseline
# please contact Illia.Silin@amd.com for more details
#process results
gpu_arch=$1
python3 process_perf_data.py perf_gemm_"$gpu_arch".log
python3 process_perf_data.py perf_resnet50_N265_"$gpu_arch".log
python3 process_perf_data.py perf_resnet50_N4_"$gpu_arch".log
python3 process_perf_data.py perf_batched_gemm_"$gpu_arch".log
python3 process_perf_data.py perf_grouped_gemm_"$gpu_arch".log
python3 process_perf_data.py perf_fwd_conv_"$gpu_arch".log
python3 process_perf_data.py perf_bwd_conv_"$gpu_arch".log
python3 process_perf_data.py perf_fusion_"$gpu_arch".log
python3 process_perf_data.py perf_reduction_"$gpu_arch".log
\ No newline at end of file
......@@ -11,26 +11,34 @@ INIT=$5
LOG=$6
REPEAT=$7
######## op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 960 1024 1024 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1920 2048 2048 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3840 4096 4096 -1 -1 -1 4
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7680 8192 8192 -1 -1 -1 2
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
REPEAT=$7
######## op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 960 1024 1024 -1 -1 -1 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1920 2048 2048 -1 -1 -1 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3840 4096 4096 -1 -1 -1 -1 -1 -1 4
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7680 8192 8192 -1 -1 -1 -1 -1 -1 2
####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1024 1024 1024 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2048 2048 2048 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4096 4096 4096 4
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8192 8192 8192 2
####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1024 1024 1024 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2048 2048 2048 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4096 4096 4096 -1 -1 -1 4
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8192 8192 8192 -1 -1 -1 2
####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1056 1056 1056 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2080 2080 2080 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4128 4128 4128 4
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8224 8224 8224 2
####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1056 1056 1056 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2080 2080 2080 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4128 4128 4128 -1 -1 -1 4
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8224 8224 8224 -1 -1 -1 2
####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1088 1088 1088 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2112 2112 2112 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4160 4160 4160 4
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8256 8256 8256 2
\ No newline at end of file
####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1088 1088 1088 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2112 2112 2112 -1 -1 -1 8
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4160 4160 4160 -1 -1 -1 4
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8256 8256 8256 -1 -1 -1 2
\ No newline at end of file
#!/bin/bash
## GPU visibility
export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 -1 -1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920 2048 2048 -1 -1 -1 -1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840 4096 4096 -1 -1 -1 -1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680 8192 8192 -1 -1 -1 -1 1 1
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 0 -1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920 2048 2048 -1 -1 0 -1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840 4096 4096 -1 -1 0 -1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680 8192 8192 -1 -1 0 -1 1 1
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1000 1000 1000 -1 -1 0 -1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2000 2000 2000 -1 -1 0 -1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4000 4000 4000 -1 -1 0 -1 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8000 8000 8000 -1 -1 0 -1 1 1
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1056 1056 1056 1056 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2080 2080 2080 2080 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4128 4128 4128 4128 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8224 8224 8224 8224 1 1
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1088 1088 1088 1088 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2112 2112 2112 2112 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4160 4160 4160 4160 1 1
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8256 8256 8256 8256 1 1
\ No newline at end of file
#!/bin/bash
#
# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
# and make sure the following python packages are installed in your environment:
pip3 install --upgrade pip
pip3 install sqlalchemy pymysql pandas sshtunnel
# you would also need to set up some environment variables in order to
# post your new test results to the database and compare them to the baseline
# please contact Illia.Silin@amd.com for more details
#
# run the script as "./run_full_performance_tests.sh <tag for your test environment>
#get the test environment type:
export env_type=$1
echo 'Environment type ' $env_type
# run the script as "./run_full_performance_tests.sh <verification> <tag for your test environment> <gpu_arch> <branch name> < node name>
# input arguments:
# verification = 0 : do not verify result correctness on CPU
# = 1 : verifuy correctness on CPU (may take a long time)
# environment tag : a string describing the specifics of your test environment
# gpu_arch : a string for GPU architecture, e.g. "gfx908" or "gfx90a".
# branch name : name of the branch in git repo (git status | grep -e 'On branch')
# node name : $hostname
#get the command line arguments:
export verify=$1
echo 'Verification: ' $verify
export env_type=$2
echo 'Environment type: ' $env_type
export gpu_arch=$3
echo 'GPU architecture: ' $gpu_arch
export branch=$4
echo 'Branch name: ' $branch
export host_name=$5
echo 'Host name: ' $host_name
function print_log_header(){
rm -f $1;
git status | grep -e 'On branch' > $1;
echo -n 'Node name: ' >>$1; hostname >> $1;
echo 'On branch ' $3 &> $1;
echo 'Node name: ' $4 >> $1;
#get GPU_arch and number of compute units from rocminfo
echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
rocminfo | grep "Compute Unit:" >> $1;
hipcc --version | grep -e 'HIP version' >> $1;
echo 'Environment type: ' $2 >>$1;
echo 'Environment type: ' $2 >> $1;
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
}
#run gemm tests
export gemm_log="perf_gemm.log"
print_log_header $gemm_log $env_type
./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a $gemm_log
python3 process_perf_data.py $gemm_log
export gemm_log="perf_gemm_${gpu_arch}.log"
print_log_header $gemm_log $env_type $branch $host_name
./profile_gemm.sh gemm 0 0 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 0 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 0 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 0 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 1 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 1 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 1 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 1 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 2 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 2 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 2 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 2 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 3 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 3 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 3 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 3 $verify 1 0 5 | tee -a $gemm_log
#run resnet50 tests
export resnet256_log="perf_resnet50_N256.log"
print_log_header $resnet256_log $env_type
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a $resnet256_log
python3 process_perf_data.py $resnet256_log
export resnet4_log="perf_resnet50_N4.log"
print_log_header $resnet4_log $env_type
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a $resnet4_log
python3 process_perf_data.py $resnet4_log
export resnet256_log="perf_resnet50_N256_${gpu_arch}.log"
print_log_header $resnet256_log $env_type $branch $host_name
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 256 | tee -a $resnet256_log
export resnet4_log="perf_resnet50_N4_${gpu_arch}.log"
print_log_header $resnet4_log $env_type $branch $host_name
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 4 | tee -a $resnet4_log
#run batched_gemm tests
export batched_gemm_log="perf_batched_gemm.log"
print_log_header $batched_gemm_log $env_type
./profile_batched_gemm.sh batched_gemm 0 0 0 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 1 0 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 2 0 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 3 0 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 0 0 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 1 0 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 2 0 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 3 0 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 0 0 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 1 0 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 2 0 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 3 0 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 0 0 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 1 0 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 2 0 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 3 0 2 0 5 | tee -a $batched_gemm_log
python3 process_perf_data.py $batched_gemm_log
export batched_gemm_log="perf_batched_gemm_${gpu_arch}.log"
print_log_header $batched_gemm_log $env_type $branch $host_name
./profile_batched_gemm.sh batched_gemm 0 0 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 1 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 2 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 3 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 0 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 1 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 2 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 3 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 0 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 1 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 2 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 3 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 0 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 1 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 2 $verify 2 0 5 | tee -a $batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 3 $verify 2 0 5 | tee -a $batched_gemm_log
#run grouped_gemm tests
export grouped_gemm_log="perf_grouped_gemm.log"
print_log_header $grouped_gemm_log $env_type
./profile_grouped_gemm.sh grouped_gemm 1 0 0 2 0 5 | tee -a $grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 1 0 2 0 5 | tee -a $grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 2 0 2 0 5 | tee -a $grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 3 0 2 0 5 | tee -a $grouped_gemm_log
python3 process_perf_data.py $grouped_gemm_log
export grouped_gemm_log="perf_grouped_gemm_${gpu_arch}.log"
print_log_header $grouped_gemm_log $env_type $branch $host_name
./profile_grouped_gemm.sh grouped_gemm 1 0 $verify 2 0 5 | tee -a $grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 1 $verify 2 0 5 | tee -a $grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 2 $verify 2 0 5 | tee -a $grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 3 $verify 2 0 5 | tee -a $grouped_gemm_log
#run fwd_conv tests
export fwd_conv_log="perf_fwd_conv.log"
print_log_header $fwd_conv_log $env_type
./profile_conv.sh conv_fwd 0 1 0 2 0 5 2 256 | tee -a $fwd_conv_log
./profile_conv.sh conv_fwd 1 1 0 2 0 5 2 256 | tee -a $fwd_conv_log
./profile_conv.sh conv_fwd 2 1 0 2 0 5 2 256 | tee -a $fwd_conv_log
./profile_conv.sh conv_fwd 3 1 0 2 0 5 2 256 | tee -a $fwd_conv_log
python3 process_perf_data.py $fwd_conv_log
export fwd_conv_log="perf_fwd_conv_${gpu_arch}.log"
print_log_header $fwd_conv_log $env_type $branch $host_name
./profile_conv.sh conv_fwd 0 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log
./profile_conv.sh conv_fwd 1 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log
./profile_conv.sh conv_fwd 2 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log
./profile_conv.sh conv_fwd 3 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log
#run bwd_conv tests
export bwd_conv_log="perf_bwd_conv.log"
print_log_header $bwd_conv_log $env_type
./profile_conv.sh conv2d_bwd_data 0 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log
./profile_conv.sh conv2d_bwd_data 1 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log
./profile_conv.sh conv2d_bwd_data 2 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log
./profile_conv.sh conv2d_bwd_data 3 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log
python3 process_perf_data.py $bwd_conv_log
export bwd_conv_log="perf_bwd_conv_${gpu_arch}.log"
print_log_header $bwd_conv_log $env_type $branch $host_name
./profile_conv.sh conv2d_bwd_data 0 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log
./profile_conv.sh conv2d_bwd_data 1 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log
./profile_conv.sh conv2d_bwd_data 2 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log
./profile_conv.sh conv2d_bwd_data 3 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log
#run fusion tests
export fusion_log="perf_fusion.log"
print_log_header $fusion_log $env_type
./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 0 0 2 0 5 | tee -a $fusion_log
./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 1 0 2 0 5 | tee -a $fusion_log
./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 2 0 2 0 5 | tee -a $fusion_log
./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 3 0 2 0 5 | tee -a $fusion_log
python3 process_perf_data.py $fusion_log
export fusion_log="perf_fusion_${gpu_arch}.log"
print_log_header $fusion_log $env_type $branch $host_name
./profile_gemm_bilinear.sh gemm_bilinear 1 0 $verify 2 0 1 | tee -a $fusion_log
./profile_gemm_bilinear.sh gemm_bilinear 1 1 $verify 2 0 1 | tee -a $fusion_log
./profile_gemm_bilinear.sh gemm_bilinear 1 2 $verify 2 0 1 | tee -a $fusion_log
./profile_gemm_bilinear.sh gemm_bilinear 1 3 $verify 2 0 1 | tee -a $fusion_log
#run reduction tests
export reduction_log="perf_reduction.log"
print_log_header $reduction_log $env_type
./profile_reduce_with_index.sh 0 2 10 --half | tee -a $reduction_log
./profile_reduce_no_index.sh 0 2 10 --half | tee -a $reduction_log
python3 process_perf_data.py $reduction_log
\ No newline at end of file
export reduction_log="perf_reduction_${gpu_arch}.log"
print_log_header $reduction_log $env_type $branch $host_name
./profile_reduce_with_index.sh $verify 2 10 --half | tee -a $reduction_log
./profile_reduce_no_index.sh $verify 2 10 --half | tee -a $reduction_log
#!/bin/bash
#
# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
# and make sure the following python packages are installed in your environment:
# run the script as "./run_performance_tests.sh <verification> <tag for your test environment> <gpu_arch> <branch name> < node name>
# input arguments:
# verification = 0 : do not verify result correctness on CPU
# = 1 : verify correctness on CPU (may take a long time)
# environment tag : a string describing the specifics of your test environment
# gpu_arch : a string for GPU architecture, e.g. "gfx908" or "gfx90a".
# branch name : name of the branch in git repo (git status | grep -e 'On branch')
# node name : $hostname
pip3 install --upgrade pip
pip3 install sqlalchemy pymysql pandas sshtunnel
# you would also need to set up some environment variables in order to
# post your new test results to the database and compare them to the baseline
# please contact Illia.Silin@amd.com for more details
#
# run the script as "./run_performance_tests.sh <tag for your test environment>
#get the test environment type:
export env_type=$1
echo 'Environment type ' $env_type
#get the command line arguments:
export verify=$1
echo 'Verification: ' $verify
export env_type=$2
echo 'Environment type: ' $env_type
export gpu_arch=$3
echo 'GPU architecture: ' $gpu_arch
export branch=$4
echo 'Branch name: ' $branch
export host_name=$5
echo 'Host name: ' $host_name
function print_log_header(){
rm -f $1;
git status | grep -e 'On branch' > $1;
echo -n 'Node name: ' >>$1; hostname >> $1;
echo 'On branch ' $3 &> $1;
echo 'Node name: ' $4 >> $1;
#get GPU_arch and number of compute units from rocminfo
echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
rocminfo | grep "Compute Unit:" >> $1;
hipcc --version | grep -e 'HIP version' >> $1;
echo 'Environment type: ' $2 >>$1;
echo 'Environment type: ' $2 >> $1;
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
}
#run gemm tests
export gemm_log="perf_gemm.log"
print_log_header $gemm_log $env_type
./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a $gemm_log
python3 process_perf_data.py $gemm_log
export gemm_log="perf_gemm_${gpu_arch}.log"
print_log_header $gemm_log $env_type $branch $host_name
./profile_gemm.sh gemm 0 0 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 0 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 0 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 0 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 1 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 1 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 1 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 1 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 2 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 2 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 2 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 2 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 3 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 3 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 3 $verify 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 3 $verify 1 0 5 | tee -a $gemm_log
#run resnet50 test
export resnet256_log="perf_resnet50_N256.log"
print_log_header $resnet256_log $env_type
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a $resnet256_log
python3 process_perf_data.py $resnet256_log
export resnet4_log="perf_resnet50_N4.log"
print_log_header $resnet4_log $env_type
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a $resnet4_log
python3 process_perf_data.py $resnet4_log
export resnet256_log="perf_resnet50_N256_${gpu_arch}.log"
print_log_header $resnet256_log $env_type $branch $host_name
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 256 | tee -a $resnet256_log
export resnet4_log="perf_resnet50_N4_${gpu_arch}.log"
print_log_header $resnet4_log $env_type $branch $host_name
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 4 | tee -a $resnet4_log
add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp)
target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_util)
#add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp)
#target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_util)
......@@ -2,39 +2,8 @@
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceGroupedGemmPtr_ = ck::tensor_operation::device::DeviceGroupedGemmPtr<
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(
std::vector<DeviceGroupedGemmPtr_>&);
}
} // namespace device
} // namespace tensor_operation
} // namespace ck
#include "profiler/include/profile_grouped_gemm_impl.hpp"
namespace {
......@@ -43,169 +12,52 @@ using BDataType = ck::half_t;
using CDataType = ck::half_t;
using AccDataType = float;
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
template <typename ALayout, typename BLayout, typename CLayout>
bool TestGroupedGemm()
{
int group_count = rand() % 10 + 1;
// GEMM shape
std::vector<ck::tensor_operation::device::GemmShape> gemm_shapes;
std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
std::vector<const void*> p_a, p_b;
std::vector<void*> p_c;
gemm_shapes.reserve(group_count);
std::vector<int> Ms, Ns, Ks, StrideAs, StrideBs, StrideCs;
for(int i = 0; i < group_count; i++)
{
int M = 256 + 256 * (rand() % 10);
int N = 256 + 256 * (rand() % 10);
int K = 128 + 128 * (rand() % 10);
int AStride = std::is_same<ck::tensor_layout::gemm::RowMajor, ALayout>::value ? K : M;
int BStride = std::is_same<ck::tensor_layout::gemm::RowMajor, BLayout>::value ? N : K;
int CStride = std::is_same<ck::tensor_layout::gemm::RowMajor, CLayout>::value ? N : M;
Ms.push_back(256 + 256 * (rand() % 10));
Ns.push_back(256 + 256 * (rand() % 10));
Ks.push_back(128 + 128 * (rand() % 10));
gemm_shapes.push_back({M, N, K, AStride, BStride, CStride});
}
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
StrideAs.push_back(std::is_same<Row, ALayout>::value ? Ks[i] : Ms[i]);
StrideBs.push_back(std::is_same<Row, BLayout>::value ? Ns[i] : Ks[i]);
StrideCs.push_back(std::is_same<Row, CLayout>::value ? Ns[i] : Ms[i]);
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
}
};
std::vector<Tensor<ADataType>> a_tensors;
;
std::vector<Tensor<BDataType>> b_tensors;
std::vector<Tensor<CDataType>> c_host_tensors;
std::vector<Tensor<CDataType>> c_device_tensors;
a_tensors.reserve(group_count);
b_tensors.reserve(group_count);
c_host_tensors.reserve(group_count);
c_device_tensors.reserve(group_count);
using DeviceMemPtr = std::unique_ptr<DeviceMem>;
std::vector<DeviceMemPtr> a_tensors_device, b_tensors_device, c_tensors_device;
a_tensors_device.reserve(group_count);
b_tensors_device.reserve(group_count);
c_tensors_device.reserve(group_count);
for(std::size_t i = 0; i < gemm_shapes.size(); i++)
{
a_tensors.emplace_back(Tensor<ADataType>(f_host_tensor_descriptor(
gemm_shapes[i].M, gemm_shapes[i].K, gemm_shapes[i].StrideA, ALayout{})));
b_tensors.emplace_back(Tensor<BDataType>(f_host_tensor_descriptor(
gemm_shapes[i].K, gemm_shapes[i].N, gemm_shapes[i].StrideB, BLayout{})));
c_host_tensors.emplace_back(Tensor<CDataType>(f_host_tensor_descriptor(
gemm_shapes[i].M, gemm_shapes[i].N, gemm_shapes[i].StrideC, CLayout{})));
c_device_tensors.emplace_back(Tensor<CDataType>(f_host_tensor_descriptor(
gemm_shapes[i].M, gemm_shapes[i].N, gemm_shapes[i].StrideC, CLayout{})));
a_tensors[i].GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
b_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
}
for(std::size_t i = 0; i < gemm_shapes.size(); i++)
{
a_tensors_device.emplace_back(
std::make_unique<DeviceMem>(sizeof(ADataType) * a_tensors[i].mDesc.GetElementSize()));
b_tensors_device.emplace_back(
std::make_unique<DeviceMem>(sizeof(BDataType) * b_tensors[i].mDesc.GetElementSize()));
c_tensors_device.emplace_back(std::make_unique<DeviceMem>(
sizeof(CDataType) * c_device_tensors[i].mDesc.GetElementSize()));
a_tensors_device[i]->ToDevice(a_tensors[i].mData.data());
b_tensors_device[i]->ToDevice(b_tensors[i].mData.data());
p_a.push_back(a_tensors_device[i]->GetDeviceBuffer());
p_b.push_back(b_tensors_device[i]->GetDeviceBuffer());
p_c.push_back(c_tensors_device[i]->GetDeviceBuffer());
}
auto a_element_op = PassThrough{};
auto b_element_op = PassThrough{};
auto c_element_op = PassThrough{};
// do GEMM
auto invoker_ptr = groupedGemmPtr->MakeInvokerPointer();
auto argument_ptr = groupedGemmPtr->MakeArgumentPointer(
p_a, p_b, p_c, gemm_shapes, a_element_op, b_element_op, c_element_op);
DeviceMem gemm_desc_workspace(groupedGemmPtr->GetWorkSpaceSize(argument_ptr.get()));
groupedGemmPtr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
invoker_ptr->Run(argument_ptr.get());
for(std::size_t i = 0; i < gemm_shapes.size(); i++)
{
c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data());
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
return ck::profiler::profile_grouped_gemm_impl<ADataType,
BDataType,
CDataType,
AccDataType,
PassThrough,
PassThrough,
PassThrough>;
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(a_tensors[i],
b_tensors[i],
c_host_tensors[i],
a_element_op,
b_element_op,
c_element_op);
if(!groupedGemmPtr->IsSupportedArgument(argument_ptr.get()))
{
return false;
}
ref_invoker.Run(ref_argument);
bool res = ck::utils::check_err(c_host_tensors[i].mData, c_device_tensors[i].mData);
std::cout << "group_id: " << i << (res ? " SUCCESS" : " FAILURE") << std::endl;
if(!res)
return false;
}
return true;
ALayout,
BLayout,
CLayout>(
true, 1, false, 1, Ms, Ns, Ks, StrideAs, StrideBs, StrideCs);
}
} // anonymous namespace
int main()
{
std::vector<DeviceGroupedGemmPtr_> groupedGemmPtrs;
ck::tensor_operation::device::instance::
add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(groupedGemmPtrs);
bool res = true;
for(auto& gemmPtr : groupedGemmPtrs)
{
res &= TestGroupedGemm(gemmPtr);
}
res = res && TestGroupedGemm<Row, Row, Row>();
res = res && TestGroupedGemm<Row, Col, Row>();
res = res && TestGroupedGemm<Col, Row, Row>();
res = res && TestGroupedGemm<Col, Col, Row>();
std::cout << "TestGroupedGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment