Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
8a370fbb
Commit
8a370fbb
authored
Jul 26, 2022
by
Chao Liu
Browse files
Merge remote-tracking branch 'origin/develop' into group_conv
parents
d8fdd226
85978e02
Changes
44
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
166 additions
and
309 deletions
+166
-309
script/run_full_performance_tests.sh
script/run_full_performance_tests.sh
+92
-92
script/run_performance_tests.sh
script/run_performance_tests.sh
+46
-43
test/conv2d_bwd_weight/CMakeLists.txt
test/conv2d_bwd_weight/CMakeLists.txt
+2
-0
test/grouped_gemm/grouped_gemm_fp16.cpp
test/grouped_gemm/grouped_gemm_fp16.cpp
+26
-174
No files found.
script/run_full_performance_tests.sh
View file @
8a370fbb
#!/bin/bash
#
# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
# and make sure the following python packages are installed in your environment:
pip3
install
--upgrade
pip
pip3
install
sqlalchemy pymysql pandas sshtunnel
# you would also need to set up some environment variables in order to
# post your new test results to the database and compare them to the baseline
# please contact Illia.Silin@amd.com for more details
#
# run the script as "./run_full_performance_tests.sh <tag for your test environment>
#get the test environment type:
export
env_type
=
$1
echo
'Environment type '
$env_type
# run the script as "./run_full_performance_tests.sh <verification> <tag for your test environment> <gpu_arch> <branch name> < node name>
# input arguments:
# verification = 0 : do not verify result correctness on CPU
# = 1 : verifuy correctness on CPU (may take a long time)
# environment tag : a string describing the specifics of your test environment
# gpu_arch : a string for GPU architecture, e.g. "gfx908" or "gfx90a".
# branch name : name of the branch in git repo (git status | grep -e 'On branch')
# node name : $hostname
#get the command line arguments:
export
verify
=
$1
echo
'Verification: '
$verify
export
env_type
=
$2
echo
'Environment type: '
$env_type
export
gpu_arch
=
$3
echo
'GPU architecture: '
$gpu_arch
export
branch
=
$4
echo
'Branch name: '
$branch
export
host_name
=
$5
echo
'Host name: '
$host_name
function
print_log_header
(){
rm
-f
$1
;
git status |
grep
-e
'On branch'
>
$1
;
echo
-n
'Node name: '
>>
$1
;
hostname
>>
$1
;
echo
'On branch
'
$3
&
>
$1
;
echo
'Node name: '
$4
>>
$1
;
#get GPU_arch and number of compute units from rocminfo
echo
-n
"GPU_arch: "
>>
$1
;
rocminfo |
grep
"Name:"
|
grep
"gfx"
>>
$1
;
rocminfo |
grep
"Compute Unit:"
>>
$1
;
hipcc
--version
|
grep
-e
'HIP version'
>>
$1
;
echo
'Environment type: '
$2
>>
$1
;
echo
'Environment type: '
$2
>>
$1
;
/opt/rocm/bin/amdclang++
--version
|
grep
-e
'InstalledDir'
>>
$1
;
}
#run gemm tests
export
gemm_log
=
"perf_gemm.log"
print_log_header
$gemm_log
$env_type
./profile_gemm.sh gemm 0 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 1 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 1 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 1 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 1 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 2 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 2 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 2 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 2 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 3 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 3 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 3 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 3 0 1 0 5 |
tee
-a
$gemm_log
python3 process_perf_data.py
$gemm_log
export
gemm_log
=
"perf_gemm_
${
gpu_arch
}
.log"
print_log_header
$gemm_log
$env_type
$branch
$host_name
./profile_gemm.sh gemm 0 0
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 0
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 0
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 0
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 1
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 1
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 1
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 1
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 2
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 2
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 2
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 2
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 3
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 3
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 3
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 3
$verify
1 0 5 |
tee
-a
$gemm_log
#run resnet50 tests
export
resnet256_log
=
"perf_resnet50_N256.log"
print_log_header
$resnet256_log
$env_type
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 |
tee
-a
$resnet256_log
python3 process_perf_data.py
$resnet256_log
export
resnet4_log
=
"perf_resnet50_N4.log"
print_log_header
$resnet4_log
$env_type
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 |
tee
-a
$resnet4_log
python3 process_perf_data.py
$resnet4_log
export
resnet256_log
=
"perf_resnet50_N256_
${
gpu_arch
}
.log"
print_log_header
$resnet256_log
$env_type
$branch
$host_name
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1
$verify
2 0 1 256 |
tee
-a
$resnet256_log
export
resnet4_log
=
"perf_resnet50_N4_
${
gpu_arch
}
.log"
print_log_header
$resnet4_log
$env_type
$branch
$host_name
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1
$verify
2 0 1 4 |
tee
-a
$resnet4_log
#run batched_gemm tests
export
batched_gemm_log
=
"perf_batched_gemm.log"
print_log_header
$batched_gemm_log
$env_type
./profile_batched_gemm.sh batched_gemm 0 0 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 1 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 2 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 3 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 0 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 1 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 2 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 3 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 0 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 1 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 2 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 3 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 0 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 1 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 2 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 3 0 2 0 5 |
tee
-a
$batched_gemm_log
python3 process_perf_data.py
$batched_gemm_log
export
batched_gemm_log
=
"perf_batched_gemm_
${
gpu_arch
}
.log"
print_log_header
$batched_gemm_log
$env_type
$branch
$host_name
./profile_batched_gemm.sh batched_gemm 0 0
$verify
2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 1
$verify
2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 2
$verify
2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 3
$verify
2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 0
$verify
2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 1
$verify
2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 2
$verify
2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 3
$verify
2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 0
$verify
2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 1
$verify
2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 2
$verify
2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 3
$verify
2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 0
$verify
2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 1
$verify
2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 2
$verify
2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 3
$verify
2 0 5 |
tee
-a
$batched_gemm_log
#run grouped_gemm tests
export
grouped_gemm_log
=
"perf_grouped_gemm.log"
print_log_header
$grouped_gemm_log
$env_type
./profile_grouped_gemm.sh grouped_gemm 1 0 0 2 0 5 |
tee
-a
$grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 1 0 2 0 5 |
tee
-a
$grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 2 0 2 0 5 |
tee
-a
$grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 3 0 2 0 5 |
tee
-a
$grouped_gemm_log
python3 process_perf_data.py
$grouped_gemm_log
export
grouped_gemm_log
=
"perf_grouped_gemm_
${
gpu_arch
}
.log"
print_log_header
$grouped_gemm_log
$env_type
$branch
$host_name
./profile_grouped_gemm.sh grouped_gemm 1 0
$verify
2 0 5 |
tee
-a
$grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 1
$verify
2 0 5 |
tee
-a
$grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 2
$verify
2 0 5 |
tee
-a
$grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 3
$verify
2 0 5 |
tee
-a
$grouped_gemm_log
#run fwd_conv tests
export
fwd_conv_log
=
"perf_fwd_conv.log"
print_log_header
$fwd_conv_log
$env_type
./profile_conv.sh conv_fwd 0 1 0 2 0 5 2 256 |
tee
-a
$fwd_conv_log
./profile_conv.sh conv_fwd 1 1 0 2 0 5 2 256 |
tee
-a
$fwd_conv_log
./profile_conv.sh conv_fwd 2 1 0 2 0 5 2 256 |
tee
-a
$fwd_conv_log
./profile_conv.sh conv_fwd 3 1 0 2 0 5 2 256 |
tee
-a
$fwd_conv_log
python3 process_perf_data.py
$fwd_conv_log
export
fwd_conv_log
=
"perf_fwd_conv_
${
gpu_arch
}
.log"
print_log_header
$fwd_conv_log
$env_type
$branch
$host_name
./profile_conv.sh conv_fwd 0 1
$verify
2 0 5 2 256 |
tee
-a
$fwd_conv_log
./profile_conv.sh conv_fwd 1 1
$verify
2 0 5 2 256 |
tee
-a
$fwd_conv_log
./profile_conv.sh conv_fwd 2 1
$verify
2 0 5 2 256 |
tee
-a
$fwd_conv_log
./profile_conv.sh conv_fwd 3 1
$verify
2 0 5 2 256 |
tee
-a
$fwd_conv_log
#run bwd_conv tests
export
bwd_conv_log
=
"perf_bwd_conv.log"
print_log_header
$bwd_conv_log
$env_type
./profile_conv.sh conv2d_bwd_data 0 1 1 1 0 2 0 5 128 |
tee
-a
$bwd_conv_log
./profile_conv.sh conv2d_bwd_data 1 1 1 1 0 2 0 5 128 |
tee
-a
$bwd_conv_log
./profile_conv.sh conv2d_bwd_data 2 1 1 1 0 2 0 5 128 |
tee
-a
$bwd_conv_log
./profile_conv.sh conv2d_bwd_data 3 1 1 1 0 2 0 5 128 |
tee
-a
$bwd_conv_log
python3 process_perf_data.py
$bwd_conv_log
export
bwd_conv_log
=
"perf_bwd_conv_
${
gpu_arch
}
.log"
print_log_header
$bwd_conv_log
$env_type
$branch
$host_name
./profile_conv.sh conv2d_bwd_data 0 1 1 1
$verify
2 0 5 128 |
tee
-a
$bwd_conv_log
./profile_conv.sh conv2d_bwd_data 1 1 1 1
$verify
2 0 5 128 |
tee
-a
$bwd_conv_log
./profile_conv.sh conv2d_bwd_data 2 1 1 1
$verify
2 0 5 128 |
tee
-a
$bwd_conv_log
./profile_conv.sh conv2d_bwd_data 3 1 1 1
$verify
2 0 5 128 |
tee
-a
$bwd_conv_log
#run fusion tests
export
fusion_log
=
"perf_fusion.log"
print_log_header
$fusion_log
$env_type
./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 0 0 2 0 5 |
tee
-a
$fusion_log
./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 1 0 2 0 5 |
tee
-a
$fusion_log
./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 2 0 2 0 5 |
tee
-a
$fusion_log
./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 3 0 2 0 5 |
tee
-a
$fusion_log
python3 process_perf_data.py
$fusion_log
export
fusion_log
=
"perf_fusion_
${
gpu_arch
}
.log"
print_log_header
$fusion_log
$env_type
$branch
$host_name
./profile_gemm_bilinear.sh gemm_bilinear 1 0
$verify
2 0 1 |
tee
-a
$fusion_log
./profile_gemm_bilinear.sh gemm_bilinear 1 1
$verify
2 0 1 |
tee
-a
$fusion_log
./profile_gemm_bilinear.sh gemm_bilinear 1 2
$verify
2 0 1 |
tee
-a
$fusion_log
./profile_gemm_bilinear.sh gemm_bilinear 1 3
$verify
2 0 1 |
tee
-a
$fusion_log
#run reduction tests
export
reduction_log
=
"perf_reduction.log"
print_log_header
$reduction_log
$env_type
./profile_reduce_with_index.sh 0 2 10
--half
|
tee
-a
$reduction_log
./profile_reduce_no_index.sh 0 2 10
--half
|
tee
-a
$reduction_log
python3 process_perf_data.py
$reduction_log
\ No newline at end of file
export
reduction_log
=
"perf_reduction_
${
gpu_arch
}
.log"
print_log_header
$reduction_log
$env_type
$branch
$host_name
./profile_reduce_with_index.sh
$verify
2 10
--half
|
tee
-a
$reduction_log
./profile_reduce_no_index.sh
$verify
2 10
--half
|
tee
-a
$reduction_log
script/run_performance_tests.sh
View file @
8a370fbb
#!/bin/bash
#
# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
# and make sure the following python packages are installed in your environment:
# run the script as "./run_performance_tests.sh <verification> <tag for your test environment> <gpu_arch> <branch name> < node name>
# input arguments:
# verification = 0 : do not verify result correctness on CPU
# = 1 : verify correctness on CPU (may take a long time)
# environment tag : a string describing the specifics of your test environment
# gpu_arch : a string for GPU architecture, e.g. "gfx908" or "gfx90a".
# branch name : name of the branch in git repo (git status | grep -e 'On branch')
# node name : $hostname
pip3
install
--upgrade
pip
pip3
install
sqlalchemy pymysql pandas sshtunnel
# you would also need to set up some environment variables in order to
# post your new test results to the database and compare them to the baseline
# please contact Illia.Silin@amd.com for more details
#
# run the script as "./run_performance_tests.sh <tag for your test environment>
#get the test environment type:
export
env_type
=
$1
echo
'Environment type '
$env_type
#get the command line arguments:
export
verify
=
$1
echo
'Verification: '
$verify
export
env_type
=
$2
echo
'Environment type: '
$env_type
export
gpu_arch
=
$3
echo
'GPU architecture: '
$gpu_arch
export
branch
=
$4
echo
'Branch name: '
$branch
export
host_name
=
$5
echo
'Host name: '
$host_name
function
print_log_header
(){
rm
-f
$1
;
git status |
grep
-e
'On branch'
>
$1
;
echo
-n
'Node name: '
>>
$1
;
hostname
>>
$1
;
echo
'On branch
'
$3
&
>
$1
;
echo
'Node name: '
$4
>>
$1
;
#get GPU_arch and number of compute units from rocminfo
echo
-n
"GPU_arch: "
>>
$1
;
rocminfo |
grep
"Name:"
|
grep
"gfx"
>>
$1
;
rocminfo |
grep
"Compute Unit:"
>>
$1
;
hipcc
--version
|
grep
-e
'HIP version'
>>
$1
;
echo
'Environment type: '
$2
>>
$1
;
echo
'Environment type: '
$2
>>
$1
;
/opt/rocm/bin/amdclang++
--version
|
grep
-e
'InstalledDir'
>>
$1
;
}
#run gemm tests
export
gemm_log
=
"perf_gemm.log"
print_log_header
$gemm_log
$env_type
./profile_gemm.sh gemm 0 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 1 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 1 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 1 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 1 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 2 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 2 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 2 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 2 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 3 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 3 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 3 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 3 0 1 0 5 |
tee
-a
$gemm_log
python3 process_perf_data.py
$gemm_log
export
gemm_log
=
"perf_gemm_
${
gpu_arch
}
.log"
print_log_header
$gemm_log
$env_type
$branch
$host_name
./profile_gemm.sh gemm 0 0
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 0
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 0
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 0
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 1
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 1
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 1
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 1
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 2
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 2
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 2
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 2
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 3
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 3
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 3
$verify
1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 3
$verify
1 0 5 |
tee
-a
$gemm_log
#run resnet50 test
export
resnet256_log
=
"perf_resnet50_N256.log"
print_log_header
$resnet256_log
$env_type
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 |
tee
-a
$resnet256_log
python3 process_perf_data.py
$resnet256_log
export
resnet4_log
=
"perf_resnet50_N4.log"
print_log_header
$resnet4_log
$env_type
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 |
tee
-a
$resnet4_log
python3 process_perf_data.py
$resnet4_log
export
resnet256_log
=
"perf_resnet50_N256_
${
gpu_arch
}
.log"
print_log_header
$resnet256_log
$env_type
$branch
$host_name
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1
$verify
2 0 1 256 |
tee
-a
$resnet256_log
export
resnet4_log
=
"perf_resnet50_N4_
${
gpu_arch
}
.log"
print_log_header
$resnet4_log
$env_type
$branch
$host_name
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1
$verify
2 0 1 4 |
tee
-a
$resnet4_log
test/conv2d_bwd_weight/CMakeLists.txt
0 → 100644
View file @
8a370fbb
#add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp)
#target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_util)
test/grouped_gemm/grouped_gemm_fp16.cpp
View file @
8a370fbb
...
...
@@ -2,39 +2,8 @@
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceGroupedGemmPtr_
=
ck
::
tensor_operation
::
device
::
DeviceGroupedGemmPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances
(
std
::
vector
<
DeviceGroupedGemmPtr_
>&
);
}
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#include "profiler/include/profile_grouped_gemm_impl.hpp"
namespace
{
...
...
@@ -43,169 +12,52 @@ using BDataType = ck::half_t;
using
CDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
ALayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
CLayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
bool
TestGroupedGemm
(
DeviceGroupedGemmPtr_
&
groupedGemmPtr
)
template
<
typename
ALayout
,
typename
BLayout
,
typename
CLayout
>
bool
TestGroupedGemm
()
{
int
group_count
=
rand
()
%
10
+
1
;
// GEMM shape
std
::
vector
<
ck
::
tensor_operation
::
device
::
Gemm
Shape
>
gemm_
shape
s
;
std
::
vector
<
ck
::
tensor_operation
::
device
::
Gemm
Desc
>
gemm_
desc
s
;
std
::
vector
<
const
void
*>
p_a
,
p_b
;
std
::
vector
<
void
*>
p_c
;
gemm_shapes
.
reserve
(
group_count
)
;
std
::
vector
<
int
>
Ms
,
Ns
,
Ks
,
StrideAs
,
StrideBs
,
StrideCs
;
for
(
int
i
=
0
;
i
<
group_count
;
i
++
)
{
int
M
=
256
+
256
*
(
rand
()
%
10
);
int
N
=
256
+
256
*
(
rand
()
%
10
);
int
K
=
128
+
128
*
(
rand
()
%
10
);
int
AStride
=
std
::
is_same
<
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ALayout
>::
value
?
K
:
M
;
int
BStride
=
std
::
is_same
<
ck
::
tensor_layout
::
gemm
::
RowMajor
,
BLayout
>::
value
?
N
:
K
;
int
CStride
=
std
::
is_same
<
ck
::
tensor_layout
::
gemm
::
RowMajor
,
CLayout
>::
value
?
N
:
M
;
gemm_shapes
.
push_back
({
M
,
N
,
K
,
AStride
,
BStride
,
CStride
});
}
Ms
.
push_back
(
256
+
256
*
(
rand
()
%
10
));
Ns
.
push_back
(
256
+
256
*
(
rand
()
%
10
));
Ks
.
push_back
(
128
+
128
*
(
rand
()
%
10
));
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
}
};
std
::
vector
<
Tensor
<
ADataType
>>
a_tensors
;
;
std
::
vector
<
Tensor
<
BDataType
>>
b_tensors
;
std
::
vector
<
Tensor
<
CDataType
>>
c_host_tensors
;
std
::
vector
<
Tensor
<
CDataType
>>
c_device_tensors
;
a_tensors
.
reserve
(
group_count
);
b_tensors
.
reserve
(
group_count
);
c_host_tensors
.
reserve
(
group_count
);
c_device_tensors
.
reserve
(
group_count
);
using
DeviceMemPtr
=
std
::
unique_ptr
<
DeviceMem
>
;
std
::
vector
<
DeviceMemPtr
>
a_tensors_device
,
b_tensors_device
,
c_tensors_device
;
a_tensors_device
.
reserve
(
group_count
);
b_tensors_device
.
reserve
(
group_count
);
c_tensors_device
.
reserve
(
group_count
);
for
(
std
::
size_t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
a_tensors
.
emplace_back
(
Tensor
<
ADataType
>
(
f_host_tensor_descriptor
(
gemm_shapes
[
i
].
M
,
gemm_shapes
[
i
].
K
,
gemm_shapes
[
i
].
StrideA
,
ALayout
{})));
b_tensors
.
emplace_back
(
Tensor
<
BDataType
>
(
f_host_tensor_descriptor
(
gemm_shapes
[
i
].
K
,
gemm_shapes
[
i
].
N
,
gemm_shapes
[
i
].
StrideB
,
BLayout
{})));
c_host_tensors
.
emplace_back
(
Tensor
<
CDataType
>
(
f_host_tensor_descriptor
(
gemm_shapes
[
i
].
M
,
gemm_shapes
[
i
].
N
,
gemm_shapes
[
i
].
StrideC
,
CLayout
{})));
c_device_tensors
.
emplace_back
(
Tensor
<
CDataType
>
(
f_host_tensor_descriptor
(
gemm_shapes
[
i
].
M
,
gemm_shapes
[
i
].
N
,
gemm_shapes
[
i
].
StrideC
,
CLayout
{})));
a_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
});
b_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
StrideAs
.
push_back
(
std
::
is_same
<
Row
,
ALayout
>::
value
?
Ks
[
i
]
:
Ms
[
i
]);
StrideBs
.
push_back
(
std
::
is_same
<
Row
,
BLayout
>::
value
?
Ns
[
i
]
:
Ks
[
i
]);
StrideCs
.
push_back
(
std
::
is_same
<
Row
,
CLayout
>::
value
?
Ns
[
i
]
:
Ms
[
i
]);
}
for
(
std
::
size_t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
a_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_tensors
[
i
].
mDesc
.
GetElementSize
()));
b_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
BDataType
)
*
b_tensors
[
i
].
mDesc
.
GetElementSize
()));
c_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
CDataType
)
*
c_device_tensors
[
i
].
mDesc
.
GetElementSize
()));
a_tensors_device
[
i
]
->
ToDevice
(
a_tensors
[
i
].
mData
.
data
());
b_tensors_device
[
i
]
->
ToDevice
(
b_tensors
[
i
].
mData
.
data
());
p_a
.
push_back
(
a_tensors_device
[
i
]
->
GetDeviceBuffer
());
p_b
.
push_back
(
b_tensors_device
[
i
]
->
GetDeviceBuffer
());
p_c
.
push_back
(
c_tensors_device
[
i
]
->
GetDeviceBuffer
());
}
auto
a_element_op
=
PassThrough
{};
auto
b_element_op
=
PassThrough
{};
auto
c_element_op
=
PassThrough
{};
// do GEMM
auto
invoker_ptr
=
groupedGemmPtr
->
MakeInvokerPointer
();
auto
argument_ptr
=
groupedGemmPtr
->
MakeArgumentPointer
(
p_a
,
p_b
,
p_c
,
gemm_shapes
,
a_element_op
,
b_element_op
,
c_element_op
);
DeviceMem
gemm_desc_workspace
(
groupedGemmPtr
->
GetWorkSpaceSize
(
argument_ptr
.
get
()));
groupedGemmPtr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
gemm_desc_workspace
.
GetDeviceBuffer
());
invoker_ptr
->
Run
(
argument_ptr
.
get
());
for
(
std
::
size_t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
c_tensors_device
[
i
]
->
FromDevice
(
c_device_tensors
[
i
].
mData
.
data
());
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
PassThrough
,
PassThrough
,
PassThrough
>
;
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
a_tensors
[
i
],
b_tensors
[
i
],
c_host_tensors
[
i
],
a_element_op
,
b_element_op
,
c_element_op
);
if
(
!
groupedGemmPtr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
return
false
;
}
ref_invoker
.
Run
(
ref_argument
);
bool
res
=
ck
::
utils
::
check_err
(
c_host_tensors
[
i
].
mData
,
c_device_tensors
[
i
].
mData
);
std
::
cout
<<
"group_id: "
<<
i
<<
(
res
?
" SUCCESS"
:
" FAILURE"
)
<<
std
::
endl
;
if
(
!
res
)
return
false
;
}
return
true
;
return
ck
::
profiler
::
profile_grouped_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
ALayout
,
BLayout
,
CLayout
>
(
true
,
1
,
false
,
1
,
Ms
,
Ns
,
Ks
,
StrideAs
,
StrideBs
,
StrideCs
);
}
}
// anonymous namespace
int
main
()
{
std
::
vector
<
DeviceGroupedGemmPtr_
>
groupedGemmPtrs
;
ck
::
tensor_operation
::
device
::
instance
::
add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances
(
groupedGemmPtrs
);
bool
res
=
true
;
for
(
auto
&
gemmPtr
:
groupedGemmPtrs
)
{
res
&
=
TestGroupedGemm
(
gemmPtr
);
}
res
=
res
&&
TestGroupedGemm
<
Row
,
Row
,
Row
>
();
res
=
res
&&
TestGroupedGemm
<
Row
,
Col
,
Row
>
();
res
=
res
&
&
TestGroupedGemm
<
Col
,
Row
,
Row
>
(
);
res
=
res
&&
TestGroupedGemm
<
Col
,
Col
,
Row
>
();
std
::
cout
<<
"TestGroupedGemm ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment