Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b78c8719
Commit
b78c8719
authored
Jul 17, 2022
by
Jing Zhang
Browse files
Merge remote-tracking branch 'origin/develop' into grouped_gemm_multiD
parents
2113ce2e
a11680cc
Changes
26
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1940 additions
and
328 deletions
+1940
-328
Dockerfile
Dockerfile
+17
-0
Jenkinsfile
Jenkinsfile
+88
-72
client_example/01_gemm/gemm.cpp
client_example/01_gemm/gemm.cpp
+1
-1
example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
+1
-1
example/23_softmax/softmax_blockwise.cpp
example/23_softmax/softmax_blockwise.cpp
+2
-0
example/27_layernorm/CMakeLists.txt
example/27_layernorm/CMakeLists.txt
+1
-0
example/27_layernorm/layernorm_blockwise.cpp
example/27_layernorm/layernorm_blockwise.cpp
+133
-0
example/CMakeLists.txt
example/CMakeLists.txt
+1
-0
include/ck/tensor_operation/gpu/device/device_layernorm.hpp
include/ck/tensor_operation/gpu/device/device_layernorm.hpp
+346
-0
include/ck/tensor_operation/gpu/grid/gridwise_layernorm.hpp
include/ck/tensor_operation/gpu/grid/gridwise_layernorm.hpp
+392
-0
library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp
...ry/reference_tensor_operation/cpu/reference_layernorm.hpp
+170
-0
script/process_perf_data.py
script/process_perf_data.py
+296
-0
script/profile_batched_gemm.sh
script/profile_batched_gemm.sh
+36
-0
script/profile_conv.sh
script/profile_conv.sh
+25
-164
script/profile_gemm_bias_relu_add.sh
script/profile_gemm_bias_relu_add.sh
+36
-0
script/profile_grouped_gemm.sh
script/profile_grouped_gemm.sh
+18
-0
script/profile_reduce_no_index.sh
script/profile_reduce_no_index.sh
+41
-46
script/profile_reduce_with_index.sh
script/profile_reduce_with_index.sh
+41
-44
script/profile_resnet50.sh
script/profile_resnet50.sh
+171
-0
script/run_full_performance_tests.sh
script/run_full_performance_tests.sh
+124
-0
No files found.
Dockerfile
View file @
b78c8719
...
@@ -2,6 +2,7 @@ FROM ubuntu:18.04
...
@@ -2,6 +2,7 @@ FROM ubuntu:18.04
ARG
ROCMVERSION=5.1
ARG
ROCMVERSION=5.1
ARG
OSDB_BKC_VERSION
ARG
OSDB_BKC_VERSION
ARG
compiler_version
RUN
set
-xe
RUN
set
-xe
...
@@ -93,3 +94,19 @@ RUN groupadd -f render
...
@@ -93,3 +94,19 @@ RUN groupadd -f render
RUN
git clone
-b
master https://github.com/RadeonOpenCompute/rocm-cmake.git
&&
\
RUN
git clone
-b
master https://github.com/RadeonOpenCompute/rocm-cmake.git
&&
\
cd
rocm-cmake
&&
mkdir
build
&&
cd
build
&&
\
cd
rocm-cmake
&&
mkdir
build
&&
cd
build
&&
\
cmake ..
&&
cmake
--build
.
&&
cmake
--build
.
--target
install
cmake ..
&&
cmake
--build
.
&&
cmake
--build
.
--target
install
WORKDIR
/
ENV
compiler_version=$compiler_version
RUN
sh
-c
"echo compiler version = '
$compiler_version
'"
RUN if
[
"
$compiler_version
"
=
"9110"
]
;
then
\
git clone
-b
ck-9110 https://github.com/RadeonOpenCompute/llvm-project.git
&&
\
cd
llvm-project
&&
mkdir
build
&&
cd
build
&&
\
cmake
-DCMAKE_INSTALL_PREFIX
=
/opt/rocm/llvm
-DCMAKE_BUILD_TYPE
=
Release
-DLLVM_ENABLE_ASSERTIONS
=
1
-DLLVM_TARGETS_TO_BUILD
=
"AMDGPU;X86"
-DLLVM_ENABLE_PROJECTS
=
"clang;lld;compiler-rt"
../llvm
&&
\
make
-j
8
;
\
else
echo
"using the release compiler"
;
\
fi
#ENV HIP_CLANG_PATH='/llvm-project/build/bin'
#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
Jenkinsfile
View file @
b78c8719
...
@@ -95,42 +95,38 @@ def buildHipClangJob(Map conf=[:]){
...
@@ -95,42 +95,38 @@ def buildHipClangJob(Map conf=[:]){
if
(
conf
.
get
(
"enforce_xnack_on"
,
false
))
{
if
(
conf
.
get
(
"enforce_xnack_on"
,
false
))
{
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1"
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1"
}
}
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' "
def
dockerArgs
if
(
params
.
USE_9110
){
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='9110' "
dockerOpts
=
dockerOpts
+
" --env HIP_CLANG_PATH='/llvm-project/build/bin' "
}
else
{
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='release' "
}
def
variant
=
env
.
STAGE_NAME
def
variant
=
env
.
STAGE_NAME
def
retimage
def
retimage
gitStatusWrapper
(
credentialsId:
"${status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCmSoftwarePlatform'
,
repo:
'composable_kernel'
)
{
gitStatusWrapper
(
credentialsId:
"${status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCmSoftwarePlatform'
,
repo:
'composable_kernel'
)
{
if
(
params
.
USE_DOCKERFILE
){
try
{
try
{
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
'.'
)
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
'.'
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
timeout
(
time:
5
,
unit:
'MINUTES'
){
timeout
(
time:
5
,
unit:
'MINUTES'
)
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
{
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
}
}
}
catch
(
org
.
jenkinsci
.
plugins
.
workflow
.
steps
.
FlowInterruptedException
e
){
echo
"The job was cancelled or aborted"
throw
e
}
catch
(
Exception
ex
)
{
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
"--no-cache ."
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
timeout
(
time:
5
,
unit:
'MINUTES'
)
{
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
}
}
}
}
}
}
}
else
{
catch
(
org
.
jenkinsci
.
plugins
.
workflow
.
steps
.
FlowInterruptedException
e
){
timeout
(
time:
3
,
unit:
'HOURS'
){
echo
"The job was cancelled or aborted"
retimage
=
docker
.
image
(
'compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54'
).
pull
()
throw
e
image
=
"b56f8ac0d6ea"
}
sh
"docker images"
catch
(
Exception
ex
)
{
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
" --no-cache ."
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
timeout
(
time:
5
,
unit:
'MINUTES'
){
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
}
}
}
}
}
...
@@ -150,9 +146,6 @@ def reboot(){
...
@@ -150,9 +146,6 @@ def reboot(){
}
}
def
buildHipClangJobAndReboot
(
Map
conf
=[:]){
def
buildHipClangJobAndReboot
(
Map
conf
=[:]){
try
{
try
{
buildHipClangJob
(
conf
)
buildHipClangJob
(
conf
)
...
@@ -186,42 +179,38 @@ def runCKProfiler(Map conf=[:]){
...
@@ -186,42 +179,38 @@ def runCKProfiler(Map conf=[:]){
if
(
conf
.
get
(
"enforce_xnack_on"
,
false
))
{
if
(
conf
.
get
(
"enforce_xnack_on"
,
false
))
{
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1"
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1"
}
}
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' "
def
dockerArgs
if
(
params
.
USE_9110
){
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='9110' "
dockerOpts
=
dockerOpts
+
" --env HIP_CLANG_PATH='/llvm-project/build/bin' "
}
else
{
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='release' "
}
def
variant
=
env
.
STAGE_NAME
def
variant
=
env
.
STAGE_NAME
def
retimage
def
retimage
gitStatusWrapper
(
credentialsId:
"${status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCmSoftwarePlatform'
,
repo:
'composable_kernel'
)
{
gitStatusWrapper
(
credentialsId:
"${status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCmSoftwarePlatform'
,
repo:
'composable_kernel'
)
{
if
(
params
.
USE_DOCKERFILE
){
try
{
try
{
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
'.'
)
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
'.'
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
timeout
(
time:
5
,
unit:
'MINUTES'
){
timeout
(
time:
5
,
unit:
'MINUTES'
)
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
{
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
}
}
}
catch
(
org
.
jenkinsci
.
plugins
.
workflow
.
steps
.
FlowInterruptedException
e
){
echo
"The job was cancelled or aborted"
throw
e
}
catch
(
Exception
ex
)
{
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
"--no-cache ."
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
timeout
(
time:
5
,
unit:
'MINUTES'
)
{
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
}
}
}
}
}
}
}
else
{
catch
(
org
.
jenkinsci
.
plugins
.
workflow
.
steps
.
FlowInterruptedException
e
){
timeout
(
time:
3
,
unit:
'HOURS'
){
echo
"The job was cancelled or aborted"
retimage
=
docker
.
image
(
'compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54'
).
pull
()
throw
e
image
=
"b56f8ac0d6ea"
}
sh
"docker images"
catch
(
Exception
ex
)
{
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
" --no-cache ."
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
timeout
(
time:
5
,
unit:
'MINUTES'
){
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
}
}
}
}
}
...
@@ -238,6 +227,12 @@ def runCKProfiler(Map conf=[:]){
...
@@ -238,6 +227,12 @@ def runCKProfiler(Map conf=[:]){
sh
"echo GPU_arch name: ${gpu_arch} >> ${gemm_log}"
sh
"echo GPU_arch name: ${gpu_arch} >> ${gemm_log}"
sh
"rocminfo | grep 'Compute Unit:' >> ${gemm_log} "
sh
"rocminfo | grep 'Compute Unit:' >> ${gemm_log} "
sh
"hipcc --version | grep -e 'HIP version' >> ${gemm_log}"
sh
"hipcc --version | grep -e 'HIP version' >> ${gemm_log}"
if
(
params
.
USE_9110
){
sh
"echo Environment type: CI_9110 >> ${gemm_log}"
}
else
{
sh
"echo Environment type: CI_release >> ${gemm_log}"
}
sh
"/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}"
sh
"/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}"
sh
"./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a ${gemm_log}"
...
@@ -259,23 +254,44 @@ def runCKProfiler(Map conf=[:]){
...
@@ -259,23 +254,44 @@ def runCKProfiler(Map conf=[:]){
//the script will return 0 if the performance criteria are met
//the script will return 0 if the performance criteria are met
//or return 1 if the criteria are not met
//or return 1 if the criteria are not met
archiveArtifacts
"${gemm_log}"
archiveArtifacts
"${gemm_log}"
sh
"python3 p
arse
_perf_data.py ${gemm_log} "
sh
"python3 p
rocess
_perf_data.py ${gemm_log} "
//run resnet50 test
//run resnet50 test
def
resnet_log
=
"perf_resnet50_${gpu_arch}.log"
def
resnet256_log
=
"perf_resnet50_N256_${gpu_arch}.log"
sh
"rm -f ${resnet_log}"
sh
"rm -f ${resnet256_log}"
sh
"echo Branch name: ${env.BRANCH_NAME} > ${resnet_log}"
sh
"echo Branch name: ${env.BRANCH_NAME} > ${resnet256_log}"
sh
"echo Node name: ${NODE_NAME} >> ${resnet_log}"
sh
"echo Node name: ${NODE_NAME} >> ${resnet256_log}"
sh
"echo GPU_arch name: ${gpu_arch} >> ${resnet_log}"
sh
"echo GPU_arch name: ${gpu_arch} >> ${resnet256_log}"
sh
"rocminfo | grep 'Compute Unit:' >> ${resnet_log} "
sh
"rocminfo | grep 'Compute Unit:' >> ${resnet256_log} "
sh
"hipcc --version | grep -e 'HIP version' >> ${resnet_log}"
sh
"hipcc --version | grep -e 'HIP version' >> ${resnet256_log}"
sh
"/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet_log}"
if
(
params
.
USE_9110
){
sh
"echo Environment type: CI_9110 >> ${resnet256_log}"
}
else
{
sh
"echo Environment type: CI_release >> ${resnet256_log}"
}
sh
"/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet256_log}"
//first run tests with N=256
//first run tests with N=256
sh
"./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a ${resnet_log}"
sh
"./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a ${resnet256_log}"
archiveArtifacts
"${resnet256_log}"
sh
"python3 process_perf_data.py ${resnet256_log} "
//then run with N=4
//then run with N=4
sh
"./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a ${resnet_log}"
def
resnet4_log
=
"perf_resnet50_N4_${gpu_arch}.log"
archiveArtifacts
"${resnet_log}"
sh
"rm -f ${resnet4_log}"
//the script will put the results from N=256 and N=4 runs into separate tables
sh
"echo Branch name: ${env.BRANCH_NAME} > ${resnet4_log}"
sh
"python3 parse_perf_data.py ${resnet_log} "
sh
"echo Node name: ${NODE_NAME} >> ${resnet4_log}"
sh
"echo GPU_arch name: ${gpu_arch} >> ${resnet4_log}"
sh
"rocminfo | grep 'Compute Unit:' >> ${resnet4_log} "
sh
"hipcc --version | grep -e 'HIP version' >> ${resnet4_log}"
if
(
params
.
USE_9110
){
sh
"echo Environment type: CI_9110 >> ${resnet4_log}"
}
else
{
sh
"echo Environment type: CI_release >> ${resnet4_log}"
}
sh
"/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet4_log}"
sh
"./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a ${resnet4_log}"
archiveArtifacts
"${resnet4_log}"
sh
"python3 process_perf_data.py ${resnet4_log} "
}
}
}
}
}
}
...
@@ -307,7 +323,7 @@ pipeline {
...
@@ -307,7 +323,7 @@ pipeline {
}
}
parameters
{
parameters
{
booleanParam
(
booleanParam
(
name:
"USE_
DOCKERFILE
"
,
name:
"USE_
9110
"
,
defaultValue:
true
,
defaultValue:
true
,
description:
""
)
description:
""
)
}
}
...
...
client_example/01_gemm/gemm.cpp
View file @
b78c8719
...
@@ -63,7 +63,7 @@ int main(int argc, char* argv[])
...
@@ -63,7 +63,7 @@ int main(int argc, char* argv[])
{
{
// use default case
// use default case
}
}
else
if
(
argc
==
5
)
else
if
(
argc
==
7
)
{
{
M
=
std
::
stoi
(
argv
[
1
]);
M
=
std
::
stoi
(
argv
[
1
]);
N
=
std
::
stoi
(
argv
[
2
]);
N
=
std
::
stoi
(
argv
[
2
]);
...
...
example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
View file @
b78c8719
...
@@ -129,7 +129,7 @@ void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
...
@@ -129,7 +129,7 @@ void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
const
Tensor
<
ADataType
>&
a_m_k
,
const
Tensor
<
ADataType
>&
a_m_k
,
const
Tensor
<
ADataType
>&
b_k_n
,
const
Tensor
<
ADataType
>&
b_k_n
,
const
Tensor
<
GammaDataType
>&
gamma_n
,
const
Tensor
<
GammaDataType
>&
gamma_n
,
const
Tensor
<
Gamm
aDataType
>&
beta_n
,
const
Tensor
<
Bet
aDataType
>&
beta_n
,
A_functor
a_element_op
,
A_functor
a_element_op
,
B_functor
b_element_op
,
B_functor
b_element_op
,
C_functor
c_element_op
,
C_functor
c_element_op
,
...
...
example/23_softmax/softmax_blockwise.cpp
View file @
b78c8719
...
@@ -212,6 +212,8 @@ int main(int argc, char* argv[])
...
@@ -212,6 +212,8 @@ int main(int argc, char* argv[])
auto
device_instance
=
DeviceInstance
{};
auto
device_instance
=
DeviceInstance
{};
std
::
cout
<<
i_inLengths
.
size
()
<<
", "
<<
i_inStrides
.
size
()
<<
std
::
endl
;
auto
argument_ptr
=
device_instance
.
MakeArgumentPointer
(
i_inLengths
,
auto
argument_ptr
=
device_instance
.
MakeArgumentPointer
(
i_inLengths
,
i_inStrides
,
i_inStrides
,
reduceDims
,
reduceDims
,
...
...
example/27_layernorm/CMakeLists.txt
0 → 100644
View file @
b78c8719
add_example_executable
(
example_layernorm_blockwise layernorm_blockwise.cpp
)
\ No newline at end of file
example/27_layernorm/layernorm_blockwise.cpp
0 → 100644
View file @
b78c8719
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <getopt.h>
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/device_layernorm.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_common_util.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
using
XDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
constexpr
int
Rank
=
2
;
constexpr
int
NumReduceDim
=
1
;
using
DeviceInstance
=
ck
::
tensor_operation
::
device
::
DeviceLayernorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
PassThrough
,
Rank
,
NumReduceDim
,
256
,
// BlockSize
8
,
// ClusterM
32
,
// ClusterK
1
,
// SliceM
8
,
// SliceK
1
,
// SrcVecDim (0=M, 1=K)
8
,
// SrcScalarPerVector
8
,
// GammaScalarPerVector
8
,
// BetaScalarPerVector
1
>
;
// OutScalarPerVector
int
main
()
{
bool
time_kernel
=
false
;
ck
::
index_t
M
=
1024
;
ck
::
index_t
N
=
1024
;
ck
::
index_t
Stride
=
N
;
auto
f_host_tensor_descriptor1d
=
[](
std
::
size_t
len
,
std
::
size_t
stride
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
len
}),
std
::
vector
<
std
::
size_t
>
({
stride
}));
};
auto
f_host_tensor_descriptor2d
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
};
Tensor
<
XDataType
>
x
(
f_host_tensor_descriptor2d
(
M
,
N
,
Stride
));
Tensor
<
GammaDataType
>
gamma
(
f_host_tensor_descriptor1d
(
N
,
1
));
Tensor
<
BetaDataType
>
beta
(
f_host_tensor_descriptor1d
(
N
,
1
));
Tensor
<
YDataType
>
y
(
f_host_tensor_descriptor2d
(
M
,
N
,
Stride
));
x
.
GenerateTensorValue
(
GeneratorTensor_3
<
XDataType
>
{
0.0
,
1.0
});
gamma
.
GenerateTensorValue
(
GeneratorTensor_3
<
GammaDataType
>
{
0.0
,
1.0
});
beta
.
GenerateTensorValue
(
GeneratorTensor_3
<
BetaDataType
>
{
0.0
,
1.0
});
DeviceMem
x_dev
(
sizeof
(
XDataType
)
*
x
.
mDesc
.
GetElementSpace
());
DeviceMem
gamma_dev
(
sizeof
(
GammaDataType
)
*
gamma
.
mDesc
.
GetElementSpace
());
DeviceMem
beta_dev
(
sizeof
(
BetaDataType
)
*
beta
.
mDesc
.
GetElementSpace
());
DeviceMem
y_dev
(
sizeof
(
YDataType
)
*
y
.
mDesc
.
GetElementSpace
());
x_dev
.
ToDevice
(
x
.
mData
.
data
());
gamma_dev
.
ToDevice
(
gamma
.
mData
.
data
());
beta_dev
.
ToDevice
(
beta
.
mData
.
data
());
auto
device_instance
=
DeviceInstance
{};
auto
argument_ptr
=
device_instance
.
MakeArgumentPointer
(
{
M
,
N
},
std
::
vector
<
ck
::
index_t
>
{
x
.
mDesc
.
GetStrides
().
begin
(),
x
.
mDesc
.
GetStrides
().
end
()},
std
::
vector
<
ck
::
index_t
>
{
gamma
.
mDesc
.
GetStrides
().
begin
(),
gamma
.
mDesc
.
GetStrides
().
end
()},
std
::
vector
<
ck
::
index_t
>
{
beta
.
mDesc
.
GetStrides
().
begin
(),
beta
.
mDesc
.
GetStrides
().
end
()},
{
1
},
1e-4
,
x_dev
.
GetDeviceBuffer
(),
gamma_dev
.
GetDeviceBuffer
(),
beta_dev
.
GetDeviceBuffer
(),
y_dev
.
GetDeviceBuffer
(),
PassThrough
{});
if
(
!
device_instance
.
IsSupportedArgument
(
argument_ptr
.
get
()))
{
std
::
cout
<<
"The runtime parameters are not supported"
<<
std
::
endl
;
return
1
;
};
auto
invoker_ptr
=
device_instance
.
MakeInvokerPointer
();
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
bool
pass
=
true
;
{
Tensor
<
YDataType
>
host_y
(
f_host_tensor_descriptor2d
(
M
,
N
,
Stride
));
using
ReferenceInstance
=
ck
::
tensor_operation
::
host
::
ReferenceLayernorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
PassThrough
,
Rank
,
NumReduceDim
>
;
ReferenceInstance
ref
;
auto
ref_argument
=
ref
.
MakeArgument
(
x
,
gamma
,
beta
,
host_y
,
PassThrough
{},
{
M
,
N
},
{
1
},
1e-4
);
auto
ref_invoker
=
ref
.
MakeInvoker
();
ref_invoker
.
Run
(
ref_argument
);
y_dev
.
FromDevice
(
y
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
y
.
mData
,
host_y
.
mData
,
"Error: Incorrect results d1"
,
1e-3
,
1e-3
);
}
return
(
pass
?
0
:
1
);
}
example/CMakeLists.txt
View file @
b78c8719
...
@@ -45,3 +45,4 @@ add_subdirectory(23_softmax)
...
@@ -45,3 +45,4 @@ add_subdirectory(23_softmax)
add_subdirectory
(
24_batched_gemm_c_permute
)
add_subdirectory
(
24_batched_gemm_c_permute
)
add_subdirectory
(
25_gemm_bias_c_permute
)
add_subdirectory
(
25_gemm_bias_c_permute
)
add_subdirectory
(
26_contraction
)
add_subdirectory
(
26_contraction
)
add_subdirectory
(
27_layernorm
)
include/ck/tensor_operation/gpu/device/device_layernorm.hpp
0 → 100644
View file @
b78c8719
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_layernorm.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
#include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
// Y = LayerNorm(X, Beta, Gamma)
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
AccDataType
,
typename
YDataType
,
typename
AccElementwiseOperation
,
index_t
Rank
,
index_t
NumReduceDim
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
XYSrcVectorDim
,
index_t
XSrcVectorSize
,
index_t
GammaSrcVectorSize
,
index_t
BetaSrcVectorSize
,
index_t
YDstVectorSize
>
struct
DeviceLayernorm
:
public
BaseOperator
{
static_assert
(
(
KThreadSliceSize
%
GammaSrcVectorSize
==
0
),
"Invalid thread slice sizes and/or gamma vector sizes configuration, please check!"
);
static_assert
(
(
KThreadSliceSize
%
BetaSrcVectorSize
==
0
),
"Invalid thread slice sizes and/or beta vector sizes configuration, please check!"
);
using
PassThrough
=
tensor_operation
::
element_wise
::
PassThrough
;
// Used for freeloading of some handy functions from DeviceReduceMultiBlock
using
Reduction
=
DeviceReduceMultiBlock
<
XDataType
,
AccDataType
,
YDataType
,
Rank
,
NumReduceDim
,
reduce
::
Add
,
PassThrough
,
// InElementwiseOperation
AccElementwiseOperation
,
// AccElementwiseOperation
InMemoryDataOperationEnum
::
Set
,
false
,
// PropagateNan
false
,
// OutputIndex
false
,
// HaveIndexInputIfOutputIndex
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
XYSrcVectorDim
,
XSrcVectorSize
,
1
>
;
// YDstVectorSize
static
auto
MakeAffine1dDescriptor
(
const
std
::
vector
<
index_t
>&
Lengths
,
const
std
::
vector
<
index_t
>&
Strides
,
int
blkGroupSize
,
int
numBlockTileIteration
)
{
const
auto
tupleLengths
=
make_tuple_from_array
(
Lengths
,
Number
<
NumReduceDim
>
{});
const
auto
tupleStrides
=
make_tuple_from_array
(
Strides
,
Number
<
NumReduceDim
>
{});
auto
desc
=
make_naive_tensor_descriptor
(
tupleLengths
,
tupleStrides
);
auto
grid_desc_k
=
transform_tensor_descriptor
(
desc
,
make_tuple
(
make_merge_transform
(
tupleLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
NumReduceDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
reduceTotalLength
=
grid_desc_k
.
GetLength
(
Number
<
0
>
{});
const
int
reduceSizePerBlock
=
Reduction
::
K_BlockTileSize
*
numBlockTileIteration
;
const
auto
Pad_K
=
reduceSizePerBlock
*
blkGroupSize
-
reduceTotalLength
;
auto
grid_desc_k_padded
=
transform_tensor_descriptor
(
grid_desc_k
,
make_tuple
(
make_right_pad_transform
(
reduceTotalLength
,
Pad_K
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
(
grid_desc_k_padded
);
};
using
GridDesc_M_K
=
decltype
(
Reduction
::
MakeSrc2dDescriptor
({
1
},
{
1
},
1
,
1
));
using
GridDesc_K
=
decltype
(
MakeAffine1dDescriptor
({
1
},
{
1
},
1
,
1
));
using
GridwiseReduceLayernormGeneric
=
GridwiseLayernorm_mk_to_mk
<
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
AccElementwiseOperation
,
GridDesc_M_K
,
GridDesc_K
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
XYSrcVectorDim
,
XSrcVectorSize
,
GammaSrcVectorSize
,
BetaSrcVectorSize
,
XYSrcVectorDim
,
YDstVectorSize
,
false
>
;
using
GridwiseReduceLayernormSweepOnce
=
GridwiseLayernorm_mk_to_mk
<
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
AccElementwiseOperation
,
GridDesc_M_K
,
GridDesc_K
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
XYSrcVectorDim
,
XSrcVectorSize
,
GammaSrcVectorSize
,
BetaSrcVectorSize
,
XYSrcVectorDim
,
YDstVectorSize
,
true
>
;
struct
Argument
:
public
Reduction
::
Argument
{
Argument
(
const
std
::
vector
<
index_t
>
lengths
,
const
std
::
vector
<
index_t
>
xStrides
,
const
std
::
vector
<
index_t
>
gammaStrides
,
const
std
::
vector
<
index_t
>
betaStrides
,
const
std
::
vector
<
index_t
>
reduceDims
,
AccElementwiseOperation
acc_elementwise_op
,
AccDataType
epsilon
,
const
XDataType
*
p_x
,
const
GammaDataType
*
p_gamma
,
const
BetaDataType
*
p_beta
,
YDataType
*
p_y
)
:
Reduction
::
Argument
(
lengths
,
xStrides
,
{},
{},
reduceDims
,
0.0
f
,
// alpha
0.0
f
,
// beta
p_x
,
nullptr
,
p_y
,
nullptr
,
acc_elementwise_op
,
PassThrough
{}),
epsilon_
(
epsilon
),
p_gamma_
(
p_gamma
),
p_beta_
(
p_beta
),
gammaStrides_
(
gammaStrides
),
betaStrides_
(
betaStrides
)
{
reduceLength_
.
resize
(
NumReduceDim
);
for
(
int
i
=
0
;
i
<
NumReduceDim
;
++
i
)
{
reduceLength_
[
i
]
=
lengths
[
reduceDims
[
i
]];
}
}
AccDataType
epsilon_
;
const
GammaDataType
*
p_gamma_
;
const
BetaDataType
*
p_beta_
;
std
::
vector
<
index_t
>
reduceLength_
;
std
::
vector
<
index_t
>
gammaStrides_
;
std
::
vector
<
index_t
>
betaStrides_
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
const
auto
x_grid_desc_m_k
=
Reduction
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
);
const
auto
gamma_grid_desc_k
=
MakeAffine1dDescriptor
(
arg
.
reduceLength_
,
arg
.
gammaStrides_
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
);
const
auto
beta_grid_desc_k
=
MakeAffine1dDescriptor
(
arg
.
reduceLength_
,
arg
.
betaStrides_
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
);
const
auto
y_grid_desc_m_k
=
Reduction
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
);
bool
sweep_once
=
x_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{})
<=
KThreadClusterSize
*
KThreadSliceSize
;
const
auto
kernel_main
=
sweep_once
?
kernel_layernorm
<
GridwiseReduceLayernormSweepOnce
,
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
AccElementwiseOperation
,
GridDesc_M_K
,
GridDesc_K
>
:
kernel_layernorm
<
GridwiseReduceLayernormGeneric
,
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
AccElementwiseOperation
,
GridDesc_M_K
,
GridDesc_K
>
;
float
avg_time
=
0
;
avg_time
+=
launch_and_time_kernel
(
stream_config
,
kernel_main
,
dim3
(
arg
.
gridSize
),
dim3
(
BlockSize
),
0
,
x_grid_desc_m_k
,
gamma_grid_desc_k
,
beta_grid_desc_k
,
y_grid_desc_m_k
,
arg
.
numBlockTileIteration
,
arg
.
epsilon_
,
arg
.
in_dev_
,
arg
.
p_gamma_
,
arg
.
p_beta_
,
arg
.
out_dev_
,
arg
.
acc_elementwise_op_
);
return
(
avg_time
);
};
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
};
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
p_arg_
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
(
!
Reduction
::
IsSupportedArgument
(
p_arg_
))
{
return
false
;
}
if
(
p_arg_
->
inLengths_
[
Rank
-
1
]
%
YDstVectorSize
!=
0
)
{
return
false
;
}
if
(
p_arg_
->
gammaStrides_
.
size
()
!=
NumReduceDim
||
p_arg_
->
betaStrides_
.
size
()
!=
NumReduceDim
)
return
false
;
auto
IsScalarPerVectorValid
=
[](
bool
isLastDimensionCoalesced
,
int
scalarPerVector
)
{
bool
ret
=
true
;
if
(
!
isLastDimensionCoalesced
)
ret
=
scalarPerVector
==
1
;
else
ret
=
KThreadSliceSize
%
scalarPerVector
==
0
;
return
ret
;
};
if
(
!
IsScalarPerVectorValid
(
p_arg_
->
gammaStrides_
.
back
()
==
1
,
GammaSrcVectorSize
))
return
false
;
if
(
!
IsScalarPerVectorValid
(
p_arg_
->
betaStrides_
.
back
()
==
1
,
BetaSrcVectorSize
))
return
false
;
return
true
;
};
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
index_t
>
lengths
,
const
std
::
vector
<
index_t
>
xStrides
,
const
std
::
vector
<
index_t
>
gammaStrides
,
const
std
::
vector
<
index_t
>
betaStrides
,
const
std
::
vector
<
index_t
>
reduceDims
,
AccDataType
epsilon
,
const
void
*
p_x
,
const
void
*
p_gamma
,
const
void
*
p_beta
,
void
*
p_y
,
AccElementwiseOperation
acc_elementwise_op
)
{
return
std
::
make_unique
<
Argument
>
(
lengths
,
xStrides
,
gammaStrides
,
betaStrides
,
reduceDims
,
acc_elementwise_op
,
epsilon
,
static_cast
<
const
XDataType
*>
(
p_x
),
static_cast
<
const
GammaDataType
*>
(
p_gamma
),
static_cast
<
const
BetaDataType
*>
(
p_beta
),
static_cast
<
YDataType
*>
(
p_y
));
};
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
();
};
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceLayernorm<"
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
MThreadClusterSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"XYSrcVectorDim_"
<<
XYSrcVectorDim
<<
","
;
str
<<
"VectorSize_X"
<<
XSrcVectorSize
<<
"_Gamma"
<<
GammaSrcVectorSize
<<
"_Beta"
<<
BetaSrcVectorSize
<<
"_Y"
<<
YDstVectorSize
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/grid/gridwise_layernorm.hpp
0 → 100644
View file @
b78c8719
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_common.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/utility/reduction_functions_accumulate.hpp"
#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
template
<
typename
GridwiseReduction
,
typename
XDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
YDataType
,
typename
AccDataType
,
typename
AccElementwiseOperation
,
typename
GridDesc_M_K
,
typename
GridDesc_K
>
__global__
void
kernel_layernorm
(
const
GridDesc_M_K
x_grid_desc_m_k
,
const
GridDesc_K
gamma_grid_desc_k
,
const
GridDesc_K
beta_grid_desc_k
,
const
GridDesc_M_K
y_grid_desc_m_k
,
index_t
num_k_block_tile_iteration
,
AccDataType
epsilon
,
const
XDataType
*
const
__restrict__
p_x_global
,
const
GammaDataType
*
const
__restrict__
p_gamma_global
,
const
BetaDataType
*
const
__restrict__
p_beta_global
,
YDataType
*
const
__restrict__
p_y_global
,
const
AccElementwiseOperation
acc_elementwise_op
)
{
GridwiseReduction
::
Run
(
x_grid_desc_m_k
,
gamma_grid_desc_k
,
beta_grid_desc_k
,
y_grid_desc_m_k
,
num_k_block_tile_iteration
,
epsilon
,
p_x_global
,
p_gamma_global
,
p_beta_global
,
p_y_global
,
acc_elementwise_op
);
};
// Y = LayerNorm(X, Beta, Gamma)
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
YDataType
,
typename
AccDataType
,
typename
AccElementwiseOperation
,
typename
GridDesc_M_K
,
typename
GridDesc_K
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
XSrcVectorDim
,
index_t
XSrcVectorSize
,
index_t
GammaSrcVectorSize
,
index_t
BetaSrcVectorSize
,
index_t
YDstVectorDim
,
index_t
YDstVectorSize
,
bool
SweepOnce
>
struct
GridwiseLayernorm_mk_to_mk
{
static_assert
((
XSrcVectorDim
==
0
&&
MThreadSliceSize
%
XSrcVectorSize
==
0
)
||
(
XSrcVectorDim
==
1
&&
KThreadSliceSize
%
XSrcVectorSize
==
0
),
"Invalid thread slice sizes and/or vector sizes configuration, please check!"
);
static_assert
((
YDstVectorDim
==
0
&&
MThreadSliceSize
%
YDstVectorSize
==
0
)
||
(
YDstVectorDim
==
1
&&
KThreadSliceSize
%
YDstVectorSize
==
0
),
"Invalid thread slice sizes and/or vector sizes configuration, please check!"
);
static
constexpr
bool
reorder_thread_cluster
=
(
XSrcVectorDim
==
0
);
using
ThreadClusterLengths_M_K
=
Sequence
<
MThreadClusterSize
,
KThreadClusterSize
>
;
using
ThreadBufferDimAccessOrder
=
typename
conditional
<
reorder_thread_cluster
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
using
ThreadClusterArrangeOrder
=
typename
conditional
<
reorder_thread_cluster
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
static
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
using
ThreadReduceSrcDesc_M_K
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{})));
using
ThreadReduceDstDesc_M
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{})));
using
BlockwiseSumReduce
=
PartitionedBlockwiseReduction
<
AccDataType
,
BlockSize
,
ThreadClusterLengths_M_K
,
ThreadClusterArrangeOrder
,
reduce
::
Add
,
true
>
;
using
ThreadwiseSumReduce
=
ThreadwiseReduction
<
AccDataType
,
ThreadReduceSrcDesc_M_K
,
ThreadReduceDstDesc_M
,
reduce
::
Add
,
true
>
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
index_t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
index_t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
__device__
static
void
Run
(
const
GridDesc_M_K
&
x_grid_desc_m_k
,
const
GridDesc_K
&
gamma_grid_desc_k
,
const
GridDesc_K
&
beta_grid_desc_k
,
const
GridDesc_M_K
&
y_grid_desc_m_k
,
index_t
num_k_block_tile_iteration
,
AccDataType
epsilon
,
const
XDataType
*
const
__restrict__
p_x_global
,
const
GammaDataType
*
const
__restrict__
p_gamma_global
,
const
BetaDataType
*
const
__restrict__
p_beta_global
,
YDataType
*
const
__restrict__
p_y_global
,
const
AccElementwiseOperation
acc_elementwise_op
)
{
if
constexpr
(
SweepOnce
)
{
num_k_block_tile_iteration
=
1
;
}
// LDS
__shared__
AccDataType
p_reduce_work_buffer
[
BlockSize
];
auto
y_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_y_global
,
y_grid_desc_m_k
.
GetElementSpaceSize
());
auto
reduce_work_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_work_buffer
,
BlockSize
);
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
x_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
KThreadSliceSize
,
true
>
gamma_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
KThreadSliceSize
,
true
>&
beta_thread_buf
=
gamma_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
y_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>&
x_square_thread_buf
=
y_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
mean_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
mean_square_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>&
var_value_buf
=
mean_square_thread_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
mean_thread_buf
(
I
)
=
reduce
::
Add
::
template
GetIdentityValue
<
AccDataType
>();
mean_square_thread_buf
(
I
)
=
reduce
::
Add
::
template
GetIdentityValue
<
AccDataType
>();
});
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_id
=
get_block_1d_id
();
const
auto
thread_cluster_idx
=
thread_cluster_desc
.
CalculateBottomIndex
(
make_multi_index
(
thread_local_id
));
const
auto
thread_m_cluster_id
=
thread_cluster_idx
[
I0
];
const
auto
thread_k_cluster_id
=
thread_cluster_idx
[
I1
];
using
ThreadBufferLengths_M_K
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
using
ThreadBufferLengths_K
=
Sequence
<
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc_m_k
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
constexpr
auto
thread_buffer_desc_k
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_x_load
=
ThreadwiseTensorSliceTransfer_v2
<
XDataType
,
AccDataType
,
GridDesc_M_K
,
decltype
(
thread_buffer_desc_m_k
),
ThreadBufferLengths_M_K
,
ThreadBufferDimAccessOrder
,
XSrcVectorDim
,
XSrcVectorSize
,
1
,
true
>
(
x_grid_desc_m_k
,
make_multi_index
(
block_global_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
KThreadSliceSize
));
auto
threadwise_gamma_load
=
ThreadwiseTensorSliceTransfer_v2
<
GammaDataType
,
AccDataType
,
GridDesc_K
,
decltype
(
thread_buffer_desc_k
),
ThreadBufferLengths_K
,
Sequence
<
0
>
,
0
,
GammaSrcVectorSize
,
1
,
true
>
(
gamma_grid_desc_k
,
make_multi_index
(
thread_k_cluster_id
*
KThreadSliceSize
));
auto
threadwise_beta_load
=
ThreadwiseTensorSliceTransfer_v2
<
BetaDataType
,
AccDataType
,
GridDesc_K
,
decltype
(
thread_buffer_desc_k
),
ThreadBufferLengths_K
,
Sequence
<
0
>
,
0
,
BetaSrcVectorSize
,
1
,
true
>
(
beta_grid_desc_k
,
make_multi_index
(
thread_k_cluster_id
*
KThreadSliceSize
));
auto
threadwise_y_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
AccDataType
,
YDataType
,
decltype
(
thread_buffer_desc_m_k
),
GridDesc_M_K
,
AccElementwiseOperation
,
ThreadBufferLengths_M_K
,
ThreadBufferDimAccessOrder
,
YDstVectorDim
,
YDstVectorSize
,
InMemoryDataOperationEnum
::
Set
,
1
,
true
>
(
y_grid_desc_m_k
,
make_multi_index
(
block_global_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
KThreadSliceSize
),
acc_elementwise_op
);
// Copy x from Cache
// one pass: fwd, second pass: bwd
constexpr
auto
thread_copy_fwd_step_k
=
make_multi_index
(
SweepOnce
?
0
:
K_BlockTileSize
);
constexpr
auto
thread_copy_bwd_step_k
=
make_multi_index
(
SweepOnce
?
0
:
-
K_BlockTileSize
);
constexpr
auto
thread_copy_fwd_step_m_k
=
make_multi_index
(
0
,
SweepOnce
?
0
:
K_BlockTileSize
);
constexpr
auto
thread_copy_bwd_step_m_k
=
make_multi_index
(
0
,
SweepOnce
?
0
:
-
K_BlockTileSize
);
const
auto
x_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_x_global
,
x_grid_desc_m_k
.
GetElementSpaceSize
());
const
auto
gamma_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_gamma_global
,
gamma_grid_desc_k
.
GetElementSpaceSize
());
const
auto
beta_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_beta_global
,
beta_grid_desc_k
.
GetElementSpaceSize
());
// E(x), E[x^2], var(x)
int
reduce_length
=
x_grid_desc_m_k
.
GetTransforms
()[
I0
].
GetUpperLengths
()[
I1
];
index_t
reducedTiles
=
0
;
do
{
threadwise_x_load
.
Run
(
x_grid_desc_m_k
,
x_global_val_buf
,
thread_buffer_desc_m_k
,
make_tuple
(
I0
,
I0
),
x_thread_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset_m_k
=
thread_buffer_desc_m_k
.
CalculateOffset
(
make_tuple
(
iM
,
iK
));
x_square_thread_buf
(
Number
<
offset_m_k
>
{})
=
x_thread_buf
(
Number
<
offset_m_k
>
{})
*
x_thread_buf
(
Number
<
offset_m_k
>
{});
});
});
ThreadwiseSumReduce
::
Reduce
(
x_thread_buf
,
mean_thread_buf
);
ThreadwiseSumReduce
::
Reduce
(
x_square_thread_buf
,
mean_square_thread_buf
);
threadwise_x_load
.
MoveSrcSliceWindow
(
x_grid_desc_m_k
,
thread_copy_fwd_step_m_k
);
++
reducedTiles
;
}
while
(
reducedTiles
<
num_k_block_tile_iteration
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
constexpr
(
I
>
0
)
block_sync_lds
();
BlockwiseSumReduce
::
Reduce
(
reduce_work_buf
,
mean_thread_buf
(
I
));
mean_thread_buf
(
I
)
=
mean_thread_buf
(
I
)
/
reduce_length
;
block_sync_lds
();
BlockwiseSumReduce
::
Reduce
(
reduce_work_buf
,
mean_square_thread_buf
(
I
));
mean_square_thread_buf
(
I
)
=
mean_square_thread_buf
(
I
)
/
reduce_length
;
// var(x) = E[x^2] - E[x]^2
var_value_buf
(
I
)
=
mean_square_thread_buf
(
I
)
-
(
mean_thread_buf
(
I
)
*
mean_thread_buf
(
I
));
});
// y = (x - E[x]) / sqrt(var[x] + epsilon)
auto
thread_copy_tail_m_k
=
(
num_k_block_tile_iteration
-
1
)
*
thread_copy_fwd_step_m_k
;
auto
thread_copy_tail_k
=
(
num_k_block_tile_iteration
-
1
)
*
thread_copy_fwd_step_k
;
threadwise_x_load
.
MoveSrcSliceWindow
(
x_grid_desc_m_k
,
thread_copy_bwd_step_m_k
);
threadwise_gamma_load
.
MoveSrcSliceWindow
(
gamma_grid_desc_k
,
thread_copy_tail_k
);
threadwise_beta_load
.
MoveSrcSliceWindow
(
beta_grid_desc_k
,
thread_copy_tail_k
);
threadwise_y_store
.
MoveDstSliceWindow
(
y_grid_desc_m_k
,
thread_copy_tail_m_k
);
reducedTiles
=
0
;
do
{
if
constexpr
(
!
SweepOnce
)
{
threadwise_x_load
.
Run
(
x_grid_desc_m_k
,
x_global_val_buf
,
thread_buffer_desc_m_k
,
make_tuple
(
I0
,
I0
),
x_thread_buf
);
}
threadwise_gamma_load
.
Run
(
gamma_grid_desc_k
,
gamma_global_val_buf
,
thread_buffer_desc_k
,
make_tuple
(
I0
),
gamma_thread_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset_m_k
=
thread_buffer_desc_m_k
.
CalculateOffset
(
make_tuple
(
iM
,
iK
));
constexpr
auto
offset_k
=
thread_buffer_desc_k
.
CalculateOffset
(
make_tuple
(
iK
));
// normalize
y_thread_buf
(
Number
<
offset_m_k
>
{})
=
(
x_thread_buf
(
Number
<
offset_m_k
>
{})
-
mean_thread_buf
(
iM
))
/
sqrt
(
var_value_buf
(
iM
)
+
epsilon
);
// gamma
y_thread_buf
(
Number
<
offset_m_k
>
{})
=
y_thread_buf
(
Number
<
offset_m_k
>
{})
*
gamma_thread_buf
(
Number
<
offset_k
>
{});
});
});
threadwise_beta_load
.
Run
(
beta_grid_desc_k
,
beta_global_val_buf
,
thread_buffer_desc_k
,
make_tuple
(
I0
),
beta_thread_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset_m_k
=
thread_buffer_desc_m_k
.
CalculateOffset
(
make_tuple
(
iM
,
iK
));
constexpr
auto
offset_k
=
thread_buffer_desc_k
.
CalculateOffset
(
make_tuple
(
iK
));
// beta
y_thread_buf
(
Number
<
offset_m_k
>
{})
=
y_thread_buf
(
Number
<
offset_m_k
>
{})
+
beta_thread_buf
(
Number
<
offset_k
>
{});
});
});
threadwise_y_store
.
Run
(
thread_buffer_desc_m_k
,
make_tuple
(
I0
,
I0
),
y_thread_buf
,
y_grid_desc_m_k
,
y_global_val_buf
);
threadwise_x_load
.
MoveSrcSliceWindow
(
x_grid_desc_m_k
,
thread_copy_bwd_step_m_k
);
threadwise_gamma_load
.
MoveSrcSliceWindow
(
gamma_grid_desc_k
,
thread_copy_bwd_step_k
);
threadwise_beta_load
.
MoveSrcSliceWindow
(
beta_grid_desc_k
,
thread_copy_bwd_step_k
);
threadwise_y_store
.
MoveDstSliceWindow
(
y_grid_desc_m_k
,
thread_copy_bwd_step_m_k
);
++
reducedTiles
;
}
while
(
reducedTiles
<
num_k_block_tile_iteration
);
}
};
}
// namespace ck
library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp
0 → 100644
View file @
b78c8719
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include <vector>
#include <algorithm>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
host
{
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
YDataType
,
typename
AccDataType
,
typename
AccElementwiseOperation
,
index_t
Rank
,
index_t
NumReduceDim
>
struct
ReferenceLayernorm
:
public
device
::
BaseOperator
{
// TODO - support generic layernorm
static_assert
((
Rank
==
2
&&
NumReduceDim
==
1
),
"Only support 2D version so far"
);
// Argument
struct
Argument
:
public
device
::
BaseArgument
{
Argument
(
const
Tensor
<
XDataType
>&
x_m_n
,
const
Tensor
<
GammaDataType
>&
gamma_n
,
const
Tensor
<
BetaDataType
>&
beta_n
,
Tensor
<
YDataType
>&
y_m_n
,
AccElementwiseOperation
acc_elementwise_op
,
const
std
::
vector
<
index_t
>
lengths
,
const
std
::
vector
<
index_t
>
reduceDims
,
AccDataType
epsilon
)
:
x_m_n_
(
x_m_n
),
gamma_n_
(
gamma_n
),
beta_n_
(
beta_n
),
y_m_n_
(
y_m_n
),
acc_elementwise_op_
(
acc_elementwise_op
),
lengths_
(
lengths
),
reduceDims_
(
reduceDims
),
epsilon_
(
epsilon
)
{
}
const
Tensor
<
XDataType
>
x_m_n_
;
const
Tensor
<
XDataType
>
gamma_n_
;
const
Tensor
<
XDataType
>
beta_n_
;
Tensor
<
YDataType
>&
y_m_n_
;
AccElementwiseOperation
acc_elementwise_op_
;
std
::
vector
<
index_t
>
lengths_
;
std
::
vector
<
index_t
>
reduceDims_
;
AccDataType
epsilon_
;
};
// Invoker
struct
Invoker
:
public
device
::
BaseInvoker
{
float
Run
(
const
Argument
&
arg
)
{
int
M
=
arg
.
lengths_
[
0
];
int
N
=
arg
.
lengths_
[
1
];
Tensor
<
AccDataType
>
mean
({
M
});
Tensor
<
AccDataType
>
var
({
M
});
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
mean
(
m
)
=
0
;
var
(
m
)
=
0
;
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
auto
x_val
=
ck
::
type_convert
<
AccDataType
>
(
arg
.
x_m_n_
(
m
,
n
));
mean
(
m
)
+=
x_val
;
var
(
m
)
+=
x_val
*
x_val
;
}
mean
(
m
)
=
mean
(
m
)
/
N
;
var
(
m
)
=
(
var
(
m
)
/
N
)
-
(
mean
(
m
)
*
mean
(
m
));
}
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
auto
x_val
=
ck
::
type_convert
<
AccDataType
>
(
arg
.
x_m_n_
(
m
,
n
));
auto
y_val
=
(
x_val
-
mean
(
m
))
/
sqrt
(
var
(
m
)
+
arg
.
epsilon_
);
y_val
=
(
y_val
*
arg
.
gamma_n_
(
n
))
+
arg
.
beta_n_
(
n
);
arg
.
y_m_n_
(
m
,
n
)
=
ck
::
type_convert
<
YDataType
>
(
y_val
);
}
}
return
0
;
}
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
const
StreamConfig
&
/* stream_config */
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
};
static
constexpr
bool
IsValidCompilationParameter
()
{
// TODO: properly implement this check
return
true
;
}
bool
IsSupportedArgument
(
const
device
::
BaseArgument
*
p_arg
)
override
{
const
Argument
*
p_arg_
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
// TODO - support generic layernorm
if
(
p_arg_
->
lengths_
.
size
()
!=
2
)
return
false
;
if
(
p_arg_
->
reduceDims_
.
size
()
!=
1
)
return
false
;
if
(
p_arg_
->
reduceDims_
[
0
]
!=
1
)
return
false
;
return
true
;
}
static
auto
MakeArgument
(
const
Tensor
<
XDataType
>&
x_m_n
,
const
Tensor
<
GammaDataType
>&
gamma_n
,
const
Tensor
<
BetaDataType
>&
beta_n
,
Tensor
<
YDataType
>&
y_m_n
,
AccElementwiseOperation
acc_elementwise_op
,
const
std
::
vector
<
index_t
>
lengths
,
const
std
::
vector
<
index_t
>
reduceDims
,
AccDataType
epsilon
)
{
return
Argument
{
x_m_n
,
gamma_n
,
beta_n
,
y_m_n
,
acc_elementwise_op
,
lengths
,
reduceDims
,
epsilon
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
virtual
std
::
unique_ptr
<
device
::
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"ReferenceLayernorm"
<<
std
::
endl
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace host
}
// namespace tensor_operation
}
// namespace ck
script/process_perf_data.py
0 → 100644
View file @
b78c8719
#!/usr/bin/env python3
import
os
,
io
,
argparse
,
datetime
#import numpy as np
import
sqlalchemy
from
sqlalchemy.types
import
NVARCHAR
,
Float
,
Integer
import
pymysql
import
pandas
as
pd
from
sshtunnel
import
SSHTunnelForwarder
def
print_to_string
(
*
args
,
**
kwargs
):
output
=
io
.
StringIO
()
print
(
*
args
,
file
=
output
,
**
kwargs
)
contents
=
output
.
getvalue
()
output
.
close
()
return
contents
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
'Parse results from tf benchmark runs'
)
parser
.
add_argument
(
'filename'
,
type
=
str
,
help
=
'Log file to prase or directory containing log files'
)
args
=
parser
.
parse_args
()
files
=
[]
if
os
.
path
.
isdir
(
args
.
filename
):
all_files
=
os
.
listdir
(
args
.
filename
)
for
name
in
all_files
:
if
not
'log'
in
name
:
continue
files
.
append
(
os
.
path
.
join
(
args
.
filename
,
name
))
else
:
files
=
[
args
.
filename
]
args
.
files
=
files
return
args
def
get_log_params
(
logfile
):
print
(
"logfile="
,
logfile
)
branch_name
=
' '
node_id
=
' '
gpu_arch
=
' '
hip_vers
=
' '
compute_units
=
0
environment
=
' '
rocm_vers
=
' '
for
line
in
open
(
logfile
):
if
'Branch name'
in
line
:
lst
=
line
.
split
()
branch_name
=
lst
[
2
]
if
'On branch'
in
line
:
lst
=
line
.
split
()
branch_name
=
lst
[
2
]
if
'Node name'
in
line
:
lst
=
line
.
split
()
node_id
=
lst
[
2
]
if
'GPU_arch'
in
line
:
lst
=
line
.
split
()
gpu_arch
=
lst
[
2
]
if
'HIP version'
in
line
:
lst
=
line
.
split
()
hip_vers
=
lst
[
2
]
if
'Compute Unit'
in
line
:
lst
=
line
.
split
()
compute_units
=
lst
[
2
]
if
'Environment type'
in
line
:
lst
=
line
.
split
()
environment
=
lst
[
2
]
if
'InstalledDir'
in
line
:
lst
=
line
.
split
()
rocm_vers
=
lst
[
1
][
lst
[
1
].
find
(
'/opt/rocm-'
)
+
len
(
'/opt/rocm-'
):
lst
[
1
].
rfind
(
'/llvm/bin'
)]
return
branch_name
,
node_id
,
gpu_arch
,
compute_units
,
rocm_vers
,
hip_vers
,
environment
def
parse_logfile
(
logfile
):
glue
=
''
res
=
[]
tests
=
[]
kernels
=
[]
tflops
=
[]
dtype
=
[]
alayout
=
[]
blayout
=
[]
M
=
[]
N
=
[]
K
=
[]
StrideA
=
[]
StrideB
=
[]
StrideC
=
[]
if
'perf_gemm'
in
logfile
:
for
line
in
open
(
logfile
):
if
'Best Perf'
in
line
:
lst
=
line
.
split
()
print
(
"len(lst)="
,
len
(
lst
),
"lst:"
,
lst
)
if
len
(
lst
)
>=
37
:
#the line is complete
tests
.
append
(
glue
.
join
(
lst
[
5
:
30
]))
kernels
.
append
(
glue
.
join
(
lst
[
37
:]))
tflops
.
append
(
lst
[
33
])
dtype
.
append
(
lst
[
5
])
alayout
.
append
(
lst
[
8
])
blayout
.
append
(
lst
[
11
])
M
.
append
(
lst
[
14
])
N
.
append
(
lst
[
17
])
K
.
append
(
lst
[
20
])
StrideA
.
append
(
lst
[
23
])
StrideB
.
append
(
lst
[
26
])
StrideC
.
append
(
lst
[
29
])
elif
len
(
lst
)
<
37
and
len
(
lst
)
>=
33
:
#the tflops are available
tests
.
append
(
glue
.
join
(
lst
[
5
:
30
]))
kernels
.
append
(
"N/A"
)
tflops
.
append
(
lst
[
33
])
dtype
.
append
(
lst
[
5
])
alayout
.
append
(
lst
[
8
])
blayout
.
append
(
lst
[
11
])
M
.
append
(
lst
[
14
])
N
.
append
(
lst
[
17
])
K
.
append
(
lst
[
20
])
StrideA
.
append
(
lst
[
23
])
StrideB
.
append
(
lst
[
26
])
StrideC
.
append
(
lst
[
29
])
print
(
"warning: incomplete line:"
,
lst
)
elif
len
(
lst
)
<
33
:
#even the tflops are not available
print
(
"Error in ckProfiler output!"
)
print
(
"warning: incomplete line="
,
lst
)
#sort results
#sorted_tests = sorted(tests)
res
=
[
x
for
_
,
x
in
sorted
(
zip
(
tests
,
tflops
))]
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
test_list
=
list
(
range
(
1
,
len
(
tests
)
+
1
))
#parse fwd_conv performance tests:
elif
'fwd_conv'
in
logfile
:
for
line
in
open
(
logfile
):
if
'tflops:'
in
line
:
lst
=
line
.
split
()
res
.
append
(
lst
[
1
])
#parse all other performance tests:
elif
'resnet50'
or
'batched_gemm'
or
'grouped_gemm'
or
'bwd_conv'
or
'fusion'
or
'reduction'
in
logfile
:
for
line
in
open
(
logfile
):
if
'Best Perf'
in
line
:
lst
=
line
.
split
()
res
.
append
(
lst
[
4
])
return
res
def
get_baseline
(
table
,
connection
):
query
=
'''SELECT * from '''
+
table
+
''' WHERE Datetime = (SELECT MAX(Datetime) FROM '''
+
table
+
''' where Branch_ID='develop' );'''
return
pd
.
read_sql_query
(
query
,
connection
)
def
store_new_test_result
(
table_name
,
test_results
,
testlist
,
branch_name
,
node_id
,
gpu_arch
,
compute_units
,
rocm_vers
,
hip_vers
,
environment
,
connection
):
params
=
[
str
(
branch_name
),
str
(
node_id
),
str
(
gpu_arch
),
compute_units
,
str
(
rocm_vers
),
str
(
hip_vers
),
str
(
environment
),
str
(
datetime
.
datetime
.
now
())]
df
=
pd
.
DataFrame
(
data
=
[
params
],
columns
=
[
'Branch_ID'
,
'Node_ID'
,
'GPU_arch'
,
'Compute Units'
,
'ROCM_version'
,
'HIP_version'
,
'Environment'
,
'Datetime'
])
df_add
=
pd
.
DataFrame
(
data
=
[
test_results
],
columns
=
testlist
)
df
=
pd
.
concat
([
df
,
df_add
],
axis
=
1
)
print
(
"new test results dataframe:"
,
df
)
df
.
to_sql
(
table_name
,
connection
,
if_exists
=
'append'
,
index
=
False
)
return
0
def
compare_test_to_baseline
(
baseline
,
test
,
testlist
):
regression
=
0
if
not
baseline
.
empty
:
base
=
baseline
[
testlist
].
to_numpy
(
dtype
=
'float'
)
base_list
=
base
[
0
]
ave_perf
=
0
for
i
in
range
(
len
(
base_list
)):
# success criterion:
if
base_list
[
i
]
>
1.01
*
float
(
test
[
i
]):
print
(
"test # "
,
i
,
"shows regression by {:.3f}%"
.
format
(
(
float
(
test
[
i
])
-
base_list
[
i
])
/
base_list
[
i
]
*
100
))
regression
=
1
ave_perf
=
ave_perf
+
float
(
test
[
i
])
/
base_list
[
i
]
if
regression
==
0
:
print
(
"no regressions found"
)
ave_perf
=
ave_perf
/
len
(
base_list
)
print
(
"average performance relative to baseline:"
,
ave_perf
)
else
:
print
(
"could not find a baseline"
)
return
regression
'''
def post_test_params(tlist,connection):
sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
sorted_M = [x for _,x in sorted(zip(tests,M))]
sorted_N = [x for _,x in sorted(zip(tests,N))]
sorted_K = [x for _,x in sorted(zip(tests,K))]
sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
ck_gemm_params=[tlist,sorted_dtypes,sorted_alayout,sorted_blayout,
sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
sorted_StrideC]
df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
print(df)
dtypes = {
'Test_number': Integer(),
'Data_type': NVARCHAR(length=5),
'Alayout': NVARCHAR(length=12),
'Blayout': NVARCHAR(length=12),
'M': Integer(),
'N': Integer(),
'K': Integer(),
'StrideA': Integer(),
'StrideB': Integer(),
'StrideC': Integer()
}
df.to_sql("ck_gemm_test_params",connection,if_exists='replace',index=False, dtype=dtypes)
'''
def
main
():
args
=
parse_args
()
results
=
[]
tflops_base
=
[]
testlist
=
[]
#parse the test parameters from the logfile
for
filename
in
args
.
files
:
branch_name
,
node_id
,
gpu_arch
,
compute_units
,
rocm_vers
,
hip_vers
,
environment
=
get_log_params
(
filename
)
print
(
"Branch name:"
,
branch_name
)
print
(
"Node name:"
,
node_id
)
print
(
"GPU_arch:"
,
gpu_arch
)
print
(
"Compute units:"
,
compute_units
)
print
(
"ROCM_version:"
,
rocm_vers
)
print
(
"HIP_version:"
,
hip_vers
)
print
(
"Environment:"
,
environment
)
#parse results, get the Tflops value for "Best Perf" kernels
results
=
parse_logfile
(
filename
)
print
(
"Number of tests:"
,
len
(
results
))
sql_hostname
=
'127.0.0.1'
sql_username
=
os
.
environ
[
"dbuser"
]
sql_password
=
os
.
environ
[
"dbpassword"
]
sql_main_database
=
'miopen_perf'
sql_port
=
3306
ssh_host
=
os
.
environ
[
"dbsship"
]
ssh_user
=
os
.
environ
[
"dbsshuser"
]
ssh_port
=
int
(
os
.
environ
[
"dbsshport"
])
ssh_pass
=
os
.
environ
[
"dbsshpassword"
]
with
SSHTunnelForwarder
(
(
ssh_host
,
ssh_port
),
ssh_username
=
ssh_user
,
ssh_password
=
ssh_pass
,
remote_bind_address
=
(
sql_hostname
,
sql_port
))
as
tunnel
:
sqlEngine
=
sqlalchemy
.
create_engine
(
'mysql+pymysql://{0}:{1}@{2}:{3}/{4}'
.
format
(
sql_username
,
sql_password
,
sql_hostname
,
tunnel
.
local_bind_port
,
sql_main_database
))
conn
=
sqlEngine
.
connect
()
#save gemm performance tests:
if
'perf_gemm'
in
filename
:
#write the ck_gemm_test_params table only needed once the test set changes
#post_test_params(test_list,conn)
for
i
in
range
(
1
,
len
(
results
)
+
1
):
testlist
.
append
(
"Test%i"
%
i
)
table_name
=
"ck_gemm_tflops"
if
'batched_gemm'
in
filename
:
for
i
in
range
(
1
,
len
(
results
)
+
1
):
testlist
.
append
(
"Test%i"
%
i
)
table_name
=
"ck_batched_gemm_tflops"
if
'grouped_gemm'
in
filename
:
for
i
in
range
(
1
,
len
(
results
)
+
1
):
testlist
.
append
(
"Test%i"
%
i
)
table_name
=
"ck_grouped_gemm_tflops"
if
'fwd_conv'
in
filename
:
for
i
in
range
(
1
,
len
(
results
)
+
1
):
testlist
.
append
(
"Test%i"
%
i
)
table_name
=
"ck_fwd_conv_tflops"
if
'bwd_conv'
in
filename
:
for
i
in
range
(
1
,
len
(
results
)
+
1
):
testlist
.
append
(
"Test%i"
%
i
)
table_name
=
"ck_bwd_conv_tflops"
if
'fusion'
in
filename
:
for
i
in
range
(
1
,
len
(
results
)
+
1
):
testlist
.
append
(
"Test%i"
%
i
)
table_name
=
"ck_fusion_tflops"
if
'reduction'
in
filename
:
for
i
in
range
(
1
,
len
(
results
)
+
1
):
testlist
.
append
(
"Test%i"
%
i
)
table_name
=
"ck_reduction_GBps"
if
'resnet50_N4'
in
filename
:
for
i
in
range
(
1
,
50
):
testlist
.
append
(
"Layer%i"
%
i
)
table_name
=
"ck_resnet50_N4_tflops"
if
'resnet50_N256'
in
filename
:
for
i
in
range
(
1
,
50
):
testlist
.
append
(
"Layer%i"
%
i
)
table_name
=
"ck_resnet50_N256_tflops"
tflops_base
=
get_baseline
(
table_name
,
conn
)
store_new_test_result
(
table_name
,
results
,
testlist
,
branch_name
,
node_id
,
gpu_arch
,
compute_units
,
rocm_vers
,
hip_vers
,
environment
,
conn
)
conn
.
close
()
#compare the results to the baseline if baseline exists
regression
=
0
regression
=
compare_test_to_baseline
(
tflops_base
,
results
,
testlist
)
return
regression
if
__name__
==
'__main__'
:
main
()
\ No newline at end of file
script/profile_batched_gemm.sh
0 → 100755
View file @
b78c8719
#!/bin/bash
## GPU visibility
export
HIP_VISIBLE_DEVICES
=
0
DRIVER
=
"../build/bin/ckProfiler"
OP
=
$1
DATATYPE
=
$2
LAYOUT
=
$3
VERIFY
=
$4
INIT
=
$5
LOG
=
$6
REPEAT
=
$7
######## op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchCount
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
960 1024 1024
-1
-1
-1
8
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
1920 2048 2048
-1
-1
-1
8
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
3840 4096 4096
-1
-1
-1
4
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
7680 8192 8192
-1
-1
-1
2
####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchCount
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
1024 1024 1024 1024 1024 1024 8
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
2048 2048 2048 2048 2048 2048 8
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
4096 4096 4096 4096 4096 4096 4
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
8192 8192 8192 8192 8192 8192 2
####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchCount
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
1024 1024 1024 1056 1056 1056 8
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
2048 2048 2048 2080 2080 2080 8
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
4096 4096 4096 4128 4128 4128 4
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
8192 8192 8192 8224 8224 8224 2
####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchCount
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
1024 1024 1024 1088 1088 1088 8
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
2048 2048 2048 2112 2112 2112 8
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
4096 4096 4096 4160 4160 4160 4
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
8192 8192 8192 8256 8256 8256 2
\ No newline at end of file
script/profile_conv.sh
View file @
b78c8719
#!/bin/bash
#!/bin/bash
## GPU visibility
## GPU visibility
export
HIP_VISIBLE_DEVICES
=
0
export
HIP_VISIBLE_DEVICES
=
0
DRIVER
=
"../build/bin/ckProfiler"
# make -j ckProfiler
DRIVER
=
"../build/bin/ckProfiler"
OP
=
$1
OP
=
$1
DATATYPE
=
$2
DATATYPE
=
$2
IN_LAYOUT
=
$3
IN_LAYOUT
=
$3
...
@@ -16,162 +12,27 @@ VERIFY=$6
...
@@ -16,162 +12,27 @@ VERIFY=$6
INIT
=
$7
INIT
=
$7
LOG
=
$8
LOG
=
$8
REPEAT
=
$9
REPEAT
=
$9
N
=
${
10
}
# test
######## op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads Desired_grid_size__
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 128 256 256 3 3 30 30 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 128 256 256 3 3 28 28 2 2 1 1 1 1 1 1 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 128 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
N
=
${
10
}
# Resnet50 (no duplicated layer)
######## op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
######## op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 1024 1 1 14 14 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 128 3 3 28 28 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 128 1 1 28 28 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 128 3 3 56 56 2 2 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 2048 1 1 7 7 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 28 28 2 2 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 256 1 1 56 56 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 256 1 1 56 56 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 512 3 3 14 14 2 2 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 512 1 1 28 28 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 512 1 1 28 28 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
2048 512 1 1 7 7 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 512 3 3 7 7 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 64 1 1 56 56 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 64 1 1 56 56 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 64 3 3 56 56 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 3 7 7 224 224 2 2 1 1 3 3 3 3
# Resnet50 fusion
####### op_________________ datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C_ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 3 7 7 224 224 2 2 1 1 3 3 3 3
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 64 3 3 56 56 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 64 3 3 56 56 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 64 3 3 56 56 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 128 3 3 56 56 2 2 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 128 3 3 28 28 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 128 3 3 28 28 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 128 3 3 28 28 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 28 28 2 2 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 512 3 3 14 14 2 2 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
2048 512 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 2048 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 512 3 3 7 7 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
2048 512 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 2048 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 512 3 3 7 7 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
2048 512 1 1 7 7 1 1 1 1 0 0 0 0
# Resnet50
######## op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads Desired_grid_size__
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 1024 1 1 14 14 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 58 58 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 30 30 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 256 1 1 56 56 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 16 16 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 512 1 1 28 28 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 3 7 7 230 230 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
# SSD
######## op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads Desired_grid_size__
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 3 7 7 300 300 2 2 1 1 3 3 3 3
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 64 3 3 75 75 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 64 3 3 75 75 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 64 3 3 75 75 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 64 3 3 75 75 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 64 3 3 75 75 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 64 3 3 75 75 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 64 1 1 75 75 2 2 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 64 3 3 75 75 2 2 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 128 1 1 38 38 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 1 1 38 38 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 512 256 3 3 38 38 2 2 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 512 1 1 19 19 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 512 256 3 3 19 19 2 2 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 512 1 1 10 10 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 128 3 3 10 10 2 2 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 256 1 1 5 5 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 128 3 3 5 5 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 256 1 1 3 3 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 128 3 3 3 3 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 340 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 510 512 3 3 19 19 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 510 512 3 3 10 10 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 510 256 3 3 5 5 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 340 256 3 3 3 3 1 1 1 1 1 1 1 1
script/profile_gemm_bias_relu_add.sh
0 → 100755
View file @
b78c8719
#!/bin/bash
## GPU visibility
export
HIP_VISIBLE_DEVICES
=
0
DRIVER
=
"../build/bin/ckProfiler"
OP
=
$1
DATATYPE
=
$2
LAYOUT
=
$3
VERIFY
=
$4
INIT
=
$5
LOG
=
$6
REPEAT
=
$7
######## op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC StrideC1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
960 1024 1024
-1
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
1920 2048 2048
-1
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
3840 4096 4096
-1
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
7680 8192 8192
-1
-1
-1
-1
####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC StrideC1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
1024 1024 1024 1024 1024 1024 1024
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
2048 2048 2048 2048 2048 2048 2048
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
4096 4096 4096 4096 4096 4096 4096
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
8192 8192 8192 8192 8192 8192 8192
####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC StrideC1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
1024 1024 1024 1056 1056 1056 1056
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
2048 2048 2048 2080 2080 2080 2080
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
4096 4096 4096 4128 4128 4128 4128
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
8192 8192 8192 8224 8224 8224 8224
####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC StrideC1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
1024 1024 1024 1088 1088 1088 1088
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
2048 2048 2048 2112 2112 2112 2112
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
4096 4096 4096 4160 4160 4160 4160
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
8192 8192 8192 8256 8256 8256 8256
\ No newline at end of file
script/profile_grouped_gemm.sh
0 → 100755
View file @
b78c8719
#!/bin/bash
## GPU visibility
export
HIP_VISIBLE_DEVICES
=
0
DRIVER
=
"../build/bin/ckProfiler"
OP
=
$1
DATATYPE
=
$2
LAYOUT
=
$3
VERIFY
=
$4
INIT
=
$5
LOG
=
$6
REPEAT
=
$7
######## op datatype layout verify init log repeat Ms______________ Ns______________ Ks_____________ StrideAs___________ StrideBs__________ StrideCs___________
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
256,512,1024,768 128,256,384,1024 128,192,256,512 1024,1025,1044,1026 1024,1024,1024,1024 1025,1024,1028,1024
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
512,768,2048,128 128,256,384,1024 128,192,256,512 1024,1025,2053,1026 1024,1024,1024,1024 1025,1024,2054,1024
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
256,512,1024,768 512,256,768,1024 128,192,256,512 1024,1045,1034,1026 1024,1024,1024,1024 1025,1063,1028,1024
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
512,768,4096,768 128,768,512,2048 128,192,256,512 1024,1027,4096,2050 1024,1024,1024,2048 1025,1024,4099,2049
\ No newline at end of file
script/profile_reduce_no_index.sh
View file @
b78c8719
#!/bin/bash
#!/bin/bash
DRIVER
=
"../build/bin/ckProfiler"
PRECISION
=
VERIFY
=
"-v
$1
"
INIT
=
$2
NREPEAT
=
$3
PRECISION
=
$4
##PRECISION=--half
##PRECISION=--half
##PRECISION=--double
##PRECISION=--double
##PRECISION=--int8
##PRECISION=--int8
...
@@ -12,14 +15,6 @@ elif [ -n $PRECISION ] && [ "$PRECISION" = "--int8" ]; then
...
@@ -12,14 +15,6 @@ elif [ -n $PRECISION ] && [ "$PRECISION" = "--int8" ]; then
ACCTYPE
=
"-C 2"
ACCTYPE
=
"-C 2"
fi
fi
driver
=
"./bin/ckProfiler"
VERIFY
=
"-v
$1
"
INIT
=
$2
NREPEAT
=
$3
#### 0 - ADD, 5 - AVG, 7 - NORM2
#### 0 - ADD, 5 - AVG, 7 - NORM2
Operations
=
"0 5 7"
Operations
=
"0 5 7"
...
@@ -32,19 +27,19 @@ fi
...
@@ -32,19 +27,19 @@ fi
for
op
in
$Operations
;
do
for
op
in
$Operations
;
do
set
-x
set
-x
####### datatype layout reduce dims op acctype verify init repeats
####### datatype layout reduce dims op acctype verify init repeats
$
driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2,3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2,3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
64,4,280,82
-R
0
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
64,4,280,82
-R
0
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
64,4,280,82
-R
1
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
64,4,280,82
-R
1
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
64,4,280,82
-R
2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
64,4,280,82
-R
2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
64,4,280,82
-R
3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
64,4,280,82
-R
3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
64,4,280,82
-R
1,2,3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
64,4,280,82
-R
1,2,3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,2,3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
64,4,280,82
-R
0,2,3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,22960
-R
0
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,22960
-R
0
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,22960
-R
1
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,22960
-R
1
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
4,1469440
-R
0
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
4,1469440
-R
0
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
4,1469440
-R
1
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
4,1469440
-R
1
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
set
+x
set
+x
done
done
...
@@ -55,29 +50,29 @@ Operations=5
...
@@ -55,29 +50,29 @@ Operations=5
for
op
in
$Operations
;
do
for
op
in
$Operations
;
do
set
-x
set
-x
####### datatype layout reduce dims op acctype verify init repeats
####### datatype layout reduce dims op acctype verify init repeats
$
driver
reduce
$PRECISION
-D
256,14,14,1024
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,14,14,1024
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,28,28,128
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,28,28,128
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,58,58,128
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,58,58,128
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,7,7,2048
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,7,7,2048
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,14,14,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,14,14,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,30,30,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,30,30,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,56,56,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,56,56,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,16,16,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,16,16,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,28,28,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,28,28,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,7,7,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,7,7,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,56,56,64
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,56,56,64
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,230,230,3
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,230,230,3
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,14,14,1024
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,14,14,1024
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,28,28,128
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,28,28,128
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,58,58,128
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,58,58,128
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,7,7,2048
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,7,7,2048
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,14,14,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,14,14,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,30,30,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,30,30,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,56,56,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,56,56,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,16,16,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,16,16,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,28,28,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,28,28,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,7,7,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,7,7,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,56,56,64
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,56,56,64
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
set
+x
set
+x
done
done
script/profile_reduce_with_index.sh
View file @
b78c8719
#!/bin/bash
#!/bin/bash
DRIVER
=
"../build/bin/ckProfiler"
PRECISION
=
VERIFY
=
"-v
$1
"
INIT
=
$2
NREPEAT
=
$3
PRECISION
=
$4
##PRECISION=--half
##PRECISION=--half
##PRECISION=--double
##PRECISION=--double
##PRECISION=--int8
##PRECISION=--int8
##PRECISION=--bf16
##PRECISION=--bf16
driver
=
"./bin/ckProfiler"
VERIFY
=
"-v
$1
"
INIT
=
$2
NREPEAT
=
$3
#### 2 - MIN, 3 - MAX, 4 - AMAX
#### 2 - MIN, 3 - MAX, 4 - AMAX
Operations
=
"2 4"
Operations
=
"2 4"
...
@@ -20,19 +17,19 @@ for op in $Operations; do
...
@@ -20,19 +17,19 @@ for op in $Operations; do
for
use_idx
in
0 1
;
do
for
use_idx
in
0 1
;
do
set
-x
set
-x
####### datatype layout reduce dims op use index verify init repeats
####### datatype layout reduce dims op use index verify init repeats
$
driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2,3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2,3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
64,4,280,82
-R
0
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
64,4,280,82
-R
0
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
64,4,280,82
-R
1
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
64,4,280,82
-R
1
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
64,4,280,82
-R
2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
64,4,280,82
-R
2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
64,4,280,82
-R
3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
64,4,280,82
-R
3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
64,4,280,82
-R
1,2,3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
64,4,280,82
-R
1,2,3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,2,3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
64,4,280,82
-R
0,2,3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,22960
-R
0
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,22960
-R
0
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,22960
-R
1
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,22960
-R
1
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
4,1469440
-R
0
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
4,1469440
-R
0
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
4,1469440
-R
1
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
4,1469440
-R
1
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
set
+x
set
+x
done
done
done
done
...
@@ -44,29 +41,29 @@ for op in $Operations; do
...
@@ -44,29 +41,29 @@ for op in $Operations; do
for
use_idx
in
0 1
;
do
for
use_idx
in
0 1
;
do
set
-x
set
-x
####### datatype layout reduce dims op use index verify init repeats
####### datatype layout reduce dims op use index verify init repeats
$
driver
reduce
$PRECISION
-D
256,14,14,1024
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,14,14,1024
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,28,28,128
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,28,28,128
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,58,58,128
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,58,58,128
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,7,7,2048
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,7,7,2048
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,14,14,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,14,14,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,30,30,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,30,30,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,56,56,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,56,56,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,16,16,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,16,16,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,28,28,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,28,28,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,7,7,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,7,7,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,56,56,64
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,56,56,64
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
256,230,230,3
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
256,230,230,3
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,14,14,1024
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,14,14,1024
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,28,28,128
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,28,28,128
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,58,58,128
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,58,58,128
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,7,7,2048
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,7,7,2048
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,14,14,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,14,14,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,30,30,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,30,30,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,56,56,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,56,56,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,16,16,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,16,16,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,28,28,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,28,28,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,7,7,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,7,7,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
driver
reduce
$PRECISION
-D
128,56,56,64
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$
DRIVER
reduce
$PRECISION
-D
128,56,56,64
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
set
+x
set
+x
done
done
done
done
...
...
script/profile_resnet50.sh
0 → 100755
View file @
b78c8719
#!/bin/bash
## GPU visibility
export
HIP_VISIBLE_DEVICES
=
0
DRIVER
=
"../build/bin/ckProfiler"
OP
=
$1
DATATYPE
=
$2
IN_LAYOUT
=
$3
WEI_LAYOUT
=
$4
OUT_LAYOUT
=
$5
VERIFY
=
$6
INIT
=
$7
LOG
=
$8
REPEAT
=
$9
N
=
${
10
}
# test
######## op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads Desired_grid_size__
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 128 256 256 3 3 30 30 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 128 256 256 3 3 28 28 2 2 1 1 1 1 1 1 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 128 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
# Resnet50 (no duplicated layer)
######## op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3
# Resnet50 fusion
####### op_________________ datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C_ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 3 7 7 224 224 2 2 1 1 3 3 3 3
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 64 3 3 56 56 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 64 3 3 56 56 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 64 3 3 56 56 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 128 3 3 56 56 2 2 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 128 3 3 28 28 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 128 3 3 28 28 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 128 3 3 28 28 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 28 28 2 2 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 512 3 3 14 14 2 2 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
2048 512 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 2048 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 512 3 3 7 7 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
2048 512 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 2048 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 512 3 3 7 7 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
2048 512 1 1 7 7 1 1 1 1 0 0 0 0
# Resnet50
######## op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads Desired_grid_size__
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 1024 1 1 14 14 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 58 58 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 30 30 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 256 1 1 56 56 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 16 16 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 512 1 1 28 28 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 3 7 7 230 230 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
# SSD
######## op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads Desired_grid_size__
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 3 7 7 300 300 2 2 1 1 3 3 3 3
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 64 3 3 75 75 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 64 3 3 75 75 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 64 3 3 75 75 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 64 3 3 75 75 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 64 3 3 75 75 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 64 3 3 75 75 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 64 1 1 75 75 2 2 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 64 3 3 75 75 2 2 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 128 1 1 38 38 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 1 1 38 38 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 512 256 3 3 38 38 2 2 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 512 1 1 19 19 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 512 256 3 3 19 19 2 2 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 512 1 1 10 10 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 128 3 3 10 10 2 2 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 256 1 1 5 5 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 128 3 3 5 5 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 256 1 1 3 3 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 128 3 3 3 3 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 340 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 510 512 3 3 19 19 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 510 512 3 3 10 10 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 510 256 3 3 5 5 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 340 256 3 3 3 3 1 1 1 1 1 1 1 1
script/run_full_performance_tests.sh
0 → 100755
View file @
b78c8719
#!/bin/bash
#
# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
# and make sure the following python packages are installed in your environment:
pip3
install
--upgrade
pip
pip3
install
sqlalchemy pymysql pandas sshtunnel
# you would also need to set up some environment variables in order to
# post your new test results to the database and compare them to the baseline
# please contact Illia.Silin@amd.com for more details
#
# run the script as "./run_full_performance_tests.sh <tag for your test environment>
#get the test environment type:
export
env_type
=
$1
echo
'Environment type '
$env_type
function
print_log_header
(){
rm
-f
$1
;
git status |
grep
-e
'On branch'
>
$1
;
echo
-n
'Node name: '
>>
$1
;
hostname
>>
$1
;
#get GPU_arch and number of compute units from rocminfo
echo
-n
"GPU_arch: "
>>
$1
;
rocminfo |
grep
"Name:"
|
grep
"gfx"
>>
$1
;
rocminfo |
grep
"Compute Unit:"
>>
$1
;
hipcc
--version
|
grep
-e
'HIP version'
>>
$1
;
echo
'Environment type: '
$2
>>
$1
;
/opt/rocm/bin/amdclang++
--version
|
grep
-e
'InstalledDir'
>>
$1
;
}
#run gemm tests
export
gemm_log
=
"perf_gemm.log"
print_log_header
$gemm_log
$env_type
./profile_gemm.sh gemm 0 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 1 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 1 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 1 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 1 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 2 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 2 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 2 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 2 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 3 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 3 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 3 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 3 0 1 0 5 |
tee
-a
$gemm_log
python3 process_perf_data.py
$gemm_log
#run resnet50 tests
export
resnet256_log
=
"perf_resnet50_N256.log"
print_log_header
$resnet256_log
$env_type
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 |
tee
-a
$resnet256_log
python3 process_perf_data.py
$resnet256_log
export
resnet4_log
=
"perf_resnet50_N4.log"
print_log_header
$resnet4_log
$env_type
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 |
tee
-a
$resnet4_log
python3 process_perf_data.py
$resnet4_log
#run batched_gemm tests
export
batched_gemm_log
=
"perf_batched_gemm.log"
print_log_header
$batched_gemm_log
$env_type
./profile_batched_gemm.sh batched_gemm 0 0 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 1 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 2 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 3 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 0 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 1 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 2 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 3 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 0 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 1 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 2 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 3 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 0 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 1 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 2 0 2 0 5 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 3 0 2 0 5 |
tee
-a
$batched_gemm_log
python3 process_perf_data.py
$batched_gemm_log
#run grouped_gemm tests
export
grouped_gemm_log
=
"perf_grouped_gemm.log"
print_log_header
$grouped_gemm_log
$env_type
./profile_grouped_gemm.sh grouped_gemm 1 0 0 2 0 5 |
tee
-a
$grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 1 0 2 0 5 |
tee
-a
$grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 2 0 2 0 5 |
tee
-a
$grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 3 0 2 0 5 |
tee
-a
$grouped_gemm_log
python3 process_perf_data.py
$grouped_gemm_log
#run fwd_conv tests
export
fwd_conv_log
=
"perf_fwd_conv.log"
print_log_header
$fwd_conv_log
$env_type
./profile_conv.sh conv_fwd 0 1 0 2 0 5 2 256 |
tee
-a
$fwd_conv_log
./profile_conv.sh conv_fwd 1 1 0 2 0 5 2 256 |
tee
-a
$fwd_conv_log
./profile_conv.sh conv_fwd 2 1 0 2 0 5 2 256 |
tee
-a
$fwd_conv_log
./profile_conv.sh conv_fwd 3 1 0 2 0 5 2 256 |
tee
-a
$fwd_conv_log
python3 process_perf_data.py
$fwd_conv_log
#run bwd_conv tests
export
bwd_conv_log
=
"perf_bwd_conv.log"
print_log_header
$bwd_conv_log
$env_type
./profile_conv.sh conv2d_bwd_data 0 1 1 1 0 2 0 5 128 |
tee
-a
$bwd_conv_log
./profile_conv.sh conv2d_bwd_data 1 1 1 1 0 2 0 5 128 |
tee
-a
$bwd_conv_log
./profile_conv.sh conv2d_bwd_data 2 1 1 1 0 2 0 5 128 |
tee
-a
$bwd_conv_log
./profile_conv.sh conv2d_bwd_data 3 1 1 1 0 2 0 5 128 |
tee
-a
$bwd_conv_log
python3 process_perf_data.py
$bwd_conv_log
#run fusion tests
export
fusion_log
=
"perf_fusion.log"
print_log_header
$fusion_log
$env_type
./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 0 0 2 0 5 |
tee
-a
$fusion_log
./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 1 0 2 0 5 |
tee
-a
$fusion_log
./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 2 0 2 0 5 |
tee
-a
$fusion_log
./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 3 0 2 0 5 |
tee
-a
$fusion_log
python3 process_perf_data.py
$fusion_log
#run reduction tests
export
reduction_log
=
"perf_reduction.log"
print_log_header
$reduction_log
$env_type
./profile_reduce_with_index.sh 0 2 10
--half
|
tee
-a
$reduction_log
./profile_reduce_no_index.sh 0 2 10
--half
|
tee
-a
$reduction_log
python3 process_perf_data.py
$reduction_log
\ No newline at end of file
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment