Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
a9bbab4d
Commit
a9bbab4d
authored
Sep 08, 2022
by
Po-Yen, Chen
Browse files
Merge branch 'develop' into feature/add-permute-device-op
parents
ac9d0a67
d6709dc3
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
150 additions
and
79 deletions
+150
-79
Jenkinsfile
Jenkinsfile
+6
-0
example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
...gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
+5
-4
include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
...device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+3
-3
include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
...n/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
+11
-4
include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
...id/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
+11
-4
script/process_perf_data.py
script/process_perf_data.py
+14
-1
script/process_qa_data.sh
script/process_qa_data.sh
+4
-2
script/profile_onnx_gemm.sh
script/profile_onnx_gemm.sh
+31
-0
script/run_full_performance_tests.sh
script/run_full_performance_tests.sh
+65
-61
No files found.
Jenkinsfile
View file @
a9bbab4d
...
...
@@ -352,6 +352,8 @@ def runCKProfiler(Map conf=[:]){
archiveArtifacts
"perf_conv_bwd_data_${gpu_arch}.log"
archiveArtifacts
"perf_gemm_bilinear_${gpu_arch}.log"
archiveArtifacts
"perf_reduction_${gpu_arch}.log"
archiveArtifacts
"perf_splitK_gemm_${gpu_arch}.log"
archiveArtifacts
"perf_onnx_gemm_${gpu_arch}.log"
// stash perf files to master
stash
name:
"perf_gemm_${gpu_arch}.log"
stash
name:
"perf_resnet50_N256_${gpu_arch}.log"
...
...
@@ -362,6 +364,8 @@ def runCKProfiler(Map conf=[:]){
stash
name:
"perf_conv_bwd_data_${gpu_arch}.log"
stash
name:
"perf_gemm_bilinear_${gpu_arch}.log"
stash
name:
"perf_reduction_${gpu_arch}.log"
stash
name:
"perf_splitK_gemm_${gpu_arch}.log"
stash
name:
"perf_onnx_gemm_${gpu_arch}.log"
//we will process results on the master node
}
else
{
...
...
@@ -442,6 +446,8 @@ def process_results(Map conf=[:]){
unstash
"perf_conv_bwd_data_${gpu_arch}.log"
unstash
"perf_gemm_bilinear_${gpu_arch}.log"
unstash
"perf_reduction_${gpu_arch}.log"
unstash
"perf_splitK_gemm_${gpu_arch}.log"
unstash
"perf_onnx_gemm_${gpu_arch}.log"
sh
"./process_qa_data.sh ${gpu_arch}"
}
else
{
...
...
example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
View file @
a9bbab4d
...
...
@@ -58,7 +58,7 @@ using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
using
B1ElementOp
=
PassThrough
;
using
CElementOp
=
PassThrough
;
static
constexpr
auto
Gemm
Default
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
static
constexpr
auto
Gemm
Spec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNOPadding
;
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
<
...
...
@@ -77,7 +77,7 @@ using DeviceGemmInstance =
Acc0ElementOp
,
B1ElementOp
,
CElementOp
,
Gemm
Default
,
Gemm
Spec
,
1
,
256
,
128
,
// MPerBlock
...
...
@@ -166,8 +166,6 @@ int main(int argc, char* argv[])
// C_g0_m_g1_o = permute(C_g0_g1_m_o, [0, 2, 1, 3])
ck
::
index_t
G0
=
7
;
ck
::
index_t
G1
=
13
;
std
::
vector
<
ck
::
index_t
>
c_gs_ms_os_lengths
{
G0
,
G1
,
M
,
O
};
std
::
vector
<
ck
::
index_t
>
c_gs_ms_os_strides
{
M
*
G1
*
O
,
O
,
G1
*
O
,
1
};
if
(
argc
==
1
)
{
...
...
@@ -204,6 +202,9 @@ int main(int argc, char* argv[])
exit
(
0
);
}
std
::
vector
<
ck
::
index_t
>
c_gs_ms_os_lengths
{
G0
,
G1
,
M
,
O
};
std
::
vector
<
ck
::
index_t
>
c_gs_ms_os_strides
{
M
*
G1
*
O
,
O
,
G1
*
O
,
1
};
const
int
DefaultStrideA
=
ck
::
is_same_v
<
ALayout
,
Row
>
?
K
:
M
;
const
int
DefaultStrideB0
=
ck
::
is_same_v
<
B0Layout
,
Row
>
?
N
:
K
;
const
int
DefaultStrideB1
=
ck
::
is_same_v
<
B1Layout
,
Row
>
?
O
:
N
;
...
...
include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
View file @
a9bbab4d
...
...
@@ -693,9 +693,9 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
}
// Check if C permute dimension matches GEMM + GEMM shape
const
index_t
c_g
=
arg
.
c_grid_desc_g_m_n_
.
GetLength
(
I0
);
const
index_t
c_m
=
arg
.
c_grid_desc_
g_
m_n_
.
GetLength
(
I
1
);
const
index_t
c_gemm1n
=
arg
.
c_grid_desc_
g_
m_n_
.
GetLength
(
I
2
);
const
index_t
c_g
=
arg
.
c_grid_desc_g_m_n_
.
GetLength
(
I0
);
// unpadded
const
index_t
c_m
=
arg
.
c_grid_desc_m_n_
.
GetLength
(
I
0
);
const
index_t
c_gemm1n
=
arg
.
c_grid_desc_m_n_
.
GetLength
(
I
1
);
const
index_t
a_m
=
arg
.
a_grid_desc_ak0_m_ak1_
.
GetLength
(
I1
);
const
index_t
b1_gemm1n
=
arg
.
b1_grid_desc_bk0_n_bk1_
.
GetLength
(
I1
);
if
(
!
(
c_g
==
arg
.
batch_count_
&&
c_m
==
a_m
&&
c_gemm1n
==
b1_gemm1n
))
...
...
include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
View file @
a9bbab4d
...
...
@@ -594,10 +594,17 @@ struct GridwiseBatchedGemmGemm_Xdl_CShuffle
static_cast
<
FloatAB
*>
(
p_shared
)
+
SharedMemTrait
::
b1_block_space_offset
,
b1_block_desc_bk0_n_bk1
.
GetElementSpaceSize
());
// selected_mfma.k_per_blk <= B1K1 <= selected_mfma.group_size
constexpr
index_t
Gemm1KPack
=
math
::
max
(
math
::
gcd
(
MfmaSelector
<
FloatAB
,
MPerXdl
,
NPerXdl
>::
selected_mfma
.
group_size
,
B1K1
),
MfmaSelector
<
FloatAB
,
MPerXdl
,
NPerXdl
>::
selected_mfma
.
k_per_blk
);
// selected_mfma.group_size or B1K1 <= Gemm1KPack <= selected_mfma.group_size
// selected_mfma.k_per_blk <= Gemm1KPack
//
// Following similar rationale behind Gemm0KPack, let Gemm1KPack be the lowest common
// multiples of A1K1 (predetermined by selected_mfma.group_size) and B1K1. But in this case
// Gemm1KPack can't be higher than A1K1 itself because A1 matrix is distributed in VGPRs
// with 'group_size' amount of contiguous elements. Having Gemm1KPack greater than A1K1 will
// cause mismatch in summation index for example c[0:7] = a1[[0:3, 8:11]] * b1[0:7].
// therefore we may just as well assign Gemm1KPack = group_size
constexpr
index_t
Gemm1KPack
=
MfmaSelector
<
FloatAB
,
MPerXdl
,
NPerXdl
>::
selected_mfma
.
group_size
;
auto
gemm1_blockwise_gemm
=
BlockwiseGemmXdlops_v2
<
BlockSize
,
...
...
include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
View file @
a9bbab4d
...
...
@@ -645,10 +645,17 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
static_cast
<
FloatAB
*>
(
p_shared
)
+
SharedMemTrait
::
b1_block_space_offset
,
b1_block_desc_bk0_n_bk1
.
GetElementSpaceSize
());
// selected_mfma.k_per_blk <= B1K1 <= selected_mfma.group_size
constexpr
index_t
Gemm1KPack
=
math
::
max
(
math
::
gcd
(
MfmaSelector
<
FloatAB
,
MPerXdl
,
NPerXdl
>::
selected_mfma
.
group_size
,
B1K1
),
MfmaSelector
<
FloatAB
,
MPerXdl
,
NPerXdl
>::
selected_mfma
.
k_per_blk
);
// selected_mfma.group_size or B1K1 <= Gemm1KPack <= selected_mfma.group_size
// selected_mfma.k_per_blk <= Gemm1KPack
//
// Following similar rationale behind Gemm0KPack, let Gemm1KPack be the lowest common
// multiples of A1K1 (predetermined by selected_mfma.group_size) and B1K1. But in this case
// Gemm1KPack can't be higher than A1K1 itself because A1 matrix is distributed in VGPRs
// with 'group_size' amount of contiguous elements. Having Gemm1KPack greater than A1K1 will
// cause mismatch in summation index for example c[0:7] = a1[[0:3, 8:11]] * b1[0:7].
// therefore we may just as well assign Gemm1KPack = group_size
constexpr
index_t
Gemm1KPack
=
MfmaSelector
<
FloatAB
,
MPerXdl
,
NPerXdl
>::
selected_mfma
.
group_size
;
auto
gemm1_blockwise_gemm
=
BlockwiseGemmXdlops_v2
<
BlockSize
,
...
...
script/process_perf_data.py
View file @
a9bbab4d
...
...
@@ -127,11 +127,16 @@ def parse_logfile(logfile):
lst
=
line
.
split
()
res
.
append
(
lst
[
1
])
#parse all other performance tests:
elif
'resnet50'
or
'batched_gemm'
or
'grouped_gemm'
or
'conv_bwd_data'
or
'gemm_bilinear'
or
'reduction'
in
logfile
:
elif
'resnet50'
in
logfile
or
'batched_gemm'
in
logfile
or
'grouped_gemm'
in
logfile
or
'conv_bwd_data'
in
logfile
or
'gemm_bilinear'
in
logfile
or
'reduction'
in
logfile
:
for
line
in
open
(
logfile
):
if
'Best Perf'
in
line
:
lst
=
line
.
split
()
res
.
append
(
lst
[
4
])
elif
'onnx_gemm'
in
logfile
or
'splitK_gemm'
in
logfile
:
for
line
in
open
(
logfile
):
if
'Best Perf'
in
line
:
lst
=
line
.
split
()
res
.
append
(
lst
[
33
])
return
res
...
...
@@ -281,6 +286,14 @@ def main():
for
i
in
range
(
1
,
50
):
testlist
.
append
(
"Layer%i"
%
i
)
table_name
=
"ck_resnet50_N256_tflops"
if
'onnx_gemm'
in
filename
:
for
i
in
range
(
1
,
len
(
results
)
+
1
):
testlist
.
append
(
"Test%i"
%
i
)
table_name
=
"ck_onnx_gemm_tflops"
if
'splitK_gemm'
in
filename
:
for
i
in
range
(
1
,
len
(
results
)
+
1
):
testlist
.
append
(
"Test%i"
%
i
)
table_name
=
"ck_splitK_gemm_tflops"
tflops_base
=
get_baseline
(
table_name
,
conn
)
store_new_test_result
(
table_name
,
results
,
testlist
,
branch_name
,
node_id
,
gpu_arch
,
compute_units
,
rocm_vers
,
hip_vers
,
environment
,
conn
)
...
...
script/process_qa_data.sh
View file @
a9bbab4d
...
...
@@ -2,8 +2,8 @@
#
# in order to run this script you'd need the following python packages:
pip3
install
--upgrade
pip
pip3
install
sqlalchemy pymysql pandas sshtunnel
#
pip3 install --upgrade pip
#
pip3 install sqlalchemy pymysql pandas sshtunnel
# you would also need to set up some environment variables in order to
# post your new test results to the database and compare them to the baseline
...
...
@@ -20,3 +20,5 @@ python3 process_perf_data.py perf_conv_fwd_"$gpu_arch".log
python3 process_perf_data.py perf_conv_bwd_data_
"
$gpu_arch
"
.log
python3 process_perf_data.py perf_gemm_bilinear_
"
$gpu_arch
"
.log
python3 process_perf_data.py perf_reduction_
"
$gpu_arch
"
.log
python3 process_perf_data.py perf_splitK_gemm_
"
$gpu_arch
"
.log
python3 process_perf_data.py perf_onnx_gemm_
"
$gpu_arch
"
.log
script/profile_onnx_gemm.sh
0 → 100755
View file @
a9bbab4d
#!/bin/bash
## GPU visibility
export
HIP_VISIBLE_DEVICES
=
0
DRIVER
=
"../build/bin/ckProfiler"
echo
$DRIVER
OP
=
$1
DATATYPE
=
$2
LAYOUT
=
$3
VERIFY
=
$4
INIT
=
$5
LOG
=
$6
TIME
=
$7
# GEMM kernel benchmarks used by ONNX
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$TIME
384 768 768
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$TIME
384 768 2304
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$TIME
384 768 3072
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$TIME
384 3072 768
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$TIME
384 1024 1024
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$TIME
384 1024 3072
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$TIME
384 1024 4096
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$TIME
384 4096 1024
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$TIME
24576 768 768
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$TIME
24576 768 2304
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$TIME
24576 768 3072
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$TIME
24576 3072 768
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$TIME
24576 1024 1024
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$TIME
24576 1024 3072
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$TIME
24576 1024 4096
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$TIME
24576 4096 1024
-1
-1
-1
script/run_full_performance_tests.sh
View file @
a9bbab4d
...
...
@@ -40,99 +40,103 @@ function print_log_header(){
#run gemm tests
export
gemm_log
=
"perf_gemm_
${
gpu_arch
}
.log"
print_log_header
$gemm_log
$env_type
$branch
$host_name
./profile_gemm.sh gemm 0 0
$verify
1 0 1 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 0
$verify
1 0 1 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 0
$verify
1 0 1 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 0
$verify
1 0 1 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 1
$verify
1 0 1 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 1
$verify
1 0 1 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 1
$verify
1 0 1 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 1
$verify
1 0 1 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 2
$verify
1 0 1 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 2
$verify
1 0 1 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 2
$verify
1 0 1 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 2
$verify
1 0 1 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 3
$verify
1 0 1 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 3
$verify
1 0 1 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 3
$verify
1 0 1 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 3
$verify
1 0 1 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 0
$verify
1 0 1
2>&1
|
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 0
$verify
1 0 1
2>&1
|
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 0
$verify
1 0 1
2>&1
|
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 0
$verify
1 0 1
2>&1
|
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 1
$verify
1 0 1
2>&1
|
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 1
$verify
1 0 1
2>&1
|
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 1
$verify
1 0 1
2>&1
|
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 1
$verify
1 0 1
2>&1
|
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 2
$verify
1 0 1
2>&1
|
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 2
$verify
1 0 1
2>&1
|
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 2
$verify
1 0 1
2>&1
|
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 2
$verify
1 0 1
2>&1
|
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 3
$verify
1 0 1
2>&1
|
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 3
$verify
1 0 1
2>&1
|
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 3
$verify
1 0 1
2>&1
|
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 3
$verify
1 0 1
2>&1
|
tee
-a
$gemm_log
#run batched_gemm tests
export
batched_gemm_log
=
"perf_batched_gemm_
${
gpu_arch
}
.log"
print_log_header
$batched_gemm_log
$env_type
$branch
$host_name
./profile_batched_gemm.sh batched_gemm 0 0
$verify
1 0 1 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 1
$verify
1 0 1 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 2
$verify
1 0 1 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 3
$verify
1 0 1 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 0
$verify
1 0 1 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 1
$verify
1 0 1 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 2
$verify
1 0 1 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 3
$verify
1 0 1 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 0
$verify
1 0 1 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 1
$verify
1 0 1 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 2
$verify
1 0 1 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 3
$verify
1 0 1 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 0
$verify
1 0 1 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 1
$verify
1 0 1 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 2
$verify
1 0 1 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 3
$verify
1 0 1 |
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 0
$verify
1 0 1
2>&1
|
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 1
$verify
1 0 1
2>&1
|
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 2
$verify
1 0 1
2>&1
|
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 0 3
$verify
1 0 1
2>&1
|
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 0
$verify
1 0 1
2>&1
|
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 1
$verify
1 0 1
2>&1
|
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 2
$verify
1 0 1
2>&1
|
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 1 3
$verify
1 0 1
2>&1
|
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 0
$verify
1 0 1
2>&1
|
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 1
$verify
1 0 1
2>&1
|
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 2
$verify
1 0 1
2>&1
|
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 2 3
$verify
1 0 1
2>&1
|
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 0
$verify
1 0 1
2>&1
|
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 1
$verify
1 0 1
2>&1
|
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 2
$verify
1 0 1
2>&1
|
tee
-a
$batched_gemm_log
./profile_batched_gemm.sh batched_gemm 3 3
$verify
1 0 1
2>&1
|
tee
-a
$batched_gemm_log
#run grouped_gemm tests
export
grouped_gemm_log
=
"perf_grouped_gemm_
${
gpu_arch
}
.log"
print_log_header
$grouped_gemm_log
$env_type
$branch
$host_name
./profile_grouped_gemm.sh grouped_gemm 1 0
$verify
1 0 1 |
tee
-a
$grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 1
$verify
1 0 1 |
tee
-a
$grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 2
$verify
1 0 1 |
tee
-a
$grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 3
$verify
1 0 1 |
tee
-a
$grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 0
$verify
1 0 1
2>&1
|
tee
-a
$grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 1
$verify
1 0 1
2>&1
|
tee
-a
$grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 2
$verify
1 0 1
2>&1
|
tee
-a
$grouped_gemm_log
./profile_grouped_gemm.sh grouped_gemm 1 3
$verify
1 0 1
2>&1
|
tee
-a
$grouped_gemm_log
#run GEMM+Bilinear tests
export
gemm_bilinear_log
=
"perf_gemm_bilinear_
${
gpu_arch
}
.log"
print_log_header
$gemm_bilinear_log
$env_type
$branch
$host_name
./profile_gemm_bilinear.sh gemm_bilinear 1 0
$verify
1 0 1 |
tee
-a
$gemm_bilinear_log
./profile_gemm_bilinear.sh gemm_bilinear 1 1
$verify
1 0 1 |
tee
-a
$gemm_bilinear_log
./profile_gemm_bilinear.sh gemm_bilinear 1 2
$verify
1 0 1 |
tee
-a
$gemm_bilinear_log
./profile_gemm_bilinear.sh gemm_bilinear 1 3
$verify
1 0 1 |
tee
-a
$gemm_bilinear_log
./profile_gemm_bilinear.sh gemm_bilinear 1 0
$verify
1 0 1
2>&1
|
tee
-a
$gemm_bilinear_log
./profile_gemm_bilinear.sh gemm_bilinear 1 1
$verify
1 0 1
2>&1
|
tee
-a
$gemm_bilinear_log
./profile_gemm_bilinear.sh gemm_bilinear 1 2
$verify
1 0 1
2>&1
|
tee
-a
$gemm_bilinear_log
./profile_gemm_bilinear.sh gemm_bilinear 1 3
$verify
1 0 1
2>&1
|
tee
-a
$gemm_bilinear_log
#run conv_fwd tests
export
conv_fwd_log
=
"perf_conv_fwd_
${
gpu_arch
}
.log"
print_log_header
$conv_fwd_log
$env_type
$branch
$host_name
./profile_conv_fwd.sh conv_fwd 0 1
$verify
1 0 1 256 |
tee
-a
$conv_fwd_log
./profile_conv_fwd.sh conv_fwd 1 1
$verify
1 0 1 256 |
tee
-a
$conv_fwd_log
./profile_conv_fwd.sh conv_fwd 2 1
$verify
1 0 1 256 |
tee
-a
$conv_fwd_log
./profile_conv_fwd.sh conv_fwd 3 1
$verify
1 0 1 256 |
tee
-a
$conv_fwd_log
./profile_conv_fwd.sh conv_fwd 0 1
$verify
1 0 1 256
2>&1
|
tee
-a
$conv_fwd_log
./profile_conv_fwd.sh conv_fwd 1 1
$verify
1 0 1 256
2>&1
|
tee
-a
$conv_fwd_log
./profile_conv_fwd.sh conv_fwd 2 1
$verify
1 0 1 256
2>&1
|
tee
-a
$conv_fwd_log
./profile_conv_fwd.sh conv_fwd 3 1
$verify
1 0 1 256
2>&1
|
tee
-a
$conv_fwd_log
#run conv_bwd_data tests
export
conv_bwd_data_log
=
"perf_conv_bwd_data_
${
gpu_arch
}
.log"
print_log_header
$conv_bwd_data_log
$env_type
$branch
$host_name
./profile_conv_bwd_data.sh conv_bwd_data 0 1
$verify
1 0 1 256 |
tee
-a
$conv_bwd_data_log
./profile_conv_bwd_data.sh conv_bwd_data 1 1
$verify
1 0 1 256 |
tee
-a
$conv_bwd_data_log
./profile_conv_bwd_data.sh conv_bwd_data 2 1
$verify
1 0 1 256 |
tee
-a
$conv_bwd_data_log
./profile_conv_bwd_data.sh conv_bwd_data 3 1
$verify
1 0 1 256 |
tee
-a
$conv_bwd_data_log
./profile_conv_bwd_data.sh conv_bwd_data 0 1
$verify
1 0 1 256
2>&1
|
tee
-a
$conv_bwd_data_log
./profile_conv_bwd_data.sh conv_bwd_data 1 1
$verify
1 0 1 256
2>&1
|
tee
-a
$conv_bwd_data_log
./profile_conv_bwd_data.sh conv_bwd_data 2 1
$verify
1 0 1 256
2>&1
|
tee
-a
$conv_bwd_data_log
./profile_conv_bwd_data.sh conv_bwd_data 3 1
$verify
1 0 1 256
2>&1
|
tee
-a
$conv_bwd_data_log
#run resnet50 tests
export
resnet256_log
=
"perf_resnet50_N256_
${
gpu_arch
}
.log"
print_log_header
$resnet256_log
$env_type
$branch
$host_name
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1
$verify
1 0 1 256 |
tee
-a
$resnet256_log
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1
$verify
1 0 1 256
2>&1
|
tee
-a
$resnet256_log
export
resnet4_log
=
"perf_resnet50_N4_
${
gpu_arch
}
.log"
print_log_header
$resnet4_log
$env_type
$branch
$host_name
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1
$verify
1 0 1 4 |
tee
-a
$resnet4_log
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1
$verify
1 0 1 4
2>&1
|
tee
-a
$resnet4_log
#run reduction tests
export
reduction_log
=
"perf_reduction_
${
gpu_arch
}
.log"
print_log_header
$reduction_log
$env_type
$branch
$host_name
./profile_reduce_with_index.sh
$verify
2 10
--half
|
tee
-a
$reduction_log
./profile_reduce_no_index.sh
$verify
2 10
--half
|
tee
-a
$reduction_log
./profile_reduce_with_index.sh
$verify
2 10
--half
2>&1
|
tee
-a
$reduction_log
./profile_reduce_no_index.sh
$verify
2 10
--half
2>&1
|
tee
-a
$reduction_log
#run splitK_gemm tests
export
splitK_gemm_log
=
"perf_splitK_gemm_
${
gpu_arch
}
.log"
print_log_header
$splitK_gemm_log
$env_type
$branch
$host_name
./profile_splitK_gemm.sh gemm_splitk 0 0
$verify
1 0 1 4 2>&1 |
tee
-a
$splitK_gemm_log
./profile_splitK_gemm.sh gemm_splitk 0 1
$verify
1 0 1 4 2>&1 |
tee
-a
$splitK_gemm_log
./profile_splitK_gemm.sh gemm_splitk 0 2
$verify
1 0 1 4 2>&1 |
tee
-a
$splitK_gemm_log
./profile_splitK_gemm.sh gemm_splitk 0 3
$verify
1 0 1 4 2>&1 |
tee
-a
$splitK_gemm_log
./profile_splitK_gemm.sh gemm_splitk 1 0
$verify
1 0 1 4 2>&1 |
tee
-a
$splitK_gemm_log
./profile_splitK_gemm.sh gemm_splitk 1 1
$verify
1 0 1 4 2>&1 |
tee
-a
$splitK_gemm_log
./profile_splitK_gemm.sh gemm_splitk 1 2
$verify
1 0 1 4 2>&1 |
tee
-a
$splitK_gemm_log
./profile_splitK_gemm.sh gemm_splitk 1 3
$verify
1 0 1 4 2>&1 |
tee
-a
$splitK_gemm_log
../script/profile_splitK_gemm.sh gemm_splitk 0 0
$verify
1 0 1 4 |
tee
-a
$splitK_gemm_log
../script/profile_splitK_gemm.sh gemm_splitk 0 1
$verify
1 0 1 4 |
tee
-a
$splitK_gemm_log
../script/profile_splitK_gemm.sh gemm_splitk 0 2
$verify
1 0 1 4 |
tee
-a
$splitK_gemm_log
../script/profile_splitK_gemm.sh gemm_splitk 0 3
$verify
1 0 1 4 |
tee
-a
$splitK_gemm_log
../script/profile_splitK_gemm.sh gemm_splitk 1 0
$verify
1 0 1 4 |
tee
-a
$splitK_gemm_log
../script/profile_splitK_gemm.sh gemm_splitk 1 1
$verify
1 0 1 4 |
tee
-a
$splitK_gemm_log
../script/profile_splitK_gemm.sh gemm_splitk 1 2
$verify
1 0 1 4 |
tee
-a
$splitK_gemm_log
../script/profile_splitK_gemm.sh gemm_splitk 1 3
$verify
1 0 1 4 |
tee
-a
$splitK_gemm_log
#run ONNX gemm tests
export
onnx_log
=
"perf_onnx_gemm_
${
gpu_arch
}
.log"
print_log_header
$onnx_log
$env_type
$branch
$host_name
./profile_onnx_gemm.sh gemm 0 0
$verify
2 0 1 2>&1 |
tee
-a
$onnx_log
./profile_onnx_gemm.sh gemm 1 0
$verify
2 0 1 2>&1 |
tee
-a
$onnx_log
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment