Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
67fcb0bd
Commit
67fcb0bd
authored
Jun 14, 2022
by
Chao Liu
Browse files
Merge remote-tracking branch 'origin/develop' into gelu
parents
578ffb6b
1ced00a5
Changes
45
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
334 additions
and
172 deletions
+334
-172
profiler/include/profile_reduce_impl.hpp
profiler/include/profile_reduce_impl.hpp
+12
-10
script/parse_perf_data.py
script/parse_perf_data.py
+206
-109
script/profile_conv.sh
script/profile_conv.sh
+52
-52
script/run_performance_tests.sh
script/run_performance_tests.sh
+58
-0
test/grouped_gemm/grouped_gemm_fp16.cpp
test/grouped_gemm/grouped_gemm_fp16.cpp
+6
-1
No files found.
profiler/include/profile_reduce_impl.hpp
View file @
67fcb0bd
...
@@ -138,7 +138,6 @@ bool profile_reduce_impl_impl(bool do_verification,
...
@@ -138,7 +138,6 @@ bool profile_reduce_impl_impl(bool do_verification,
{
{
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
::
device_reduce_instance
;
using
namespace
ck
::
tensor_operation
::
device
::
device_reduce_instance
;
using
namespace
ck
::
host_reduce
;
using
ck
::
host_common
::
dumpBufferToFile
;
using
ck
::
host_common
::
dumpBufferToFile
;
constexpr
bool
op_support_indices
=
constexpr
bool
op_support_indices
=
...
@@ -261,15 +260,17 @@ bool profile_reduce_impl_impl(bool do_verification,
...
@@ -261,15 +260,17 @@ bool profile_reduce_impl_impl(bool do_verification,
float
best_avg_time
=
0
;
float
best_avg_time
=
0
;
float
best_gb_per_sec
=
0
;
float
best_gb_per_sec
=
0
;
using
InElementwiseOperation
_0
=
using
InElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
InElementwiseOperation
;
using
AccElementwiseOperation
_0
=
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
AccElementwiseOperation
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
DeviceReduceInstPtr0
=
using
DeviceReduceInstPtr0
=
DeviceReducePtr
<
InElementwiseOperation
_0
,
AccElementwiseOperation
_0
>
;
DeviceReducePtr
<
InElementwiseOperation
,
AccElementwiseOperation
>
;
std
::
vector
<
DeviceReduceInstPtr0
>
reduce0_ptrs
;
std
::
vector
<
DeviceReduceInstPtr0
>
reduce0_ptrs
;
...
@@ -313,7 +314,9 @@ bool profile_reduce_impl_impl(bool do_verification,
...
@@ -313,7 +314,9 @@ bool profile_reduce_impl_impl(bool do_verification,
ReductionHost
<
InDataType
,
ReductionHost
<
InDataType
,
AccDataType
,
AccDataType
,
OutDataType
,
OutDataType
,
ReduceOpId
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
Rank
,
Rank
,
NumReduceDim
,
NumReduceDim
,
PropagateNan
,
PropagateNan
,
...
@@ -337,9 +340,8 @@ bool profile_reduce_impl_impl(bool do_verification,
...
@@ -337,9 +340,8 @@ bool profile_reduce_impl_impl(bool do_verification,
for
(
auto
&
reduce_ptr
:
reduce0_ptrs
)
for
(
auto
&
reduce_ptr
:
reduce0_ptrs
)
{
{
InElementwiseOperation_0
in_elementwise_op_0
(
static_cast
<
int32_t
>
(
reduce_total_length
));
InElementwiseOperation
in_elementwise_op
(
static_cast
<
int32_t
>
(
reduce_total_length
));
AccElementwiseOperation_0
acc_elementwise_op_0
(
AccElementwiseOperation
acc_elementwise_op
(
static_cast
<
int32_t
>
(
reduce_total_length
));
static_cast
<
int32_t
>
(
reduce_total_length
));
auto
argument_ptr
=
reduce_ptr
->
MakeArgumentPointer
(
i_inLengths
,
auto
argument_ptr
=
reduce_ptr
->
MakeArgumentPointer
(
i_inLengths
,
i_inStrides
,
i_inStrides
,
...
@@ -352,8 +354,8 @@ bool profile_reduce_impl_impl(bool do_verification,
...
@@ -352,8 +354,8 @@ bool profile_reduce_impl_impl(bool do_verification,
nullptr
,
nullptr
,
out_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
in_elementwise_op
_0
,
in_elementwise_op
,
acc_elementwise_op
_0
);
acc_elementwise_op
);
if
(
!
reduce_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
!
reduce_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
continue
;
continue
;
...
...
script/parse_perf_data.py
View file @
67fcb0bd
#!/usr/bin/env python3
#!/usr/bin/env python3
import
os
,
io
,
argparse
,
datetime
import
os
,
io
,
argparse
,
datetime
,
re
import
numpy
as
np
import
numpy
as
np
import
sqlalchemy
import
sqlalchemy
from
sqlalchemy.types
import
NVARCHAR
,
Float
,
Integer
from
sqlalchemy.types
import
NVARCHAR
,
Float
,
Integer
...
@@ -45,12 +45,41 @@ def main():
...
@@ -45,12 +45,41 @@ def main():
StrideB
=
[]
StrideB
=
[]
StrideC
=
[]
StrideC
=
[]
#parse results, get the Tflops value for "Best Perf" kernels
#parse results, get the Tflops value for "Best Perf" kernels
glue
=
""
glue
=
""
for
filename
in
args
.
files
:
for
filename
in
args
.
files
:
for
line
in
open
(
filename
):
for
line
in
open
(
filename
):
if
'Branch name'
in
line
:
if
'Branch name'
in
line
:
lst
=
line
.
split
()
lst
=
line
.
split
()
branch_name
=
lst
[
2
]
branch_name
=
lst
[
2
]
if
'On branch'
in
line
:
lst
=
line
.
split
()
branch_name
=
lst
[
2
]
if
'Node name'
in
line
:
lst
=
line
.
split
()
node_id
=
lst
[
2
]
if
'GPU_arch'
in
line
:
lst
=
line
.
split
()
gpu_arch
=
lst
[
2
]
if
'HIP version'
in
line
:
lst
=
line
.
split
()
hip_vers
=
lst
[
2
]
if
'Compute Unit'
in
line
:
lst
=
line
.
split
()
compute_units
=
lst
[
2
]
if
'InstalledDir'
in
line
:
lst
=
line
.
split
()
rocm_vers
=
lst
[
1
][
lst
[
1
].
find
(
'/opt/rocm-'
)
+
len
(
'/opt/rocm-'
):
lst
[
1
].
rfind
(
'/llvm/bin'
)]
print
(
"Branch name:"
,
branch_name
)
print
(
"Node name:"
,
node_id
)
print
(
"GPU_arch:"
,
gpu_arch
)
print
(
"Compute units:"
,
compute_units
)
print
(
"ROCM_version:"
,
rocm_vers
)
print
(
"HIP_version:"
,
hip_vers
)
#parse gemm performance tests:
if
'gemm'
in
filename
:
for
filename
in
args
.
files
:
for
filename
in
args
.
files
:
for
line
in
open
(
filename
):
for
line
in
open
(
filename
):
if
'Best Perf'
in
line
:
if
'Best Perf'
in
line
:
...
@@ -85,26 +114,29 @@ def main():
...
@@ -85,26 +114,29 @@ def main():
elif
len
(
lst
)
<
33
:
#even the tflops are not available
elif
len
(
lst
)
<
33
:
#even the tflops are not available
print
(
"Error in ckProfiler output!"
)
print
(
"Error in ckProfiler output!"
)
print
(
"warning: incomplete line="
,
lst
)
print
(
"warning: incomplete line="
,
lst
)
#sort results
#sort results
print
(
"Number of tests:"
,
len
(
tests
))
print
(
"Branch name:"
,
branch_name
)
#sorted_tests = sorted(tests)
#sorted_tests = sorted(tests)
#print("sorted tests:",sorted_tests)
#print("sorted tests:",sorted_tests)
sorted_tflops
=
[
x
for
_
,
x
in
sorted
(
zip
(
tests
,
tflops
))]
sorted_tflops
=
[
x
for
_
,
x
in
sorted
(
zip
(
tests
,
tflops
))]
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
test_list
=
list
(
range
(
1
,
len
(
tests
)
+
1
))
test_list
=
list
(
range
(
1
,
len
(
tests
)
+
1
))
#parse resnet50 performance tests:
if
'resnet50'
in
filename
:
for
filename
in
args
.
files
:
for
line
in
open
(
filename
):
if
'Best Perf'
in
line
:
lst
=
line
.
split
()
tflops
.
append
(
lst
[
4
])
print
(
"Number of tests:"
,
len
(
tflops
))
sql_hostname
=
'127.0.0.1'
sql_hostname
=
'127.0.0.1'
sql_username
=
os
.
environ
[
"dbuser"
]
sql_username
=
os
.
environ
[
"dbuser"
]
print
(
"sql_username="
,
sql_username
)
sql_password
=
os
.
environ
[
"dbpassword"
]
sql_password
=
os
.
environ
[
"dbpassword"
]
sql_main_database
=
'miopen_perf'
sql_main_database
=
'miopen_perf'
sql_port
=
3306
sql_port
=
3306
ssh_host
=
os
.
environ
[
"dbsship"
]
ssh_host
=
os
.
environ
[
"dbsship"
]
print
(
"ssh_host="
,
ssh_host
)
ssh_user
=
os
.
environ
[
"dbsshuser"
]
ssh_user
=
os
.
environ
[
"dbsshuser"
]
print
(
"ssh_user="
,
ssh_user
)
ssh_port
=
int
(
os
.
environ
[
"dbsshport"
])
ssh_port
=
int
(
os
.
environ
[
"dbsshport"
])
ssh_pass
=
os
.
environ
[
"dbsshpassword"
]
ssh_pass
=
os
.
environ
[
"dbsshpassword"
]
...
@@ -118,6 +150,9 @@ def main():
...
@@ -118,6 +150,9 @@ def main():
format
(
sql_username
,
sql_password
,
sql_hostname
,
tunnel
.
local_bind_port
,
sql_main_database
))
format
(
sql_username
,
sql_password
,
sql_hostname
,
tunnel
.
local_bind_port
,
sql_main_database
))
conn
=
sqlEngine
.
connect
()
conn
=
sqlEngine
.
connect
()
#save gemm performance tests:
if
'gemm'
in
filename
:
#write the ck_gemm_test_params table
#write the ck_gemm_test_params table
#only needed once the test set changes
#only needed once the test set changes
'''
'''
...
@@ -160,16 +195,42 @@ def main():
...
@@ -160,16 +195,42 @@ def main():
testlist
=
[]
testlist
=
[]
for
i
in
range
(
1
,
len
(
tests
)
+
1
):
for
i
in
range
(
1
,
len
(
tests
)
+
1
):
testlist
.
append
(
"Test%i"
%
i
)
testlist
.
append
(
"Test%i"
%
i
)
ck_gemm_tflops
=
[
str
(
branch_name
),
str
(
datetime
.
datetime
.
now
())]
ck_gemm_tflops
=
[
str
(
branch_name
),
str
(
node_id
),
str
(
gpu_arch
),
compute_units
,
str
(
rocm_vers
),
str
(
hip_vers
),
str
(
datetime
.
datetime
.
now
())]
flops
=
pd
.
DataFrame
(
data
=
[
ck_gemm_tflops
],
columns
=
[
'Branch_ID'
,
'Datetime'
])
flops
=
pd
.
DataFrame
(
data
=
[
ck_gemm_tflops
],
columns
=
[
'Branch_ID'
,
'
Node_ID'
,
'GPU_arch'
,
'Compute Units'
,
'ROCM_version'
,
'HIP_version'
,
'
Datetime'
])
df_add
=
pd
.
DataFrame
(
data
=
[
sorted_tflops
],
columns
=
testlist
)
df_add
=
pd
.
DataFrame
(
data
=
[
sorted_tflops
],
columns
=
testlist
)
flops
=
pd
.
concat
([
flops
,
df_add
],
axis
=
1
)
flops
=
pd
.
concat
([
flops
,
df_add
],
axis
=
1
)
print
(
"new tflops
resul
ts:"
,
flops
)
print
(
"new tflops
for gemm tes
ts:"
,
flops
)
flops
.
to_sql
(
"ck_gemm_tflops"
,
conn
,
if_exists
=
'append'
,
index
=
False
)
flops
.
to_sql
(
"ck_gemm_tflops"
,
conn
,
if_exists
=
'append'
,
index
=
False
)
#save resnet50 performance tests:
if
'resnet50'
in
filename
:
#read baseline results for the latest develop branch
query
=
'''SELECT * from ck_resnet50_N256_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N256_tflops where Branch_ID='develop' );'''
tflops_base_N256
=
pd
.
read_sql_query
(
query
,
conn
)
query
=
'''SELECT * from ck_resnet50_N4_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N4_tflops where Branch_ID='develop' );'''
tflops_base_N4
=
pd
.
read_sql_query
(
query
,
conn
)
#write new results to the db
testlist
=
[]
for
i
in
range
(
1
,
50
):
testlist
.
append
(
"Layer%i"
%
i
)
ck_resnet_tflops
=
[
str
(
branch_name
),
str
(
node_id
),
str
(
gpu_arch
),
compute_units
,
str
(
rocm_vers
),
str
(
hip_vers
),
str
(
datetime
.
datetime
.
now
())]
flops0
=
pd
.
DataFrame
(
data
=
[
ck_resnet_tflops
],
columns
=
[
'Branch_ID'
,
'Node_ID'
,
'GPU_arch'
,
'Compute Units'
,
'ROCM_version'
,
'HIP_version'
,
'Datetime'
])
df_add
=
pd
.
DataFrame
(
data
=
[
tflops
[
0
:
49
]],
columns
=
testlist
)
flops
=
pd
.
concat
([
flops0
,
df_add
],
axis
=
1
)
print
(
"new tflops for N=256 resnet50 test:"
,
flops
)
flops
.
to_sql
(
"ck_resnet50_N256_tflops"
,
conn
,
if_exists
=
'append'
,
index
=
False
)
df_add
=
pd
.
DataFrame
(
data
=
[
tflops
[
49
:
98
]],
columns
=
testlist
)
flops
=
pd
.
concat
([
flops0
,
df_add
],
axis
=
1
)
print
(
"new tflops for N=4 resnet50 test:"
,
flops
)
flops
.
to_sql
(
"ck_resnet50_N4_tflops"
,
conn
,
if_exists
=
'append'
,
index
=
False
)
conn
.
close
()
conn
.
close
()
#compare the results to the baseline
#compare the results to the baseline
if baseline exists
regression
=
0
regression
=
0
if
'gemm'
in
filename
:
if
not
tflops_base
.
empty
:
base
=
tflops_base
[
testlist
].
to_numpy
(
dtype
=
'float'
)
base
=
tflops_base
[
testlist
].
to_numpy
(
dtype
=
'float'
)
base_list
=
base
[
0
]
base_list
=
base
[
0
]
ave_perf
=
0
ave_perf
=
0
...
@@ -184,9 +245,45 @@ def main():
...
@@ -184,9 +245,45 @@ def main():
print
(
"no regressions found"
)
print
(
"no regressions found"
)
ave_perf
=
ave_perf
/
len
(
base_list
)
ave_perf
=
ave_perf
/
len
(
base_list
)
print
(
"average performance relative to baseline:"
,
ave_perf
)
print
(
"average performance relative to baseline:"
,
ave_perf
)
else
:
print
(
"could not find a baseline"
)
if
'resnet50'
in
filename
:
if
not
tflops_base_N256
.
empty
:
base
=
tflops_base_N256
[
testlist
].
to_numpy
(
dtype
=
'float'
)
base_list
=
base
[
0
]
ave_perf
=
0
for
i
in
range
(
len
(
base_list
)):
# success criterion:
if
base_list
[
i
]
>
1.01
*
float
(
tflops
[
i
]):
print
(
"layer # "
,
i
,
"shows regression by {:.3f}%"
.
format
(
(
float
(
tflops
[
i
])
-
base_list
[
i
])
/
base_list
[
i
]
*
100
))
regression
=
1
ave_perf
=
ave_perf
+
float
(
tflops
[
i
])
/
base_list
[
i
]
if
regression
==
0
:
print
(
"no regressions found"
)
ave_perf
=
ave_perf
/
len
(
base_list
)
print
(
"average performance relative to baseline:"
,
ave_perf
)
else
:
print
(
"could not find a baseline for N=256"
)
if
not
tflops_base_N4
.
empty
:
base
=
tflops_base_N4
[
testlist
].
to_numpy
(
dtype
=
'float'
)
base_list
=
base
[
0
]
ave_perf
=
0
for
i
in
range
(
len
(
base_list
)):
# success criterion:
if
base_list
[
i
]
>
1.01
*
float
(
tflops
[
i
+
49
]):
print
(
"layer # "
,
i
,
"shows regression by {:.3f}%"
.
format
(
(
float
(
tflops
[
i
+
49
])
-
base_list
[
i
])
/
base_list
[
i
]
*
100
))
regression
=
1
ave_perf
=
ave_perf
+
float
(
tflops
[
i
+
49
])
/
base_list
[
i
]
if
regression
==
0
:
print
(
"no regressions found"
)
ave_perf
=
ave_perf
/
len
(
base_list
)
print
(
"average performance relative to baseline:"
,
ave_perf
)
else
:
print
(
"could not find a baseline for N=4"
)
#return 0 if performance criteria met, otherwise return 1
#return 0 if performance criteria met, otherwise return 1
return
regression
return
regression
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
script/profile_conv.sh
View file @
67fcb0bd
...
@@ -3,9 +3,9 @@
...
@@ -3,9 +3,9 @@
## GPU visibility
## GPU visibility
export
HIP_VISIBLE_DEVICES
=
0
export
HIP_VISIBLE_DEVICES
=
0
make
-j
ckProfiler
#
make -j ckProfiler
DRIVER
=
".
/profiler
/ckProfiler"
DRIVER
=
".
./build/bin
/ckProfiler"
OP
=
$1
OP
=
$1
DATATYPE
=
$2
DATATYPE
=
$2
...
@@ -51,56 +51,56 @@ REPEAT=$9
...
@@ -51,56 +51,56 @@ REPEAT=$9
# Resnet50 from Bing
# Resnet50 from Bing
#############
####### op_________________
___
datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C_
__
Y X Hi_
_
Wi__ Strides Dilations LeftPads RightPads
####### op_________________
datatype in_layout wei_layout out_layout verify init log repeat
N__
K___ C_ Y X
Hi_ Wi__ Strides Dilations LeftPads RightPads
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 3 7 7 224 224 2 2 1 1 3 3 3 3
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 64 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 64 3 3 56 56 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 64 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 256 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 64 3 3 56 56 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 64 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 256 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 64 3 3 56 56 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 64 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 256 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 128 3 3 56 56 2 2 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 128 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 512 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 128 3 3 28 28 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 128 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 512 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 128 3 3 28 28 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 128 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 512 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 128 3 3 28 28 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 128 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 512 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 28 28 2 2 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 512 3 3 14 14 2 2 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
2048 512 1 1 7 7 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 2048 1 1 7 7 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 512 3 3 7 7 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
2048 512 1 1 7 7 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 2048 1 1 7 7 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 512 3 3 7 7 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
2048 512 1 1 7 7 1 1 1 1 0 0 0 0
# Resnet50
# Resnet50
...
...
script/run_performance_tests.sh
0 → 100644
View file @
67fcb0bd
#!/bin/bash
#
# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
# and make sure the following python packages are installed in your environment:
# pip3 install --upgrade pip
# pip3 install sqlalchemy
# pip3 install pymysql
# pip3 install pandas
# pip3 install sshtunnel
# you would also need to set up some environment variables in order to
# post your new test results to the database and compare them to the baseline
# please contact Illia.Silin@amd.com for more details
#
export
gemm_log
=
"perf_gemm.log"
rm
-f
$gemm_log
git status |
grep
-e
'On branch'
>
${
gemm_log
}
echo
-n
'Node name: '
>>
${
gemm_log
}
;
hostname
>>
${
gemm_log
}
#get GPU_arch and number of compute units from rocminfo
echo
-n
"GPU_arch: "
>>
${
gemm_log
}
;
rocminfo |
grep
"Name:"
|
grep
"gfx"
>>
${
gemm_log
}
rocminfo |
grep
"Compute Unit:"
>>
${
gemm_log
}
hipcc
--version
|
grep
-e
'HIP version'
>>
${
gemm_log
}
/opt/rocm/bin/amdclang++
--version
|
grep
-e
'InstalledDir'
>>
${
gemm_log
}
./profile_gemm.sh gemm 0 0 0 1 0 5 |
tee
-a
${
gemm_log
}
./profile_gemm.sh gemm 1 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 1 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 1 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 1 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 1 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 2 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 2 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 2 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 2 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 0 3 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 3 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 3 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 3 0 1 0 5 |
tee
-a
$gemm_log
python3 parse_perf_data.py
${
gemm_log
}
#run resnet50 test
export
resnet_log
=
"perf_resnet50.log"
rm
-f
$resnet_log
git status |
grep
-e
'On branch'
>
${
resnet_log
}
echo
-n
'Node name: '
>>
${
resnet_log
}
;
hostname
>>
${
resnet_log
}
#get GPU_arch and number of compute units from rocminfo
echo
-n
"GPU_arch: "
>>
${
resnet_log
}
;
rocminfo |
grep
"Name:"
|
grep
"gfx"
>>
${
resnet_log
}
rocminfo |
grep
"Compute Unit:"
>>
${
resnet_log
}
hipcc
--version
|
grep
-e
'HIP version'
>>
${
resnet_log
}
/opt/rocm/bin/amdclang++
--version
|
grep
-e
'InstalledDir'
>>
${
resnet_log
}
#first run tests with N=256
./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 |
tee
-a
${
resnet_log
}
#then run with N=4
./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 |
tee
-a
${
resnet_log
}
#the script will put the results from N=256 and N=4 runs into separate tables
python3 parse_perf_data.py
${
resnet_log
}
test/grouped_gemm/grouped_gemm_fp16.cpp
View file @
67fcb0bd
...
@@ -142,9 +142,14 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
...
@@ -142,9 +142,14 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
// do GEMM
// do GEMM
auto
invoker_ptr
=
groupedGemmPtr
->
MakeInvokerPointer
();
auto
invoker_ptr
=
groupedGemmPtr
->
MakeInvokerPointer
();
auto
argument_ptr
=
groupedGemmPtr
->
MakeArgumentPointer
(
auto
argument_ptr
=
groupedGemmPtr
->
MakeArgumentPointer
(
p_a
,
p_b
,
p_c
,
gemm_shapes
,
a_element_op
,
b_element_op
,
c_element_op
);
p_a
,
p_b
,
p_c
,
gemm_shapes
,
a_element_op
,
b_element_op
,
c_element_op
);
DeviceMem
gemm_desc_workspace
(
groupedGemmPtr
->
GetWorkSpaceSize
(
argument_ptr
.
get
()));
groupedGemmPtr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
gemm_desc_workspace
.
GetDeviceBuffer
());
invoker_ptr
->
Run
(
argument_ptr
.
get
());
invoker_ptr
->
Run
(
argument_ptr
.
get
());
for
(
std
::
size_t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
for
(
std
::
size_t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment