"tests/pipelines/vscode:/vscode.git/clone" did not exist on "17f9aed79cd073f4475bd3af1c6f34b681839685"
Commit 67fcb0bd authored by Chao Liu's avatar Chao Liu
Browse files

Merge remote-tracking branch 'origin/develop' into gelu

parents 578ffb6b 1ced00a5
...@@ -138,7 +138,6 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -138,7 +138,6 @@ bool profile_reduce_impl_impl(bool do_verification,
{ {
using namespace ck::tensor_operation::device; using namespace ck::tensor_operation::device;
using namespace ck::tensor_operation::device::device_reduce_instance; using namespace ck::tensor_operation::device::device_reduce_instance;
using namespace ck::host_reduce;
using ck::host_common::dumpBufferToFile; using ck::host_common::dumpBufferToFile;
constexpr bool op_support_indices = constexpr bool op_support_indices =
...@@ -261,15 +260,17 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -261,15 +260,17 @@ bool profile_reduce_impl_impl(bool do_verification,
float best_avg_time = 0; float best_avg_time = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
using InElementwiseOperation_0 = using InElementwiseOperation =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>:: typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
InElementwiseOperation; InElementwiseOperation;
using AccElementwiseOperation_0 = using AccElementwiseOperation =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>:: typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
AccElementwiseOperation; AccElementwiseOperation;
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
using DeviceReduceInstPtr0 = using DeviceReduceInstPtr0 =
DeviceReducePtr<InElementwiseOperation_0, AccElementwiseOperation_0>; DeviceReducePtr<InElementwiseOperation, AccElementwiseOperation>;
std::vector<DeviceReduceInstPtr0> reduce0_ptrs; std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
...@@ -313,7 +314,9 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -313,7 +314,9 @@ bool profile_reduce_impl_impl(bool do_verification,
ReductionHost<InDataType, ReductionHost<InDataType,
AccDataType, AccDataType,
OutDataType, OutDataType,
ReduceOpId, ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
Rank, Rank,
NumReduceDim, NumReduceDim,
PropagateNan, PropagateNan,
...@@ -337,9 +340,8 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -337,9 +340,8 @@ bool profile_reduce_impl_impl(bool do_verification,
for(auto& reduce_ptr : reduce0_ptrs) for(auto& reduce_ptr : reduce0_ptrs)
{ {
InElementwiseOperation_0 in_elementwise_op_0(static_cast<int32_t>(reduce_total_length)); InElementwiseOperation in_elementwise_op(static_cast<int32_t>(reduce_total_length));
AccElementwiseOperation_0 acc_elementwise_op_0( AccElementwiseOperation acc_elementwise_op(static_cast<int32_t>(reduce_total_length));
static_cast<int32_t>(reduce_total_length));
auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths, auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
i_inStrides, i_inStrides,
...@@ -352,8 +354,8 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -352,8 +354,8 @@ bool profile_reduce_impl_impl(bool do_verification,
nullptr, nullptr,
out_dev.GetDeviceBuffer(), out_dev.GetDeviceBuffer(),
out_indices_dev.GetDeviceBuffer(), out_indices_dev.GetDeviceBuffer(),
in_elementwise_op_0, in_elementwise_op,
acc_elementwise_op_0); acc_elementwise_op);
if(!reduce_ptr->IsSupportedArgument(argument_ptr.get())) if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
continue; continue;
......
#!/usr/bin/env python3 #!/usr/bin/env python3
import os, io, argparse, datetime import os, io, argparse, datetime, re
import numpy as np import numpy as np
import sqlalchemy import sqlalchemy
from sqlalchemy.types import NVARCHAR, Float, Integer from sqlalchemy.types import NVARCHAR, Float, Integer
...@@ -45,66 +45,98 @@ def main(): ...@@ -45,66 +45,98 @@ def main():
StrideB=[] StrideB=[]
StrideC=[] StrideC=[]
#parse results, get the Tflops value for "Best Perf" kernels #parse results, get the Tflops value for "Best Perf" kernels
glue="" glue=""
for filename in args.files: for filename in args.files:
for line in open(filename): for line in open(filename):
if 'Branch name' in line: if 'Branch name' in line:
lst=line.split() lst=line.split()
branch_name=lst[2] branch_name=lst[2]
for filename in args.files: if 'On branch' in line:
for line in open(filename): lst=line.split()
if 'Best Perf' in line: branch_name=lst[2]
if 'Node name' in line:
lst=line.split() lst=line.split()
if len(lst)>=37: #the line is complete node_id=lst[2]
tests.append(glue.join(lst[5:30])) if 'GPU_arch' in line:
kernels.append(glue.join(lst[37:])) lst=line.split()
tflops.append(lst[33]) gpu_arch=lst[2]
dtype.append(lst[5]) if 'HIP version' in line:
alayout.append(lst[8]) lst=line.split()
blayout.append(lst[11]) hip_vers=lst[2]
M.append(lst[14]) if 'Compute Unit' in line:
N.append(lst[17]) lst=line.split()
K.append(lst[20]) compute_units=lst[2]
StrideA.append(lst[23]) if 'InstalledDir' in line:
StrideB.append(lst[26]) lst=line.split()
StrideC.append(lst[29]) rocm_vers=lst[1][lst[1].find('/opt/rocm-')+len('/opt/rocm-'):lst[1].rfind('/llvm/bin')]
elif len(lst)<37 and len(lst)>=33: #the tflops are available
tests.append(glue.join(lst[5:30]))
kernels.append("N/A")
tflops.append(lst[33])
dtype.append(lst[5])
alayout.append(lst[8])
blayout.append(lst[11])
M.append(lst[14])
N.append(lst[17])
K.append(lst[20])
StrideA.append(lst[23])
StrideB.append(lst[26])
StrideC.append(lst[29])
print("warning: incomplete line:",lst)
elif len(lst)<33: #even the tflops are not available
print("Error in ckProfiler output!")
print("warning: incomplete line=",lst)
#sort results
print("Number of tests:",len(tests))
print("Branch name:",branch_name) print("Branch name:",branch_name)
#sorted_tests = sorted(tests) print("Node name:",node_id)
#print("sorted tests:",sorted_tests) print("GPU_arch:",gpu_arch)
sorted_tflops = [x for _,x in sorted(zip(tests,tflops))] print("Compute units:",compute_units)
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))] print("ROCM_version:",rocm_vers)
test_list=list(range(1,len(tests)+1)) print("HIP_version:",hip_vers)
#parse gemm performance tests:
if 'gemm' in filename:
for filename in args.files:
for line in open(filename):
if 'Best Perf' in line:
lst=line.split()
if len(lst)>=37: #the line is complete
tests.append(glue.join(lst[5:30]))
kernels.append(glue.join(lst[37:]))
tflops.append(lst[33])
dtype.append(lst[5])
alayout.append(lst[8])
blayout.append(lst[11])
M.append(lst[14])
N.append(lst[17])
K.append(lst[20])
StrideA.append(lst[23])
StrideB.append(lst[26])
StrideC.append(lst[29])
elif len(lst)<37 and len(lst)>=33: #the tflops are available
tests.append(glue.join(lst[5:30]))
kernels.append("N/A")
tflops.append(lst[33])
dtype.append(lst[5])
alayout.append(lst[8])
blayout.append(lst[11])
M.append(lst[14])
N.append(lst[17])
K.append(lst[20])
StrideA.append(lst[23])
StrideB.append(lst[26])
StrideC.append(lst[29])
print("warning: incomplete line:",lst)
elif len(lst)<33: #even the tflops are not available
print("Error in ckProfiler output!")
print("warning: incomplete line=",lst)
#sort results
#sorted_tests = sorted(tests)
#print("sorted tests:",sorted_tests)
sorted_tflops = [x for _,x in sorted(zip(tests,tflops))]
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
test_list=list(range(1,len(tests)+1))
#parse resnet50 performance tests:
if 'resnet50' in filename:
for filename in args.files:
for line in open(filename):
if 'Best Perf' in line:
lst=line.split()
tflops.append(lst[4])
print("Number of tests:",len(tflops))
sql_hostname = '127.0.0.1' sql_hostname = '127.0.0.1'
sql_username = os.environ["dbuser"] sql_username = os.environ["dbuser"]
print("sql_username=",sql_username)
sql_password = os.environ["dbpassword"] sql_password = os.environ["dbpassword"]
sql_main_database = 'miopen_perf' sql_main_database = 'miopen_perf'
sql_port = 3306 sql_port = 3306
ssh_host = os.environ["dbsship"] ssh_host = os.environ["dbsship"]
print("ssh_host=",ssh_host)
ssh_user = os.environ["dbsshuser"] ssh_user = os.environ["dbsshuser"]
print("ssh_user=",ssh_user)
ssh_port = int(os.environ["dbsshport"]) ssh_port = int(os.environ["dbsshport"])
ssh_pass = os.environ["dbsshpassword"] ssh_pass = os.environ["dbsshpassword"]
...@@ -118,75 +150,140 @@ def main(): ...@@ -118,75 +150,140 @@ def main():
format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database)) format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database))
conn = sqlEngine.connect() conn = sqlEngine.connect()
#write the ck_gemm_test_params table #save gemm performance tests:
#only needed once the test set changes if 'gemm' in filename:
'''
sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))] #write the ck_gemm_test_params table
sorted_alayout = [x for _,x in sorted(zip(tests,alayout))] #only needed once the test set changes
sorted_blayout = [x for _,x in sorted(zip(tests,blayout))] '''
sorted_M = [x for _,x in sorted(zip(tests,M))] sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
sorted_N = [x for _,x in sorted(zip(tests,N))] sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
sorted_K = [x for _,x in sorted(zip(tests,K))] sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))] sorted_M = [x for _,x in sorted(zip(tests,M))]
sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))] sorted_N = [x for _,x in sorted(zip(tests,N))]
sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))] sorted_K = [x for _,x in sorted(zip(tests,K))]
ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout, sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB, sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
sorted_StrideC] sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type', ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout,
'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC']) sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
print(df) sorted_StrideC]
df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
dtypes = { 'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
'Test_number': Integer(), print(df)
'Data_type': NVARCHAR(length=5),
'Alayout': NVARCHAR(length=12), dtypes = {
'Blayout': NVARCHAR(length=12), 'Test_number': Integer(),
'M': Integer(), 'Data_type': NVARCHAR(length=5),
'N': Integer(), 'Alayout': NVARCHAR(length=12),
'K': Integer(), 'Blayout': NVARCHAR(length=12),
'StrideA': Integer(), 'M': Integer(),
'StrideB': Integer(), 'N': Integer(),
'StrideC': Integer() 'K': Integer(),
} 'StrideA': Integer(),
df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes) 'StrideB': Integer(),
''' 'StrideC': Integer()
}
#read baseline results for the latest develop branch df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes)
query = '''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );''' '''
tflops_base = pd.read_sql_query(query, conn)
#read baseline results for the latest develop branch
#write new results to the db query = '''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );'''
testlist=[] tflops_base = pd.read_sql_query(query, conn)
for i in range(1,len(tests)+1):
testlist.append("Test%i"%i) #write new results to the db
ck_gemm_tflops=[str(branch_name),str(datetime.datetime.now())] testlist=[]
flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Datetime']) for i in range(1,len(tests)+1):
df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist) testlist.append("Test%i"%i)
flops=pd.concat([flops,df_add],axis=1) ck_gemm_tflops=[str(branch_name),str(node_id),str(gpu_arch),compute_units,str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
print("new tflops results:",flops) flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Datetime'])
flops.to_sql("ck_gemm_tflops",conn,if_exists='append',index=False) df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist)
flops=pd.concat([flops,df_add],axis=1)
print("new tflops for gemm tests:",flops)
flops.to_sql("ck_gemm_tflops",conn,if_exists='append',index=False)
#save resnet50 performance tests:
if 'resnet50' in filename:
#read baseline results for the latest develop branch
query = '''SELECT * from ck_resnet50_N256_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N256_tflops where Branch_ID='develop' );'''
tflops_base_N256 = pd.read_sql_query(query, conn)
query = '''SELECT * from ck_resnet50_N4_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N4_tflops where Branch_ID='develop' );'''
tflops_base_N4 = pd.read_sql_query(query, conn)
#write new results to the db
testlist=[]
for i in range(1,50):
testlist.append("Layer%i"%i)
ck_resnet_tflops=[str(branch_name),str(node_id),str(gpu_arch),compute_units,str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
flops0=pd.DataFrame(data=[ck_resnet_tflops],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Datetime'])
df_add=pd.DataFrame(data=[tflops[0:49]],columns=testlist)
flops=pd.concat([flops0,df_add],axis=1)
print("new tflops for N=256 resnet50 test:",flops)
flops.to_sql("ck_resnet50_N256_tflops",conn,if_exists='append',index=False)
df_add=pd.DataFrame(data=[tflops[49:98]],columns=testlist)
flops=pd.concat([flops0,df_add],axis=1)
print("new tflops for N=4 resnet50 test:",flops)
flops.to_sql("ck_resnet50_N4_tflops",conn,if_exists='append',index=False)
conn.close() conn.close()
#compare the results to the baseline #compare the results to the baseline if baseline exists
regression=0 regression=0
base=tflops_base[testlist].to_numpy(dtype='float') if 'gemm' in filename:
base_list=base[0] if not tflops_base.empty:
ave_perf=0 base=tflops_base[testlist].to_numpy(dtype='float')
for i in range(len(base_list)): base_list=base[0]
# success criterion: ave_perf=0
if base_list[i]>1.01*float(sorted_tflops[i]): for i in range(len(base_list)):
print("test # ",i,"shows regression by {:.3f}%".format( # success criterion:
(float(sorted_tflops[i])-base_list[i])/base_list[i]*100)) if base_list[i]>1.01*float(sorted_tflops[i]):
regression=1 print("test # ",i,"shows regression by {:.3f}%".format(
ave_perf=ave_perf+float(sorted_tflops[i])/base_list[i] (float(sorted_tflops[i])-base_list[i])/base_list[i]*100))
if regression==0: regression=1
print("no regressions found") ave_perf=ave_perf+float(sorted_tflops[i])/base_list[i]
ave_perf=ave_perf/len(base_list) if regression==0:
print("average performance relative to baseline:",ave_perf) print("no regressions found")
ave_perf=ave_perf/len(base_list)
print("average performance relative to baseline:",ave_perf)
else:
print("could not find a baseline")
if 'resnet50' in filename:
if not tflops_base_N256.empty:
base=tflops_base_N256[testlist].to_numpy(dtype='float')
base_list=base[0]
ave_perf=0
for i in range(len(base_list)):
# success criterion:
if base_list[i]>1.01*float(tflops[i]):
print("layer # ",i,"shows regression by {:.3f}%".format(
(float(tflops[i])-base_list[i])/base_list[i]*100))
regression=1
ave_perf=ave_perf+float(tflops[i])/base_list[i]
if regression==0:
print("no regressions found")
ave_perf=ave_perf/len(base_list)
print("average performance relative to baseline:",ave_perf)
else:
print("could not find a baseline for N=256")
if not tflops_base_N4.empty:
base=tflops_base_N4[testlist].to_numpy(dtype='float')
base_list=base[0]
ave_perf=0
for i in range(len(base_list)):
# success criterion:
if base_list[i]>1.01*float(tflops[i+49]):
print("layer # ",i,"shows regression by {:.3f}%".format(
(float(tflops[i+49])-base_list[i])/base_list[i]*100))
regression=1
ave_perf=ave_perf+float(tflops[i+49])/base_list[i]
if regression==0:
print("no regressions found")
ave_perf=ave_perf/len(base_list)
print("average performance relative to baseline:",ave_perf)
else:
print("could not find a baseline for N=4")
#return 0 if performance criteria met, otherwise return 1 #return 0 if performance criteria met, otherwise return 1
return regression return regression
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
## GPU visibility ## GPU visibility
export HIP_VISIBLE_DEVICES=0 export HIP_VISIBLE_DEVICES=0
make -j ckProfiler # make -j ckProfiler
DRIVER="./profiler/ckProfiler" DRIVER="../build/bin/ckProfiler"
OP=$1 OP=$1
DATATYPE=$2 DATATYPE=$2
...@@ -51,56 +51,56 @@ REPEAT=$9 ...@@ -51,56 +51,56 @@ REPEAT=$9
# Resnet50 from Bing # Resnet50 from Bing
#################### op____________________ datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads ####### op_________________ datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C_ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
# Resnet50 # Resnet50
......
#!/bin/bash
#
# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
# and make sure the following python packages are installed in your environment:
# pip3 install --upgrade pip
# pip3 install sqlalchemy
# pip3 install pymysql
# pip3 install pandas
# pip3 install sshtunnel
# you would also need to set up some environment variables in order to
# post your new test results to the database and compare them to the baseline
# please contact Illia.Silin@amd.com for more details
#
export gemm_log="perf_gemm.log"
rm -f $gemm_log
git status | grep -e 'On branch' > ${gemm_log}
echo -n 'Node name: ' >>${gemm_log}; hostname >> ${gemm_log}
#get GPU_arch and number of compute units from rocminfo
echo -n "GPU_arch: " >> ${gemm_log}; rocminfo | grep "Name:" | grep "gfx" >> ${gemm_log}
rocminfo | grep "Compute Unit:" >> ${gemm_log}
hipcc --version | grep -e 'HIP version' >> ${gemm_log}
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}
./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}
./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a $gemm_log
./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a $gemm_log
python3 parse_perf_data.py ${gemm_log}
#run resnet50 test
export resnet_log="perf_resnet50.log"
rm -f $resnet_log
git status | grep -e 'On branch' > ${resnet_log}
echo -n 'Node name: '>>${resnet_log}; hostname >>${resnet_log}
#get GPU_arch and number of compute units from rocminfo
echo -n "GPU_arch: " >> ${resnet_log}; rocminfo | grep "Name:" | grep "gfx" >> ${resnet_log}
rocminfo | grep "Compute Unit:" >> ${resnet_log}
hipcc --version | grep -e 'HIP version' >> ${resnet_log}
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet_log}
#first run tests with N=256
./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a ${resnet_log}
#then run with N=4
./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a ${resnet_log}
#the script will put the results from N=256 and N=4 runs into separate tables
python3 parse_perf_data.py ${resnet_log}
...@@ -141,10 +141,15 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr) ...@@ -141,10 +141,15 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
auto c_element_op = PassThrough{}; auto c_element_op = PassThrough{};
// do GEMM // do GEMM
auto invoker_ptr = groupedGemmPtr->MakeInvokerPointer(); auto invoker_ptr = groupedGemmPtr->MakeInvokerPointer();
auto argument_ptr = groupedGemmPtr->MakeArgumentPointer( auto argument_ptr = groupedGemmPtr->MakeArgumentPointer(
p_a, p_b, p_c, gemm_shapes, a_element_op, b_element_op, c_element_op); p_a, p_b, p_c, gemm_shapes, a_element_op, b_element_op, c_element_op);
DeviceMem gemm_desc_workspace(groupedGemmPtr->GetWorkSpaceSize(argument_ptr.get()));
groupedGemmPtr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
invoker_ptr->Run(argument_ptr.get()); invoker_ptr->Run(argument_ptr.get());
for(std::size_t i = 0; i < gemm_shapes.size(); i++) for(std::size_t i = 0; i < gemm_shapes.size(); i++)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment