Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
1945c26b
Commit
1945c26b
authored
Apr 26, 2023
by
Adam Osewski
Browse files
Reduce number of logged output. Add constant initialization.
parent
88436bd9
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
44 additions
and
25 deletions
+44
-25
profiler/include/profiler/profile_grouped_gemm_impl.hpp
profiler/include/profiler/profile_grouped_gemm_impl.hpp
+44
-25
No files found.
profiler/include/profiler/profile_grouped_gemm_impl.hpp
View file @
1945c26b
...
@@ -19,6 +19,7 @@
...
@@ -19,6 +19,7 @@
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -43,7 +44,6 @@ bool profile_grouped_gemm_impl(int do_verification,
...
@@ -43,7 +44,6 @@ bool profile_grouped_gemm_impl(int do_verification,
const
std
::
vector
<
int
>&
StrideCs
,
const
std
::
vector
<
int
>&
StrideCs
,
int
kbatch
=
1
)
int
kbatch
=
1
)
{
{
bool
pass
=
true
;
bool
pass
=
true
;
auto
f_host_tensor_descriptor
=
auto
f_host_tensor_descriptor
=
...
@@ -81,15 +81,18 @@ bool profile_grouped_gemm_impl(int do_verification,
...
@@ -81,15 +81,18 @@ bool profile_grouped_gemm_impl(int do_verification,
c_m_n_device_results
.
push_back
(
c_m_n_device_results
.
push_back
(
Tensor
<
CDataType
>
(
f_host_tensor_descriptor
(
Ms
[
i
],
Ns
[
i
],
StrideCs
[
i
],
CLayout
{})));
Tensor
<
CDataType
>
(
f_host_tensor_descriptor
(
Ms
[
i
],
Ns
[
i
],
StrideCs
[
i
],
CLayout
{})));
#if DEBUG_LOG
std
::
cout
<<
"group: "
<<
i
<<
" a_m_k["
<<
i
<<
"]:"
<<
a_m_k
[
i
].
mDesc
<<
", b_k_n["
<<
i
std
::
cout
<<
"group: "
<<
i
<<
" a_m_k["
<<
i
<<
"]:"
<<
a_m_k
[
i
].
mDesc
<<
", b_k_n["
<<
i
<<
"]:"
<<
b_k_n
[
i
].
mDesc
<<
", c_m_n_device_results["
<<
i
<<
"]:"
<<
b_k_n
[
i
].
mDesc
<<
", c_m_n_device_results["
<<
i
<<
"]:"
<<
c_m_n_device_results
[
i
].
mDesc
<<
std
::
endl
;
<<
"]:"
<<
c_m_n_device_results
[
i
].
mDesc
<<
std
::
endl
;
#endif // DEBUG_LOG
std
::
size_t
num_thread
=
1
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
case
0
:
break
;
case
0
:
utils
::
FillConstant
<
ADataType
>
{
1.0
}(
a_m_k
[
i
]);
utils
::
FillConstant
<
BDataType
>
{
1.0
}(
b_k_n
[
i
]);
break
;
case
1
:
case
1
:
a_m_k
[
i
].
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
},
num_thread
);
a_m_k
[
i
].
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
},
num_thread
);
b_k_n
[
i
].
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
},
num_thread
);
b_k_n
[
i
].
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
},
num_thread
);
...
@@ -191,10 +194,10 @@ bool profile_grouped_gemm_impl(int do_verification,
...
@@ -191,10 +194,10 @@ bool profile_grouped_gemm_impl(int do_verification,
DeviceMem
gemm_desc_workspace
(
gemm_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
()));
DeviceMem
gemm_desc_workspace
(
gemm_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
()));
gemm_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
gemm_desc_workspace
.
GetDeviceBuffer
());
gemm_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
gemm_desc_workspace
.
GetDeviceBuffer
());
std
::
string
gemm_name
=
gemm_ptr
->
GetTypeString
();
if
(
gemm_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
gemm_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
std
::
string
gemm_name
=
gemm_ptr
->
GetTypeString
();
if
(
kbatch
>
1
)
if
(
kbatch
>
1
)
{
{
...
@@ -221,35 +224,41 @@ bool profile_grouped_gemm_impl(int do_verification,
...
@@ -221,35 +224,41 @@ bool profile_grouped_gemm_impl(int do_verification,
float
ave_time
=
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
0
,
num_btype
=
0
;
if
(
time_kernel
)
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
{
{
flop
+=
std
::
size_t
(
2
)
*
Ms
[
i
]
*
Ns
[
i
]
*
Ks
[
i
];
std
::
size_t
flop
=
0
,
num_btype
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
{
flop
+=
std
::
size_t
(
2
)
*
Ms
[
i
]
*
Ns
[
i
]
*
Ks
[
i
];
num_btype
+=
sizeof
(
ADataType
)
*
Ms
[
i
]
*
Ks
[
i
]
+
sizeof
(
BDataType
)
*
Ks
[
i
]
*
Ns
[
i
]
+
num_btype
+=
sizeof
(
ADataType
)
*
Ms
[
i
]
*
Ks
[
i
]
+
sizeof
(
CDataType
)
*
Ms
[
i
]
*
Ns
[
i
];
sizeof
(
BDataType
)
*
Ks
[
i
]
*
Ns
[
i
]
+
}
sizeof
(
CDataType
)
*
Ms
[
i
]
*
Ns
[
i
];
}
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
gb_per_sec
<<
" GB/s, "
<<
gemm_name
<<
std
::
endl
;
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
if
(
tflops
>
best_tflops
)
{
{
best_gemm_name
=
gemm_name
;
best_gemm_name
=
gemm_name
;
best_tflops
=
tflops
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
best_gb_per_sec
=
gb_per_sec
;
}
}
}
if
(
do_verification
)
if
(
do_verification
)
{
{
bool
instance_pass
=
true
;
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
{
{
c_device_buf
[
i
]
->
FromDevice
(
c_m_n_device_results
[
i
].
mData
.
data
());
c_device_buf
[
i
]
->
FromDevice
(
c_m_n_device_results
[
i
].
mData
.
data
());
c_device_buf
[
i
]
->
SetZero
();
Tensor
<
CDataType
>
c_m_n_host_result
(
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
Ms
[
i
],
Ns
[
i
],
StrideCs
[
i
],
CLayout
{}));
f_host_tensor_descriptor
(
Ms
[
i
],
Ns
[
i
],
StrideCs
[
i
],
CLayout
{}));
...
@@ -274,7 +283,8 @@ bool profile_grouped_gemm_impl(int do_verification,
...
@@ -274,7 +283,8 @@ bool profile_grouped_gemm_impl(int do_verification,
c_element_op
);
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
c_m_n_device_results
[
i
],
c_m_n_host_result
);
instance_pass
=
instance_pass
&&
ck
::
utils
::
check_err
(
c_m_n_device_results
[
i
],
c_m_n_host_result
);
if
(
do_log
)
if
(
do_log
)
{
{
...
@@ -289,16 +299,25 @@ bool profile_grouped_gemm_impl(int do_verification,
...
@@ -289,16 +299,25 @@ bool profile_grouped_gemm_impl(int do_verification,
<<
std
::
endl
;
<<
std
::
endl
;
}
}
}
}
std
::
cout
<<
"Instance: "
<<
gemm_name
<<
" verification "
<<
(
instance_pass
?
"SUCCEED"
:
"FAILED"
)
<<
std
::
endl
;
pass
=
pass
&&
instance_pass
;
}
}
}
}
else
else
{
{
std
::
cout
<<
"does not support this GEMM problem"
<<
std
::
endl
;
std
::
cout
<<
"Instance: "
<<
gemm_name
<<
", does not support this GEMM problem"
<<
std
::
endl
;
}
}
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
if
(
time_kernel
)
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_gemm_name
<<
std
::
endl
;
{
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_gemm_name
<<
std
::
endl
;
}
return
pass
;
return
pass
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment