Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
b3e5048f
Commit
b3e5048f
authored
Oct 30, 2024
by
aska-0096
Browse files
tempsave
parent
e8c19535
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
190 additions
and
158 deletions
+190
-158
example/65_gemm_multiply_multiply/CMakeLists.txt
example/65_gemm_multiply_multiply/CMakeLists.txt
+2
-1
example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp
...emm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp
+25
-5
include/ck/host_utility/flush_cache.hpp
include/ck/host_utility/flush_cache.hpp
+5
-2
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
...or_operation/gpu/element/unary_element_wise_operation.hpp
+6
-0
profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp
.../include/profiler/profile_gemm_multiply_multiply_impl.hpp
+3
-1
profiler/src/CMakeLists.txt
profiler/src/CMakeLists.txt
+149
-149
No files found.
example/65_gemm_multiply_multiply/CMakeLists.txt
View file @
b3e5048f
add_example_executable
(
example_gemm_multiply_multiply_xdl_fp8 gemm_multiply_multiply_xdl_fp8.cpp
)
add_example_executable
(
example_gemm_multiply_multiply_xdl_fp8_ab_scale gemm_multiply_multiply_xdl_fp8_ab_scale.cpp
)
add_example_executable
(
example_gemm_add_add_xdl_fp16 gemm_add_add_xdl_fp16.cpp
)
add_example_executable
(
example_gemm_multiply_multiply_xdl_int8 gemm_multiply_multiply_xdl_int8.cpp
)
\ No newline at end of file
add_example_executable
(
example_gemm_multiply_multiply_xdl_int8 gemm_multiply_multiply_xdl_int8.cpp
)
target_compile_options
(
example_gemm_multiply_multiply_xdl_int8 PRIVATE -Wno-gnu-line-marker -save-temps=$PWD
)
\ No newline at end of file
example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp
View file @
b3e5048f
...
...
@@ -37,8 +37,8 @@ using A0DataType = I8;
using
B0DataType
=
I8
;
using
AccDataType
=
I32
;
using
CShuffleDataType
=
I32
;
using
D0DataType
=
F
32
;
using
D1DataType
=
F
32
;
using
D0DataType
=
F
16
;
using
D1DataType
=
F
16
;
using
DsDataType
=
ck
::
Tuple
<
D0DataType
,
D1DataType
>
;
using
EDataType
=
F16
;
...
...
@@ -74,6 +74,16 @@ struct MultiplyMultiply
e
=
ck
::
type_convert
<
ck
::
half_t
>
(
x0_f
);
}
template
<
>
__host__
__device__
constexpr
void
operator
()
<
ck
::
half_t
,
int
,
ck
::
half_t
,
ck
::
half_t
>
(
ck
::
half_t
&
e
,
const
int
&
c
,
const
ck
::
half_t
&
d0
,
const
ck
::
half_t
&
d1
)
const
{
const
ck
::
half_t
x0_f
=
ck
::
type_convert
<
ck
::
half_t
>
(
c
)
*
d0
*
d1
;
e
=
x0_f
;
}
template
<
>
__host__
__device__
constexpr
void
operator
()
<
ck
::
bhalf_t
,
int
,
float
,
float
>
(
ck
::
bhalf_t
&
e
,
const
int
&
c
,
const
float
&
d0
,
const
float
&
d1
)
const
...
...
@@ -91,7 +101,7 @@ using AElementOp = PassThrough;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
MultiplyMultiply
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
using
DeviceOpInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemmMultiD_Xdl_CShuffle_V3
// clang-format off
...
...
@@ -102,7 +112,17 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShu
///###### RRR
///< Row, Row, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, 256, 256, 128, 64, 16, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, I8>;
///###### RCR
<
Row
,
Col
,
DsLayout
,
ELayout
,
A0DataType
,
B0DataType
,
DsDataType
,
EDataType
,
AccDataType
,
CShuffleDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
256
,
256
,
128
,
64
,
16
,
16
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
S
<
8
,
8
,
1
>
,
ck
::
BlockGemmPipelineScheduler
::
Interwave
,
ck
::
BlockGemmPipelineVersion
::
v1
,
I8
>
;
<
Row
,
Col
,
DsLayout
,
ELayout
,
A0DataType
,
B0DataType
,
DsDataType
,
EDataType
,
AccDataType
,
CShuffleDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
256
,
128
,
128
,
128
,
16
,
16
,
32
,
32
,
2
,
2
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
0
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
S
<
8
,
8
,
1
>
,
ck
::
BlockGemmPipelineScheduler
::
Intrawave
,
ck
::
BlockGemmPipelineVersion
::
v3
,
I8
>
;
// clang-format on
int
main
(
int
argc
,
char
*
argv
[])
...
...
@@ -251,7 +271,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem"
);
}
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
,
2
0
,
50
});
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
,
0
,
20
,
50
,
true
,
50
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
...
...
include/ck/host_utility/flush_cache.hpp
View file @
b3e5048f
...
...
@@ -93,8 +93,11 @@ struct RotatingMemWrapperMultiD
}
void
Print
()
{
std
::
cout
<<
"RotatingMemWrapperMultiD: { size_a: "
<<
size_a
<<
", size_b: "
<<
size_b
<<
", rotating_count: "
<<
rotating_count
<<
"}"
<<
std
::
endl
;
std
::
cout
<<
"RotatingMemWrapperMultiD: { size_a: "
<<
size_a
<<
", size_b: "
<<
size_b
;
static_for
<
0
,
NumDs
,
1
>
{}([
&
](
auto
j
)
{
std
::
cout
<<
", size_d"
<<
j
.
value
<<
": "
<<
size_ds
[
j
];
});
std
::
cout
<<
", rotating_count: "
<<
rotating_count
<<
"}"
<<
std
::
endl
;
}
~
RotatingMemWrapperMultiD
()
{
...
...
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
View file @
b3e5048f
...
...
@@ -66,6 +66,12 @@ struct PassThrough
y
=
type_convert
<
half_t
>
(
x
);
}
template
<
>
__host__
__device__
void
operator
()
<
half_t
,
int32_t
>
(
half_t
&
y
,
const
int32_t
&
x
)
const
{
y
=
type_convert
<
half_t
>
(
x
);
}
template
<
>
__host__
__device__
void
operator
()
<
bhalf_t
,
bhalf_t
>
(
bhalf_t
&
y
,
const
bhalf_t
&
x
)
const
{
...
...
profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp
View file @
b3e5048f
...
...
@@ -188,7 +188,9 @@ bool profile_gemm_multiply_multiply_impl(int do_verification,
// profile device GEMM instances
for
(
auto
&
op_ptr
:
op_ptrs
)
{
std
::
vector
<
int
>
kbatch_list
=
{
1
,
2
,
4
,
8
,
16
,
19
,
32
,
38
};
// Seems like when performance measurement has bug when spiltK is large
// std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38};
std
::
vector
<
int
>
kbatch_list
=
{
1
,
2
,
4
};
if
(
KBatch
>
0
)
{
...
...
profiler/src/CMakeLists.txt
View file @
b3e5048f
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment