Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
4c850c90
Unverified
Commit
4c850c90
authored
Jun 19, 2024
by
Rostyslav Geyyer
Committed by
GitHub
Jun 19, 2024
Browse files
Merge branch 'develop' into lwpck-1815
parents
ce30621d
1973903f
Changes
52
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
787 additions
and
0 deletions
+787
-0
library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_mem_v1_mnpadding_instance.cpp
...tiply_bf16_i8_bf16_mk_kn_mn_mem_v1_mnpadding_instance.cpp
+36
-0
library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_mem_v2_default_instance.cpp
...ultiply_bf16_i8_bf16_mk_kn_mn_mem_v2_default_instance.cpp
+36
-0
library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
...ltiply_bf16_i8_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
+36
-0
library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
...iply_bf16_i8_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
+36
-0
library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_mem_v2_mnpadding_instance.cpp
...tiply_bf16_i8_bf16_mk_kn_mn_mem_v2_mnpadding_instance.cpp
+36
-0
library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bias_bf16_i8_bf16_mk_kn_mn_instance.cpp
...ile_loop_multiply_bias_bf16_i8_bf16_mk_kn_mn_instance.cpp
+40
-0
library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bias_fastgelu_bf16_i8_bf16_mk_kn_mn_instance.cpp
...multiply_bias_fastgelu_bf16_i8_bf16_mk_kn_mn_instance.cpp
+41
-0
library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_fastgelu_bf16_i8_bf16_mk_kn_mn_instance.cpp
...loop_multiply_fastgelu_bf16_i8_bf16_mk_kn_mn_instance.cpp
+39
-0
profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp
...profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp
+347
-0
profiler/src/CMakeLists.txt
profiler/src/CMakeLists.txt
+1
-0
profiler/src/profile_grouped_gemm_multiply_tile_loop.cpp
profiler/src/profile_grouped_gemm_multiply_tile_loop.cpp
+133
-0
test/contraction/test_contraction_xdl.cpp
test/contraction/test_contraction_xdl.cpp
+6
-0
No files found.
library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_mem_v1_mnpadding_instance.cpp
0 → 100644
View file @
4c850c90
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_mem_v1_mnpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmTileLoop
<
Row
,
Row
,
ck
::
Tuple
<
Row
>
,
Row
,
BF16
,
I8
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
PassThrough
,
Multiply
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
BF16
>
,
Multiply
,
GemmMNPadding
,
Intrawave
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_mem_v2_default_instance.cpp
0 → 100644
View file @
4c850c90
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_mem_v2_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmTileLoop
<
Row
,
Row
,
ck
::
Tuple
<
Row
>
,
Row
,
BF16
,
I8
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
PassThrough
,
Multiply
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
BF16
>
,
Multiply
,
GemmDefault
,
Interwave
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
0 → 100644
View file @
4c850c90
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_mem_v2_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmTileLoop
<
Row
,
Row
,
ck
::
Tuple
<
Row
>
,
Row
,
BF16
,
I8
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
PassThrough
,
Multiply
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
BF16
>
,
Multiply
,
GemmKPadding
,
Interwave
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
0 → 100644
View file @
4c850c90
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_mem_v2_mnkpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmTileLoop
<
Row
,
Row
,
ck
::
Tuple
<
Row
>
,
Row
,
BF16
,
I8
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
PassThrough
,
Multiply
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
BF16
>
,
Multiply
,
GemmMNKPadding
,
Interwave
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_mem_v2_mnpadding_instance.cpp
0 → 100644
View file @
4c850c90
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_mem_v2_mnpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmTileLoop
<
Row
,
Row
,
ck
::
Tuple
<
Row
>
,
Row
,
BF16
,
I8
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
PassThrough
,
Multiply
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
BF16
>
,
Multiply
,
GemmMNPadding
,
Interwave
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bias_bf16_i8_bf16_mk_kn_mn_instance.cpp
0 → 100644
View file @
4c850c90
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_grouped_gemm_xdl_tile_loop_multiply_bias_bf16_i8_bf16_mk_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmTileLoop
<
Row
,
Row
,
ck
::
Tuple
<
Row
,
Row
>
,
Row
,
BF16
,
I8
,
ck
::
Tuple
<
BF16
,
BF16
>
,
BF16
,
PassThrough
,
PassThrough
,
MultiplyAdd
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances
<
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<
BF16
,
BF16
>
,
MultiplyAdd
>
{});
add_device_operation_instances
(
instances
,
device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances
<
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<
BF16
,
BF16
>
,
MultiplyAdd
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bias_fastgelu_bf16_i8_bf16_mk_kn_mn_instance.cpp
0 → 100644
View file @
4c850c90
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_grouped_gemm_xdl_tile_loop_multiply_bias_fastgelu_bf16_i8_bf16_mk_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmTileLoop
<
Row
,
Row
,
ck
::
Tuple
<
Row
,
Row
>
,
Row
,
BF16
,
I8
,
ck
::
Tuple
<
BF16
,
BF16
>
,
BF16
,
PassThrough
,
PassThrough
,
MultiplyAddFastGelu
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances
<
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<
BF16
,
BF16
>
,
MultiplyAddFastGelu
>
{});
add_device_operation_instances
(
instances
,
device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances
<
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<
BF16
,
BF16
>
,
MultiplyAddFastGelu
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_fastgelu_bf16_i8_bf16_mk_kn_mn_instance.cpp
0 → 100644
View file @
4c850c90
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_grouped_gemm_xdl_tile_loop_multiply_fastgelu_bf16_i8_bf16_mk_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmTileLoop
<
Row
,
Row
,
ck
::
Tuple
<
Row
>
,
Row
,
BF16
,
I8
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
PassThrough
,
MultiplyFastGelu
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
BF16
>
,
MultiplyFastGelu
>
{});
add_device_operation_instances
(
instances
,
device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
BF16
>
,
MultiplyFastGelu
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp
0 → 100644
View file @
4c850c90
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/host_utility/hip_check_error.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_tile_loop_multiply.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace
ck
{
namespace
profiler
{
template
<
typename
ADataType
,
typename
BDataType
,
typename
DDataType
,
typename
EDataType
,
typename
AccDataType
,
typename
ALayout
,
typename
BLayout
,
typename
DLayout
,
typename
ELayout
>
bool
profile_grouped_gemm_multiply_tile_loop_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
const
std
::
vector
<
int
>&
Ms
,
const
std
::
vector
<
int
>&
Ns
,
const
std
::
vector
<
int
>&
Ks
,
const
std
::
vector
<
int
>&
StrideAs
,
const
std
::
vector
<
int
>&
StrideBs
,
const
std
::
vector
<
int
>&
StrideDs
,
const
std
::
vector
<
int
>&
StrideEs
,
int
n_warmup
=
10
,
int
n_iter
=
50
)
{
using
CDataType
=
EDataType
;
bool
pass
=
true
;
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
is_same
<
decltype
(
layout
),
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
({
row
,
col
},
{
1
_uz
,
stride
});
}
};
std
::
size_t
group_count
=
Ms
.
size
();
if
(
!
(
group_count
==
Ns
.
size
()
&&
group_count
==
Ks
.
size
()
&&
group_count
==
StrideAs
.
size
()
&&
group_count
==
StrideBs
.
size
()
&&
group_count
==
StrideEs
.
size
()))
{
throw
std
::
runtime_error
(
"wrong! inconsistent M/N/Ks, StrideA/B/Cs size
\n
"
);
}
std
::
vector
<
Tensor
<
ADataType
>>
a_m_k
;
std
::
vector
<
Tensor
<
BDataType
>>
b_k_n
;
std
::
vector
<
Tensor
<
DDataType
>>
d_m_n
;
std
::
vector
<
Tensor
<
CDataType
>>
e_m_n_host_results
;
std
::
vector
<
Tensor
<
CDataType
>>
e_m_n_device_results
;
for
(
std
::
size_t
i
=
0
;
i
<
group_count
;
i
++
)
{
a_m_k
.
push_back
(
Tensor
<
ADataType
>
(
f_host_tensor_descriptor
(
Ms
[
i
],
Ks
[
i
],
StrideAs
[
i
],
ALayout
{})));
b_k_n
.
push_back
(
Tensor
<
BDataType
>
(
f_host_tensor_descriptor
(
Ks
[
i
],
Ns
[
i
],
StrideBs
[
i
],
BLayout
{})));
d_m_n
.
push_back
(
Tensor
<
DDataType
>
(
f_host_tensor_descriptor
(
Ms
[
i
],
Ns
[
i
],
StrideDs
[
i
],
DLayout
{})));
e_m_n_device_results
.
push_back
(
Tensor
<
CDataType
>
(
f_host_tensor_descriptor
(
Ms
[
i
],
Ns
[
i
],
StrideEs
[
i
],
ELayout
{})));
e_m_n_host_results
.
push_back
(
Tensor
<
CDataType
>
(
f_host_tensor_descriptor
(
Ms
[
i
],
Ns
[
i
],
StrideEs
[
i
],
ELayout
{})));
if
(
ck
::
EnvIsEnabled
(
CK_ENV
(
CK_LOGGING
)))
{
std
::
cout
<<
"group: "
<<
i
<<
" a_m_k["
<<
i
<<
"]:"
<<
a_m_k
[
i
].
mDesc
<<
", b_k_n["
<<
i
<<
"]:"
<<
b_k_n
[
i
].
mDesc
<<
", e_m_n_device_results["
<<
i
<<
"]:"
<<
e_m_n_device_results
[
i
].
mDesc
<<
std
::
endl
;
}
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ADataType
>
{
-
5
,
5
}(
a_m_k
[
i
]);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
BDataType
>
{
-
5
,
5
}(
b_k_n
[
i
]);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
DDataType
>
{
-
5
,
5
}(
d_m_n
[
i
]);
break
;
case
2
:
ck
::
utils
::
FillUniformDistribution
<
ADataType
>
{
.0
,
1.
}(
a_m_k
[
i
]);
ck
::
utils
::
FillUniformDistribution
<
BDataType
>
{
-
0.5
,
0.5
}(
b_k_n
[
i
]);
ck
::
utils
::
FillUniformDistribution
<
DDataType
>
{
-
0.5
,
0.5
}(
d_m_n
[
i
]);
break
;
default:
ck
::
utils
::
FillConstant
<
ADataType
>
{
1
}(
a_m_k
[
i
]);
ck
::
utils
::
FillConstant
<
BDataType
>
{
1
}(
b_k_n
[
i
]);
ck
::
utils
::
FillConstant
<
DDataType
>
{
1
}(
d_m_n
[
i
]);
}
}
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CDEElementOp
=
ck
::
tensor_operation
::
element_wise
::
Multiply
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
c_element_op
=
CElementOp
{};
const
auto
cde_element_op
=
CDEElementOp
{};
using
DeviceMemPtr
=
std
::
unique_ptr
<
DeviceMem
>
;
std
::
vector
<
DeviceMemPtr
>
a_device_buf
,
b_device_buf
,
d_device_buf
,
e_device_buf
;
a_device_buf
.
reserve
(
group_count
);
b_device_buf
.
reserve
(
group_count
);
d_device_buf
.
reserve
(
group_count
);
e_device_buf
.
reserve
(
group_count
);
std
::
vector
<
const
void
*>
p_a
,
p_b
,
p_d
;
constexpr
ck
::
index_t
NumDTensor
=
1
;
auto
p_ds
=
std
::
vector
<
std
::
array
<
const
void
*
,
NumDTensor
>>
{};
std
::
vector
<
void
*>
p_e
;
p_a
.
reserve
(
group_count
);
p_b
.
reserve
(
group_count
);
p_ds
.
reserve
(
group_count
);
p_e
.
reserve
(
group_count
);
using
KernelArguments
=
ck
::
tensor_operation
::
device
::
GroupedGemmTileLoopKernelArguments
<
NumDTensor
>
;
std
::
vector
<
ck
::
tensor_operation
::
device
::
GemmDesc
>
gemm_descs
;
std
::
vector
<
KernelArguments
>
gemm_kargs
;
gemm_descs
.
reserve
(
group_count
);
gemm_kargs
.
reserve
(
group_count
);
for
(
std
::
size_t
i
=
0
;
i
<
group_count
;
i
++
)
{
a_device_buf
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_m_k
[
i
].
mDesc
.
GetElementSpaceSize
()));
b_device_buf
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
BDataType
)
*
b_k_n
[
i
].
mDesc
.
GetElementSpaceSize
()));
d_device_buf
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
DDataType
)
*
d_m_n
[
i
].
mDesc
.
GetElementSpaceSize
()));
e_device_buf
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
CDataType
)
*
e_m_n_device_results
[
i
].
mDesc
.
GetElementSpaceSize
()));
a_device_buf
[
i
]
->
ToDevice
(
a_m_k
[
i
].
mData
.
data
());
b_device_buf
[
i
]
->
ToDevice
(
b_k_n
[
i
].
mData
.
data
());
d_device_buf
[
i
]
->
ToDevice
(
d_m_n
[
i
].
mData
.
data
());
e_device_buf
[
i
]
->
SetZero
();
p_a
.
push_back
(
a_device_buf
[
i
]
->
GetDeviceBuffer
());
p_b
.
push_back
(
b_device_buf
[
i
]
->
GetDeviceBuffer
());
p_ds
.
push_back
({
d_device_buf
[
i
]
->
GetDeviceBuffer
()});
p_e
.
push_back
(
e_device_buf
[
i
]
->
GetDeviceBuffer
());
gemm_descs
.
push_back
(
{
0
,
Ns
[
i
],
Ks
[
i
],
StrideAs
[
i
],
StrideBs
[
i
],
StrideEs
[
i
],
{
StrideDs
[
i
]}});
gemm_kargs
.
push_back
({
a_device_buf
[
i
]
->
GetDeviceBuffer
(),
b_device_buf
[
i
]
->
GetDeviceBuffer
(),
{
d_device_buf
[
i
]
->
GetDeviceBuffer
()},
e_device_buf
[
i
]
->
GetDeviceBuffer
(),
Ms
[
i
],
Ns
[
i
],
Ks
[
i
],
StrideAs
[
i
],
StrideBs
[
i
],
{
StrideDs
[
i
]},
StrideEs
[
i
]});
}
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGroupedGemmTileLoop
<
ALayout
,
BLayout
,
ck
::
Tuple
<
DLayout
>
,
ELayout
,
ADataType
,
BDataType
,
ck
::
Tuple
<
DDataType
>
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
>
;
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
if
(
op_ptrs
.
size
()
<=
0
)
{
throw
std
::
runtime_error
(
"wrong! no device GEMM instance found"
);
}
std
::
string
best_gemm_name
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
if
(
do_verification
)
{
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
{
Tensor
<
CDataType
>
c_m_n
({
Ms
[
i
],
Ns
[
i
]});
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
a_m_k
[
i
],
b_k_n
[
i
],
c_m_n
,
a_element_op
,
b_element_op
,
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
for
(
int
m
=
0
;
m
<
Ms
[
i
];
++
m
)
{
for
(
int
n
=
0
;
n
<
Ns
[
i
];
++
n
)
{
cde_element_op
(
e_m_n_host_results
[
i
](
m
,
n
),
c_m_n
(
m
,
n
),
d_m_n
[
i
](
m
,
n
));
}
}
}
}
// profile device GEMM instances
for
(
auto
&
gemm_ptr
:
op_ptrs
)
{
auto
argument_ptr
=
gemm_ptr
->
MakeArgumentPointer
(
p_a
,
p_b
,
p_ds
,
p_e
,
gemm_descs
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
{},
ck
::
tensor_operation
::
element_wise
::
PassThrough
{},
cde_element_op
);
auto
invoker_ptr
=
gemm_ptr
->
MakeInvokerPointer
();
std
::
string
gemm_name
=
gemm_ptr
->
GetTypeString
();
DeviceMem
gemm_arg_dev_mem
(
gemm_ptr
->
GetDeviceKernelArgSize
(
argument_ptr
.
get
()));
hip_check_error
(
hipMemcpy
(
gemm_arg_dev_mem
.
GetDeviceBuffer
(),
gemm_kargs
.
data
(),
gemm_ptr
->
GetDeviceKernelArgSize
(
argument_ptr
.
get
()),
hipMemcpyHostToDevice
));
gemm_ptr
->
SetDeviceKernelArgs
(
argument_ptr
.
get
(),
gemm_arg_dev_mem
.
GetDeviceBuffer
());
if
(
gemm_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
,
0
,
n_warmup
,
n_iter
});
if
(
do_verification
)
{
bool
instance_pass
=
true
;
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
{
e_device_buf
[
i
]
->
FromDevice
(
e_m_n_device_results
[
i
].
mData
.
data
());
instance_pass
=
instance_pass
&&
ck
::
utils
::
check_err
(
e_m_n_device_results
[
i
],
e_m_n_host_results
[
i
]);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"a : "
,
a_m_k
[
i
].
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"b: "
,
b_k_n
[
i
].
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"e_device: "
,
e_m_n_device_results
[
i
].
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"e_host : "
,
e_m_n_host_results
[
i
].
mData
,
","
)
<<
std
::
endl
;
}
}
std
::
cout
<<
"Instance: "
<<
gemm_name
<<
" verification "
<<
(
instance_pass
?
"SUCCEED"
:
"FAILED"
)
<<
std
::
endl
;
pass
=
pass
&&
instance_pass
;
}
if
(
time_kernel
)
{
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
,
0
,
n_warmup
,
n_iter
});
std
::
size_t
flop
=
0
,
num_btype
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
{
flop
+=
std
::
size_t
(
2
)
*
Ms
[
i
]
*
Ns
[
i
]
*
Ks
[
i
];
num_btype
+=
sizeof
(
ADataType
)
*
Ms
[
i
]
*
Ks
[
i
]
+
sizeof
(
BDataType
)
*
Ks
[
i
]
*
Ns
[
i
]
+
sizeof
(
EDataType
)
*
Ms
[
i
]
*
Ns
[
i
]
+
// D matrix
sizeof
(
EDataType
)
*
Ms
[
i
]
*
Ns
[
i
];
}
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_gemm_name
=
gemm_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
}
else
{
std
::
cout
<<
"Instance: "
<<
gemm_name
<<
", does not support this GEMM problem"
<<
std
::
endl
;
}
}
if
(
time_kernel
)
{
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_gemm_name
<<
std
::
endl
;
}
return
pass
;
}
}
// namespace profiler
}
// namespace ck
profiler/src/CMakeLists.txt
View file @
4c850c90
...
@@ -43,6 +43,7 @@ if(GPU_TARGETS MATCHES "gfx9")
...
@@ -43,6 +43,7 @@ if(GPU_TARGETS MATCHES "gfx9")
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_two_stage.cpp
)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_two_stage.cpp
)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp
)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp
)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_tile_loop.cpp
)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_tile_loop.cpp
)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp
)
endif
()
endif
()
list
(
APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp
)
list
(
APPEND PROFILER_SOURCES profile_batched_gemm.cpp
)
list
(
APPEND PROFILER_SOURCES profile_batched_gemm.cpp
)
...
...
profiler/src/profile_grouped_gemm_multiply_tile_loop.cpp
0 → 100644
View file @
4c850c90
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
#include "profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp"
#include "profiler_operation_registry.hpp"
enum
struct
GemmMatrixLayout
{
MK_KN_MN
,
// 0
};
enum
struct
GemmDataType
{
BF16_INT8_BF16_BF16
,
// 0
};
#define OP_NAME "grouped_gemm_multiply_tile_loop"
#define OP_DESC "Grouped GEMM Multiply Multiple D Tile Loop"
namespace
{
std
::
vector
<
int
>
argToIntArray
(
char
*
input
)
{
std
::
vector
<
int
>
out
;
std
::
istringstream
in
(
input
);
std
::
string
item
;
while
(
std
::
getline
(
in
,
item
,
','
))
{
out
.
push_back
(
std
::
stoi
(
item
));
}
return
out
;
}
int
profile_grouped_gemm_tile_loop
(
int
argc
,
char
*
argv
[])
{
if
(
argc
<
14
)
{
std
::
cout
<<
"arg1: tensor operation ("
OP_NAME
": "
OP_DESC
")
\n
"
<<
"arg2: data type (0: bf16@int8)
\n
"
<<
"arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n]);
\n
"
<<
"arg4: verification (0: no; 1: yes)
\n
"
<<
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
<<
"arg6: print tensor value (0: no; 1: yes)
\n
"
<<
"arg7: time kernel (0=n0, 1=yes)
\n
"
<<
"arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
"64,64 64,64 128,128)
\n
"
<<
"optional:
\n
"
<<
"arg14: number of warm-up cycles (default 1)
\n
"
<<
"arg15: number of iterations (default 10)
\n
"
<<
std
::
endl
;
exit
(
1
);
}
const
auto
data_type
=
static_cast
<
GemmDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
layout
=
static_cast
<
GemmMatrixLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
auto
Ms
=
argToIntArray
(
argv
[
8
]);
const
auto
Ns
=
argToIntArray
(
argv
[
9
]);
const
auto
Ks
=
argToIntArray
(
argv
[
10
]);
auto
StrideAs
=
argToIntArray
(
argv
[
11
]);
auto
StrideBs
=
argToIntArray
(
argv
[
12
]);
auto
StrideCs
=
argToIntArray
(
argv
[
13
]);
const
int
DefaultStrideA
=
Ks
[
0
];
const
int
DefaultStrideB
=
Ns
[
0
];
const
int
DefaultStrideC
=
Ns
[
0
];
for
(
size_t
i
=
0
;
i
<
Ms
.
size
();
++
i
)
{
StrideAs
[
i
]
=
StrideAs
[
i
]
==
-
1
?
DefaultStrideA
:
StrideAs
[
i
];
StrideBs
[
i
]
=
StrideBs
[
i
]
==
-
1
?
DefaultStrideB
:
StrideBs
[
i
];
StrideCs
[
i
]
=
StrideCs
[
i
]
==
-
1
?
DefaultStrideC
:
StrideCs
[
i
];
}
std
::
vector
<
int
>
StrideDs
(
StrideCs
);
int
n_warmup
=
10
;
int
n_iter
=
50
;
if
(
argc
==
16
)
{
n_warmup
=
std
::
stoi
(
argv
[
14
]);
n_iter
=
std
::
stoi
(
argv
[
15
]);
}
if
(
data_type
==
GemmDataType
::
BF16_INT8_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_multiply_tile_loop_impl
<
ck
::
bhalf_t
,
int8_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
Ms
,
Ns
,
Ks
,
StrideAs
,
StrideBs
,
StrideDs
,
StrideCs
,
n_warmup
,
n_iter
);
}
else
{
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
}
return
0
;
}
}
// anonymous namespace
REGISTER_PROFILER_OPERATION
(
OP_NAME
,
OP_DESC
,
profile_grouped_gemm_tile_loop
);
test/contraction/test_contraction_xdl.cpp
View file @
4c850c90
...
@@ -212,4 +212,10 @@ TYPED_TEST(TestContractionScaleMixedPrecision, scale)
...
@@ -212,4 +212,10 @@ TYPED_TEST(TestContractionScaleMixedPrecision, scale)
this
->
template
Run
<
6
>({{
1
,
1
,
1
,
3
,
2
,
3
},
{
1
,
1
,
1
,
3
,
2
,
3
},
{
1
,
1
,
1
,
2
,
2
,
4
}});
this
->
template
Run
<
6
>({{
1
,
1
,
1
,
3
,
2
,
3
},
{
1
,
1
,
1
,
3
,
2
,
3
},
{
1
,
1
,
1
,
2
,
2
,
4
}});
this
->
template
Run
<
2
>({{
16
,
8
},
{
16
,
8
},
{
16
,
8
}});
this
->
template
Run
<
2
>({{
16
,
8
},
{
16
,
8
},
{
16
,
8
}});
this
->
template
Run
<
2
>({{
8
,
16
},
{
16
,
8
},
{
8
,
16
}});
this
->
template
Run
<
2
>({{
8
,
16
},
{
16
,
8
},
{
8
,
16
}});
// special cases
this
->
template
Run
<
2
>({{
1
,
1
},
{
16
,
8
},
{
8
,
16
}});
this
->
template
Run
<
2
>({{
8
,
16
},
{
16
,
8
},
{
1
,
1
}});
this
->
template
Run
<
2
>({{
8
,
16
},
{
1
,
1
},
{
8
,
16
}});
this
->
template
Run
<
2
>({{
1
,
1
},
{
1
,
1
},
{
1
,
1
}});
}
}
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment