Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
4396a224
Unverified
Commit
4396a224
authored
Apr 16, 2024
by
Harisankar Sadasivan
Committed by
GitHub
Apr 16, 2024
Browse files
Merge branch 'develop' into mi300_time_measurement
parents
0a27f07e
501a6b68
Changes
187
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3113 additions
and
0 deletions
+3113
-0
include/ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp
include/ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp
+18
-0
include/ck_tile/ops/gemm/warp/warp_gemm.hpp
include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+105
-0
include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
+471
-0
include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
...e/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+379
-0
include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+65
-0
include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
+74
-0
include/ck_tile/ops/reduce.hpp
include/ck_tile/ops/reduce.hpp
+7
-0
include/ck_tile/ops/reduce/block/block_reduce.hpp
include/ck_tile/ops/reduce/block/block_reduce.hpp
+211
-0
include/ck_tile/remod.py
include/ck_tile/remod.py
+88
-0
library/include/ck/library/tensor_operation_instance/gpu/gemm_multi_abd.hpp
.../library/tensor_operation_instance/gpu/gemm_multi_abd.hpp
+468
-0
library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_multi_abd_fixed_nk.hpp
...peration_instance/gpu/grouped_gemm_multi_abd_fixed_nk.hpp
+470
-0
library/src/tensor_operation_instance/gpu/gemm_multi_abd/CMakeLists.txt
...nsor_operation_instance/gpu/gemm_multi_abd/CMakeLists.txt
+10
-0
library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_common.hpp
...evice_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_common.hpp
+101
-0
library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_common.hpp
...evice_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_common.hpp
+101
-0
library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_common.hpp
...evice_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_common.hpp
+101
-0
library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_km_kn_mn_v1_instance.cpp
...multi_abd_bias_gelu_bf16_i8_bf16_km_kn_mn_v1_instance.cpp
+115
-0
library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
...multi_abd_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
+115
-0
library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_mk_nk_mn_v1_instance.cpp
...multi_abd_bias_gelu_bf16_i8_bf16_mk_nk_mn_v1_instance.cpp
+115
-0
library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk_multi_abd/CMakeLists.txt
...stance/gpu/grouped_gemm_fixed_nk_multi_abd/CMakeLists.txt
+10
-0
library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk_multi_abd/device_grouped_gemm_xdl_fixed_nk_bf16_i8_bf16_km_kn_mn_common.hpp
...rouped_gemm_xdl_fixed_nk_bf16_i8_bf16_km_kn_mn_common.hpp
+89
-0
No files found.
include/ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
namespace
ck_tile
{
template
<
index_t
kMPerTile
,
index_t
kNPerTile
,
index_t
kKPerTile
>
struct
TileGemmShape
{
static
constexpr
index_t
kM
=
kMPerTile
;
static
constexpr
index_t
kN
=
kNPerTile
;
static
constexpr
index_t
kK
=
kKPerTile
;
};
}
// namespace ck_tile
include/ck_tile/ops/gemm/warp/warp_gemm.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp"
namespace
ck_tile
{
// fp16
using
WarpGemmMfmaF16F16F32M32N32K8
=
WarpGemmImpl
<
WarpGemmAtrributeMfma
<
WarpGemmAttributeMfmaImplF16F16F32M32N32K8
>>
;
using
WarpGemmMfmaF16F16F32M16N16K16
=
WarpGemmImpl
<
WarpGemmAtrributeMfma
<
WarpGemmAttributeMfmaImplF16F16F32M16N16K16
>>
;
using
WarpGemmMfmaF16F16F32M32N32K16
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaIterateK
<
WarpGemmAttributeMfmaImplF16F16F32M32N32K8
,
2
>>
;
using
WarpGemmMfmaF16F16F32M16N16K32
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaIterateK
<
WarpGemmAttributeMfmaImplF16F16F32M16N16K16
,
2
>>
;
using
WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaTransposedCDistribution
<
WarpGemmAttributeMfmaImplF16F16F32M32N32K8
>>
;
using
WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaTransposedCDistribution
<
WarpGemmAttributeMfmaImplF16F16F32M16N16K16
>>
;
using
WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
<
WarpGemmAttributeMfmaImplF16F16F32M32N32K8
,
2
>>
;
using
WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
<
WarpGemmAttributeMfmaImplF16F16F32M16N16K16
,
2
>>
;
using
WarpGemmMfmaF16F16F32M16N16K32SwizzleBTransposedCDistribution
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
<
WarpGemmAttributeMfmaImplF16F16F32M32N32K8
,
2
>>
;
// bf16
using
WarpGemmMfmaBf16Bf16F32M32N32K8
=
WarpGemmImpl
<
WarpGemmAtrributeMfma
<
WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
>>
;
using
WarpGemmMfmaBf16Bf16F32M16N16K16
=
WarpGemmImpl
<
WarpGemmAtrributeMfma
<
WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
>>
;
using
WarpGemmMfmaBf16Bf16F32M32N32K16
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaIterateK
<
WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
,
2
>>
;
using
WarpGemmMfmaBf16Bf16F32M16N16K32
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaIterateK
<
WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
,
2
>>
;
using
WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaTransposedCDistribution
<
WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
>>
;
using
WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaTransposedCDistribution
<
WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
>>
;
using
WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
<
WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
,
2
>>
;
using
WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
<
WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
,
2
>>
;
using
WarpGemmMfmaBf16Bf16F32M16N16K32SwizzleBTransposedCDistribution
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
<
WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
,
2
>>
;
// fp8
using
WarpGemmMfma_f32_32x32x16_fp8_fp8
=
WarpGemmImpl
<
WarpGemmAtrributeMfma
<
WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8
>>
;
using
WarpGemmMfma_f32_32x32x16_fp8_bf8
=
WarpGemmImpl
<
WarpGemmAtrributeMfma
<
WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8
>>
;
using
WarpGemmMfma_f32_32x32x16_bf8_fp8
=
WarpGemmImpl
<
WarpGemmAtrributeMfma
<
WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8
>>
;
using
WarpGemmMfma_f32_32x32x16_bf8_bf8
=
WarpGemmImpl
<
WarpGemmAtrributeMfma
<
WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8
>>
;
using
WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaTransposedCDistribution
<
WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8
>>
;
using
WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaTransposedCDistribution
<
WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8
>>
;
using
WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaTransposedCDistribution
<
WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8
>>
;
using
WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaTransposedCDistribution
<
WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8
>>
;
}
// namespace ck_tile
include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp"
namespace
ck_tile
{
template
<
typename
WarpGemmAttributeMfmaImpl_
>
struct
WarpGemmAtrributeMfma
{
using
Impl
=
remove_cvref_t
<
WarpGemmAttributeMfmaImpl_
>
;
using
ADataType
=
typename
Impl
::
ADataType
;
using
BDataType
=
typename
Impl
::
BDataType
;
using
CDataType
=
typename
Impl
::
CDataType
;
using
AVecType
=
typename
Impl
::
AVecType
;
using
BVecType
=
typename
Impl
::
BVecType
;
using
CVecType
=
typename
Impl
::
CVecType
;
static
constexpr
index_t
kM
=
Impl
::
kM
;
static
constexpr
index_t
kN
=
Impl
::
kN
;
static
constexpr
index_t
kK
=
Impl
::
kK
;
using
AWarpDstrEncoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Impl
::
kAMLane
>
,
sequence
<
Impl
::
kABKLane
,
Impl
::
kABKPerLane
>>
,
tuple
<
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
0
,
0
>>
,
sequence
<
2
>
,
sequence
<
1
>>
;
using
BWarpDstrEncoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Impl
::
kBNLane
>
,
sequence
<
Impl
::
kABKLane
,
Impl
::
kABKPerLane
>>
,
tuple
<
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
0
,
0
>>
,
sequence
<
2
>
,
sequence
<
1
>>
;
using
CWarpDstrEncoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Impl
::
kCM0PerLane
,
Impl
::
kCMLane
,
Impl
::
kCM1PerLane
>
,
sequence
<
Impl
::
kCNLane
>>
,
tuple
<
sequence
<
1
,
2
>>
,
tuple
<
sequence
<
1
,
0
>>
,
sequence
<
1
,
1
>
,
sequence
<
0
,
2
>>
;
// c_vec += a_vec * b_vec
CK_TILE_DEVICE
void
operator
()(
CVecType
&
c_vec
,
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
Impl
{}(
c_vec
,
a_vec
,
b_vec
);
}
// c_vec = a_vec * b_vec
CK_TILE_DEVICE
CVecType
operator
()(
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
return
Impl
{}(
a_vec
,
b_vec
);
}
};
template
<
typename
WarpGemmAttributeMfmaImpl_
,
index_t
kKIter
>
struct
WarpGemmAtrributeMfmaIterateK
{
static_assert
(
kKIter
>
0
,
"wrong!"
);
using
Impl
=
remove_cvref_t
<
WarpGemmAttributeMfmaImpl_
>
;
using
ADataType
=
typename
Impl
::
ADataType
;
using
BDataType
=
typename
Impl
::
BDataType
;
using
CDataType
=
typename
Impl
::
CDataType
;
using
AVecType
=
ext_vector_t
<
ADataType
,
vector_traits
<
typename
Impl
::
AVecType
>::
vector_size
*
kKIter
>
;
using
BVecType
=
ext_vector_t
<
BDataType
,
vector_traits
<
typename
Impl
::
BVecType
>::
vector_size
*
kKIter
>
;
using
CVecType
=
typename
Impl
::
CVecType
;
static
constexpr
index_t
kM
=
Impl
::
kM
;
static
constexpr
index_t
kN
=
Impl
::
kN
;
static
constexpr
index_t
kK
=
Impl
::
kK
*
kKIter
;
using
AWarpDstrEncoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Impl
::
kAMLane
>
,
sequence
<
Impl
::
kABKLane
,
Impl
::
kABKPerLane
*
kKIter
>>
,
tuple
<
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
0
,
0
>>
,
sequence
<
2
>
,
sequence
<
1
>>
;
using
BWarpDstrEncoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Impl
::
kBNLane
>
,
sequence
<
Impl
::
kABKLane
,
Impl
::
kABKPerLane
*
kKIter
>>
,
tuple
<
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
0
,
0
>>
,
sequence
<
2
>
,
sequence
<
1
>>
;
using
CWarpDstrEncoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Impl
::
kCM0PerLane
,
Impl
::
kCMLane
,
Impl
::
kCM1PerLane
>
,
sequence
<
Impl
::
kCNLane
>>
,
tuple
<
sequence
<
1
,
2
>>
,
tuple
<
sequence
<
1
,
0
>>
,
sequence
<
1
,
1
>
,
sequence
<
0
,
2
>>
;
// c_vec += a_vec * b_vec
CK_TILE_DEVICE
void
operator
()(
CVecType
&
c_vec
,
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
using
buf_a
=
thread_buffer
<
typename
Impl
::
AVecType
,
kKIter
>
;
using
buf_b
=
thread_buffer
<
typename
Impl
::
BVecType
,
kKIter
>
;
static_for
<
0
,
kKIter
,
1
>
{}([
&
](
auto
iKIter
)
{
Impl
{}(
c_vec
,
reinterpret_cast
<
const
buf_a
>
(
a_vec
)
.
template
get_as
<
typename
Impl
::
AVecType
>()[
iKIter
],
reinterpret_cast
<
const
buf_b
>
(
b_vec
)
.
template
get_as
<
typename
Impl
::
BVecType
>()[
iKIter
]);
});
}
// c_vec = a_vec * b_vec
CK_TILE_DEVICE
CVecType
operator
()(
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
constexpr
auto
I0
=
number
<
0
>
{};
using
buf_a
=
thread_buffer
<
typename
Impl
::
AVecType
,
kKIter
>
;
using
buf_b
=
thread_buffer
<
typename
Impl
::
BVecType
,
kKIter
>
;
// c = a * b
auto
c_vec
=
Impl
{}(
reinterpret_cast
<
const
buf_a
>
(
a_vec
).
template
get_as
<
typename
Impl
::
AVecType
>()[
I0
],
reinterpret_cast
<
const
buf_b
>
(
b_vec
).
template
get_as
<
typename
Impl
::
BVecType
>()[
I0
]);
// c += a * b
static_for
<
1
,
kKIter
,
1
>
{}([
&
](
auto
iKIter
)
{
Impl
{}(
c_vec
,
reinterpret_cast
<
const
buf_a
>
(
a_vec
)
.
template
get_as
<
typename
Impl
::
AVecType
>()[
iKIter
],
reinterpret_cast
<
const
buf_b
>
(
b_vec
)
.
template
get_as
<
typename
Impl
::
BVecType
>()[
iKIter
]);
});
return
c_vec
;
}
};
template
<
typename
WarpGemmAttributeMfmaImpl_
>
struct
WarpGemmAtrributeMfmaTransposedCDistribution
{
using
Impl
=
remove_cvref_t
<
WarpGemmAttributeMfmaImpl_
>
;
using
ADataType
=
typename
Impl
::
BDataType
;
using
BDataType
=
typename
Impl
::
ADataType
;
using
CDataType
=
typename
Impl
::
CDataType
;
using
AVecType
=
typename
Impl
::
BVecType
;
using
BVecType
=
typename
Impl
::
AVecType
;
using
CVecType
=
typename
Impl
::
CVecType
;
static
constexpr
index_t
kM
=
Impl
::
kN
;
static
constexpr
index_t
kN
=
Impl
::
kM
;
static
constexpr
index_t
kK
=
Impl
::
kK
;
using
AWarpDstrEncoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Impl
::
kBNLane
>
,
sequence
<
Impl
::
kABKLane
,
Impl
::
kABKPerLane
>>
,
tuple
<
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
0
,
0
>>
,
sequence
<
2
>
,
sequence
<
1
>>
;
using
BWarpDstrEncoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Impl
::
kAMLane
>
,
sequence
<
Impl
::
kABKLane
,
Impl
::
kABKPerLane
>>
,
tuple
<
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
0
,
0
>>
,
sequence
<
2
>
,
sequence
<
1
>>
;
using
CWarpDstrEncoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Impl
::
kCNLane
>
,
sequence
<
Impl
::
kCM0PerLane
,
Impl
::
kCMLane
,
Impl
::
kCM1PerLane
>>
,
tuple
<
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
1
,
0
>>
,
sequence
<
2
,
2
>
,
sequence
<
0
,
2
>>
;
// c_vec += a_vec * b_vec
CK_TILE_DEVICE
void
operator
()(
CVecType
&
c_vec
,
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
// swap A and B
Impl
{}(
c_vec
,
b_vec
,
a_vec
);
}
// c_vec = a_vec * b_vec
CK_TILE_DEVICE
CVecType
operator
()(
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
// swap A and B
return
Impl
{}(
b_vec
,
a_vec
);
}
};
template
<
typename
WarpGemmAttributeMfmaImpl_
>
struct
WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB
{
using
Impl
=
remove_cvref_t
<
WarpGemmAttributeMfmaImpl_
>
;
using
ADataType
=
typename
Impl
::
BDataType
;
using
BDataType
=
typename
Impl
::
ADataType
;
using
CDataType
=
typename
Impl
::
CDataType
;
using
AVecType
=
typename
Impl
::
BVecType
;
using
BVecType
=
typename
Impl
::
AVecType
;
using
CVecType
=
typename
Impl
::
CVecType
;
static
constexpr
index_t
kM
=
Impl
::
kN
;
static
constexpr
index_t
kN
=
Impl
::
kM
;
static
constexpr
index_t
kK
=
Impl
::
kK
;
using
AWarpDstrEncoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Impl
::
kBNLane
>
,
sequence
<
Impl
::
kABKLane
,
Impl
::
kABKPerLane
>>
,
tuple
<
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
0
,
0
>>
,
sequence
<
2
>
,
sequence
<
1
>>
;
using
BWarpDstrEncoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Impl
::
kAMLane
/
(
Impl
::
kABKPerLane
*
Impl
::
kABKLane
*
2
),
Impl
::
kABKLane
,
2
,
Impl
::
kABKPerLane
>
,
sequence
<
Impl
::
kABKLane
,
Impl
::
kABKPerLane
>>
,
tuple
<
sequence
<
2
,
1
,
1
,
1
,
1
>>
,
tuple
<
sequence
<
0
,
0
,
2
,
1
,
3
>>
,
sequence
<
2
>
,
sequence
<
1
>>
;
using
CWarpDstrEncoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Impl
::
kCNLane
>
,
sequence
<
Impl
::
kCM0PerLane
/
2
,
Impl
::
kCMLane
,
Impl
::
kCM1PerLane
*
2
>>
,
tuple
<
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
1
,
0
>>
,
sequence
<
2
,
2
>
,
sequence
<
0
,
2
>>
;
// c_vec += a_vec * b_vec
CK_TILE_DEVICE
void
operator
()(
CVecType
&
c_vec
,
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
// swap A and B
Impl
{}(
c_vec
,
b_vec
,
a_vec
);
}
// c_vec = a_vec * b_vec
CK_TILE_DEVICE
CVecType
operator
()(
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
// swap A and B
return
Impl
{}(
b_vec
,
a_vec
);
}
};
template
<
typename
WarpGemmAttributeMfmaImpl_
,
index_t
kKIter
>
struct
WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
{
using
Impl
=
remove_cvref_t
<
WarpGemmAttributeMfmaImpl_
>
;
// swap A and B
using
ADataType
=
typename
Impl
::
BDataType
;
using
BDataType
=
typename
Impl
::
ADataType
;
using
CDataType
=
typename
Impl
::
CDataType
;
using
AVecType
=
ext_vector_t
<
ADataType
,
vector_traits
<
typename
Impl
::
AVecType
>::
vector_size
*
kKIter
>
;
using
BVecType
=
ext_vector_t
<
BDataType
,
vector_traits
<
typename
Impl
::
BVecType
>::
vector_size
*
kKIter
>
;
using
CVecType
=
typename
Impl
::
CVecType
;
static
constexpr
index_t
kM
=
Impl
::
kN
;
static
constexpr
index_t
kN
=
Impl
::
kM
;
static
constexpr
index_t
kK
=
Impl
::
kK
*
kKIter
;
using
AWarpDstrEncoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Impl
::
kBNLane
>
,
sequence
<
Impl
::
kABKLane
,
Impl
::
kABKPerLane
*
kKIter
>>
,
tuple
<
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
0
,
0
>>
,
sequence
<
2
>
,
sequence
<
1
>>
;
using
BWarpDstrEncoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Impl
::
kAMLane
>
,
sequence
<
Impl
::
kABKLane
,
Impl
::
kABKPerLane
*
kKIter
>>
,
tuple
<
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
0
,
0
>>
,
sequence
<
2
>
,
sequence
<
1
>>
;
using
CWarpDstrEncoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Impl
::
kCNLane
>
,
sequence
<
Impl
::
kCM0PerLane
,
Impl
::
kCMLane
,
Impl
::
kCM1PerLane
>>
,
tuple
<
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
1
,
0
>>
,
sequence
<
2
,
2
>
,
sequence
<
0
,
2
>>
;
// c_vec += a_vec * b_vec
CK_TILE_DEVICE
void
operator
()(
CVecType
&
c_vec
,
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
using
buf_a
=
thread_buffer
<
typename
Impl
::
AVecType
,
kKIter
>
;
using
buf_b
=
thread_buffer
<
typename
Impl
::
BVecType
,
kKIter
>
;
// swap A and B, value and type
static_for
<
0
,
kKIter
,
1
>
{}([
&
](
auto
iKIter
)
{
Impl
{}(
c_vec
,
reinterpret_cast
<
const
buf_b
&>
(
b_vec
)
.
template
get_as
<
typename
Impl
::
BVecType
>()[
iKIter
],
reinterpret_cast
<
const
buf_a
&>
(
a_vec
)
.
template
get_as
<
typename
Impl
::
AVecType
>()[
iKIter
]);
});
}
// c_vec = a_vec * b_vec
CK_TILE_DEVICE
CVecType
operator
()(
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
constexpr
auto
I0
=
number
<
0
>
{};
using
buf_a
=
thread_buffer
<
typename
Impl
::
AVecType
,
kKIter
>
;
using
buf_b
=
thread_buffer
<
typename
Impl
::
BVecType
,
kKIter
>
;
// swap A and B, value and type
auto
c_vec
=
Impl
{}(
reinterpret_cast
<
const
buf_b
&>
(
b_vec
).
template
get_as
<
typename
Impl
::
BVecType
>()[
I0
],
reinterpret_cast
<
const
buf_a
&>
(
a_vec
).
template
get_as
<
typename
Impl
::
AVecType
>()[
I0
]);
static_for
<
1
,
kKIter
,
1
>
{}([
&
](
auto
iKIter
)
{
Impl
{}(
c_vec
,
reinterpret_cast
<
const
buf_b
&>
(
b_vec
)
.
template
get_as
<
typename
Impl
::
BVecType
>()[
iKIter
],
reinterpret_cast
<
const
buf_a
&>
(
a_vec
)
.
template
get_as
<
typename
Impl
::
AVecType
>()[
iKIter
]);
});
return
c_vec
;
}
};
template
<
typename
WarpGemmAttributeMfmaImpl_
,
index_t
kKIter
,
index_t
SFactor_
=
2
>
struct
WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
{
using
Impl
=
remove_cvref_t
<
WarpGemmAttributeMfmaImpl_
>
;
// swap A and B
using
ADataType
=
typename
Impl
::
BDataType
;
using
BDataType
=
typename
Impl
::
ADataType
;
using
CDataType
=
typename
Impl
::
CDataType
;
using
AVecType
=
ext_vector_t
<
ADataType
,
vector_traits
<
typename
Impl
::
AVecType
>::
vector_size
*
kKIter
>
;
using
BVecType
=
ext_vector_t
<
BDataType
,
vector_traits
<
typename
Impl
::
BVecType
>::
vector_size
*
kKIter
>
;
using
CVecType
=
typename
Impl
::
CVecType
;
static
constexpr
index_t
kM
=
Impl
::
kN
;
static
constexpr
index_t
kN
=
Impl
::
kM
;
static
constexpr
index_t
kK
=
Impl
::
kK
*
kKIter
;
static
constexpr
index_t
SFactor
=
SFactor_
;
// group how many CM1 together
using
AWarpDstrEncoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Impl
::
kBNLane
>
,
sequence
<
Impl
::
kABKLane
,
Impl
::
kABKPerLane
*
kKIter
>>
,
tuple
<
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
0
,
0
>>
,
sequence
<
2
>
,
sequence
<
1
>>
;
#if 0
using BWarpDstrEncoding = tile_distribution_encoding<
sequence<>,
tuple<sequence<Impl::kAMLane / (Impl::kABKPerLane * Impl::kABKLane * 2),
Impl::kABKLane,
2,
Impl::kABKPerLane>,
sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
tuple<sequence<2, 1, 1, 1, 1>>,
tuple<sequence<0, 0, 2, 1, 3>>,
sequence<2>,
sequence<1>>;
using CWarpDstrEncoding = tile_distribution_encoding<
sequence<>,
tuple<sequence<Impl::kCNLane>,
sequence<Impl::kCM0PerLane / 2, Impl::kCMLane, Impl::kCM1PerLane * 2>>,
tuple<sequence<2, 1>>,
tuple<sequence<1, 0>>,
sequence<2, 2>,
sequence<0, 2>>;
#else
// TODO: more test not only 32x32
using
BWarpDstrEncoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Impl
::
kAMLane
/
(
Impl
::
kCMLane
*
SFactor
*
Impl
::
kCM1PerLane
),
Impl
::
kCMLane
,
SFactor
,
Impl
::
kCM1PerLane
>
,
sequence
<
Impl
::
kABKLane
,
Impl
::
kABKPerLane
*
kKIter
>>
,
tuple
<
sequence
<
2
,
1
,
1
,
1
,
1
>>
,
tuple
<
sequence
<
0
,
0
,
2
,
1
,
3
>>
,
sequence
<
2
>
,
sequence
<
1
>>
;
using
CWarpDstrEncoding
=
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
Impl
::
kCNLane
>
,
sequence
<
Impl
::
kCM0PerLane
/
SFactor
,
Impl
::
kCMLane
,
Impl
::
kCM1PerLane
*
SFactor
>>
,
tuple
<
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
1
,
0
>>
,
sequence
<
2
,
2
>
,
sequence
<
0
,
2
>>
;
#endif
// c_vec += a_vec * b_vec
CK_TILE_DEVICE
void
operator
()(
CVecType
&
c_vec
,
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
using
buf_a
=
thread_buffer
<
typename
Impl
::
AVecType
,
kKIter
>
;
using
buf_b
=
thread_buffer
<
typename
Impl
::
BVecType
,
kKIter
>
;
// swap A and B, value and type
static_for
<
0
,
kKIter
,
1
>
{}([
&
](
auto
iKIter
)
{
Impl
{}(
c_vec
,
reinterpret_cast
<
const
buf_b
&>
(
b_vec
)
.
template
get_as
<
typename
Impl
::
BVecType
>()[
iKIter
],
reinterpret_cast
<
const
buf_a
&>
(
a_vec
)
.
template
get_as
<
typename
Impl
::
AVecType
>()[
iKIter
]);
});
}
// c_vec = a_vec * b_vec
CK_TILE_DEVICE
CVecType
operator
()(
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
using
buf_a
=
thread_buffer
<
typename
Impl
::
AVecType
,
kKIter
>
;
using
buf_b
=
thread_buffer
<
typename
Impl
::
BVecType
,
kKIter
>
;
constexpr
auto
I0
=
number
<
0
>
{};
// swap A and B, value and type
auto
c_vec
=
Impl
{}(
reinterpret_cast
<
const
buf_b
&>
(
b_vec
).
template
get_as
<
typename
Impl
::
BVecType
>()[
I0
],
reinterpret_cast
<
const
buf_a
&>
(
a_vec
).
template
get_as
<
typename
Impl
::
AVecType
>()[
I0
]);
static_for
<
1
,
kKIter
,
1
>
{}([
&
](
auto
iKIter
)
{
Impl
{}(
c_vec
,
reinterpret_cast
<
const
buf_b
&>
(
b_vec
)
.
template
get_as
<
typename
Impl
::
BVecType
>()[
iKIter
],
reinterpret_cast
<
const
buf_a
&>
(
a_vec
)
.
template
get_as
<
typename
Impl
::
AVecType
>()[
iKIter
]);
});
return
c_vec
;
}
};
}
// namespace ck_tile
include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
namespace
ck_tile
{
// FP16
struct
WarpGemmAttributeMfmaImplF16F16F32M32N32K8
{
using
ADataType
=
fp16_t
;
using
BDataType
=
fp16_t
;
using
CDataType
=
float
;
using
AVecType
=
ext_vector_t
<
fp16_t
,
4
>
;
using
BVecType
=
ext_vector_t
<
fp16_t
,
4
>
;
using
CVecType
=
ext_vector_t
<
float
,
16
>
;
static
constexpr
index_t
kM
=
32
;
static
constexpr
index_t
kN
=
32
;
static
constexpr
index_t
kK
=
8
;
static
constexpr
index_t
kAMLane
=
32
;
static
constexpr
index_t
kBNLane
=
32
;
static
constexpr
index_t
kABKLane
=
2
;
static
constexpr
index_t
kABKPerLane
=
4
;
static
constexpr
index_t
kCMLane
=
2
;
static
constexpr
index_t
kCNLane
=
32
;
static
constexpr
index_t
kCM0PerLane
=
4
;
static
constexpr
index_t
kCM1PerLane
=
4
;
// c_vec += a_vec * b_vec
CK_TILE_DEVICE
void
operator
()(
CVecType
&
c_vec
,
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
defined(__gfx942__)
c_vec
=
__builtin_amdgcn_mfma_f32_32x32x8f16
(
a_vec
,
b_vec
,
c_vec
,
0
,
0
,
0
);
#else
ck_tile
::
ignore
=
c_vec
;
ck_tile
::
ignore
=
a_vec
;
ck_tile
::
ignore
=
b_vec
;
#endif
}
// c_vec = a_vec * b_vec
CK_TILE_DEVICE
CVecType
operator
()(
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
defined(__gfx942__)
return
bit_cast
<
CVecType
>
(
__builtin_amdgcn_mfma_f32_32x32x8f16
(
a_vec
,
b_vec
,
fp32x16_t
{
0.
f
},
0
,
0
,
0
));
#else
ck_tile
::
ignore
=
a_vec
;
ck_tile
::
ignore
=
b_vec
;
return
CVecType
{
0.
f
};
#endif
}
};
struct
WarpGemmAttributeMfmaImplF16F16F32M16N16K16
{
using
ADataType
=
fp16_t
;
using
BDataType
=
fp16_t
;
using
CDataType
=
float
;
using
AVecType
=
ext_vector_t
<
fp16_t
,
4
>
;
using
BVecType
=
ext_vector_t
<
fp16_t
,
4
>
;
using
CVecType
=
ext_vector_t
<
float
,
4
>
;
static
constexpr
index_t
kM
=
16
;
static
constexpr
index_t
kN
=
16
;
static
constexpr
index_t
kK
=
16
;
static
constexpr
index_t
kAMLane
=
16
;
static
constexpr
index_t
kBNLane
=
16
;
static
constexpr
index_t
kABKLane
=
4
;
static
constexpr
index_t
kABKPerLane
=
4
;
static
constexpr
index_t
kCMLane
=
4
;
static
constexpr
index_t
kCNLane
=
16
;
static
constexpr
index_t
kCM0PerLane
=
1
;
static
constexpr
index_t
kCM1PerLane
=
4
;
// c_vec += a_vec * b_vec
CK_TILE_DEVICE
void
operator
()(
CVecType
&
c_vec
,
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
defined(__gfx942__)
c_vec
=
__builtin_amdgcn_mfma_f32_16x16x16f16
(
a_vec
,
b_vec
,
c_vec
,
0
,
0
,
0
);
#else
ck_tile
::
ignore
=
c_vec
;
ck_tile
::
ignore
=
a_vec
;
ck_tile
::
ignore
=
b_vec
;
#endif
}
// c_vec = a_vec * b_vec
CK_TILE_DEVICE
CVecType
operator
()(
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
defined(__gfx942__)
return
bit_cast
<
CVecType
>
(
__builtin_amdgcn_mfma_f32_16x16x16f16
(
a_vec
,
b_vec
,
fp32x4_t
{
0.
f
},
0
,
0
,
0
));
#else
ck_tile
::
ignore
=
a_vec
;
ck_tile
::
ignore
=
b_vec
;
return
CVecType
{
0.
f
};
#endif
}
};
// Bf16
struct
WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
{
using
ADataType
=
bf16_t
;
using
BDataType
=
bf16_t
;
using
CDataType
=
float
;
using
AVecType
=
ext_vector_t
<
bf16_t
,
4
>
;
using
BVecType
=
ext_vector_t
<
bf16_t
,
4
>
;
using
CVecType
=
ext_vector_t
<
float
,
16
>
;
static
constexpr
index_t
kM
=
32
;
static
constexpr
index_t
kN
=
32
;
static
constexpr
index_t
kK
=
8
;
static
constexpr
index_t
kAMLane
=
32
;
static
constexpr
index_t
kBNLane
=
32
;
static
constexpr
index_t
kABKLane
=
2
;
static
constexpr
index_t
kABKPerLane
=
4
;
static
constexpr
index_t
kCMLane
=
2
;
static
constexpr
index_t
kCNLane
=
32
;
static
constexpr
index_t
kCM0PerLane
=
4
;
static
constexpr
index_t
kCM1PerLane
=
4
;
// c_vec += a_vec * b_vec
CK_TILE_DEVICE
void
operator
()(
CVecType
&
c_vec
,
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
#if defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
c_vec
=
__builtin_amdgcn_mfma_f32_32x32x8bf16_1k
(
a_vec
,
b_vec
,
c_vec
,
0
,
0
,
0
);
#elif defined(__gfx908__)
static_for
<
0
,
2
,
1
>
{}([
&
](
auto
k
)
{
c_vec
=
__builtin_amdgcn_mfma_f32_32x32x4bf16
(
reinterpret_cast
<
const
thread_buffer
<
ADataType
,
4
>&>
(
a_vec
)
.
template
get_as
<
ext_vector_t
<
bf16_t
,
2
>
>
()[
number
<
k
>
{}],
reinterpret_cast
<
const
thread_buffer
<
BDataType
,
4
>&>
(
b_vec
)
.
template
get_as
<
ext_vector_t
<
bf16_t
,
2
>
>
()[
number
<
k
>
{}],
c_vec
,
0
,
0
,
0
);
});
#else
ck_tile
::
ignore
=
c_vec
;
ck_tile
::
ignore
=
a_vec
;
ck_tile
::
ignore
=
b_vec
;
#endif
}
// c_vec = a_vec * b_vec
CK_TILE_DEVICE
CVecType
operator
()(
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
#if defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
return
bit_cast
<
CVecType
>
(
__builtin_amdgcn_mfma_f32_32x32x8bf16_1k
(
a_vec
,
b_vec
,
fp32x16_t
{
0.
f
},
0
,
0
,
0
));
#elif defined(__gfx908__)
CVecType
c_vec
{
0.
f
};
static_for
<
0
,
2
,
1
>
{}([
&
](
auto
k
)
{
c_vec
=
__builtin_amdgcn_mfma_f32_32x32x4bf16
(
reinterpret_cast
<
const
thread_buffer
<
ADataType
,
4
>&>
(
a_vec
)
.
template
get_as
<
ext_vector_t
<
bf16_t
,
2
>
>
()[
number
<
k
>
{}],
reinterpret_cast
<
const
thread_buffer
<
BDataType
,
4
>&>
(
b_vec
)
.
template
get_as
<
ext_vector_t
<
bf16_t
,
2
>
>
()[
number
<
k
>
{}],
c_vec
,
0
,
0
,
0
);
});
return
c_vec
;
#else
ck_tile
::
ignore
=
a_vec
;
ck_tile
::
ignore
=
b_vec
;
return
CVecType
{
0.
f
};
#endif
}
};
struct
WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
{
using
ADataType
=
bf16_t
;
using
BDataType
=
bf16_t
;
using
CDataType
=
float
;
using
AVecType
=
ext_vector_t
<
bf16_t
,
4
>
;
using
BVecType
=
ext_vector_t
<
bf16_t
,
4
>
;
using
CVecType
=
ext_vector_t
<
float
,
4
>
;
static
constexpr
index_t
kM
=
16
;
static
constexpr
index_t
kN
=
16
;
static
constexpr
index_t
kK
=
16
;
static
constexpr
index_t
kAMLane
=
16
;
static
constexpr
index_t
kBNLane
=
16
;
static
constexpr
index_t
kABKLane
=
4
;
static
constexpr
index_t
kABKPerLane
=
4
;
static
constexpr
index_t
kCMLane
=
4
;
static
constexpr
index_t
kCNLane
=
16
;
static
constexpr
index_t
kCM0PerLane
=
1
;
static
constexpr
index_t
kCM1PerLane
=
4
;
// c_vec += a_vec * b_vec
CK_TILE_DEVICE
void
operator
()(
CVecType
&
c_vec
,
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
#if defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
c_vec
=
__builtin_amdgcn_mfma_f32_16x16x16bf16_1k
(
a_vec
,
b_vec
,
c_vec
,
0
,
0
,
0
);
#elif defined(__gfx908__)
static_for
<
0
,
2
,
1
>
{}([
&
](
auto
k
)
{
c_vec
=
__builtin_amdgcn_mfma_f32_16x16x8bf16
(
reinterpret_cast
<
const
thread_buffer
<
ADataType
,
4
>&>
(
a_vec
)
.
template
get_as
<
ext_vector_t
<
bf16_t
,
2
>
>
()[
number
<
k
>
{}],
reinterpret_cast
<
const
thread_buffer
<
BDataType
,
4
>&>
(
b_vec
)
.
template
get_as
<
ext_vector_t
<
bf16_t
,
2
>
>
()[
number
<
k
>
{}],
c_vec
,
0
,
0
,
0
);
});
#else
ck_tile
::
ignore
=
c_vec
;
ck_tile
::
ignore
=
a_vec
;
ck_tile
::
ignore
=
b_vec
;
#endif
}
// c_vec = a_vec * b_vec
CK_TILE_DEVICE
CVecType
operator
()(
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
#if defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
return
bit_cast
<
CVecType
>
(
__builtin_amdgcn_mfma_f32_16x16x16bf16_1k
(
a_vec
,
b_vec
,
fp32x4_t
{
0.
f
},
0
,
0
,
0
));
#elif defined(__gfx908__)
CVecType
c_vec
{
0.
f
};
static_for
<
0
,
2
,
1
>
{}([
&
](
auto
k
)
{
c_vec
=
__builtin_amdgcn_mfma_f32_16x16x8bf16
(
reinterpret_cast
<
const
thread_buffer
<
ADataType
,
4
>&>
(
a_vec
)
.
template
get_as
<
ext_vector_t
<
bf16_t
,
2
>
>
()[
number
<
k
>
{}],
reinterpret_cast
<
const
thread_buffer
<
BDataType
,
4
>&>
(
b_vec
)
.
template
get_as
<
ext_vector_t
<
bf16_t
,
2
>
>
()[
number
<
k
>
{}],
c_vec
,
0
,
0
,
0
);
});
return
c_vec
;
#else
ck_tile
::
ignore
=
a_vec
;
ck_tile
::
ignore
=
b_vec
;
return
CVecType
{
0.
f
};
#endif
}
};
// FP8
template
<
typename
AType_
,
typename
BType_
>
struct
WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
{
using
ADataType
=
AType_
;
using
BDataType
=
BType_
;
using
CDataType
=
float
;
using
AVecType
=
ext_vector_t
<
ADataType
,
8
>
;
using
BVecType
=
ext_vector_t
<
BDataType
,
8
>
;
using
CVecType
=
ext_vector_t
<
CDataType
,
16
>
;
static
constexpr
index_t
kM
=
32
;
static
constexpr
index_t
kN
=
32
;
static
constexpr
index_t
kK
=
16
;
static
constexpr
index_t
kAMLane
=
32
;
static
constexpr
index_t
kBNLane
=
32
;
static
constexpr
index_t
kABKLane
=
2
;
static
constexpr
index_t
kABKPerLane
=
8
;
static
constexpr
index_t
kCMLane
=
2
;
static
constexpr
index_t
kCNLane
=
32
;
static
constexpr
index_t
kCM0PerLane
=
4
;
static
constexpr
index_t
kCM1PerLane
=
4
;
// c_vec += a_vec * b_vec
CK_TILE_DEVICE
void
operator
()(
CVecType
&
c_vec
,
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
if
constexpr
(
std
::
is_same_v
<
ADataType
,
fp8_t
>
&&
std
::
is_same_v
<
BDataType
,
fp8_t
>
)
c_vec
=
__builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8
(
bit_cast
<
long
>
(
a_vec
),
bit_cast
<
long
>
(
b_vec
),
c_vec
,
0
,
0
,
0
);
else
if
constexpr
(
std
::
is_same_v
<
ADataType
,
fp8_t
>
&&
std
::
is_same_v
<
BDataType
,
bf8_t
>
)
c_vec
=
__builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8
(
bit_cast
<
long
>
(
a_vec
),
bit_cast
<
long
>
(
b_vec
),
c_vec
,
0
,
0
,
0
);
else
if
constexpr
(
std
::
is_same_v
<
ADataType
,
bf8_t
>
&&
std
::
is_same_v
<
BDataType
,
fp8_t
>
)
c_vec
=
__builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8
(
bit_cast
<
long
>
(
a_vec
),
bit_cast
<
long
>
(
b_vec
),
c_vec
,
0
,
0
,
0
);
else
if
constexpr
(
std
::
is_same_v
<
ADataType
,
bf8_t
>
&&
std
::
is_same_v
<
BDataType
,
bf8_t
>
)
c_vec
=
__builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8
(
bit_cast
<
long
>
(
a_vec
),
bit_cast
<
long
>
(
b_vec
),
c_vec
,
0
,
0
,
0
);
#elif defined(__gfx908__) || defined(__gfx90a__)
static_for
<
0
,
8
,
1
>
{}([
&
](
auto
k
)
{
float
a_f32
=
type_convert
<
float
>
(
reinterpret_cast
<
const
thread_buffer
<
ADataType
,
8
>&>
(
a_vec
)
.
template
get_as
<
ADataType
>()[
number
<
k
>
{}]);
float
b_f32
=
type_convert
<
float
>
(
reinterpret_cast
<
const
thread_buffer
<
BDataType
,
8
>&>
(
b_vec
)
.
template
get_as
<
BDataType
>()[
number
<
k
>
{}]);
c_vec
=
__builtin_amdgcn_mfma_f32_32x32x2f32
(
a_f32
,
b_f32
,
c_vec
,
0
,
0
,
0
);
});
#else
ck_tile
::
ignore
=
c_vec
;
ck_tile
::
ignore
=
a_vec
;
ck_tile
::
ignore
=
b_vec
;
#endif
}
// c_vec = a_vec * b_vec
CK_TILE_DEVICE
CVecType
operator
()(
const
AVecType
&
a_vec
,
const
BVecType
&
b_vec
)
const
{
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
if
constexpr
(
std
::
is_same_v
<
ADataType
,
fp8_t
>
&&
std
::
is_same_v
<
BDataType
,
fp8_t
>
)
return
bit_cast
<
CVecType
>
(
__builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8
(
bit_cast
<
long
>
(
a_vec
),
bit_cast
<
long
>
(
b_vec
),
CVecType
{
0.
f
},
0
,
0
,
0
));
else
if
constexpr
(
std
::
is_same_v
<
ADataType
,
fp8_t
>
&&
std
::
is_same_v
<
BDataType
,
bf8_t
>
)
return
bit_cast
<
CVecType
>
(
__builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8
(
bit_cast
<
long
>
(
a_vec
),
bit_cast
<
long
>
(
b_vec
),
CVecType
{
0.
f
},
0
,
0
,
0
));
else
if
constexpr
(
std
::
is_same_v
<
ADataType
,
bf8_t
>
&&
std
::
is_same_v
<
BDataType
,
fp8_t
>
)
return
bit_cast
<
CVecType
>
(
__builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8
(
bit_cast
<
long
>
(
a_vec
),
bit_cast
<
long
>
(
b_vec
),
CVecType
{
0.
f
},
0
,
0
,
0
));
else
if
constexpr
(
std
::
is_same_v
<
ADataType
,
bf8_t
>
&&
std
::
is_same_v
<
BDataType
,
bf8_t
>
)
return
bit_cast
<
CVecType
>
(
__builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8
(
bit_cast
<
long
>
(
a_vec
),
bit_cast
<
long
>
(
b_vec
),
CVecType
{
0.
f
},
0
,
0
,
0
));
#elif defined(__gfx908__) || defined(__gfx90a__)
CVecType
c_vec
{
0.
f
};
static_for
<
0
,
8
,
1
>
{}([
&
](
auto
k
)
{
float
a_f32
=
type_convert
<
float
>
(
reinterpret_cast
<
const
thread_buffer
<
ADataType
,
8
>&>
(
a_vec
)
.
template
get_as
<
ADataType
>()[
number
<
k
>
{}]);
float
b_f32
=
type_convert
<
float
>
(
reinterpret_cast
<
const
thread_buffer
<
BDataType
,
8
>&>
(
b_vec
)
.
template
get_as
<
BDataType
>()[
number
<
k
>
{}]);
c_vec
=
__builtin_amdgcn_mfma_f32_32x32x2f32
(
a_f32
,
b_f32
,
c_vec
,
0
,
0
,
0
);
});
return
c_vec
;
#else
ck_tile
::
ignore
=
a_vec
;
ck_tile
::
ignore
=
b_vec
;
return
CVecType
{
0.
f
};
#endif
}
};
using
WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8
=
WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
<
fp8_t
,
fp8_t
>
;
using
WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_bf8
=
WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
<
fp8_t
,
bf8_t
>
;
using
WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_fp8
=
WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
<
bf8_t
,
fp8_t
>
;
using
WarpGemmAttributeMfmaImpl_f32_32x32x16_bf8_bf8
=
WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
<
bf8_t
,
bf8_t
>
;
}
// namespace ck_tile
include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
namespace
ck_tile
{
namespace
impl
{
template
<
typename
AType
,
typename
BType
,
typename
CType
,
index_t
MPerWave
,
index_t
NPerWave
,
index_t
KPerWave
,
bool
TransposeC
>
struct
WarpGemmMfmaDispatcher
;
// clang-format off
// fp16
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
half_t
,
ck_tile
::
half_t
,
float
,
32
,
32
,
8
,
false
>
{
using
Type
=
WarpGemmMfmaF16F16F32M32N32K8
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
half_t
,
ck_tile
::
half_t
,
float
,
32
,
32
,
8
,
true
>
{
using
Type
=
WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
half_t
,
ck_tile
::
half_t
,
float
,
32
,
32
,
16
,
false
>
{
using
Type
=
WarpGemmMfmaF16F16F32M32N32K16
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
half_t
,
ck_tile
::
half_t
,
float
,
32
,
32
,
16
,
true
>
{
using
Type
=
WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
half_t
,
ck_tile
::
half_t
,
float
,
16
,
16
,
16
,
false
>
{
using
Type
=
WarpGemmMfmaF16F16F32M16N16K16
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
half_t
,
ck_tile
::
half_t
,
float
,
16
,
16
,
16
,
true
>
{
using
Type
=
WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
half_t
,
ck_tile
::
half_t
,
float
,
16
,
16
,
32
,
false
>
{
using
Type
=
WarpGemmMfmaF16F16F32M16N16K32
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
half_t
,
ck_tile
::
half_t
,
float
,
16
,
16
,
32
,
true
>
{
using
Type
=
WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution
;
};
// bf16
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
bf16_t
,
ck_tile
::
bf16_t
,
float
,
32
,
32
,
8
,
false
>
{
using
Type
=
WarpGemmMfmaBf16Bf16F32M32N32K8
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
bf16_t
,
ck_tile
::
bf16_t
,
float
,
32
,
32
,
8
,
true
>
{
using
Type
=
WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
bf16_t
,
ck_tile
::
bf16_t
,
float
,
32
,
32
,
16
,
false
>
{
using
Type
=
WarpGemmMfmaBf16Bf16F32M32N32K16
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
bf16_t
,
ck_tile
::
bf16_t
,
float
,
32
,
32
,
16
,
true
>
{
using
Type
=
WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
bf16_t
,
ck_tile
::
bf16_t
,
float
,
16
,
16
,
16
,
false
>
{
using
Type
=
WarpGemmMfmaBf16Bf16F32M16N16K16
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
bf16_t
,
ck_tile
::
bf16_t
,
float
,
16
,
16
,
16
,
true
>
{
using
Type
=
WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
bf16_t
,
ck_tile
::
bf16_t
,
float
,
16
,
16
,
32
,
false
>
{
using
Type
=
WarpGemmMfmaBf16Bf16F32M16N16K32
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
bf16_t
,
ck_tile
::
bf16_t
,
float
,
16
,
16
,
32
,
true
>
{
using
Type
=
WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution
;
};
// fp8
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
fp8_t
,
ck_tile
::
fp8_t
,
float
,
32
,
32
,
16
,
false
>
{
using
Type
=
WarpGemmMfma_f32_32x32x16_fp8_fp8
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
fp8_t
,
ck_tile
::
fp8_t
,
float
,
32
,
32
,
16
,
true
>
{
using
Type
=
WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
fp8_t
,
ck_tile
::
bf8_t
,
float
,
32
,
32
,
16
,
false
>
{
using
Type
=
WarpGemmMfma_f32_32x32x16_fp8_bf8
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
fp8_t
,
ck_tile
::
bf8_t
,
float
,
32
,
32
,
16
,
true
>
{
using
Type
=
WarpGemmMfma_f32_32x32x16_fp8_bf8_CTransposed
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
bf8_t
,
ck_tile
::
fp8_t
,
float
,
32
,
32
,
16
,
false
>
{
using
Type
=
WarpGemmMfma_f32_32x32x16_bf8_fp8
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
bf8_t
,
ck_tile
::
fp8_t
,
float
,
32
,
32
,
16
,
true
>
{
using
Type
=
WarpGemmMfma_f32_32x32x16_bf8_fp8_CTransposed
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
bf8_t
,
ck_tile
::
bf8_t
,
float
,
32
,
32
,
16
,
false
>
{
using
Type
=
WarpGemmMfma_f32_32x32x16_bf8_bf8
;
};
template
<
>
struct
WarpGemmMfmaDispatcher
<
ck_tile
::
bf8_t
,
ck_tile
::
bf8_t
,
float
,
32
,
32
,
16
,
true
>
{
using
Type
=
WarpGemmMfma_f32_32x32x16_bf8_bf8_CTransposed
;
};
// clang-format on
}
// namespace impl
template
<
typename
AType
,
typename
BType
,
typename
CType
,
index_t
MPerWave
,
index_t
NPerWave
,
index_t
KPerWave
,
bool
TransposeC
>
using
WarpGemmMfmaDispatcher
=
typename
impl
::
WarpGemmMfmaDispatcher
<
AType
,
BType
,
CType
,
MPerWave
,
NPerWave
,
KPerWave
,
TransposeC
>::
Type
;
}
// namespace ck_tile
include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
namespace
ck_tile
{
template
<
typename
WarpGemmAttribute_
>
struct
WarpGemmImpl
{
using
WarpGemmAttribute
=
remove_cvref_t
<
WarpGemmAttribute_
>
;
static
constexpr
index_t
kM
=
WarpGemmAttribute
::
kM
;
static
constexpr
index_t
kN
=
WarpGemmAttribute
::
kN
;
static
constexpr
index_t
kK
=
WarpGemmAttribute
::
kK
;
using
ADataType
=
typename
WarpGemmAttribute
::
ADataType
;
using
BDataType
=
typename
WarpGemmAttribute
::
BDataType
;
using
CDataType
=
typename
WarpGemmAttribute
::
CDataType
;
using
AWarpDstrEncoding
=
typename
WarpGemmAttribute
::
AWarpDstrEncoding
;
using
BWarpDstrEncoding
=
typename
WarpGemmAttribute
::
BWarpDstrEncoding
;
using
CWarpDstrEncoding
=
typename
WarpGemmAttribute
::
CWarpDstrEncoding
;
using
AWarpDstr
=
remove_cvref_t
<
decltype
(
make_static_tile_distribution
(
AWarpDstrEncoding
{}))
>
;
using
BWarpDstr
=
remove_cvref_t
<
decltype
(
make_static_tile_distribution
(
BWarpDstrEncoding
{}))
>
;
using
CWarpDstr
=
remove_cvref_t
<
decltype
(
make_static_tile_distribution
(
CWarpDstrEncoding
{}))
>
;
using
AWarpTensor
=
static_distributed_tensor
<
ADataType
,
AWarpDstr
>
;
using
BWarpTensor
=
static_distributed_tensor
<
BDataType
,
BWarpDstr
>
;
using
CWarpTensor
=
static_distributed_tensor
<
CDataType
,
CWarpDstr
>
;
CK_TILE_DEVICE
void
operator
()(
CWarpTensor
&
c
,
const
AWarpTensor
&
a
,
const
BWarpTensor
&
b
)
const
{
using
AVec
=
ext_vector_t
<
ADataType
,
AWarpTensor
::
get_thread_buffer_size
()
>
;
using
BVec
=
ext_vector_t
<
BDataType
,
BWarpTensor
::
get_thread_buffer_size
()
>
;
using
CVec
=
ext_vector_t
<
CDataType
,
CWarpTensor
::
get_thread_buffer_size
()
>
;
constexpr
auto
I0
=
number
<
0
>
{};
const
auto
a_vec
=
a
.
get_thread_buffer
().
template
get_as
<
AVec
>()[
I0
];
const
auto
b_vec
=
b
.
get_thread_buffer
().
template
get_as
<
BVec
>()[
I0
];
auto
c_vec
=
c
.
get_thread_buffer
().
template
get_as
<
CVec
>()[
I0
];
// c_vec += a_vec * b_vec
WarpGemmAttribute
{}(
c_vec
,
a_vec
,
b_vec
);
c
.
get_thread_buffer
().
template
set_as
<
CVec
>(
I0
,
c_vec
);
}
CK_TILE_DEVICE
auto
operator
()(
const
AWarpTensor
&
a
,
const
BWarpTensor
&
b
)
const
{
CWarpTensor
c
;
using
AVec
=
ext_vector_t
<
ADataType
,
AWarpTensor
::
get_thread_buffer_size
()
>
;
using
BVec
=
ext_vector_t
<
BDataType
,
BWarpTensor
::
get_thread_buffer_size
()
>
;
using
CVec
=
ext_vector_t
<
CDataType
,
CWarpTensor
::
get_thread_buffer_size
()
>
;
constexpr
auto
I0
=
number
<
0
>
{};
const
auto
a_vec
=
a
.
get_thread_buffer
().
template
get_as
<
AVec
>()[
I0
];
const
auto
b_vec
=
b
.
get_thread_buffer
().
template
get_as
<
BVec
>()[
I0
];
// c_vec = a_vec * b_vec
auto
c_vec
=
WarpGemmAttribute
{}(
a_vec
,
b_vec
);
c
.
get_thread_buffer
().
template
set_as
<
CVec
>(
I0
,
c_vec
);
return
c
;
}
};
}
// namespace ck_tile
include/ck_tile/ops/reduce.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/ops/reduce/block/block_reduce.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
include/ck_tile/ops/reduce/block/block_reduce.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
namespace
ck_tile
{
// synchronize reduce result (cross lane reduction and broadcast on replicated dimension)
template
<
typename
AccDistributedTensor_
,
typename
ReduceFunc
,
bool
WithBroadcast
=
true
>
CK_TILE_DEVICE
void
block_tile_reduce_sync
(
AccDistributedTensor_
&
acc_tensor
,
const
ReduceFunc
&
reduce_func
,
bool_constant
<
WithBroadcast
>
=
{})
{
using
Dstr
=
typename
AccDistributedTensor_
::
StaticTileDistribution
;
using
DstrEncode
=
typename
Dstr
::
DstrEncode
;
using
DstrEncodeDetail
=
typename
DstrEncode
::
detail
;
constexpr
index_t
NDimP
=
Dstr
::
get_num_of_dimension_p
();
constexpr
index_t
NDimR
=
Dstr
::
get_num_of_dimension_r
();
constexpr
index_t
idim_p_lane
=
NDimP
-
1
;
const
auto
ps_idx
=
make_array
<
index_t
>
(
get_block_id
(),
get_lane_id
());
const
auto
rs_idx
=
acc_tensor
.
get_tile_distribution
().
calculate_rs_index_from_ps_index
(
ps_idx
);
constexpr
index_t
thread_buf_size
=
AccDistributedTensor_
::
get_thread_buffer_size
();
// loop over thread data
static_for
<
0
,
thread_buf_size
,
1
>
{}([
&
](
auto
i
)
{
auto
v_local
=
acc_tensor
.
get_thread_buffer
()[
i
];
// cross-lane reduce for replication
// only reduce on R dimension correspond to lane
// (lane id maps to this R dimension)
static_for
<
0
,
NDimR
,
1
>
{}([
&
](
auto
idim_r
)
{
// FIXME: nasty to use does_p_own_r_
if
constexpr
(
DstrEncodeDetail
::
does_p_own_r_
[
idim_p_lane
][
idim_r
])
{
constexpr
index_t
r_length
=
DstrEncode
::
rs_lengths_
[
idim_r
];
constexpr
index_t
lid_over_rid_derivative
=
DstrEncodeDetail
::
ps_over_rs_derivative_
[
idim_p_lane
][
idim_r
];
static_assert
(
is_power_of_two_integer
(
r_length
),
"wrong! only support power of 2 reduction"
);
constexpr
index_t
nstage
=
integer_log2_floor
(
r_length
);
// reduction sweep forward
static_for
<
0
,
nstage
,
1
>
{}([
&
](
auto
istage
)
{
constexpr
index_t
lid_delta
=
lid_over_rid_derivative
*
(
1
<<
(
nstage
-
istage
-
1
));
// pull data from remote lane
const
auto
v_remote
=
warp_shuffle_down
(
v_local
,
lid_delta
);
// reduce
v_local
=
reduce_func
(
v_local
,
v_remote
);
});
}
});
if
constexpr
(
WithBroadcast
)
{
// cross-lane broadcast for replication
// only broadcast on R dimension correspond to lane
// (lane id maps to this R dimension)
static_for
<
0
,
NDimR
,
1
>
{}([
&
](
auto
idim_r
)
{
// FIXME: nasty to use does_p_own_r_
if
constexpr
(
DstrEncodeDetail
::
does_p_own_r_
[
idim_p_lane
][
idim_r
])
{
const
index_t
r_id
=
rs_idx
[
idim_r
];
constexpr
index_t
r_length
=
DstrEncode
::
rs_lengths_
[
idim_r
];
constexpr
index_t
lid_over_rid_derivative
=
DstrEncodeDetail
::
ps_over_rs_derivative_
[
NDimP
-
1
][
idim_r
];
static_assert
(
is_power_of_two_integer
(
r_length
),
"wrong! only support power of 2 reduction"
);
constexpr
index_t
nstage
=
integer_log2_floor
(
r_length
);
// broadcast sweep backward
static_for
<
0
,
nstage
,
1
>
{}([
&
](
auto
istage
)
{
// do I hold reduced data?
const
bool
do_i_hold_reduced_data
=
r_id
<
(
1
<<
istage
);
constexpr
index_t
lid_delta
=
lid_over_rid_derivative
*
(
1
<<
istage
);
// pull data from remote lane
const
auto
v_remote
=
warp_shuffle_up
(
v_local
,
lid_delta
);
// decide whether to update local data with remote data
v_local
=
do_i_hold_reduced_data
?
v_local
:
v_remote
;
});
}
});
}
acc_tensor
.
get_thread_buffer
()(
i
)
=
v_local
;
});
}
// FIXME: this is for 2D to 1D reduce only, need to support n-D
template
<
typename
AccDistributedTensor_
,
typename
InDistributedTensor_
,
index_t
...
InReduceDims
,
typename
ReduceFunc
>
CK_TILE_DEVICE
void
block_tile_reduce
(
AccDistributedTensor_
&
acc_tensor
,
const
InDistributedTensor_
&
in_tensor
,
sequence
<
InReduceDims
...
>
,
const
ReduceFunc
&
reduce_func
)
{
constexpr
auto
I0
=
number
<
0
>
{};
constexpr
auto
I1
=
number
<
1
>
{};
#if 0
constexpr auto in_reduce_dims = sequence<InReduceDims...>{};
constexpr index_t ndim_in = InDistributedTensor_::get_num_of_dimension();
constexpr index_t ndim_in_reduce = in_reduce_dims.size();
constexpr index_t ndim_in_free = ndim_in - ndim_in_reduce;
constexpr auto in_free_dims_arr = [&] {
array<bool, ndim_free> is_free_dims{true};
for(index_t i = 0; i < ndim_reduce; i++)
{
is_free_dims(in_reduce_dims[i]) = false;
}
array<index_t, ndim_free> in_free_dims{-1};
index_t cnt = 0;
for(index_t i = 0; i < ndim_in; i++)
{
if(is_free_dims[i])
{
in_free_dims(cnt) = i;
cnt++
}
}
return is_free_dims;
}();
constexpr auto in_free_dims = TO_SEQUENCE(is_free_dims_arr, ndim_in_free);
#else
constexpr
auto
spans
=
InDistributedTensor_
::
get_distributed_spans
();
// in-thread reduction
// FIXME: hard coded to be 2D to 1D reduction
sweep_tile_span
(
spans
[
I0
],
[
&
](
auto
dstr_idx_i0
)
{
constexpr
auto
acc_dstr_idx
=
make_tuple
(
dstr_idx_i0
);
auto
acc
=
acc_tensor
[
acc_dstr_idx
];
// FIXME
sweep_tile_span
(
spans
[
I1
],
[
&
](
auto
dstr_idx_i1
)
{
constexpr
auto
in_dstr_idx
=
make_tuple
(
dstr_idx_i0
,
dstr_idx_i1
);
const
auto
in
=
in_tensor
[
in_dstr_idx
];
acc
=
reduce_func
(
acc
,
in
);
});
acc_tensor
(
acc_dstr_idx
)
=
acc
;
});
#endif
}
template
<
typename
AccDataType_
,
typename
InDistributedTensor_
,
index_t
...
InReduceDims
,
typename
ReduceFunc
,
typename
InDataType_
>
CK_TILE_DEVICE
auto
block_tile_reduce
(
const
InDistributedTensor_
&
in_tensor
,
sequence
<
InReduceDims
...
>
in_reduce_dims
,
const
ReduceFunc
&
reduce_func
,
const
InDataType_
&
reduce_init
)
{
using
InDataType
=
typename
InDistributedTensor_
::
DataType
;
using
AccDataType
=
remove_cvref_t
<
AccDataType_
>
;
static_assert
(
std
::
is_same_v
<
InDataType
,
remove_cvref_t
<
InDataType_
>>
,
"wrong!"
);
// declare acc_tensor
constexpr
auto
acc_dstr
=
make_static_tile_distribution
(
ck_tile
::
detail
::
make_reduce_tile_distribution_encoding
(
InDistributedTensor_
::
get_tile_distribution
().
get_static_tile_distribution_encoding
(),
sequence
<
InReduceDims
...
>
{}));
auto
acc_tensor
=
make_static_distributed_tensor
<
AccDataType
>
(
acc_dstr
);
// init acc_tensor
tile_elementwise_inout
([
&
](
auto
&
acc
)
{
acc
=
type_convert
<
AccDataType
>
(
reduce_init
);
},
acc_tensor
);
// warp reduce
block_tile_reduce
(
acc_tensor
,
in_tensor
,
in_reduce_dims
,
reduce_func
);
return
acc_tensor
;
}
}
// namespace ck_tile
include/ck_tile/remod.py
0 → 100644
View file @
4396a224
import
pathlib
from
pathlib
import
Path
import
subprocess
import
os
import
copy
NS
=
'ck_tile'
OPS
=
'ops'
OPS_COMMON
=
'common'
# common header will be duplicated into ops/* other module
HEADER_COMMON
=
"""// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
\n
"""
# aa/bb/cc/file.hpp -> (aa, bb, cc, file.hpp)
def
get_module
(
f
,
level
=
0
):
all_parts
=
f
.
parts
return
str
(
all_parts
[
level
])
all_files
=
[]
for
p
in
sorted
(
Path
(
"./"
).
rglob
(
"*"
)):
if
p
.
suffix
==
'.hpp'
:
all_files
.
append
(
pathlib
.
PurePath
(
p
))
class
submodule_t
:
def
__init__
(
self
):
self
.
m
=
dict
()
def
push
(
self
,
f
):
if
len
(
f
.
parents
)
!=
1
:
# ignore ./xxx.hpp
mod
=
get_module
(
f
)
if
mod
==
OPS
:
if
mod
not
in
self
.
m
.
keys
():
self
.
m
[
mod
]
=
dict
()
mod2
=
get_module
(
f
,
1
)
if
Path
(
mod2
).
suffix
!=
'.hpp'
:
# ignore ops/xxx.hpp
if
mod2
not
in
self
.
m
[
mod
].
keys
():
self
.
m
[
mod
][
mod2
]
=
list
()
self
.
m
[
mod
][
mod2
].
append
(
f
)
else
:
if
mod
not
in
self
.
m
.
keys
():
self
.
m
[
mod
]
=
list
()
self
.
m
[
mod
].
append
(
f
)
def
gen
(
self
):
def
gen_header
(
hpath
,
include_list
):
# print(hpath)
if
os
.
path
.
exists
(
str
(
hpath
)):
os
.
remove
(
str
(
hpath
))
with
hpath
.
open
(
'w'
)
as
f
:
f
.
write
(
HEADER_COMMON
)
f
.
write
(
'#pragma once
\n
'
)
f
.
write
(
'
\n
'
)
for
individual_header
in
include_list
:
header_path
=
NS
+
'/'
+
str
(
individual_header
)
f
.
write
(
f
'#include
\"
{
header_path
}
\"\n
'
)
# f.write('\n') # otherwise clang-format will complain
# print(self.m)
# restructure common
for
k
,
v
in
self
.
m
.
items
():
if
k
==
OPS
and
OPS_COMMON
in
v
.
keys
():
common_list
=
copy
.
deepcopy
(
v
[
OPS_COMMON
])
# v.pop(OPS_COMMON)
for
km
in
v
.
keys
():
if
km
!=
OPS_COMMON
:
v
[
km
].
extend
(
common_list
)
for
k
,
v
in
self
.
m
.
items
():
if
k
==
OPS
:
for
km
,
kv
in
v
.
items
():
gen_header
(
Path
(
k
)
/
(
f
'
{
km
}
.hpp'
),
kv
)
else
:
gen_header
(
Path
(
f
'
{
k
}
.hpp'
),
v
)
submodule
=
submodule_t
()
# formatting
for
x
in
all_files
:
subprocess
.
Popen
(
f
'dos2unix
{
str
(
x
)
}
'
,
shell
=
True
)
cmd
=
f
'clang-format-12 -style=file -i
{
str
(
x
)
}
'
#for xp in x.parents:
#print(get_file_base(x))
subprocess
.
Popen
(
cmd
,
shell
=
True
)
submodule
.
push
(
x
)
submodule
.
gen
()
#print(all_files)
library/include/ck/library/tensor_operation_instance/gpu/gemm_multi_abd.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Scales
=
ck
::
tensor_operation
::
element_wise
::
Scales
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AddFastGelu
=
ck
::
tensor_operation
::
element_wise
::
AddFastGelu
;
#ifdef CK_ENABLE_INT8
// RRR
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_gelu_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<
Row
>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
Scales
,
AddFastGelu
>>>&
instances
);
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<
Row
>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
Scales
,
Add
>>>&
instances
);
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_gelu_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<>
,
BF16
,
PassThrough
,
Scales
,
FastGelu
>>>&
instances
);
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<>
,
BF16
,
PassThrough
,
Scales
,
PassThrough
>>>&
instances
);
// RCR
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_gelu_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
Col
,
Col
>
,
ck
::
Tuple
<
Row
>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
Scales
,
AddFastGelu
>>>&
instances
);
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
Col
,
Col
>
,
ck
::
Tuple
<
Row
>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
Scales
,
Add
>>>&
instances
);
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_gelu_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
Col
,
Col
>
,
ck
::
Tuple
<>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<>
,
BF16
,
PassThrough
,
Scales
,
FastGelu
>>>&
instances
);
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
Col
,
Col
>
,
ck
::
Tuple
<>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<>
,
BF16
,
PassThrough
,
Scales
,
PassThrough
>>>&
instances
);
// CRR
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_bias_gelu_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
ck
::
Tuple
<
Col
>
,
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<
Row
>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
Scales
,
AddFastGelu
>>>&
instances
);
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_bias_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
ck
::
Tuple
<
Col
>
,
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<
Row
>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
Scales
,
Add
>>>&
instances
);
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_gelu_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
ck
::
Tuple
<
Col
>
,
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<>
,
BF16
,
PassThrough
,
Scales
,
FastGelu
>>>&
instances
);
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
ck
::
Tuple
<
Col
>
,
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<>
,
BF16
,
PassThrough
,
Scales
,
PassThrough
>>>&
instances
);
#endif
// GEMM + Add + Gelu
template
<
typename
AsLayout
,
typename
BsLayout
,
typename
DsLayout
,
typename
ELayout
,
typename
AsDataType
,
typename
BsDataType
,
typename
DsDataType
,
typename
EDataType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
DsDataType
,
EDataType
,
PassThrough
,
Scales
,
AddFastGelu
>>
{
using
DeviceOp
=
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
DsDataType
,
EDataType
,
PassThrough
,
Scales
,
AddFastGelu
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
#ifdef CK_ENABLE_INT8
if
constexpr
(
is_same_v
<
AsDataType
,
ck
::
Tuple
<
BF16
>>
&&
is_same_v
<
BsDataType
,
ck
::
Tuple
<
I8
,
BF16
>>
&&
is_same_v
<
DsDataType
,
ck
::
Tuple
<
BF16
>>
&&
is_same_v
<
EDataType
,
BF16
>
)
{
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Row
,
Row
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_gelu_v1_instances
(
op_ptrs
);
}
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Col
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Row
,
Row
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_bias_gelu_v1_instances
(
op_ptrs
);
}
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Col
,
Col
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_gelu_v1_instances
(
op_ptrs
);
}
}
#endif
return
op_ptrs
;
}
};
// GEMM + Add
template
<
typename
AsLayout
,
typename
BsLayout
,
typename
DsLayout
,
typename
ELayout
,
typename
AsDataType
,
typename
BsDataType
,
typename
DsDataType
,
typename
EDataType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
DsDataType
,
EDataType
,
PassThrough
,
Scales
,
Add
>>
{
using
DeviceOp
=
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
DsDataType
,
EDataType
,
PassThrough
,
Scales
,
Add
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
#ifdef CK_ENABLE_INT8
if
constexpr
(
is_same_v
<
AsDataType
,
ck
::
Tuple
<
BF16
>>
&&
is_same_v
<
BsDataType
,
ck
::
Tuple
<
I8
,
BF16
>>
&&
is_same_v
<
DsDataType
,
ck
::
Tuple
<
BF16
>>
&&
is_same_v
<
EDataType
,
BF16
>
)
{
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Row
,
Row
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_v1_instances
(
op_ptrs
);
}
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Col
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Row
,
Row
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_bias_v1_instances
(
op_ptrs
);
}
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Col
,
Col
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_v1_instances
(
op_ptrs
);
}
}
#endif
return
op_ptrs
;
}
};
// GEMM + Gelu
template
<
typename
AsLayout
,
typename
BsLayout
,
typename
DsLayout
,
typename
ELayout
,
typename
AsDataType
,
typename
BsDataType
,
typename
DsDataType
,
typename
EDataType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
DsDataType
,
EDataType
,
PassThrough
,
Scales
,
FastGelu
>>
{
using
DeviceOp
=
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
DsDataType
,
EDataType
,
PassThrough
,
Scales
,
FastGelu
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
#ifdef CK_ENABLE_INT8
if
constexpr
(
is_same_v
<
AsDataType
,
ck
::
Tuple
<
BF16
>>
&&
is_same_v
<
BsDataType
,
ck
::
Tuple
<
I8
,
BF16
>>
&&
is_same_v
<
DsDataType
,
ck
::
Tuple
<>>
&&
is_same_v
<
EDataType
,
BF16
>
)
{
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Row
,
Row
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_gelu_v1_instances
(
op_ptrs
);
}
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Col
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Row
,
Row
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_gelu_v1_instances
(
op_ptrs
);
}
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Col
,
Col
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_gelu_v1_instances
(
op_ptrs
);
}
}
#endif
return
op_ptrs
;
}
};
// GEMM
template
<
typename
AsLayout
,
typename
BsLayout
,
typename
DsLayout
,
typename
ELayout
,
typename
AsDataType
,
typename
BsDataType
,
typename
DsDataType
,
typename
EDataType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
DsDataType
,
EDataType
,
PassThrough
,
Scales
,
PassThrough
>>
{
using
DeviceOp
=
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
DsDataType
,
EDataType
,
PassThrough
,
Scales
,
PassThrough
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
#ifdef CK_ENABLE_INT8
if
constexpr
(
is_same_v
<
AsDataType
,
ck
::
Tuple
<
BF16
>>
&&
is_same_v
<
BsDataType
,
ck
::
Tuple
<
I8
,
BF16
>>
&&
is_same_v
<
DsDataType
,
ck
::
Tuple
<>>
&&
is_same_v
<
EDataType
,
BF16
>
)
{
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Row
,
Row
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_v1_instances
(
op_ptrs
);
}
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Col
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Row
,
Row
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_v1_instances
(
op_ptrs
);
}
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Col
,
Col
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_v1_instances
(
op_ptrs
);
}
}
#endif
return
op_ptrs
;
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_multi_abd_fixed_nk.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd_fixed_nk.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Scales
=
ck
::
tensor_operation
::
element_wise
::
Scales
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AddFastGelu
=
ck
::
tensor_operation
::
element_wise
::
AddFastGelu
;
// RRR
void
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_gelu_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmMultiABDFixedNK
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<
Row
>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
Scales
,
AddFastGelu
>>>&
instances
);
void
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmMultiABDFixedNK
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<
Row
>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
Scales
,
Add
>>>&
instances
);
void
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_kn_mn_gelu_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmMultiABDFixedNK
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<>
,
BF16
,
PassThrough
,
Scales
,
FastGelu
>>>&
instances
);
void
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmMultiABDFixedNK
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<>
,
BF16
,
PassThrough
,
Scales
,
PassThrough
>>>&
instances
);
// RCR
void
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_gelu_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmMultiABDFixedNK
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
Col
,
Col
>
,
ck
::
Tuple
<
Row
>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
Scales
,
AddFastGelu
>>>&
instances
);
void
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmMultiABDFixedNK
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
Col
,
Col
>
,
ck
::
Tuple
<
Row
>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
Scales
,
Add
>>>&
instances
);
void
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_nk_mn_gelu_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmMultiABDFixedNK
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
Col
,
Col
>
,
ck
::
Tuple
<>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<>
,
BF16
,
PassThrough
,
Scales
,
FastGelu
>>>&
instances
);
void
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmMultiABDFixedNK
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
Col
,
Col
>
,
ck
::
Tuple
<>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<>
,
BF16
,
PassThrough
,
Scales
,
PassThrough
>>>&
instances
);
// CRR
void
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_km_kn_mn_bias_gelu_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmMultiABDFixedNK
<
ck
::
Tuple
<
Col
>
,
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<
Row
>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
Scales
,
AddFastGelu
>>>&
instances
);
void
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_km_kn_mn_bias_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmMultiABDFixedNK
<
ck
::
Tuple
<
Col
>
,
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<
Row
>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
Scales
,
Add
>>>&
instances
);
void
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_km_kn_mn_gelu_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmMultiABDFixedNK
<
ck
::
Tuple
<
Col
>
,
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<>
,
BF16
,
PassThrough
,
Scales
,
FastGelu
>>>&
instances
);
void
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_km_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemmMultiABDFixedNK
<
ck
::
Tuple
<
Col
>
,
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<>
,
Row
,
ck
::
Tuple
<
BF16
>
,
ck
::
Tuple
<
I8
,
BF16
>
,
ck
::
Tuple
<>
,
BF16
,
PassThrough
,
Scales
,
PassThrough
>>>&
instances
);
// GEMM + Add + Gelu
template
<
typename
AsLayout
,
typename
BsLayout
,
typename
DsLayout
,
typename
ELayout
,
typename
AsDataType
,
typename
BsDataType
,
typename
DsDataType
,
typename
EDataType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGroupedGemmMultiABDFixedNK
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
DsDataType
,
EDataType
,
PassThrough
,
Scales
,
AddFastGelu
>>
{
using
DeviceOp
=
DeviceGroupedGemmMultiABDFixedNK
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
DsDataType
,
EDataType
,
PassThrough
,
Scales
,
AddFastGelu
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
is_same_v
<
AsDataType
,
ck
::
Tuple
<
BF16
>>
&&
is_same_v
<
BsDataType
,
ck
::
Tuple
<
I8
,
BF16
>>
&&
is_same_v
<
DsDataType
,
ck
::
Tuple
<
BF16
>>
&&
is_same_v
<
EDataType
,
BF16
>
)
{
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Row
,
Row
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_gelu_instances
(
op_ptrs
);
}
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Col
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Row
,
Row
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_km_kn_mn_bias_gelu_instances
(
op_ptrs
);
}
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Col
,
Col
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_gelu_instances
(
op_ptrs
);
}
}
return
op_ptrs
;
}
};
// GEMM + Add
template
<
typename
AsLayout
,
typename
BsLayout
,
typename
DsLayout
,
typename
ELayout
,
typename
AsDataType
,
typename
BsDataType
,
typename
DsDataType
,
typename
EDataType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGroupedGemmMultiABDFixedNK
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
DsDataType
,
EDataType
,
PassThrough
,
Scales
,
Add
>>
{
using
DeviceOp
=
DeviceGroupedGemmMultiABDFixedNK
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
DsDataType
,
EDataType
,
PassThrough
,
Scales
,
Add
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
is_same_v
<
AsDataType
,
ck
::
Tuple
<
BF16
>>
&&
is_same_v
<
BsDataType
,
ck
::
Tuple
<
I8
,
BF16
>>
&&
is_same_v
<
DsDataType
,
ck
::
Tuple
<
BF16
>>
&&
is_same_v
<
EDataType
,
BF16
>
)
{
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Row
,
Row
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_instances
(
op_ptrs
);
}
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Col
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Row
,
Row
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_km_kn_mn_bias_instances
(
op_ptrs
);
}
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Col
,
Col
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_instances
(
op_ptrs
);
}
}
return
op_ptrs
;
}
};
// GEMM + Gelu
template
<
typename
AsLayout
,
typename
BsLayout
,
typename
DsLayout
,
typename
ELayout
,
typename
AsDataType
,
typename
BsDataType
,
typename
DsDataType
,
typename
EDataType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGroupedGemmMultiABDFixedNK
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
DsDataType
,
EDataType
,
PassThrough
,
Scales
,
FastGelu
>>
{
using
DeviceOp
=
DeviceGroupedGemmMultiABDFixedNK
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
DsDataType
,
EDataType
,
PassThrough
,
Scales
,
FastGelu
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
is_same_v
<
AsDataType
,
ck
::
Tuple
<
BF16
>>
&&
is_same_v
<
BsDataType
,
ck
::
Tuple
<
I8
,
BF16
>>
&&
is_same_v
<
DsDataType
,
ck
::
Tuple
<>>
&&
is_same_v
<
EDataType
,
BF16
>
)
{
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Row
,
Row
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_kn_mn_gelu_instances
(
op_ptrs
);
}
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Col
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Row
,
Row
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_km_kn_mn_gelu_instances
(
op_ptrs
);
}
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Col
,
Col
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_nk_mn_gelu_instances
(
op_ptrs
);
}
}
return
op_ptrs
;
}
};
// GEMM
template
<
typename
AsLayout
,
typename
BsLayout
,
typename
DsLayout
,
typename
ELayout
,
typename
AsDataType
,
typename
BsDataType
,
typename
DsDataType
,
typename
EDataType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGroupedGemmMultiABDFixedNK
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
DsDataType
,
EDataType
,
PassThrough
,
Scales
,
PassThrough
>>
{
using
DeviceOp
=
DeviceGroupedGemmMultiABDFixedNK
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
DsDataType
,
EDataType
,
PassThrough
,
Scales
,
PassThrough
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
is_same_v
<
AsDataType
,
ck
::
Tuple
<
BF16
>>
&&
is_same_v
<
BsDataType
,
ck
::
Tuple
<
I8
,
BF16
>>
&&
is_same_v
<
DsDataType
,
ck
::
Tuple
<>>
&&
is_same_v
<
EDataType
,
BF16
>
)
{
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Row
,
Row
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_kn_mn_instances
(
op_ptrs
);
}
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Col
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Row
,
Row
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_km_kn_mn_instances
(
op_ptrs
);
}
if
constexpr
(
is_same_v
<
AsLayout
,
ck
::
Tuple
<
Row
>>
&&
is_same_v
<
BsLayout
,
ck
::
Tuple
<
Col
,
Col
>>
&&
is_same_v
<
DsLayout
,
ck
::
Tuple
<>>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_mk_nk_mn_instances
(
op_ptrs
);
}
}
return
op_ptrs
;
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/gemm_multi_abd/CMakeLists.txt
0 → 100644
View file @
4396a224
# ONLY XDL_KERNELS
set
(
GEMM_MULTI_ABD_INSTANCES
)
list
(
APPEND GEMM_MULTI_ABD_INSTANCES
device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_mk_nk_mn_v1_instance.cpp
device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_km_kn_mn_v1_instance.cpp
)
add_instance_library
(
device_gemm_multi_abd_instance
${
GEMM_MULTI_ABD_INSTANCES
}
)
library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_common.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
BF16
=
ck
::
bhalf_t
;
using
I8
=
int8_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
A0DataType
=
BF16
;
using
AsDataType
=
ck
::
Tuple
<
A0DataType
>
;
using
B0DataType
=
I8
;
using
B1DataType
=
BF16
;
using
BsDataType
=
ck
::
Tuple
<
B0DataType
,
B1DataType
>
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
BF16
;
using
D0DataType
=
BF16
;
// using DsDataType = ck::Tuple<D0DataType>;
using
EDataType
=
BF16
;
using
A0Layout
=
Col
;
using
AsLayout
=
ck
::
Tuple
<
A0Layout
>
;
using
B0Layout
=
Row
;
using
B1Layout
=
B0Layout
;
using
BsLayout
=
ck
::
Tuple
<
B0Layout
,
B1Layout
>
;
using
D0Layout
=
Row
;
// using DsLayout = ck::Tuple<D0Layout>;
using
ELayout
=
Row
;
using
Scales
=
ck
::
tensor_operation
::
element_wise
::
Scales
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AddFastGelu
=
ck
::
tensor_operation
::
element_wise
::
AddFastGelu
;
using
FastGelu
=
ck
::
tensor_operation
::
element_wise
::
FastGelu
;
using
Add
=
ck
::
tensor_operation
::
element_wise
::
Add
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
Scales
;
// using CDEElementOp = AddFastGelu;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
static
constexpr
auto
GemmMNPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
;
static
constexpr
auto
GemmMNKPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
template
<
typename
DsLayout
,
typename
DsDataType
,
typename
CDEElementOp
,
ck
::
tensor_operation
::
device
::
GemmSpecialization
GemmSpec
,
ck
::
PipelineVersion
PipVer
,
ck
::
LoopScheduler
LoopSche
>
using
device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_instances
=
std
::
tuple
<
// clang-format off
//###############################| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| K0Per| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//###############################| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
//###############################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl|
//###############################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
//PipelineVersion::v1
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
128
,
256
,
32
,
8
,
8
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
64
,
192
,
32
,
8
,
8
,
32
,
32
,
1
,
3
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
true
,
S
<
4
,
48
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
192
,
64
,
32
,
8
,
8
,
32
,
32
,
3
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
32
,
192
,
32
,
8
,
8
,
32
,
32
,
1
,
3
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
true
,
S
<
4
,
24
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
192
,
32
,
32
,
8
,
8
,
32
,
32
,
3
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
32
,
64
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
64
,
32
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
32
,
128
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
128
,
32
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
64
,
32
,
32
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
64
,
16
,
32
,
32
,
8
,
8
,
16
,
16
,
1
,
2
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
true
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
4
,
LoopSche
,
PipVer
>
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_common.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
BF16
=
ck
::
bhalf_t
;
using
I8
=
int8_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
A0DataType
=
BF16
;
using
AsDataType
=
ck
::
Tuple
<
A0DataType
>
;
using
B0DataType
=
I8
;
using
B1DataType
=
BF16
;
using
BsDataType
=
ck
::
Tuple
<
B0DataType
,
B1DataType
>
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
BF16
;
using
D0DataType
=
BF16
;
// using DsDataType = ck::Tuple<D0DataType>;
using
EDataType
=
BF16
;
using
A0Layout
=
Row
;
using
AsLayout
=
ck
::
Tuple
<
A0Layout
>
;
using
B0Layout
=
Row
;
using
B1Layout
=
B0Layout
;
using
BsLayout
=
ck
::
Tuple
<
B0Layout
,
B1Layout
>
;
using
D0Layout
=
Row
;
// using DsLayout = ck::Tuple<D0Layout>;
using
ELayout
=
Row
;
using
Scales
=
ck
::
tensor_operation
::
element_wise
::
Scales
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AddFastGelu
=
ck
::
tensor_operation
::
element_wise
::
AddFastGelu
;
using
FastGelu
=
ck
::
tensor_operation
::
element_wise
::
FastGelu
;
using
Add
=
ck
::
tensor_operation
::
element_wise
::
Add
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
Scales
;
// using CDEElementOp = AddFastGelu;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
static
constexpr
auto
GemmMNPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
;
static
constexpr
auto
GemmMNKPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
template
<
typename
DsLayout
,
typename
DsDataType
,
typename
CDEElementOp
,
ck
::
tensor_operation
::
device
::
GemmSpecialization
GemmSpec
,
ck
::
PipelineVersion
PipVer
,
ck
::
LoopScheduler
LoopSche
>
using
device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_instances
=
std
::
tuple
<
// clang-format off
//###############################| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| K0Per| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//###############################| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
//###############################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl|
//###############################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
//PipelineVersion::v1
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
128
,
256
,
32
,
8
,
8
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
64
,
192
,
32
,
8
,
8
,
32
,
32
,
1
,
3
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
48
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
192
,
64
,
32
,
8
,
8
,
32
,
32
,
3
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
32
,
192
,
32
,
8
,
8
,
32
,
32
,
1
,
3
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
24
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
192
,
32
,
32
,
8
,
8
,
32
,
32
,
3
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
32
,
64
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
64
,
32
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
32
,
128
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
128
,
32
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
64
,
32
,
32
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
2
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
64
,
16
,
32
,
32
,
8
,
8
,
16
,
16
,
1
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
4
,
LoopSche
,
PipVer
>
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_common.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
BF16
=
ck
::
bhalf_t
;
using
I8
=
int8_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
A0DataType
=
BF16
;
using
AsDataType
=
ck
::
Tuple
<
A0DataType
>
;
using
B0DataType
=
I8
;
using
B1DataType
=
BF16
;
using
BsDataType
=
ck
::
Tuple
<
B0DataType
,
B1DataType
>
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
BF16
;
using
D0DataType
=
BF16
;
// using DsDataType = ck::Tuple<D0DataType>;
using
EDataType
=
BF16
;
using
A0Layout
=
Row
;
using
AsLayout
=
ck
::
Tuple
<
A0Layout
>
;
using
B0Layout
=
Col
;
using
B1Layout
=
B0Layout
;
using
BsLayout
=
ck
::
Tuple
<
B0Layout
,
B1Layout
>
;
using
D0Layout
=
Row
;
// using DsLayout = ck::Tuple<D0Layout>;
using
ELayout
=
Row
;
using
Scales
=
ck
::
tensor_operation
::
element_wise
::
Scales
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AddFastGelu
=
ck
::
tensor_operation
::
element_wise
::
AddFastGelu
;
using
FastGelu
=
ck
::
tensor_operation
::
element_wise
::
FastGelu
;
using
Add
=
ck
::
tensor_operation
::
element_wise
::
Add
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
Scales
;
// using CDEElementOp = AddFastGelu;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
static
constexpr
auto
GemmMNPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
;
static
constexpr
auto
GemmMNKPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
template
<
typename
DsLayout
,
typename
DsDataType
,
typename
CDEElementOp
,
ck
::
tensor_operation
::
device
::
GemmSpecialization
GemmSpec
,
ck
::
PipelineVersion
PipVer
,
ck
::
LoopScheduler
LoopSche
>
using
device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_instances
=
std
::
tuple
<
// clang-format off
//###############################| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| K0Per| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//###############################| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
//###############################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl|
//###############################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
//PipelineVersion::v1
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
128
,
256
,
32
,
8
,
8
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
64
,
192
,
32
,
8
,
8
,
32
,
32
,
1
,
3
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
48
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
192
,
64
,
32
,
8
,
8
,
32
,
32
,
3
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
32
,
192
,
32
,
8
,
8
,
32
,
32
,
1
,
3
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
24
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
192
,
32
,
32
,
8
,
8
,
32
,
32
,
3
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
32
,
64
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
64
,
32
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
32
,
128
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
128
,
32
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
64
,
32
,
32
,
32
,
8
,
8
,
32
,
32
,
1
,
1
,
S
<
2
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
,
LoopSche
,
PipVer
>
,
DeviceGemmMultipleABD_Xdl_CShuffle
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
64
,
16
,
32
,
32
,
8
,
8
,
16
,
16
,
1
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
4
,
LoopSche
,
PipVer
>
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_km_kn_mn_v1_instance.cpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
#include "device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_common.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_bias_gelu_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
ck
::
Tuple
<
D0Layout
>
,
ELayout
,
AsDataType
,
BsDataType
,
ck
::
Tuple
<
D0DataType
>
,
EDataType
,
AElementOp
,
BElementOp
,
AddFastGelu
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_instances
<
ck
::
Tuple
<
D0Layout
>
,
ck
::
Tuple
<
D0DataType
>
,
AddFastGelu
,
GemmMNKPadding
,
PipelineVersion
::
v1
,
LoopScheduler
::
Default
>
{});
}
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_bias_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
ck
::
Tuple
<
D0Layout
>
,
ELayout
,
AsDataType
,
BsDataType
,
ck
::
Tuple
<
D0DataType
>
,
EDataType
,
AElementOp
,
BElementOp
,
Add
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_instances
<
ck
::
Tuple
<
D0Layout
>
,
ck
::
Tuple
<
D0DataType
>
,
Add
,
GemmMNKPadding
,
PipelineVersion
::
v1
,
LoopScheduler
::
Default
>
{});
}
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
ck
::
Tuple
<>
,
ELayout
,
AsDataType
,
BsDataType
,
ck
::
Tuple
<>
,
EDataType
,
AElementOp
,
BElementOp
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_instances
<
ck
::
Tuple
<>
,
ck
::
Tuple
<>
,
PassThrough
,
GemmMNKPadding
,
PipelineVersion
::
v1
,
LoopScheduler
::
Default
>
{});
}
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_gelu_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
ck
::
Tuple
<>
,
ELayout
,
AsDataType
,
BsDataType
,
ck
::
Tuple
<>
,
EDataType
,
AElementOp
,
BElementOp
,
FastGelu
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_gemm_xdl_multi_abd_bf16_i8_bf16_km_kn_mn_instances
<
ck
::
Tuple
<>
,
ck
::
Tuple
<>
,
FastGelu
,
GemmMNKPadding
,
PipelineVersion
::
v1
,
LoopScheduler
::
Default
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_mk_kn_mn_v1_instance.cpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
#include "device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_common.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_gelu_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
ck
::
Tuple
<
D0Layout
>
,
ELayout
,
AsDataType
,
BsDataType
,
ck
::
Tuple
<
D0DataType
>
,
EDataType
,
AElementOp
,
BElementOp
,
AddFastGelu
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_instances
<
ck
::
Tuple
<
D0Layout
>
,
ck
::
Tuple
<
D0DataType
>
,
AddFastGelu
,
GemmMNKPadding
,
PipelineVersion
::
v1
,
LoopScheduler
::
Default
>
{});
}
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_bias_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
ck
::
Tuple
<
D0Layout
>
,
ELayout
,
AsDataType
,
BsDataType
,
ck
::
Tuple
<
D0DataType
>
,
EDataType
,
AElementOp
,
BElementOp
,
Add
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_instances
<
ck
::
Tuple
<
D0Layout
>
,
ck
::
Tuple
<
D0DataType
>
,
Add
,
GemmMNKPadding
,
PipelineVersion
::
v1
,
LoopScheduler
::
Default
>
{});
}
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
ck
::
Tuple
<>
,
ELayout
,
AsDataType
,
BsDataType
,
ck
::
Tuple
<>
,
EDataType
,
AElementOp
,
BElementOp
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_instances
<
ck
::
Tuple
<>
,
ck
::
Tuple
<>
,
PassThrough
,
GemmMNKPadding
,
PipelineVersion
::
v1
,
LoopScheduler
::
Default
>
{});
}
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_gelu_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
ck
::
Tuple
<>
,
ELayout
,
AsDataType
,
BsDataType
,
ck
::
Tuple
<>
,
EDataType
,
AElementOp
,
BElementOp
,
FastGelu
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_kn_mn_instances
<
ck
::
Tuple
<>
,
ck
::
Tuple
<>
,
FastGelu
,
GemmMNKPadding
,
PipelineVersion
::
v1
,
LoopScheduler
::
Default
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/gemm_multi_abd/device_gemm_xdl_multi_abd_bias_gelu_bf16_i8_bf16_mk_nk_mn_v1_instance.cpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
#include "device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_common.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_gelu_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
ck
::
Tuple
<
D0Layout
>
,
ELayout
,
AsDataType
,
BsDataType
,
ck
::
Tuple
<
D0DataType
>
,
EDataType
,
AElementOp
,
BElementOp
,
AddFastGelu
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_instances
<
ck
::
Tuple
<
D0Layout
>
,
ck
::
Tuple
<
D0DataType
>
,
AddFastGelu
,
GemmMNKPadding
,
PipelineVersion
::
v1
,
LoopScheduler
::
Default
>
{});
}
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_bias_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
ck
::
Tuple
<
D0Layout
>
,
ELayout
,
AsDataType
,
BsDataType
,
ck
::
Tuple
<
D0DataType
>
,
EDataType
,
AElementOp
,
BElementOp
,
Add
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_instances
<
ck
::
Tuple
<
D0Layout
>
,
ck
::
Tuple
<
D0DataType
>
,
Add
,
GemmMNKPadding
,
PipelineVersion
::
v1
,
LoopScheduler
::
Default
>
{});
}
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
ck
::
Tuple
<>
,
ELayout
,
AsDataType
,
BsDataType
,
ck
::
Tuple
<>
,
EDataType
,
AElementOp
,
BElementOp
,
PassThrough
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_instances
<
ck
::
Tuple
<>
,
ck
::
Tuple
<>
,
PassThrough
,
GemmMNKPadding
,
PipelineVersion
::
v1
,
LoopScheduler
::
Default
>
{});
}
void
add_device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_gelu_v1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
ck
::
Tuple
<>
,
ELayout
,
AsDataType
,
BsDataType
,
ck
::
Tuple
<>
,
EDataType
,
AElementOp
,
BElementOp
,
FastGelu
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_gemm_xdl_multi_abd_bf16_i8_bf16_mk_nk_mn_instances
<
ck
::
Tuple
<>
,
ck
::
Tuple
<>
,
FastGelu
,
GemmMNKPadding
,
PipelineVersion
::
v1
,
LoopScheduler
::
Default
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk_multi_abd/CMakeLists.txt
0 → 100644
View file @
4396a224
# ONLY XDL_KERNELS
set
(
GROUPED_GEMM_FIXED_NK_MULTI_ABD_INSTANCES
)
list
(
APPEND GROUPED_GEMM_FIXED_NK_MULTI_ABD_INSTANCES
device_grouped_gemm_xdl_fixed_nk_bias_gelu_bf16_i8_bf16_mk_kn_mn_instance.cpp
device_grouped_gemm_xdl_fixed_nk_bias_gelu_bf16_i8_bf16_mk_nk_mn_instance.cpp
device_grouped_gemm_xdl_fixed_nk_bias_gelu_bf16_i8_bf16_km_kn_mn_instance.cpp
)
add_instance_library
(
device_grouped_gemm_fixed_nk_multi_abd_instance
${
GROUPED_GEMM_FIXED_NK_MULTI_ABD_INSTANCES
}
)
library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk_multi_abd/device_grouped_gemm_xdl_fixed_nk_bf16_i8_bf16_km_kn_mn_common.hpp
0 → 100644
View file @
4396a224
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_multi_abd.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
BF16
=
ck
::
bhalf_t
;
using
I8
=
int8_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
A0DataType
=
BF16
;
using
AsDataType
=
ck
::
Tuple
<
A0DataType
>
;
using
B0DataType
=
I8
;
using
B1DataType
=
BF16
;
using
BsDataType
=
ck
::
Tuple
<
B0DataType
,
B1DataType
>
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
BF16
;
using
D0DataType
=
BF16
;
// using DsDataType = ck::Tuple<D0DataType>;
using
EDataType
=
BF16
;
using
A0Layout
=
Col
;
using
AsLayout
=
ck
::
Tuple
<
A0Layout
>
;
using
B0Layout
=
Row
;
using
B1Layout
=
B0Layout
;
using
BsLayout
=
ck
::
Tuple
<
B0Layout
,
B1Layout
>
;
using
D0Layout
=
Row
;
// using DsLayout = ck::Tuple<Row>;
using
ELayout
=
Row
;
using
Scales
=
ck
::
tensor_operation
::
element_wise
::
Scales
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AddFastGelu
=
ck
::
tensor_operation
::
element_wise
::
AddFastGelu
;
using
Add
=
ck
::
tensor_operation
::
element_wise
::
Add
;
using
FastGelu
=
ck
::
tensor_operation
::
element_wise
::
FastGelu
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
Scales
;
// using CDEElementOp = AddFastGelu;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
static
constexpr
auto
GemmMNPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
;
static
constexpr
auto
GemmMNKPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
typename
DsLayout
,
typename
DsDataType
,
typename
CDEElementOp
,
ck
::
tensor_operation
::
device
::
GemmSpecialization
GemmSpec
>
using
device_grouped_gemm_xdl_fixed_nk_multi_abd_bf16_i8_bf16_km_kn_mn_instances
=
std
::
tuple
<
// clang-format off
//######################################| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM|NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//######################################| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization|Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######################################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
128
,
64
,
32
,
8
,
2
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
0
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
64
,
128
,
32
,
8
,
2
,
32
,
32
,
1
,
2
,
S
<
4
,
32
,
2
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
0
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
256
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
S
<
4
,
32
,
2
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
128
,
64
,
32
,
8
,
2
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
0
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
64
,
128
,
32
,
8
,
2
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
>
,
DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK
<
AsLayout
,
BsLayout
,
DsLayout
,
ELayout
,
AsDataType
,
BsDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
1
,
128
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
>
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
Prev
1
…
5
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment