Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
7572a691
Commit
7572a691
authored
Feb 15, 2025
by
coderfeli
Browse files
merge develop
parents
7796fc73
6b6fcd37
Changes
452
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
935 additions
and
178 deletions
+935
-178
include/ck_tile/core/numeric/vector_type.hpp
include/ck_tile/core/numeric/vector_type.hpp
+18
-1
include/ck_tile/core/tensor/tile_window.hpp
include/ck_tile/core/tensor/tile_window.hpp
+20
-3
include/ck_tile/core/tensor/transpose_tile.hpp
include/ck_tile/core/tensor/transpose_tile.hpp
+202
-0
include/ck_tile/core/utility/amd_address_space.hpp
include/ck_tile/core/utility/amd_address_space.hpp
+0
-37
include/ck_tile/core/utility/transpose_vectors.hpp
include/ck_tile/core/utility/transpose_vectors.hpp
+73
-43
include/ck_tile/core/utility/type_traits.hpp
include/ck_tile/core/utility/type_traits.hpp
+18
-0
include/ck_tile/core/utility/unary_element_function.hpp
include/ck_tile/core/utility/unary_element_function.hpp
+9
-7
include/ck_tile/host.hpp
include/ck_tile/host.hpp
+3
-1
include/ck_tile/host/check_err.hpp
include/ck_tile/host/check_err.hpp
+119
-3
include/ck_tile/host/concat.hpp
include/ck_tile/host/concat.hpp
+122
-0
include/ck_tile/host/convolution_host_tensor_descriptor_helper.hpp
...k_tile/host/convolution_host_tensor_descriptor_helper.hpp
+27
-57
include/ck_tile/host/host_tensor.hpp
include/ck_tile/host/host_tensor.hpp
+40
-1
include/ck_tile/host/reference/reference_batched_transpose.hpp
...de/ck_tile/host/reference/reference_batched_transpose.hpp
+59
-0
include/ck_tile/host/reference/reference_fused_moe.hpp
include/ck_tile/host/reference/reference_fused_moe.hpp
+25
-16
include/ck_tile/host/reference/reference_gemm.hpp
include/ck_tile/host/reference/reference_gemm.hpp
+3
-2
include/ck_tile/host/reference/reference_moe_sorting.hpp
include/ck_tile/host/reference/reference_moe_sorting.hpp
+24
-2
include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
+30
-4
include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
+2
-1
include/ck_tile/ops/batched_transpose.hpp
include/ck_tile/ops/batched_transpose.hpp
+12
-0
include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
...ops/batched_transpose/kernel/batched_transpose_kernel.hpp
+129
-0
No files found.
Too many changes to show.
To preserve performance only
452 of 452+
files are displayed.
Plain diff
Email patch
include/ck_tile/core/numeric/vector_type.hpp
View file @
7572a691
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -200,4 +200,21 @@ using bf8x32_t = bf8_t __attribute((ext_vector_type(32)));
using
bf8x64_t
=
bf8_t
__attribute
((
ext_vector_type
(
64
)));
#endif
CK_TILE_HOST
fp16x2_t
pk_add_f16
(
const
fp16x2_t
&
x
,
const
fp16x2_t
&
y
)
{
fp16x2_t
vector_res
;
vector_res
.
x
=
x
.
x
+
y
.
x
;
vector_res
.
y
=
x
.
y
+
y
.
y
;
return
vector_res
;
}
CK_TILE_DEVICE
fp16x2_t
pk_add_f16
(
const
fp16x2_t
&
x
,
const
fp16x2_t
&
y
)
{
fp16x2_t
c
;
asm
volatile
(
"v_pk_add_f16 %0, %1, %2"
:
"=v"
(
c
)
:
"v"
(
x
),
"v"
(
y
));
return
c
;
}
}
// namespace ck_tile
include/ck_tile/core/tensor/tile_window.hpp
View file @
7572a691
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -18,8 +18,17 @@
namespace
ck_tile
{
// Note: this tile window do not support single issue
// you need to use tile_window_linear structure for this purpose
/**
* @brief This class provides tile (windowed) view and access to the device memory.
*
* @note This tile window does not support single issue you need to use tile_window_linear
* structure for this purpose
*
* @tparam BottomTensorView_ Class describing & holding device tensor memory.
* @tparam WindowLengths_ Spatial sizes of windowed view on tensor.
* @tparam StaticTileDistribution_ Thread distribution (mapping) into Tile dimensions
* @tparam NumCoord TBD
*/
template
<
typename
BottomTensorView_
,
typename
WindowLengths_
,
typename
StaticTileDistribution_
,
...
...
@@ -1009,6 +1018,14 @@ CK_TILE_DEVICE void move_tile_window(
window
.
move
(
step
);
}
/**
* @brief This class provides description of tile windowed view on the device memory.
*
* @note This class does not provide any functions to read or modify device memory.
*
* @tparam BottomTensorView_ Class describing & holding device tensor memory.
* @tparam WindowLengths_ Spatial sizes of windowed view on tensor.
*/
template
<
typename
BottomTensorView_
,
typename
WindowLengths_
>
struct
tile_window_with_static_lengths
{
...
...
include/ck_tile/core/tensor/transpose_tile.hpp
0 → 100644
View file @
7572a691
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core/config.hpp"
#include "ck_tile/core/numeric/integer.hpp"
#include "ck_tile/core/numeric/integral_constant.hpp"
#include "ck_tile/core/utility/functional.hpp"
#include "ck_tile/core/algorithm/coordinate_transform.hpp"
#include "ck_tile/core/algorithm/space_filling_curve.hpp"
#include "ck_tile/core/container/container_helper.hpp"
#include "ck_tile/core/container/thread_buffer.hpp"
#include "ck_tile/core/container/statically_indexed_array.hpp"
#include "ck_tile/core/numeric/math.hpp"
#include "ck_tile/core/utility/type_traits.hpp"
#include "ck_tile/core/tensor/tile_elementwise.hpp"
#include "ck_tile/core/utility/transpose_vectors.hpp"
namespace
ck_tile
{
namespace
detail
{
template
<
typename
OutTensor
,
typename
InTensor
>
CK_TILE_DEVICE
void
transpose_tile2d_impl_in_thread
(
OutTensor
&
out_tensor
,
const
InTensor
&
in_tensor
)
{
constexpr
auto
I0
=
number
<
0
>
{};
static_assert
(
std
::
is_same_v
<
typename
InTensor
::
DataType
,
typename
OutTensor
::
DataType
>
,
"Data type for InTensor and OutTensor must be the same!"
);
using
DataType
=
typename
InTensor
::
DataType
;
constexpr
auto
y_in_desc
=
InTensor
::
get_tile_distribution
().
get_ys_to_d_descriptor
();
constexpr
auto
y_out_desc
=
OutTensor
::
get_tile_distribution
().
get_ys_to_d_descriptor
();
// y_dim_out_to_in
// For swapped Hs tile case I need only get_rh_minor_to_y
// since rh_major are already swapped due to swapped Hs.
constexpr
auto
get_rh_minor_to_y
=
[](
auto
dstr_tensor
)
{
using
DstrEncode
=
typename
decltype
(
dstr_tensor
.
get_tile_distribution
())
::
DstrEncode
;
map
<
index_t
,
index_t
>
rh_minor_to_y_
;
static_for
<
0
,
DstrEncode
::
NDimY
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
rh_minor
=
DstrEncode
::
ys_to_rhs_minor_
[
i
];
rh_minor_to_y_
(
rh_minor
)
=
i
;
});
return
rh_minor_to_y_
;
};
// In swapped Hs case <Y,X> -> <X,Y> tile
// we have same rh_major, but reversed rh_minor!
constexpr
auto
rh_minor_to_y_in
=
get_rh_minor_to_y
(
InTensor
{});
constexpr
auto
rh_minor_to_y_out
=
get_rh_minor_to_y
(
OutTensor
{});
// Is this really needed?? Should we have simple reverse here??
constexpr
auto
y_dim_out_to_in
=
[
&
]
{
map
<
index_t
,
index_t
>
y_dim_out_to_in_
;
for
(
const
auto
&
[
rh_minor
,
y_out
]
:
rh_minor_to_y_out
)
{
y_dim_out_to_in_
(
y_out
)
=
rh_minor_to_y_in
[
rh_minor
];
}
return
y_dim_out_to_in_
;
}();
constexpr
index_t
NDimY
=
InTensor
::
get_tile_distribution
().
get_num_of_dimension_y
();
constexpr
auto
y_lengths
=
to_sequence
(
y_in_desc
.
get_lengths
());
// input and output vector dim in the order of input Y dims
constexpr
index_t
y_dim_vec_in
=
NDimY
-
1
;
constexpr
index_t
y_dim_vec_out
=
y_dim_out_to_in
[
NDimY
-
1
];
// vector lengths
constexpr
index_t
vec_length_in
=
y_lengths
[
y_dim_vec_in
];
constexpr
index_t
vec_length_out
=
y_lengths
[
y_dim_vec_out
];
// # of vectors
constexpr
index_t
num_vec_in
=
vec_length_out
;
constexpr
index_t
num_vec_out
=
vec_length_in
;
using
InVec
=
array
<
DataType
,
vec_length_in
>
;
using
OutVec
=
array
<
DataType
,
vec_length_out
>
;
// SFC
constexpr
auto
scalars_per_access_arr
=
generate_array
(
[
&
](
auto
i
)
{
return
(
i
==
y_dim_vec_in
or
i
==
y_dim_vec_out
)
?
y_lengths
[
i
]
:
1
;
},
number
<
NDimY
>
{});
constexpr
auto
scalars_per_access
=
TO_SEQUENCE
(
scalars_per_access_arr
,
NDimY
);
using
SFC_Y
=
space_filling_curve
<
decltype
(
y_lengths
),
typename
arithmetic_sequence_gen
<
0
,
NDimY
,
1
>::
type
,
decltype
(
scalars_per_access
)
>
;
constexpr
index_t
num_access
=
SFC_Y
::
get_num_of_access
();
static_assert
(
num_access
>
0
,
"wrong! num_access should be larger than 0"
);
// in/out vectors to be transposed
thread_buffer
<
InVec
,
num_vec_in
>
in_vectors
;
thread_buffer
<
OutVec
,
num_vec_out
>
out_vectors
;
// loop over SFC and do transpose
static_for
<
0
,
num_access
,
1
>
{}([
&
](
auto
iAccess
)
{
// data index [y0, y1, ...] in the order of input tensor
constexpr
auto
idx_y_start
=
SFC_Y
::
get_index
(
iAccess
);
// get input vectors
static_for
<
0
,
num_vec_in
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
idx_y_in
=
generate_tuple
(
[
&
](
auto
ii
)
{
return
ii
==
y_dim_vec_out
?
idx_y_start
[
ii
]
+
i
:
idx_y_start
[
ii
];
},
number
<
NDimY
>
{});
constexpr
index_t
in_offset
=
y_in_desc
.
calculate_offset
(
idx_y_in
);
static_assert
(
in_offset
%
vec_length_in
==
0
);
in_vectors
(
i
).
template
get_as
<
InVec
>()(
I0
)
=
in_tensor
.
get_thread_buffer
()
.
template
get_as
<
InVec
>()[
number
<
in_offset
/
vec_length_in
>
{}];
});
// transpose
transpose_vectors
<
DataType
,
num_vec_in
,
num_vec_out
>
{}(
in_vectors
,
out_vectors
);
// set output vectors
static_for
<
0
,
num_vec_out
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
idx_y_out_tmp
=
generate_array
(
[
&
](
auto
ii
)
{
return
ii
==
y_dim_vec_in
?
idx_y_start
[
ii
]
+
i
:
idx_y_start
[
ii
];
},
number
<
NDimY
>
{});
constexpr
auto
idx_y_out
=
container_reorder_given_new2old
(
idx_y_out_tmp
,
y_dim_out_to_in
);
constexpr
index_t
out_offset
=
y_out_desc
.
calculate_offset
(
idx_y_out
);
static_assert
(
out_offset
%
vec_length_out
==
0
);
out_tensor
.
get_thread_buffer
().
template
set_as
<
OutVec
>(
number
<
out_offset
/
vec_length_out
>
{},
out_vectors
[
i
].
template
get_as
<
OutVec
>()[
I0
]);
});
});
}
}
// namespace detail
template
<
typename
OutTensor
,
typename
InTensor
>
CK_TILE_DEVICE
void
transpose_tile2d
(
OutTensor
&
out
,
const
InTensor
&
in
)
{
using
InDataType
=
typename
InTensor
::
DataType
;
using
OutDataType
=
typename
OutTensor
::
DataType
;
using
InTileDistr
=
typename
InTensor
::
StaticTileDistribution
;
using
OutTileDistr
=
typename
OutTensor
::
StaticTileDistribution
;
using
InDstrEncode
=
typename
InTileDistr
::
DstrEncode
;
using
OutDstrEncode
=
typename
OutTileDistr
::
DstrEncode
;
using
InThreadTensorDesc
=
typename
InTensor
::
ThreadTensorDesc
;
using
OutThreadTensorDesc
=
typename
OutTensor
::
ThreadTensorDesc
;
// Ys:
constexpr
auto
in_thread_desc_lengths
=
InThreadTensorDesc
{}.
get_lengths
();
constexpr
auto
out_thread_desc_lengths
=
OutThreadTensorDesc
{}.
get_lengths
();
// type convert
const
auto
in_tmp
=
[
&
]()
{
if
constexpr
(
std
::
is_same_v
<
OutDataType
,
InDataType
>
)
{
return
in
;
}
else
{
return
tile_elementwise_in
(
type_convert
<
OutDataType
,
InDataType
>
,
in
);
}
}();
// Scenario where we switch from tile <Y, X> -> <X, Y> - only 2D tiles!
// we preserve Ps but swap Ys: <Y1, Y0> -> <Y0, Y1>
if
constexpr
(
InDstrEncode
::
rs_lengths_
==
OutDstrEncode
::
rs_lengths_
&&
InDstrEncode
::
hs_lengthss_
==
tuple_reverse
(
OutDstrEncode
::
hs_lengthss_
)
&&
InDstrEncode
::
NDimY
==
OutDstrEncode
::
NDimY
&&
InDstrEncode
::
NDimY
==
2
&&
in_thread_desc_lengths
==
tuple_reverse
(
out_thread_desc_lengths
))
// Any condition on Ps ??
// InDstrEncode::ps_to_rhss_major_ == OutDstrEncode::ps_to_rhss_major_ &&
// InDstrEncode::ps_to_rhss_minor_ == OutDstrEncode::ps_to_rhss_minor_ &&
{
detail
::
transpose_tile2d_impl_in_thread
(
out
,
in_tmp
);
}
else
{
static_assert
(
false
,
"Provided tensors could not be transposed!"
);
}
}
}
// namespace ck_tile
include/ck_tile/core/utility/amd_address_space.hpp
deleted
100644 → 0
View file @
7796fc73
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core/config.hpp"
// Address Space for AMDGCN
// https://llvm.org/docs/AMDGPUUsage.html#address-space
namespace
ck_tile
{
#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4)))
template
<
typename
T
>
__device__
T
*
cast_pointer_to_generic_address_space
(
T
CK_CONSTANT_ADDRESS_SPACE
*
p
)
{
// cast a pointer in "Constant" address space (4) to "Generic" address space (0)
// only c-style pointer cast seems be able to be compiled
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wold-style-cast"
return
(
T
*
)
p
;
// NOLINT(old-style-cast)
#pragma clang diagnostic pop
}
template
<
typename
T
>
__host__
__device__
T
CK_CONSTANT_ADDRESS_SPACE
*
cast_pointer_to_constant_address_space
(
T
*
p
)
{
// cast a pointer in "Generic" address space (0) to "Constant" address space (4)
// only c-style pointer cast seems be able to be compiled
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wold-style-cast"
return
(
T
CK_CONSTANT_ADDRESS_SPACE
*
)
p
;
// NOLINT(old-style-cast)
#pragma clang diagnostic pop
}
}
// namespace ck_tile
include/ck_tile/core/utility/transpose_vectors.hpp
View file @
7572a691
...
...
@@ -68,52 +68,82 @@ struct transpose_vectors
}
else
if
constexpr
(
sizeof
(
S
)
==
1
)
{
static_assert
((
NX
%
4
==
0
&&
NY
%
4
==
0
),
"wrong!"
);
static_assert
((
(
NX
%
4
==
0
&&
NY
%
4
==
0
)
||
(
NX
%
2
==
0
&&
NY
%
2
==
0
))
,
"wrong!"
);
using
S4
=
array
<
S
,
4
>
;
// typename array<S, 4>::type;
// loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
static_for
<
0
,
NY
,
4
>
{}([
&
](
auto
iy
)
{
static_for
<
0
,
NX
,
4
>
{}([
&
](
auto
ix
)
{
// 4 int8x4 data from vx_tuple
const
int32_t
x_s4_0
=
bit_cast
<
int32_t
>
(
vx_tuple
[
ix
].
template
get_as
<
S4
>()[
iy
/
I4
]);
const
int32_t
x_s4_1
=
bit_cast
<
int32_t
>
(
vx_tuple
[
ix
+
I1
].
template
get_as
<
S4
>()[
iy
/
I4
]);
const
int32_t
x_s4_2
=
bit_cast
<
int32_t
>
(
vx_tuple
[
ix
+
I2
].
template
get_as
<
S4
>()[
iy
/
I4
]);
const
int32_t
x_s4_3
=
bit_cast
<
int32_t
>
(
vx_tuple
[
ix
+
I3
].
template
get_as
<
S4
>()[
iy
/
I4
]);
// transpose
int32_t
t_s4_0
,
t_s4_1
;
int32_t
y_s4_0
,
y_s4_1
,
y_s4_2
,
y_s4_3
;
constexpr
int32_t
m0
=
0x05010400
;
constexpr
int32_t
m1
=
0x05040100
;
constexpr
int32_t
m2
=
0x07060302
;
constexpr
int32_t
m3
=
0x07030602
;
// ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) -> 0x33774488
// -- -- -- -- -- -- -- -- - - - -
// index 7 6 5 4 3 2 1 0 33 77 44 88
// index is reversed because of little endianness (least significant bits first)
t_s4_0
=
__builtin_amdgcn_perm
(
x_s4_1
,
x_s4_0
,
m0
);
t_s4_1
=
__builtin_amdgcn_perm
(
x_s4_3
,
x_s4_2
,
m0
);
y_s4_0
=
__builtin_amdgcn_perm
(
t_s4_1
,
t_s4_0
,
m1
);
y_s4_1
=
__builtin_amdgcn_perm
(
t_s4_1
,
t_s4_0
,
m2
);
t_s4_0
=
__builtin_amdgcn_perm
(
x_s4_1
,
x_s4_0
,
m3
);
t_s4_1
=
__builtin_amdgcn_perm
(
x_s4_3
,
x_s4_2
,
m3
);
y_s4_2
=
__builtin_amdgcn_perm
(
t_s4_1
,
t_s4_0
,
m1
);
y_s4_3
=
__builtin_amdgcn_perm
(
t_s4_1
,
t_s4_0
,
m2
);
// 4 int8x4 data from vy_tuple
vy_tuple
(
iy
).
template
get_as
<
S4
>()(
ix
/
I4
)
=
bit_cast
<
S4
>
(
y_s4_0
);
vy_tuple
(
iy
+
I1
).
template
get_as
<
S4
>()(
ix
/
I4
)
=
bit_cast
<
S4
>
(
y_s4_1
);
vy_tuple
(
iy
+
I2
).
template
get_as
<
S4
>()(
ix
/
I4
)
=
bit_cast
<
S4
>
(
y_s4_2
);
vy_tuple
(
iy
+
I3
).
template
get_as
<
S4
>()(
ix
/
I4
)
=
bit_cast
<
S4
>
(
y_s4_3
);
using
S2
=
array
<
S
,
2
>
;
// typename array<S, 4>::type;
if
constexpr
(
NX
%
4
==
0
&&
NY
%
4
==
0
)
{
// loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
static_for
<
0
,
NY
,
4
>
{}([
&
](
auto
iy
)
{
static_for
<
0
,
NX
,
4
>
{}([
&
](
auto
ix
)
{
// 4 int8x4 data from vx_tuple
const
int32_t
x_s4_0
=
bit_cast
<
int32_t
>
(
vx_tuple
[
ix
].
template
get_as
<
S4
>()[
iy
/
I4
]);
const
int32_t
x_s4_1
=
bit_cast
<
int32_t
>
(
vx_tuple
[
ix
+
I1
].
template
get_as
<
S4
>()[
iy
/
I4
]);
const
int32_t
x_s4_2
=
bit_cast
<
int32_t
>
(
vx_tuple
[
ix
+
I2
].
template
get_as
<
S4
>()[
iy
/
I4
]);
const
int32_t
x_s4_3
=
bit_cast
<
int32_t
>
(
vx_tuple
[
ix
+
I3
].
template
get_as
<
S4
>()[
iy
/
I4
]);
// transpose
int32_t
t_s4_0
,
t_s4_1
;
int32_t
y_s4_0
,
y_s4_1
,
y_s4_2
,
y_s4_3
;
constexpr
int32_t
m0
=
0x05010400
;
constexpr
int32_t
m1
=
0x05040100
;
constexpr
int32_t
m2
=
0x07060302
;
constexpr
int32_t
m3
=
0x07030602
;
// ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) ->
// 0x33774488
// -- -- -- -- -- -- -- -- - - - -
// index 7 6 5 4 3 2 1 0 33 77 44 88
// index is reversed because of little endianness (least significant bits
// first)
t_s4_0
=
__builtin_amdgcn_perm
(
x_s4_1
,
x_s4_0
,
m0
);
t_s4_1
=
__builtin_amdgcn_perm
(
x_s4_3
,
x_s4_2
,
m0
);
y_s4_0
=
__builtin_amdgcn_perm
(
t_s4_1
,
t_s4_0
,
m1
);
y_s4_1
=
__builtin_amdgcn_perm
(
t_s4_1
,
t_s4_0
,
m2
);
t_s4_0
=
__builtin_amdgcn_perm
(
x_s4_1
,
x_s4_0
,
m3
);
t_s4_1
=
__builtin_amdgcn_perm
(
x_s4_3
,
x_s4_2
,
m3
);
y_s4_2
=
__builtin_amdgcn_perm
(
t_s4_1
,
t_s4_0
,
m1
);
y_s4_3
=
__builtin_amdgcn_perm
(
t_s4_1
,
t_s4_0
,
m2
);
// 4 int8x4 data from vy_tuple
vy_tuple
(
iy
).
template
get_as
<
S4
>()(
ix
/
I4
)
=
bit_cast
<
S4
>
(
y_s4_0
);
vy_tuple
(
iy
+
I1
).
template
get_as
<
S4
>()(
ix
/
I4
)
=
bit_cast
<
S4
>
(
y_s4_1
);
vy_tuple
(
iy
+
I2
).
template
get_as
<
S4
>()(
ix
/
I4
)
=
bit_cast
<
S4
>
(
y_s4_2
);
vy_tuple
(
iy
+
I3
).
template
get_as
<
S4
>()(
ix
/
I4
)
=
bit_cast
<
S4
>
(
y_s4_3
);
});
});
});
}
else
if
constexpr
(
NX
%
2
==
0
&&
NY
%
2
==
0
)
{
static_for
<
0
,
NY
,
2
>
{}([
&
](
auto
ix
)
{
static_for
<
0
,
NX
,
2
>
{}([
&
](
auto
iy
)
{
const
int16_t
x_s2_0
=
bit_cast
<
int16_t
>
(
vx_tuple
[
ix
].
template
get_as
<
S2
>()[
iy
/
I2
]);
const
int16_t
x_s2_1
=
bit_cast
<
int16_t
>
(
vx_tuple
[
ix
+
I1
].
template
get_as
<
S2
>()[
iy
/
I2
]);
constexpr
int32_t
m0
=
0x05040100
;
constexpr
int32_t
m1
=
0x07060302
;
const
int32_t
x0_32
=
static_cast
<
int32_t
>
(
x_s2_0
&
0xFFFF
);
const
int32_t
x1_32
=
static_cast
<
int32_t
>
(
x_s2_1
&
0xFFFF
);
const
int32_t
y_s2_0
=
__builtin_amdgcn_perm
(
x1_32
,
x0_32
,
m0
);
const
int32_t
y_s2_1
=
__builtin_amdgcn_perm
(
x1_32
,
x0_32
,
m1
);
vy_tuple
(
iy
).
template
get_as
<
S2
>()[
ix
/
I2
]
=
bit_cast
<
S2
>
(
static_cast
<
int16_t
>
(
y_s2_0
&
0xFFFF
));
vy_tuple
(
iy
+
I1
).
template
get_as
<
S2
>()[
ix
/
I2
]
=
bit_cast
<
S2
>
(
static_cast
<
int16_t
>
(
y_s2_1
&
0xFFFF
));
});
});
}
}
else
{
...
...
include/ck_tile/core/utility/type_traits.hpp
View file @
7572a691
...
...
@@ -109,4 +109,22 @@ CK_TILE_HOST_DEVICE PY c_style_pointer_cast(PX p_x)
#pragma clang diagnostic pop
}
template
<
typename
CompareTo
,
typename
...
Rest
>
struct
is_any_of
:
std
::
false_type
{
};
template
<
typename
CompareTo
,
typename
FirstType
>
struct
is_any_of
<
CompareTo
,
FirstType
>
:
std
::
is_same
<
CompareTo
,
FirstType
>
{
};
template
<
typename
CompareTo
,
typename
FirstType
,
typename
...
Rest
>
struct
is_any_of
<
CompareTo
,
FirstType
,
Rest
...
>
:
std
::
integral_constant
<
bool
,
std
::
is_same
<
CompareTo
,
FirstType
>::
value
||
is_any_of
<
CompareTo
,
Rest
...
>::
value
>
{
};
}
// namespace ck_tile
include/ck_tile/core/utility/unary_element_function.hpp
View file @
7572a691
...
...
@@ -51,16 +51,18 @@ struct composes<F>
template
<
typename
...
Ts
>
__host__
__device__
composes
(
Ts
&&
...)
->
composes
<
remove_cvref_t
<
Ts
>
...
>
;
template
<
typename
To
>
template
<
typename
SaturateType
>
struct
saturates
{
template
<
typename
From
>
CK_TILE_HOST_DEVICE
constexpr
auto
operator
()(
const
From
&
from
)
const
->
std
::
enable_if_t
<
std
::
is_arithmetic_v
<
From
>
,
From
>
// NOTE: this function does not return SaturateType value
// it is user's responsiblity to do further cast or not
template
<
typename
AccType
>
CK_TILE_HOST_DEVICE
constexpr
auto
operator
()(
const
AccType
&
a_
)
const
->
std
::
enable_if_t
<
std
::
is_arithmetic_v
<
AccType
>
,
AccType
>
{
return
clamp
(
from
,
type_convert
<
From
>
(
numeric
<
To
>::
lowest
()),
type_convert
<
From
>
(
numeric
<
To
>::
max
()));
return
clamp
(
a_
,
type_convert
<
AccType
>
(
numeric
<
SaturateType
>::
lowest
()),
type_convert
<
AccType
>
(
numeric
<
SaturateType
>::
max
()));
}
};
...
...
include/ck_tile/host.hpp
View file @
7572a691
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/host/arg_parser.hpp"
#include "ck_tile/host/check_err.hpp"
#include "ck_tile/host/concat.hpp"
#include "ck_tile/host/convolution_host_tensor_descriptor_helper.hpp"
#include "ck_tile/host/convolution_parameter.hpp"
#include "ck_tile/host/device_memory.hpp"
...
...
@@ -20,6 +21,7 @@
#include "ck_tile/host/reference/reference_batched_masking.hpp"
#include "ck_tile/host/reference/reference_batched_rotary_position_embedding.hpp"
#include "ck_tile/host/reference/reference_batched_softmax.hpp"
#include "ck_tile/host/reference/reference_batched_transpose.hpp"
#include "ck_tile/host/reference/reference_elementwise.hpp"
#include "ck_tile/host/reference/reference_fused_moe.hpp"
#include "ck_tile/host/reference/reference_gemm.hpp"
...
...
include/ck_tile/host/check_err.hpp
View file @
7572a691
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -18,6 +18,114 @@
namespace
ck_tile
{
template
<
typename
ComputeDataType
,
typename
OutDataType
,
typename
AccDataType
=
ComputeDataType
>
double
get_relative_threshold
(
const
int
number_of_accumulations
=
1
)
{
using
F8
=
ck_tile
::
fp8_t
;
using
BF8
=
ck_tile
::
bf8_t
;
using
F16
=
ck_tile
::
half_t
;
using
BF16
=
ck_tile
::
bf16_t
;
using
F32
=
float
;
using
I8
=
int8_t
;
using
I32
=
int32_t
;
static_assert
(
is_any_of
<
ComputeDataType
,
F8
,
BF8
,
F16
,
BF16
,
F32
,
I8
,
I32
,
int
>::
value
,
"Warning: Unhandled ComputeDataType for setting up the relative threshold!"
);
double
compute_error
=
0
;
if
constexpr
(
is_any_of
<
ComputeDataType
,
I8
,
I32
,
int
>::
value
)
{
return
0
;
}
else
{
compute_error
=
std
::
pow
(
2
,
-
numeric_traits
<
ComputeDataType
>::
mant
)
*
0.5
;
}
static_assert
(
is_any_of
<
OutDataType
,
F8
,
BF8
,
F16
,
BF16
,
F32
,
I8
,
I32
,
int
>::
value
,
"Warning: Unhandled OutDataType for setting up the relative threshold!"
);
double
output_error
=
0
;
if
constexpr
(
is_any_of
<
OutDataType
,
I8
,
I32
,
int
>::
value
)
{
return
0
;
}
else
{
output_error
=
std
::
pow
(
2
,
-
numeric_traits
<
OutDataType
>::
mant
)
*
0.5
;
}
double
midway_error
=
std
::
max
(
compute_error
,
output_error
);
static_assert
(
is_any_of
<
AccDataType
,
F8
,
BF8
,
F16
,
BF16
,
F32
,
I8
,
I32
,
int
>::
value
,
"Warning: Unhandled AccDataType for setting up the relative threshold!"
);
double
acc_error
=
0
;
if
constexpr
(
is_any_of
<
AccDataType
,
I8
,
I32
,
int
>::
value
)
{
return
0
;
}
else
{
acc_error
=
std
::
pow
(
2
,
-
numeric_traits
<
AccDataType
>::
mant
)
*
0.5
*
number_of_accumulations
;
}
return
std
::
max
(
acc_error
,
midway_error
);
}
template
<
typename
ComputeDataType
,
typename
OutDataType
,
typename
AccDataType
=
ComputeDataType
>
double
get_absolute_threshold
(
const
double
max_possible_num
,
const
int
number_of_accumulations
=
1
)
{
using
F8
=
ck_tile
::
fp8_t
;
using
BF8
=
ck_tile
::
bf8_t
;
using
F16
=
ck_tile
::
half_t
;
using
BF16
=
ck_tile
::
bf16_t
;
using
F32
=
float
;
using
I8
=
int8_t
;
using
I32
=
int32_t
;
static_assert
(
is_any_of
<
ComputeDataType
,
F8
,
BF8
,
F16
,
BF16
,
F32
,
I8
,
I32
,
int
>::
value
,
"Warning: Unhandled ComputeDataType for setting up the absolute threshold!"
);
auto
expo
=
std
::
log2
(
std
::
abs
(
max_possible_num
));
double
compute_error
=
0
;
if
constexpr
(
is_any_of
<
ComputeDataType
,
I8
,
I32
,
int
>::
value
)
{
return
0
;
}
else
{
compute_error
=
std
::
pow
(
2
,
expo
-
numeric_traits
<
ComputeDataType
>::
mant
)
*
0.5
;
}
static_assert
(
is_any_of
<
OutDataType
,
F8
,
BF8
,
F16
,
BF16
,
F32
,
I8
,
I32
,
int
>::
value
,
"Warning: Unhandled OutDataType for setting up the absolute threshold!"
);
double
output_error
=
0
;
if
constexpr
(
is_any_of
<
OutDataType
,
I8
,
I32
,
int
>::
value
)
{
return
0
;
}
else
{
output_error
=
std
::
pow
(
2
,
expo
-
numeric_traits
<
OutDataType
>::
mant
)
*
0.5
;
}
double
midway_error
=
std
::
max
(
compute_error
,
output_error
);
static_assert
(
is_any_of
<
AccDataType
,
F8
,
BF8
,
F16
,
BF16
,
F32
,
I8
,
I32
,
int
>::
value
,
"Warning: Unhandled AccDataType for setting up the absolute threshold!"
);
double
acc_error
=
0
;
if
constexpr
(
is_any_of
<
AccDataType
,
I8
,
I32
,
int
>::
value
)
{
return
0
;
}
else
{
acc_error
=
std
::
pow
(
2
,
expo
-
numeric_traits
<
AccDataType
>::
mant
)
*
0.5
*
number_of_accumulations
;
}
return
std
::
max
(
acc_error
,
midway_error
);
}
template
<
typename
T
>
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
std
::
vector
<
T
>&
v
)
{
...
...
@@ -337,7 +445,11 @@ std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_val
}
if
(
!
res
)
{
std
::
cerr
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
const
float
error_percent
=
static_cast
<
float
>
(
err_count
)
/
static_cast
<
float
>
(
out
.
size
())
*
100.
f
;
std
::
cerr
<<
"max err: "
<<
max_err
;
std
::
cerr
<<
", number of errors: "
<<
err_count
;
std
::
cerr
<<
", "
<<
error_percent
<<
"% wrong values"
<<
std
::
endl
;
}
return
res
;
}
...
...
@@ -391,7 +503,11 @@ std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_val
}
if
(
!
res
)
{
std
::
cerr
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
const
float
error_percent
=
static_cast
<
float
>
(
err_count
)
/
static_cast
<
float
>
(
out
.
size
())
*
100.
f
;
std
::
cerr
<<
"max err: "
<<
max_err
;
std
::
cerr
<<
", number of errors: "
<<
err_count
;
std
::
cerr
<<
", "
<<
error_percent
<<
"% wrong values"
<<
std
::
endl
;
}
return
res
;
}
...
...
include/ck_tile/host/concat.hpp
0 → 100644
View file @
7572a691
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
namespace
ck_tile
{
template
<
typename
T
>
struct
IsCharArray
:
std
::
false_type
{
};
template
<
std
::
size_t
N
>
struct
IsCharArray
<
char
[
N
]
>
:
std
::
true_type
{
};
template
<
std
::
size_t
N
>
struct
IsCharArray
<
const
char
[
N
]
>
:
std
::
true_type
{
};
template
<
std
::
size_t
N
>
struct
IsCharArray
<
char
(
&
)[
N
]
>
:
std
::
true_type
{
};
template
<
std
::
size_t
N
>
struct
IsCharArray
<
const
char
(
&
)[
N
]
>
:
std
::
true_type
{
};
template
<
typename
...
Ts
>
inline
constexpr
bool
AllConvertibleToStringView
=
((
std
::
is_convertible_v
<
Ts
,
std
::
string_view
>
||
IsCharArray
<
Ts
>::
value
||
std
::
is_same_v
<
Ts
,
char
>
)
&&
...);
template
<
typename
...
Ts
>
[[
nodiscard
]]
auto
concat
(
const
Ts
&
...
xs
)
->
std
::
enable_if_t
<!
AllConvertibleToStringView
<
Ts
...
>
,
std
::
string
>
{
using
::
operator
<<
;
thread_local
std
::
ostringstream
oss
;
oss
.
str
(
""
);
(
oss
<<
...
<<
xs
);
return
oss
.
str
();
}
template
<
std
::
size_t
N
>
[[
nodiscard
]]
constexpr
inline
std
::
size_t
getSize
(
char
(
&
)[
N
])
noexcept
{
return
N
;
}
template
<
std
::
size_t
N
>
[[
nodiscard
]]
constexpr
inline
std
::
size_t
getSize
(
const
char
(
&
)[
N
])
noexcept
{
return
N
;
}
[[
nodiscard
]]
constexpr
inline
std
::
size_t
getSize
(
const
char
*
s
)
noexcept
{
const
char
*
end
=
s
;
while
(
*
end
++
!=
0
)
{}
return
end
-
s
-
1
;
}
[[
nodiscard
]]
constexpr
inline
std
::
size_t
getSize
(
const
char
&
)
noexcept
{
return
1
;
}
[[
nodiscard
]]
inline
std
::
size_t
getSize
(
const
std
::
string
&
s
)
noexcept
{
return
s
.
size
();
}
[[
nodiscard
]]
constexpr
inline
std
::
size_t
getSize
(
const
std
::
string_view
&
s
)
noexcept
{
return
s
.
size
();
}
template
<
typename
...
Ts
>
auto
concatInto
(
std
::
string
&
result
,
const
Ts
&
...
xs
)
->
std
::
enable_if_t
<
AllConvertibleToStringView
<
Ts
...
>
,
void
>
{
const
std
::
size_t
space
=
(
1
+
...
+
getSize
(
xs
));
result
.
reserve
(
result
.
size
()
+
space
);
((
result
+=
xs
),
...);
}
template
<
typename
...
Ts
>
[[
nodiscard
]]
auto
concat
(
const
Ts
&
...
xs
)
->
std
::
enable_if_t
<
AllConvertibleToStringView
<
Ts
...
>
,
std
::
string
>
{
std
::
string
result
;
concatInto
(
result
,
xs
...);
return
result
;
}
// Function for types convertible to std::string_view
template
<
typename
Sep
,
typename
First
,
typename
...
Rest
>
[[
nodiscard
]]
auto
concat
(
Sep
sep
,
const
First
&
first
,
const
Rest
&
...
rest
)
->
std
::
enable_if_t
<
AllConvertibleToStringView
<
First
,
Rest
...
>
,
std
::
string
>
{
std
::
string
result
;
result
+=
first
;
((
result
+=
sep
,
result
+=
rest
),
...);
return
result
;
}
// Function for other types
template
<
typename
Sep
,
typename
First
,
typename
...
Rest
>
[[
nodiscard
]]
auto
concat
(
Sep
sep
,
const
First
&
first
,
const
Rest
&
...
rest
)
->
std
::
enable_if_t
<!
AllConvertibleToStringView
<
First
,
Rest
...
>
,
std
::
string
>
{
using
::
operator
<<
;
thread_local
std
::
ostringstream
oss
;
oss
.
str
(
""
);
oss
<<
first
;
((
oss
<<
sep
<<
rest
),
...);
return
oss
.
str
();
}
}
// namespace ck_tile
include/ck_tile/host/convolution_host_tensor_descriptor_helper.hpp
View file @
7572a691
...
...
@@ -14,57 +14,41 @@ namespace detail {
template
<
typename
OldLayout
>
CK_TILE_HOST
std
::
vector
<
std
::
size_t
>
get_layout_transpose_gnchw_to_old
()
{
if
constexpr
(
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNCW
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKCX
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNKW
>
)
using
namespace
ck_tile
::
tensor_layout
::
convolution
;
if
constexpr
(
is_any_of
<
OldLayout
,
GNCW
,
GKCX
,
GNKW
>::
value
)
{
return
{
0
,
1
,
2
,
3
};
}
else
if
constexpr
(
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNCHW
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKCYX
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNKHW
>
)
else
if
constexpr
(
is_any_of
<
OldLayout
,
GNCHW
,
GKCYX
,
GNKHW
>::
value
)
{
return
{
0
,
1
,
2
,
3
,
4
};
}
else
if
constexpr
(
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNCDHW
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKCZYX
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNKDHW
>
)
else
if
constexpr
(
is_any_of
<
OldLayout
,
GNCDHW
,
GKCZYX
,
GNKDHW
>::
value
)
{
return
{
0
,
1
,
2
,
3
,
4
,
5
};
}
if
constexpr
(
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNWC
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKXC
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNWK
>
)
if
constexpr
(
is_any_of
<
OldLayout
,
GNWC
,
GKXC
,
GNWK
>::
value
)
{
return
{
0
,
1
,
3
,
2
};
}
else
if
constexpr
(
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNHWC
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKYXC
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNHWK
>
)
else
if
constexpr
(
is_any_of
<
OldLayout
,
GNHWC
,
GKYXC
,
GNHWK
>::
value
)
{
return
{
0
,
1
,
4
,
2
,
3
};
}
else
if
constexpr
(
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNDHWC
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKZYXC
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNDHWK
>
)
else
if
constexpr
(
is_any_of
<
OldLayout
,
GNDHWC
,
GKZYXC
,
GNDHWK
>::
value
)
{
return
{
0
,
1
,
5
,
2
,
3
,
4
};
}
else
if
constexpr
(
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NWGC
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
KXGC
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NWGK
>
)
else
if
constexpr
(
is_any_of
<
OldLayout
,
NWGC
,
KXGC
,
NWGK
>::
value
)
{
return
{
2
,
0
,
3
,
1
};
}
else
if
constexpr
(
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NHWGC
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
KYXGC
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NHWGK
>
)
else
if
constexpr
(
is_any_of
<
OldLayout
,
NHWGC
,
KYXGC
,
NHWGK
>::
value
)
{
return
{
3
,
0
,
4
,
1
,
2
};
}
else
if
constexpr
(
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NDHWGC
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
KZYXGC
>
||
std
::
is_same_v
<
OldLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NDHWGK
>
)
else
if
constexpr
(
is_any_of
<
OldLayout
,
NDHWGC
,
KZYXGC
,
NDHWGK
>::
value
)
{
return
{
4
,
0
,
5
,
1
,
2
,
3
};
}
...
...
@@ -83,11 +67,11 @@ template <typename InLayout>
CK_TILE_HOST
HostTensorDescriptor
make_input_host_tensor_descriptor_g_n_c_wis_packed
(
const
ck_tile
::
conv
::
ConvParam
&
param
)
{
using
namespace
ck_tile
::
tensor_layout
::
convolution
;
std
::
vector
<
std
::
size_t
>
physical_lengths
;
if
constexpr
(
std
::
is_same_v
<
InLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNCW
>
||
std
::
is_same_v
<
InLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNCHW
>
||
std
::
is_same_v
<
InLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNCDHW
>
)
if
constexpr
(
is_any_of
<
InLayout
,
GNCW
,
GNCHW
,
GNCDHW
>::
value
)
{
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
G_
),
static_cast
<
std
::
size_t
>
(
param
.
N_
),
...
...
@@ -97,9 +81,7 @@ make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvPara
param
.
input_spatial_lengths_
.
begin
(),
param
.
input_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
}
else
if
constexpr
(
std
::
is_same_v
<
InLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNWC
>
||
std
::
is_same_v
<
InLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNHWC
>
||
std
::
is_same_v
<
InLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNDHWC
>
)
else
if
constexpr
(
is_any_of
<
InLayout
,
GNWC
,
GNHWC
,
GNDHWC
>::
value
)
{
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
G_
),
static_cast
<
std
::
size_t
>
(
param
.
N_
),
...
...
@@ -109,9 +91,7 @@ make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvPara
param
.
input_spatial_lengths_
.
begin
(),
param
.
input_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
}
else
if
constexpr
(
std
::
is_same_v
<
InLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NWGC
>
||
std
::
is_same_v
<
InLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NHWGC
>
||
std
::
is_same_v
<
InLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NDHWGC
>
)
else
if
constexpr
(
is_any_of
<
InLayout
,
NWGC
,
NHWGC
,
NDHWGC
>::
value
)
{
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
N_
),
static_cast
<
std
::
size_t
>
(
param
.
G_
),
...
...
@@ -139,11 +119,11 @@ template <typename WeiLayout>
CK_TILE_HOST
HostTensorDescriptor
make_weight_host_tensor_descriptor_g_k_c_xs_packed
(
const
ck_tile
::
conv
::
ConvParam
&
param
)
{
using
namespace
ck_tile
::
tensor_layout
::
convolution
;
std
::
vector
<
std
::
size_t
>
physical_lengths
;
if
constexpr
(
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
KXC
>
||
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
KYXC
>
||
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
KZYXC
>
)
if
constexpr
(
is_any_of
<
WeiLayout
,
KXC
,
KYXC
,
KZYXC
>::
value
)
{
if
(
param
.
G_
!=
1
)
{
...
...
@@ -157,9 +137,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvPara
param
.
filter_spatial_lengths_
.
begin
(),
param
.
filter_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
}
else
if
constexpr
(
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKCX
>
||
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKCYX
>
||
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKCZYX
>
)
else
if
constexpr
(
is_any_of
<
WeiLayout
,
GKCX
,
GKCYX
,
GKCZYX
>::
value
)
{
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
G_
),
static_cast
<
std
::
size_t
>
(
param
.
K_
),
...
...
@@ -169,9 +147,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvPara
param
.
filter_spatial_lengths_
.
begin
(),
param
.
filter_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
}
else
if
constexpr
(
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKXC
>
||
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKYXC
>
||
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GKZYXC
>
)
else
if
constexpr
(
is_any_of
<
WeiLayout
,
GKXC
,
GKYXC
,
GKZYXC
>::
value
)
{
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
G_
),
static_cast
<
std
::
size_t
>
(
param
.
K_
),
...
...
@@ -181,9 +157,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvPara
param
.
filter_spatial_lengths_
.
begin
(),
param
.
filter_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
}
else
if
constexpr
(
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
KXGC
>
||
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
KYXGC
>
||
std
::
is_same_v
<
WeiLayout
,
ck_tile
::
tensor_layout
::
convolution
::
KZYXGC
>
)
else
if
constexpr
(
is_any_of
<
WeiLayout
,
KXGC
,
KYXGC
,
KZYXGC
>::
value
)
{
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
K_
),
static_cast
<
std
::
size_t
>
(
param
.
G_
),
...
...
@@ -211,11 +185,11 @@ template <typename OutLayout>
CK_TILE_HOST
HostTensorDescriptor
make_output_host_tensor_descriptor_g_n_k_wos_packed
(
const
ck_tile
::
conv
::
ConvParam
&
param
)
{
using
namespace
ck_tile
::
tensor_layout
::
convolution
;
std
::
vector
<
std
::
size_t
>
physical_lengths
;
if
constexpr
(
std
::
is_same_v
<
OutLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNKW
>
||
std
::
is_same_v
<
OutLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNKHW
>
||
std
::
is_same_v
<
OutLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNKDHW
>
)
if
constexpr
(
is_any_of
<
OutLayout
,
GNKW
,
GNKHW
,
GNKDHW
>::
value
)
{
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
G_
),
static_cast
<
std
::
size_t
>
(
param
.
N_
),
...
...
@@ -226,9 +200,7 @@ make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvPar
param
.
output_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
}
// separate from legacy code above
else
if
constexpr
(
std
::
is_same_v
<
OutLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNWK
>
||
std
::
is_same_v
<
OutLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNHWK
>
||
std
::
is_same_v
<
OutLayout
,
ck_tile
::
tensor_layout
::
convolution
::
GNDHWK
>
)
else
if
constexpr
(
is_any_of
<
OutLayout
,
GNWK
,
GNHWK
,
GNDHWK
>::
value
)
{
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
G_
),
static_cast
<
std
::
size_t
>
(
param
.
N_
),
...
...
@@ -238,9 +210,7 @@ make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvPar
param
.
output_spatial_lengths_
.
begin
(),
param
.
output_spatial_lengths_
.
begin
()
+
param
.
num_dim_spatial_
);
}
else
if
constexpr
(
std
::
is_same_v
<
OutLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NWGK
>
||
std
::
is_same_v
<
OutLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NHWGK
>
||
std
::
is_same_v
<
OutLayout
,
ck_tile
::
tensor_layout
::
convolution
::
NDHWGK
>
)
else
if
constexpr
(
is_any_of
<
OutLayout
,
NWGK
,
NHWGK
,
NDHWGK
>::
value
)
{
physical_lengths
=
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
param
.
N_
),
static_cast
<
std
::
size_t
>
(
param
.
G_
),
...
...
include/ck_tile/host/host_tensor.hpp
View file @
7572a691
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -678,4 +678,43 @@ struct HostTensor
Descriptor
mDesc
;
Data
mData
;
};
template
<
bool
is_row_major
>
auto
host_tensor_descriptor
(
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
bool_constant
<
is_row_major
>
)
{
using
namespace
ck_tile
::
literals
;
if
constexpr
(
is_row_major
)
{
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
({
row
,
col
},
{
1
_uz
,
stride
});
}
}
template
<
bool
is_row_major
>
auto
get_default_stride
(
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
bool_constant
<
is_row_major
>
)
{
if
(
stride
==
0
)
{
if
constexpr
(
is_row_major
)
{
return
col
;
}
else
{
return
row
;
}
}
else
return
stride
;
}
}
// namespace ck_tile
include/ck_tile/host/reference/reference_batched_transpose.hpp
0 → 100644
View file @
7572a691
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include <thread>
namespace
ck_tile
{
template
<
typename
Type
>
CK_TILE_HOST
void
reference_batched_transpose
(
const
HostTensor
<
Type
>&
x
,
HostTensor
<
Type
>&
y
,
std
::
string
layout_in
=
"NCHW"
,
std
::
string
layout_out
=
"NHWC"
)
{
const
int
N
=
x
.
mDesc
.
get_lengths
()[
0
];
auto
f
=
[
&
](
auto
batch
)
{
if
(
layout_in
==
"NCHW"
&&
layout_out
==
"NHWC"
)
{
const
int
C
=
x
.
mDesc
.
get_lengths
()[
1
];
const
int
H
=
x
.
mDesc
.
get_lengths
()[
2
];
const
int
W
=
x
.
mDesc
.
get_lengths
()[
3
];
for
(
int
c
=
0
;
c
<
C
;
++
c
)
{
for
(
int
h
=
0
;
h
<
H
;
++
h
)
{
for
(
int
w
=
0
;
w
<
W
;
++
w
)
{
Type
v_x
=
x
(
batch
,
c
,
h
,
w
);
y
(
batch
,
h
,
w
,
c
)
=
v_x
;
}
}
}
}
else
if
(
layout_in
==
"NHWC"
&&
layout_out
==
"NCHW"
)
{
const
int
H
=
x
.
mDesc
.
get_lengths
()[
1
];
const
int
W
=
x
.
mDesc
.
get_lengths
()[
2
];
const
int
C
=
x
.
mDesc
.
get_lengths
()[
3
];
for
(
int
h
=
0
;
h
<
H
;
++
h
)
{
for
(
int
w
=
0
;
w
<
W
;
++
w
)
{
for
(
int
c
=
0
;
c
<
C
;
++
c
)
{
Type
v_x
=
x
(
batch
,
h
,
w
,
c
);
y
(
batch
,
c
,
h
,
w
)
=
v_x
;
}
}
}
}
};
make_ParallelTensorFunctor
(
f
,
N
)(
std
::
thread
::
hardware_concurrency
());
}
}
// namespace ck_tile
include/ck_tile/host/reference/reference_fused_moe.hpp
View file @
7572a691
...
...
@@ -73,7 +73,7 @@ void reference_fused_moe(
ck_tile
::
index_t
tokens
,
ck_tile
::
index_t
experts
,
ck_tile
::
index_t
hidden_size
,
ck_tile
::
index_t
intermediate_size
,
// this size is for gate/up
ck_tile
::
index_t
intermediate_size
,
// this size is for gate/up
/down
ck_tile
::
index_t
topk
,
ck_tile
::
index_t
gate_only
)
{
...
...
@@ -82,19 +82,8 @@ void reference_fused_moe(
assert
(
sorted_expert_ids_host
.
get_num_of_dimension
()
==
1
);
assert
(
num_sorted_tiles_host
.
get_element_size
()
==
1
);
ck_tile
::
index_t
num_sorted_tiles
=
num_sorted_tiles_host
.
mData
[
0
]
/
block_m
;
ck_tile
::
index_t
intermediate_size_0
=
intermediate_size
;
ck_tile
::
index_t
intermediate_size_1
=
intermediate_size
/
(
gate_only
?
1
:
2
);
// TODO: better remove this in the future, or modify the token_id value
auto
get_topk_id
=
[
&
](
ck_tile
::
index_t
token_id_
,
ck_tile
::
index_t
expert_id_
)
{
for
(
ck_tile
::
index_t
i_
=
0
;
i_
<
topk
;
i_
++
)
{
if
(
token_ids_host
(
token_id_
,
i_
)
==
expert_id_
)
return
i_
;
}
throw
std
::
runtime_error
(
"not correct token/expert pair
\n
"
);
return
-
1
;
// TODO: not correct!!
};
ck_tile
::
index_t
intermediate_size_0
=
intermediate_size
*
(
gate_only
?
1
:
2
);
ck_tile
::
index_t
intermediate_size_1
=
intermediate_size
;
ck_tile
::
HostTensor
<
AccDataType
>
out_topk_tokens
({
tokens
,
topk
,
hidden_size
});
...
...
@@ -105,11 +94,31 @@ void reference_fused_moe(
if
(
i_tile
>=
num_sorted_tiles
)
return
;
ck_tile
::
index_t
i_expert
=
sorted_expert_ids_host
.
mData
[
i_tile
];
ck_tile
::
index_t
i_token
=
sorted_token_ids_host
.
mData
[
i_flatten
];
#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
ck_tile
::
index_t
i_token
=
sorted_token_ids_host
.
mData
[
i_flatten
];
ck_tile
::
index_t
i_topk
=
i_token
>>
24
;
i_token
&=
0xffffff
;
if
(
i_token
>=
tokens
)
return
;
(
void
)
token_ids_host
;
#else
// TODO: better remove this in the future, or modify the token_id value
auto
get_topk_id
=
[
&
](
ck_tile
::
index_t
token_id_
,
ck_tile
::
index_t
expert_id_
)
{
for
(
ck_tile
::
index_t
i_
=
0
;
i_
<
topk
;
i_
++
)
{
if
(
token_ids_host
(
token_id_
,
i_
)
==
expert_id_
)
return
i_
;
}
throw
std
::
runtime_error
(
"not correct token/expert pair
\n
"
);
return
-
1
;
// TODO: not correct!!
};
ck_tile
::
index_t
i_token
=
sorted_token_ids_host
.
mData
[
i_flatten
];
if
(
i_token
>=
tokens
)
return
;
ck_tile
::
index_t
i_topk
=
get_topk_id
(
i_token
,
i_expert
);
// TODO: ugly
auto
weight
=
sorted_weight_host
.
mData
[
i_flatten
];
#endif
auto
weight
=
sorted_weight_host
.
mData
[
i_flatten
];
ck_tile
::
HostTensor
<
AccDataType
>
acc_0
({
1
,
intermediate_size_0
});
// first gemm
...
...
include/ck_tile/host/reference/reference_gemm.hpp
View file @
7572a691
...
...
@@ -80,13 +80,14 @@ __global__ void naive_gemm_kernel(ADataType* A,
int
b_index
=
(
std
::
is_same_v
<
LayoutB
,
tensor_layout
::
gemm
::
ColumnMajor
>
)
?
col
*
strideB
+
k
:
k
*
strideB
+
col
;
acc
+=
static_cast
<
AccDataType
>
(
A
[
a_index
])
*
static_cast
<
AccDataType
>
(
B
[
b_index
]);
acc
+=
ck_tile
::
type_convert
<
AccDataType
>
(
A
[
a_index
])
*
ck_tile
::
type_convert
<
AccDataType
>
(
B
[
b_index
]);
}
int
c_index
=
(
std
::
is_same_v
<
LayoutC
,
tensor_layout
::
gemm
::
RowMajor
>
)
?
row
*
strideC
+
col
:
col
*
strideC
+
row
;
C
[
c_index
]
=
acc
;
C
[
c_index
]
=
ck_tile
::
type_convert
<
CDataType
>
(
acc
)
;
}
}
...
...
include/ck_tile/host/reference/reference_moe_sorting.hpp
View file @
7572a691
...
...
@@ -14,12 +14,15 @@ namespace ck_tile {
template
<
typename
WeightType
,
typename
IndexType
=
index_t
>
CK_TILE_HOST
void
reference_moe_sorting
(
const
HostTensor
<
IndexType
>&
topk_ids
,
const
HostTensor
<
WeightType
>&
weights
,
const
HostTensor
<
IndexType
>&
local_expert_mask
,
HostTensor
<
IndexType
>&
p_sorted_token_ids
,
HostTensor
<
WeightType
>&
sorted_weight
,
HostTensor
<
IndexType
>&
sorted_expert_ids
,
index_t
&
unit_cnt
,
const
index_t
experts
,
const
index_t
unit_size
)
const
index_t
unit_size
,
bool
local_expert_masking
,
bool
skip_experts_with_zero_token
=
true
)
{
const
index_t
num_token
=
topk_ids
.
mDesc
.
get_lengths
()[
0
];
const
index_t
topk
=
topk_ids
.
mDesc
.
get_lengths
()[
1
];
...
...
@@ -33,8 +36,11 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
#endif
std
::
vector
<
std
::
vector
<
WeightType
>>
expert_token_weights
(
experts
,
std
::
vector
<
WeightType
>
(
unit_size
,
0
));
// count number of unit-size slices in this expert
std
::
vector
<
IndexType
>
expert_slices
(
experts
,
1
);
// count the tokens used in this expert
std
::
vector
<
IndexType
>
expert_slice_idxs
(
experts
,
0
);
// TODO: above 2 buffer seems duplicated
for
(
index_t
t
=
0
;
t
<
num_token
;
t
++
)
{
...
...
@@ -72,8 +78,23 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
IndexType
*
out_tokens
=
p_sorted_token_ids
.
data
();
WeightType
*
out_weights
=
sorted_weight
.
data
();
IndexType
*
out_expert_id
=
sorted_expert_ids
.
data
();
int
curr_expert_id
=
0
;
for
(
index_t
e
=
0
;
e
<
experts
;
e
++
)
{
if
(
local_expert_masking
)
{
if
(
local_expert_mask
(
e
)
==
0
)
continue
;
}
if
(
skip_experts_with_zero_token
)
{
if
(
expert_slice_idxs
[
e
]
==
0
)
{
curr_expert_id
++
;
continue
;
}
}
memcpy
(
out_tokens
,
expert_tokens
[
e
].
data
(),
sizeof
(
index_t
)
*
expert_slices
[
e
]
*
unit_size
);
out_tokens
+=
expert_slices
[
e
]
*
unit_size
;
memcpy
(
out_weights
,
...
...
@@ -83,10 +104,11 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
for
(
index_t
s
=
0
;
s
<
expert_slices
[
e
];
s
++
)
{
out_expert_id
[
s
]
=
e
;
out_expert_id
[
s
]
=
curr_expert_id
;
unit_cnt
++
;
}
out_expert_id
+=
expert_slices
[
e
];
curr_expert_id
++
;
}
unit_cnt
*=
unit_size
;
return
;
...
...
include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
View file @
7572a691
...
...
@@ -8,16 +8,40 @@
namespace
ck_tile
{
// Note: for simplicity, each functor only care about single M
struct
reference_rmsnorm2d_default_epilogue
{
template
<
typename
OutDataType
,
typename
AccDataType
>
void
operator
()(
int
m
,
HostTensor
<
OutDataType
>&
o
,
const
HostTensor
<
AccDataType
>&
acc
)
{
const
int
N
=
acc
.
mDesc
.
get_lengths
()[
1
];
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
o
(
m
,
n
)
=
ck_tile
::
type_convert
<
OutDataType
>
(
acc
(
m
,
n
));
}
}
template
<
typename
OutDataType
,
typename
AccDataType
>
auto
operator
()(
int
m
,
const
HostTensor
<
AccDataType
>&
acc
)
{
HostTensor
<
OutDataType
>
o
(
acc
.
get_lengths
(),
acc
.
get_strides
());
operator
()(
m
,
o
,
acc
);
return
o
;
}
};
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
ComputeDataType
,
typename
YDataType
,
typename
InvRmsDataType
>
typename
InvRmsDataType
,
typename
Epilogue
=
reference_rmsnorm2d_default_epilogue
>
void
reference_rmsnorm2d_fwd
(
const
HostTensor
<
XDataType
>&
x_m_n
,
const
HostTensor
<
GammaDataType
>&
gamma_n
,
HostTensor
<
YDataType
>&
y_m_n
,
HostTensor
<
InvRmsDataType
>&
invRms_m
,
ComputeDataType
epsilon
)
ComputeDataType
epsilon
,
Epilogue
epilogue_functor
=
{})
{
auto
rmsnorm2d_fwd_func
=
[
&
](
auto
m
)
{
const
int
N
=
x_m_n
.
mDesc
.
get_lengths
()[
1
];
...
...
@@ -37,13 +61,15 @@ void reference_rmsnorm2d_fwd(const HostTensor<XDataType>& x_m_n,
if
constexpr
(
!
std
::
is_same_v
<
InvRmsDataType
,
ck_tile
::
null_type
>
)
invRms_m
(
m
)
=
ck_tile
::
type_convert
<
InvRmsDataType
>
(
divisor
);
HostTensor
<
ComputeDataType
>
acc
(
x_m_n
.
get_lengths
(),
x_m_n
.
get_strides
());
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
ComputeDataType
x
=
ck_tile
::
type_convert
<
ComputeDataType
>
(
x_m_n
(
m
,
n
));
ComputeDataType
gamma
=
ck_tile
::
type_convert
<
ComputeDataType
>
(
gamma_n
(
n
));
auto
y
=
x
*
divisor
*
gamma
;
y_m_n
(
m
,
n
)
=
ck_tile
::
type_convert
<
YDataType
>
(
y
);
acc
(
m
,
n
)
=
x
*
divisor
*
gamma
;
}
epilogue_functor
(
m
,
y_m_n
,
acc
);
};
make_ParallelTensorFunctor
(
rmsnorm2d_fwd_func
,
invRms_m
.
mDesc
.
get_lengths
()[
0
])(
...
...
include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
View file @
7572a691
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -10,3 +10,4 @@
#include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
include/ck_tile/ops/batched_transpose.hpp
0 → 100644
View file @
7572a691
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp"
#include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_pipeline.hpp"
#include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp"
#include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"
include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
0 → 100644
View file @
7572a691
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common.hpp"
#include "ck_tile/ops/elementwise.hpp"
#include "ck_tile/host/hip_check_error.hpp"
#include <string>
#include <type_traits>
namespace
ck_tile
{
struct
BatchedTransposeHostArgs
{
const
void
*
p_input
;
void
*
p_output
;
index_t
batch
;
index_t
height
;
index_t
width
;
// index_t dim_blocks;
index_t
dim_stride
;
index_t
dim_block_h
;
index_t
dim_block_w
;
};
template
<
typename
Pipeline_
>
struct
BatchedTransposeKernel
{
using
Pipeline
=
remove_cvref_t
<
Pipeline_
>
;
using
Problem
=
remove_cvref_t
<
typename
Pipeline
::
Problem
>
;
using
Type
=
typename
Problem
::
InputType
;
struct
BatchedTransposeKargs
{
const
void
*
p_input
;
void
*
p_output
;
index_t
batch
;
index_t
height
;
index_t
width
;
index_t
dim_stride
;
};
using
Kargs
=
BatchedTransposeKargs
;
using
Hargs
=
BatchedTransposeHostArgs
;
CK_TILE_HOST
static
constexpr
auto
GridSize
(
const
Hargs
&
h
)
{
size_t
grid_size_x
=
(
h
.
width
+
h
.
dim_block_w
-
1
)
/
h
.
dim_block_w
;
size_t
grid_size_y
=
(
h
.
height
+
h
.
dim_block_h
-
1
)
/
h
.
dim_block_h
;
size_t
grid_size_z
=
h
.
batch
;
return
dim3
(
grid_size_x
,
grid_size_y
,
grid_size_z
);
}
CK_TILE_HOST
static
constexpr
auto
MakeKargs
(
const
Hargs
&
h
)
{
Kargs
k
;
k
.
p_input
=
h
.
p_input
;
k
.
p_output
=
h
.
p_output
;
k
.
batch
=
h
.
batch
;
k
.
height
=
h
.
height
;
k
.
width
=
h
.
width
;
k
.
dim_stride
=
h
.
dim_stride
;
return
k
;
}
CK_TILE_HOST_DEVICE
static
constexpr
auto
BlockSize
()
{
return
Problem
::
kBlockSize
;
}
CK_TILE_DEVICE
void
operator
()(
Kargs
kargs
)
const
{
static
constexpr
ck_tile
::
index_t
kMPerBlock
=
Problem
::
kMPerBlock
;
static
constexpr
ck_tile
::
index_t
kNPerBlock
=
Problem
::
kNPerBlock
;
static
constexpr
bool
kPadM
=
Problem
::
kPadM
;
static
constexpr
bool
kPadN
=
Problem
::
kPadN
;
static
constexpr
ck_tile
::
index_t
kMPerThread
=
Problem
::
kMPerThread
;
static
constexpr
ck_tile
::
index_t
kNPerThread
=
Problem
::
kNPerThread
;
static_assert
(
kMPerThread
==
1
&&
kNPerThread
==
1
);
const
auto
iDim
=
blockIdx
.
z
;
const
auto
x_m_n
=
[
&
]()
{
const
auto
x_dram_naive
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
static_cast
<
const
Type
*>
(
kargs
.
p_input
)
+
iDim
*
kargs
.
dim_stride
,
make_tuple
(
kargs
.
height
,
kargs
.
width
),
make_tuple
(
kargs
.
width
,
1
),
number
<
kNPerThread
>
{},
// TODO thread load value
number
<
1
>
{});
return
pad_tensor_view
(
x_dram_naive
,
make_tuple
(
number
<
kMPerBlock
>
{},
number
<
kNPerBlock
>
{}),
sequence
<
kPadM
,
kPadN
>
{});
}();
const
auto
iM
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
x
*
kMPerBlock
);
const
auto
iN
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
y
*
kNPerBlock
);
const
auto
y_n_m
=
[
&
]()
{
const
auto
y_dram_naive
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
static_cast
<
Type
*>
(
kargs
.
p_output
)
+
iDim
*
kargs
.
dim_stride
,
make_tuple
(
kargs
.
width
,
kargs
.
height
),
make_tuple
(
kargs
.
height
,
1
),
number
<
kMPerThread
>
{},
number
<
1
>
{});
return
pad_tensor_view
(
y_dram_naive
,
make_tuple
(
number
<
kNPerBlock
>
{},
number
<
kMPerBlock
>
{}),
sequence
<
kPadN
,
kPadM
>
{});
}();
auto
x_block_window
=
make_tile_window
(
x_m_n
,
make_tuple
(
number
<
kMPerBlock
>
{},
number
<
kNPerBlock
>
{}),
{
static_cast
<
ck_tile
::
index_t
>
(
iM
*
kMPerBlock
),
static_cast
<
ck_tile
::
index_t
>
(
iN
*
kNPerBlock
)});
auto
y_block_window
=
make_tile_window
(
y_n_m
,
make_tuple
(
number
<
kNPerBlock
>
{},
number
<
kMPerBlock
>
{}),
{
static_cast
<
ck_tile
::
index_t
>
(
iN
*
kNPerBlock
),
static_cast
<
ck_tile
::
index_t
>
(
iM
*
kMPerBlock
)});
Pipeline
{}(
x_block_window
,
y_block_window
);
}
};
}
// namespace ck_tile
Prev
1
…
14
15
16
17
18
19
20
21
22
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment