Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b3782d46
Commit
b3782d46
authored
Sep 07, 2022
by
Po-Yen, Chen
Browse files
Add 'BlockSize' parameter to 'DevicePermute'
parent
6ab0e31f
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
91 additions
and
18 deletions
+91
-18
example/36_permute/permute_fp16.cpp
example/36_permute/permute_fp16.cpp
+5
-5
include/ck/tensor_operation/gpu/device/device_permute.hpp
include/ck/tensor_operation/gpu/device/device_permute.hpp
+9
-13
include/ck/tensor_operation/gpu/grid/gridwise_copy.hpp
include/ck/tensor_operation/gpu/grid/gridwise_copy.hpp
+77
-0
No files found.
example/36_permute/permute_fp16.cpp
View file @
b3782d46
...
...
@@ -8,11 +8,11 @@ using BDataType = F16;
// clang-format off
using
DevicePermuteInstance
=
ck
::
tensor_operation
::
device
::
DevicePermute
// ######| InData| OutData| Elementwise| NumDim| NPer| HPer| WPer|MPerThread| InScalar| OutScalar|
// ######| Type| Type| Operation| | Block| Block| Block| | PerVector| PerVector|
// ######| | | | | | | | | | |
// ######| | | | | | | | | | |
<
ADataType
,
BDataType
,
PassThrough
,
4
,
128
,
128
,
128
,
8
,
8
,
1
>
;
// ######| InData| OutData| Elementwise| NumDim|
Block|
NPer| HPer| WPer|MPerThread| InScalar| OutScalar|
// ######| Type| Type| Operation| |
Size|
Block| Block| Block| | PerVector| PerVector|
// ######| | | | | | | |
|
| | |
// ######| | | | | | | |
|
| | |
<
ADataType
,
BDataType
,
PassThrough
,
4
,
256
,
128
,
128
,
128
,
8
,
8
,
1
>
;
// clang-format on
#include "run_permute_example.inc"
...
...
include/ck/tensor_operation/gpu/device/device_permute.hpp
View file @
b3782d46
...
...
@@ -10,6 +10,7 @@
#include "ck/utility/math.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_copy.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
...
...
@@ -77,6 +78,7 @@ template <typename InDataType,
typename
OutDataType
,
typename
ElementwiseOperation
,
index_t
NumDim
,
index_t
BlockSize
,
index_t
NPerBlock
,
index_t
HPerBlock
,
index_t
WPerBlock
,
...
...
@@ -87,6 +89,7 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
OutDataType
,
ElementwiseOperation
,
NumDim
,
BlockSize
,
NPerBlock
,
HPerBlock
,
WPerBlock
,
...
...
@@ -148,19 +151,8 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
Sequence
<
NumDim
-
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}));
#if 0
const index_t N = std::accumulate(begin(lengths), std::prev(end(lengths), 2), index_t{1}, std::multiplies<index_t>{});
const auto desc_m = transform_tensor_descriptor(
desc_n_h_w,
make_tuple(make_merge_transform(make_tuple(N, H, W))),
make_tuple(Sequence<0, 1, 2>{}),
make_tuple(Sequence<0>{})
);
return PadDescriptor_M_1d(desc_m, gridSize, blockSize);
#else
return
PadDescriptor_M_1d
(
desc_n_h_w
,
gridSize
,
blockSize
);
#endif
return
PadTensorDescriptor
(
desc_n_h_w
,
make_tuple
(
NPerBlock
,
HPerBlock
,
WPerBlock
),
Sequence
<
true
,
true
,
true
>
{});
}
using
InGrid1dDesc
=
decltype
(
MakeDescriptor_N_H_W
({
1
,
1
},
{
1
,
1
},
1
,
1
));
...
...
@@ -171,6 +163,10 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
InDataTypePointer
,
OutDataTypePointer
,
ElementwiseOperation
,
BlockSize
,
NPerBlock
,
HPerBlock
,
WPerBlock
,
MPerThread
,
InScalarPerVector
,
OutScalarPerVector
>
;
...
...
include/ck/tensor_operation/gpu/grid/gridwise_copy.hpp
View file @
b3782d46
...
...
@@ -5,6 +5,7 @@
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
...
...
@@ -31,18 +32,40 @@ template <typename InGrid1dDesc,
typename
InDataTypePointer
,
typename
OutDataTypePointer
,
typename
ElementwiseOperation
,
index_t
BlockSize
,
index_t
NPerBlock
,
index_t
HPerBlock
,
index_t
WPerBlock
,
index_t
MPerThread
,
index_t
InScalarPerVector
,
index_t
OutScalarPerVector
>
struct
GridwiseCopy
{
static_assert
(
InGrid1dDesc
::
GetNumOfDimension
()
==
3
&&
OutGrid1dDesc
::
GetNumOfDimension
()
==
3
);
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
thread_buffer_desc_m
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MPerThread
>
{}));
using
PassThroughOp
=
tensor_operation
::
element_wise
::
PassThrough
;
using
ThisThreadBlock
=
ThisThreadBlock
<
BlockSize
>
;
__host__
__device__
static
constexpr
auto
GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1
()
{
constexpr
index_t
ABlockLdsExtraM
=
0
;
// A matrix in LDS memory, dst of blockwise copy
return
make_naive_tensor_descriptor
(
make_tuple
(
Number
<
NPerBlock
>
{},
Number
<
HPerBlock
>
{},
Number
<
WPerBlock
>
{}),
make_tuple
(
Number
<
NPerBlock
+
ABlockLdsExtraM
>
{}
*
Number
<
HPerBlock
>
{},
Number
<
HPerBlock
>
{},
I1
));
}
__device__
static
void
Run
(
const
InGrid1dDesc
in_grid_1d_desc
,
const
OutGrid1dDesc
out_grid_1d_desc
,
const
InDataTypePointer
p_in_global
,
...
...
@@ -71,6 +94,7 @@ struct GridwiseCopy
const
index_t
loop_step
=
blockPerGrid
*
blockSize
*
MPerThread
;
const
auto
loop_step_index
=
make_multi_index
(
loop_step
);
#if 1
auto
in_global_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
InDataType
,
...
...
@@ -82,7 +106,60 @@ struct GridwiseCopy
InScalarPerVector
,
// ScalarPerVector
1
,
// SrcScalarStrideInVector
false
>
{
in_grid_1d_desc
,
thread_global_offset
};
#else
// const auto block_work_idx =
// block_2_etile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
// HACK: this force m/n_block_data_idx_on_grid into SGPR
const
index_t
m_block_data_idx_on_grid
=
__builtin_amdgcn_readfirstlane
(
block_work_idx
[
I0
]
*
NPerBlock
*
HPerBlock
);
// const index_t n_block_data_idx_on_grid =
// __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
// A matrix in LDS memory, dst of blockwise copy
constexpr
auto
a_block_desc_ak0_m_ak1
=
GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1
();
// // B matrix in LDS memory, dst of blockwise copy
// constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
using
SliceLengths
=
Sequence
<
NPerBlock
,
HPerBlock
,
WPerBlock
>
;
using
ABlockTransferThreadClusterLengths_AK0_M_AK1
=
Sequence
<
4
,
64
,
1
>
;
using
ABlockTransferThreadClusterArrangeOrder
=
Sequence
<
1
,
0
,
2
>
;
using
ABlockTransferSrcAccessOrder
=
int
;
constexpr
index_t
ABlockTransferSrcVectorDim
=
2
;
constexpr
index_t
ABlockTransferSrcScalarPerVector
=
1
;
constexpr
index_t
ABlockTransferDstScalarPerVector
=
1
;
auto
in_global_load
=
ThreadGroupTensorSliceTransfer_v4r1
<
ThisThreadBlock
,
ElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
InMemoryDataOperationEnum
::
Set
,
SliceLengths
,
ABlockTransferThreadClusterLengths_AK0_M_AK1
,
ABlockTransferThreadClusterArrangeOrder
,
InDataType
,
InDataType
,
decltype
(
in_grid_1d_desc
),
decltype
(
a_block_desc_ak0_m_ak1
),
ABlockTransferSrcAccessOrder
,
Sequence
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
2
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector
,
1
,
1
,
true
,
true
>
(
in_grid_1d_desc
,
make_multi_index
(
0
,
m_block_data_idx_on_grid
,
0
),
element_op
,
a_block_desc_ak0_m_ak1
,
make_multi_index
(
0
,
0
,
0
),
ck
::
tensor_operation
::
element_wise
::
PassThrough
{});
#endif
auto
out_global_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
OutDataType
,
OutDataType
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment