Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
7a6dbadc
Commit
7a6dbadc
authored
Sep 06, 2022
by
Po-Yen, Chen
Browse files
Add 'GridwisePermute' kernel
This kernel is a clone of 'GridwiseElementwise_1D'
parent
fa21bcde
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
184 additions
and
65 deletions
+184
-65
include/ck/tensor_operation/gpu/device/device_permute.hpp
include/ck/tensor_operation/gpu/device/device_permute.hpp
+44
-65
include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp
include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp
+140
-0
No files found.
include/ck/tensor_operation/gpu/device/device_permute.hpp
View file @
7a6dbadc
...
...
@@ -10,7 +10,7 @@
#include "ck/utility/math.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_
elementwise_1d
.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_
permute
.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/host_utility/kernel_launch.hpp"
...
...
@@ -90,8 +90,8 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
NumOutput
==
OutScalarPerVectorSeq
::
Size
(),
"Tuple size is inconsistent with the number of in/out!"
);
using
InDataTypePointer
Tuple
=
Tuple
<
const
InDataType
*
>
;
using
OutDataTypePointer
Tuple
=
Tuple
<
OutDataType
*
>
;
using
InDataTypePointer
=
const
InDataType
*
;
using
OutDataTypePointer
=
OutDataType
*
;
template
<
typename
Desc_M
>
static
auto
PadDescriptor_M_1d
(
Desc_M
desc_m
,
index_t
gridSize
,
index_t
blockSize
)
...
...
@@ -147,13 +147,13 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
};
};
using
InGrid1dDesc
Tuple
=
Tuple
<
decltype
(
GenerateInOutGrid1dDesc
())
>
;
using
OutGrid1dDesc
Tuple
=
Tuple
<
decltype
(
GenerateInOutGrid1dDesc
())
>
;
using
InGrid1dDesc
=
decltype
(
GenerateInOutGrid1dDesc
());
using
OutGrid1dDesc
=
decltype
(
GenerateInOutGrid1dDesc
());
using
Gridwise
Elementwis
e
=
Gridwise
Elementwise_1D
<
InGrid1dDesc
Tuple
,
OutGrid1dDesc
Tuple
,
InDataTypePointer
Tuple
,
OutDataTypePointer
Tuple
,
using
Gridwise
Permut
e
=
Gridwise
Permute
<
InGrid1dDesc
,
OutGrid1dDesc
,
InDataTypePointer
,
OutDataTypePointer
,
ElementwiseOperation
,
MPerThread
,
InScalarPerVectorSeq
,
...
...
@@ -170,49 +170,30 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
ElementwiseOperation
elementwise_op
)
:
blockSize_
(
256
),
gridSize_
(
120
),
// FIXME - Calculate the grid size by number of CU in the future
in_dev_buffer_
(
static_cast
<
InDataTypePointer
>
(
in_dev_buffer
)),
out_dev_buffer_
(
static_cast
<
OutDataTypePointer
>
(
out_dev_buffer
)),
in_grid_1d_desc_
(
MakeDescriptor_M
(
inLengths
,
inStrides
,
gridSize_
,
blockSize_
)),
out_grid_1d_desc_
(
MakeDescriptor_M
(
inLengths
,
inStrides
,
gridSize_
,
blockSize_
)),
inLengths_
(
inLengths
),
axes_
(
axes
),
inStrides
Array
_
(
{
inStrides
}
),
outStrides
Array
_
(
{
outStrides
}
),
inStrides_
(
inStrides
),
outStrides_
(
outStrides
),
elementwise_op_
(
elementwise_op
)
{
in_dev_buffers_
=
generate_tuple
(
[
&
](
auto
)
{
using
DataType
=
InDataType
;
return
static_cast
<
const
DataType
*>
(
in_dev_buffer
);
},
Number
<
NumInput
>
{});
out_dev_buffers_
=
generate_tuple
(
[
&
](
auto
)
{
using
DataType
=
OutDataType
;
return
static_cast
<
DataType
*>
(
out_dev_buffer
);
},
Number
<
NumOutput
>
{});
in_grid_1d_desc_tuple_
=
generate_tuple
(
[
&
](
auto
)
{
return
MakeDescriptor_M
(
inLengths
,
inStrides
,
gridSize_
,
blockSize_
);
},
Number
<
NumInput
>
{});
out_grid_1d_desc_tuple_
=
generate_tuple
(
[
&
](
auto
)
{
return
MakeDescriptor_M
(
inLengths
,
outStrides
,
gridSize_
,
blockSize_
);
},
Number
<
NumOutput
>
{});
}
index_t
blockSize_
;
index_t
gridSize_
;
InDataTypePointer
Tuple
in_dev_buffer
s
_
;
OutDataTypePointer
Tuple
out_dev_buffer
s
_
;
InGrid1dDesc
Tuple
in_grid_1d_desc_
tuple_
;
OutGrid1dDesc
Tuple
out_grid_1d_desc_
tuple_
;
InDataTypePointer
in_dev_buffer_
;
OutDataTypePointer
out_dev_buffer_
;
InGrid1dDesc
in_grid_1d_desc_
;
OutGrid1dDesc
out_grid_1d_desc_
;
std
::
array
<
index_t
,
NumDim
>
inLengths_
;
std
::
array
<
index_t
,
NumDim
>
axes_
;
std
::
array
<
std
::
array
<
index_t
,
NumDim
>
,
NumInput
>
inStrides
Array
_
;
std
::
array
<
std
::
array
<
index_t
,
NumDim
>
,
NumOutput
>
outStrides
Array
_
;
std
::
array
<
index_t
,
NumDim
>
inStrides_
;
std
::
array
<
index_t
,
NumDim
>
outStrides_
;
ElementwiseOperation
elementwise_op_
;
};
...
...
@@ -221,11 +202,11 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
{
static
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
const
auto
kernel
=
kernel_
elementwise_1d
<
GridwiseElementwis
e
,
InGrid1dDesc
Tuple
,
OutGrid1dDesc
Tuple
,
InDataTypePointer
Tuple
,
OutDataTypePointer
Tuple
,
const
auto
kernel
=
kernel_
permute
<
GridwisePermut
e
,
InGrid1dDesc
,
OutGrid1dDesc
,
InDataTypePointer
,
OutDataTypePointer
,
ElementwiseOperation
>
;
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
...
...
@@ -233,10 +214,10 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
dim3
(
arg
.
gridSize_
),
dim3
(
arg
.
blockSize_
),
0
,
arg
.
in_grid_1d_desc_
tuple_
,
arg
.
out_grid_1d_desc_
tuple_
,
arg
.
in_dev_buffer
s
_
,
arg
.
out_dev_buffer
s
_
,
arg
.
in_grid_1d_desc_
,
arg
.
out_grid_1d_desc_
,
arg
.
in_dev_buffer_
,
arg
.
out_dev_buffer_
,
arg
.
elementwise_op_
);
return
elapsed_time
;
}
...
...
@@ -262,17 +243,15 @@ struct DevicePermute : detail::DevicePermuteBase<DevicePermute<InDataType,
};
bool
valid
=
true
;
static_for
<
0
,
NumInput
,
1
>
{}([
&
](
auto
I
)
{
if
(
!
IsScalarPerVectorValid
(
arg
.
inLengths_
,
arg
.
inStridesArray_
[
I
.
value
],
InScalarPerVectorSeq
::
At
(
I
)))
if
(
!
IsScalarPerVectorValid
(
arg
.
inLengths_
,
arg
.
inStrides_
,
InScalarPerVectorSeq
::
At
(
0
)))
{
valid
=
false
;
}
);
}
static_for
<
0
,
NumOutput
,
1
>
{}([
&
](
auto
I
)
{
if
(
!
IsScalarPerVectorValid
(
arg
.
inLengths_
,
arg
.
outStridesArray_
[
I
.
value
],
OutScalarPerVectorSeq
::
At
(
I
)))
if
(
!
IsScalarPerVectorValid
(
arg
.
inLengths_
,
arg
.
outStrides_
,
OutScalarPerVectorSeq
::
At
(
0
)))
{
valid
=
false
;
}
);
}
return
valid
;
};
...
...
include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp
0 → 100644
View file @
7a6dbadc
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
template
<
typename
GridwisePermuteFunctor
,
typename
InGrid1dDesc
,
typename
OutGrid1dDesc
,
typename
InDataTypePointer
,
typename
OutDataTypePointer
,
typename
ElementwiseOperation
>
__global__
void
kernel_permute
(
const
InGrid1dDesc
in_grid_1d_desc
,
const
OutGrid1dDesc
out_grid_1d_desc
,
const
InDataTypePointer
p_in_global
,
const
OutDataTypePointer
p_out_global
,
const
ElementwiseOperation
elementwise_op
)
{
GridwisePermuteFunctor
::
Run
(
in_grid_1d_desc
,
out_grid_1d_desc
,
p_in_global
,
p_out_global
,
elementwise_op
);
}
template
<
typename
InGrid1dDesc
,
typename
OutGrid1dDesc
,
typename
InDataTypePointer
,
typename
OutDataTypePointer
,
typename
ElementwiseOperation
,
index_t
MPerThread
,
typename
InScalarPerVectorSeq
,
typename
OutScalarPerVectorSeq
>
struct
GridwisePermute
{
static
constexpr
index_t
NumInput
=
1
;
static
constexpr
index_t
NumOutput
=
1
;
static_assert
(
NumInput
==
InScalarPerVectorSeq
::
Size
()
&&
NumOutput
==
OutScalarPerVectorSeq
::
Size
(),
"Tuple size is inconsistent with the number of in/out!"
);
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
thread_buffer_desc_m
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MPerThread
>
{}));
using
PassThroughOp
=
tensor_operation
::
element_wise
::
PassThrough
;
__device__
static
void
Run
(
const
InGrid1dDesc
in_grid_1d_desc
,
const
OutGrid1dDesc
out_grid_1d_desc
,
const
InDataTypePointer
p_in_global
,
const
OutDataTypePointer
p_out_global
,
const
ElementwiseOperation
elementwise_op
)
{
const
index_t
thread_global_id
=
get_thread_global_1d_id
();
using
InDataType
=
remove_cv_t
<
remove_pointer_t
<
InDataTypePointer
>>
;
auto
in_thread_buf
=
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
InDataType
,
MPerThread
,
true
>
{};
using
OutDataType
=
remove_cv_t
<
remove_pointer_t
<
OutDataTypePointer
>>
;
auto
out_thread_buf
=
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
OutDataType
,
MPerThread
,
true
>
{};
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_global
,
in_grid_1d_desc
.
GetElementSpaceSize
());
auto
out_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_global
,
out_grid_1d_desc
.
GetElementSpaceSize
());
const
auto
thread_global_offset
=
make_multi_index
(
thread_global_id
*
MPerThread
);
const
index_t
blockSize
=
get_block_size
();
const
index_t
blockPerGrid
=
get_grid_size
();
const
auto
M
=
in_grid_1d_desc
.
GetLength
(
I0
);
const
index_t
loop_step
=
blockPerGrid
*
blockSize
*
MPerThread
;
const
auto
loop_step_index
=
make_multi_index
(
loop_step
);
auto
in_global_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
InDataType
,
decltype
(
in_grid_1d_desc
),
decltype
(
thread_buffer_desc_m
),
Sequence
<
MPerThread
>
,
// SliceLengths
Sequence
<
0
>
,
// DimAccessOrder
0
,
// SrcVectorDim
InScalarPerVectorSeq
::
At
(
0
),
// ScalarPerVector
1
,
// SrcScalarStrideInVector
false
>
{
in_grid_1d_desc
,
thread_global_offset
};
auto
out_global_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
OutDataType
,
OutDataType
,
decltype
(
thread_buffer_desc_m
),
decltype
(
out_grid_1d_desc
),
PassThroughOp
,
Sequence
<
MPerThread
>
,
// SliceLengths
Sequence
<
0
>
,
// DimAccessOrder
0
,
// SrcVectorDim
OutScalarPerVectorSeq
::
At
(
0
),
InMemoryDataOperationEnum
::
Set
,
1
,
false
>
(
out_grid_1d_desc
,
thread_global_offset
,
PassThroughOp
{});
index_t
num_iter
=
M
/
(
loop_step
);
do
{
in_global_load
.
Run
(
in_grid_1d_desc
,
in_global_buf
,
thread_buffer_desc_m
,
make_tuple
(
I0
),
in_thread_buf
);
in_global_load
.
MoveSrcSliceWindow
(
in_grid_1d_desc
,
loop_step_index
);
static_for
<
0
,
MPerThread
,
1
>
{}([
&
](
auto
iM
)
{
// get reference to in data
const
auto
&
in_data_ref
=
in_thread_buf
(
iM
);
// get reference to dst data
auto
&
out_data_ref
=
out_thread_buf
(
iM
);
elementwise_op
(
out_data_ref
,
in_data_ref
);
});
out_global_store
.
Run
(
thread_buffer_desc_m
,
make_tuple
(
I0
),
out_thread_buf
,
out_grid_1d_desc
,
out_global_buf
);
out_global_store
.
MoveDstSliceWindow
(
out_grid_1d_desc
,
loop_step_index
);
}
while
(
--
num_iter
);
}
};
}
// namespace ck
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment