Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
df0d6810
Commit
df0d6810
authored
Sep 20, 2021
by
Chao Liu
Browse files
:Merge remote-tracking branch 'origin/develop' into CK_upstream
parents
31b40352
9e80cdce
Changes
26
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
5166 additions
and
72 deletions
+5166
-72
composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
...sor_operation/gridwise_generic_2d_reduction_blockwise.hpp
+613
-0
composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
...ation/gridwise_generic_2d_reduction_direct_threadwise.hpp
+491
-0
composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
...eration/gridwise_generic_2d_reduction_direct_warpwise.hpp
+532
-0
composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
...or_operation/gridwise_generic_2d_reduction_multiblock.hpp
+376
-0
composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp
...nclude/tensor_operation/reduction_functions_blockwise.hpp
+271
-0
composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp
...clude/tensor_operation/reduction_functions_threadwise.hpp
+141
-0
composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp
...include/tensor_operation/reduction_functions_warpwise.hpp
+371
-0
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
...ude/tensor_operation/threadwise_tensor_slice_transfer.hpp
+2
-4
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+109
-66
composable_kernel/include/utility/data_type.hpp
composable_kernel/include/utility/data_type.hpp
+11
-0
composable_kernel/include/utility/dynamic_buffer.hpp
composable_kernel/include/utility/dynamic_buffer.hpp
+11
-2
composable_kernel/include/utility/reduction_common.hpp
composable_kernel/include/utility/reduction_common.hpp
+104
-0
composable_kernel/include/utility/reduction_functions_binop.hpp
...able_kernel/include/utility/reduction_functions_binop.hpp
+100
-0
composable_kernel/include/utility/reduction_operator.hpp
composable_kernel/include/utility/reduction_operator.hpp
+420
-0
composable_kernel/include/utility/type.hpp
composable_kernel/include/utility/type.hpp
+3
-0
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
...eneric_reduction_first_call_blockwise_reduce_all_dims.cpp
+317
-0
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
...ic_reduction_first_call_blockwise_reduce_partial_dims.cpp
+318
-0
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
...neric_reduction_first_call_multiblock_reduce_all_dims.cpp
+323
-0
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
...c_reduction_first_call_multiblock_reduce_partial_dims.cpp
+323
-0
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
...neric_reduction_first_call_threadwise_reduce_all_dims.cpp
+330
-0
No files found.
composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
0 → 100644
View file @
df0d6810
This diff is collapsed.
Click to expand it.
composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
0 → 100644
View file @
df0d6810
This diff is collapsed.
Click to expand it.
composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
0 → 100644
View file @
df0d6810
This diff is collapsed.
Click to expand it.
composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
0 → 100644
View file @
df0d6810
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
#define CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_blockwise.hpp"
#include "blockwise_tensor_slice_transfer.hpp"
namespace
ck
{
template
<
index_t
BlockSize
,
typename
srcDataType
,
typename
dstDataType
,
// not used together with the beta input
typename
compType
,
typename
src2dDescType
,
typename
dst1dDescType
,
ReduceTensorOp_t
op
,
NanPropagation_t
nanPropaOpt
,
ReduceTensorIndices_t
reduceIndicesOpt
,
index_t
GredAccessesPerThreadInBlock
>
struct
GridwiseReduction_xy_to_x_multiblock
{
using
opReduce
=
typename
reduce_binary_operator
<
compType
,
op
>::
opType
;
using
preUnaryOpType
=
typename
reduce_unary_operator
<
compType
,
op
,
true
,
false
>::
preUnaryOp
;
using
posUnaryOpType
=
typename
reduce_unary_operator
<
compType
,
op
,
true
,
false
>::
posUnaryOp
;
static
constexpr
auto
buffer2dDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
GredAccessesPerThreadInBlock
>
{},
Number
<
BlockSize
>
{}));
using
blockwise_reduce
=
BlockwiseReduction_2d_block_buffer
<
decltype
(
buffer2dDesc
),
true
,
opReduce
,
nanPropaOpt
>
;
static
constexpr
index_t
BlockBufferSize
=
buffer2dDesc
.
GetElementSize
();
static
constexpr
auto
I0
=
Number
<
0
>
{};
template
<
int
RunId
>
__device__
static
void
Run
(
const
src2dDescType
&
src2dDesc
,
const
dst1dDescType
&
dst1dDesc
,
int
origReduceLen
,
int
BlkGroupSize
,
srcDataType
alpha
,
const
srcDataType
*
const
__restrict__
p_src_global
,
dstDataType
beta
,
srcDataType
*
const
__restrict__
ws_values_global
,
int
*
const
__restrict__
ws_indices_global
);
template
<
>
__device__
static
void
Run
<
1
>
(
const
src2dDescType
&
src2dDesc
,
const
dst1dDescType
&
dst1dDesc
,
int
origReduceLen
,
int
BlkGroupSize
,
srcDataType
alpha
,
const
srcDataType
*
const
__restrict__
p_src_global
,
dstDataType
beta
,
srcDataType
*
const
__restrict__
ws_values_global
,
int
*
const
__restrict__
ws_indices_global
)
{
(
void
)
ws_indices_global
;
(
void
)
alpha
;
// unused
(
void
)
beta
;
// unused
auto
zeroVal
=
opReduce
::
GetZeroVal
();
// LDS
__shared__
compType
p_in_block_buffer
[
BlockBufferSize
];
const
auto
src_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_src_global
,
src2dDesc
.
GetElementSpaceSize
(),
type_convert
<
srcDataType
>
{}(
zeroVal
));
auto
workspace_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
ws_values_global
,
dst1dDesc
.
GetLength
(
I0
)
*
BlkGroupSize
);
auto
in_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_in_block_buffer
,
BlockBufferSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
1
,
true
>
accuValue_buf
;
accuValue_buf
(
I0
)
=
zeroVal
;
const
auto
toReduceLength
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
const
int
divider
=
origReduceLen
;
const
preUnaryOpType
preUnaryOp
(
divider
);
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_id
=
get_block_1d_id
();
const
index_t
blkgroup_id
=
block_global_id
/
BlkGroupSize
;
const
index_t
block_local_id
=
block_global_id
%
BlkGroupSize
;
const
index_t
reduceSizePerBlock
=
(((
toReduceLength
+
BlkGroupSize
-
1
)
/
BlkGroupSize
+
BlockBufferSize
-
1
)
/
BlockBufferSize
)
*
BlockBufferSize
;
constexpr
auto
in_block_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{},
Number
<
BlockSize
*
GredAccessesPerThreadInBlock
>
{}));
using
ThreadSliceLengths
=
Sequence
<
1
,
GredAccessesPerThreadInBlock
>
;
using
ThreadClusterLengths
=
Sequence
<
1
,
BlockSize
>
;
auto
blockwise_src_load
=
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
1
,
BlockBufferSize
>
,
ThreadSliceLengths
,
ThreadClusterLengths
,
Sequence
<
0
,
1
>
,
srcDataType
,
compType
,
src2dDescType
,
decltype
(
in_block_desc
),
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
false
,
true
>
(
src2dDesc
,
make_multi_index
(
blkgroup_id
,
block_local_id
*
reduceSizePerBlock
),
in_block_desc
,
make_multi_index
(
0
,
0
));
constexpr
auto
in_block_copy_step
=
make_multi_index
(
0
,
BlockBufferSize
);
const
index_t
toReduceBlocks
=
(
reduceSizePerBlock
+
BlockSize
-
1
)
/
BlockSize
;
for
(
index_t
reducedBlocks
=
0
;
reducedBlocks
<
toReduceBlocks
;
reducedBlocks
+=
GredAccessesPerThreadInBlock
)
{
blockwise_src_load
.
RunRead
(
src2dDesc
,
src_global_buf
);
blockwise_src_load
.
RunWrite
(
in_block_desc
,
in_block_buf
);
__syncthreads
();
// do element-wise pre-reduction operation
blockwise_reduce
::
operate_on_elements
(
preUnaryOp
,
in_block_buf
);
index_t
BlocksInOneOp
=
(
reducedBlocks
<
toReduceBlocks
-
GredAccessesPerThreadInBlock
)
?
GredAccessesPerThreadInBlock
:
toReduceBlocks
-
reducedBlocks
;
blockwise_reduce
::
Reduce
(
in_block_buf
,
BlocksInOneOp
,
accuValue_buf
(
I0
));
blockwise_src_load
.
MoveSrcSliceWindow
(
src2dDesc
,
in_block_copy_step
);
}
constexpr
auto
ReducedDataDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{}));
const
auto
workspace_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
dst1dDesc
.
GetLength
(
I0
)
*
BlkGroupSize
));
// The first thread in the block stores the reduced result to the global location
// representing the block
if
(
thread_local_id
==
0
)
{
auto
threadwise_workspace_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
compType
,
srcDataType
,
decltype
(
ReducedDataDesc
),
decltype
(
workspace_desc
),
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
workspace_desc
,
make_multi_index
(
block_global_id
));
threadwise_workspace_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
accuValue_buf
,
workspace_desc
,
workspace_global_buf
);
}
};
template
<
>
__device__
static
void
Run
<
2
>
(
const
src2dDescType
&
src2dDesc
,
const
dst1dDescType
&
dst1dDesc
,
int
origReduceLen
,
int
BlkGroupSize
,
srcDataType
alpha
,
const
srcDataType
*
const
__restrict__
p_src_global
,
dstDataType
beta
,
srcDataType
*
const
__restrict__
ws_values_global
,
int
*
const
__restrict__
ws_indices_global
)
{
(
void
)
alpha
;
// unused
(
void
)
beta
;
// unused
auto
zeroVal
=
opReduce
::
GetZeroVal
();
// LDS
__shared__
compType
p_in_block_values_buffer
[
BlockBufferSize
];
__shared__
int
p_in_block_indices_buffer
[
BlockBufferSize
];
const
auto
src_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_src_global
,
src2dDesc
.
GetElementSpaceSize
(),
type_convert
<
srcDataType
>
{}(
zeroVal
));
auto
workspace_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
ws_values_global
,
dst1dDesc
.
GetLength
(
I0
)
*
BlkGroupSize
);
auto
workspace_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
ws_indices_global
,
dst1dDesc
.
GetLength
(
I0
)
*
BlkGroupSize
);
auto
in_block_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_in_block_values_buffer
,
BlockBufferSize
);
auto
in_block_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_in_block_indices_buffer
,
BlockBufferSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
1
,
true
>
accuValue_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
int
,
1
,
true
>
accuIndex_buf
;
accuValue_buf
(
I0
)
=
zeroVal
;
accuIndex_buf
(
I0
)
=
0
;
const
auto
toReduceLength
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
const
int
divider
=
origReduceLen
;
const
preUnaryOpType
preUnaryOp
(
divider
);
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_id
=
get_block_1d_id
();
const
index_t
blkgroup_id
=
block_global_id
/
BlkGroupSize
;
const
index_t
block_local_id
=
block_global_id
%
BlkGroupSize
;
const
index_t
reduceSizePerBlock
=
(((
toReduceLength
+
BlkGroupSize
-
1
)
/
BlkGroupSize
+
BlockBufferSize
-
1
)
/
BlockBufferSize
)
*
BlockBufferSize
;
constexpr
auto
in_block_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{},
Number
<
BlockSize
*
GredAccessesPerThreadInBlock
>
{}));
using
ThreadSliceLengths
=
Sequence
<
1
,
GredAccessesPerThreadInBlock
>
;
using
ThreadClusterLengths
=
Sequence
<
1
,
BlockSize
>
;
auto
blockwise_src_load
=
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
1
,
BlockBufferSize
>
,
ThreadSliceLengths
,
ThreadClusterLengths
,
Sequence
<
0
,
1
>
,
srcDataType
,
compType
,
src2dDescType
,
decltype
(
in_block_desc
),
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
false
,
true
>
(
src2dDesc
,
make_multi_index
(
blkgroup_id
,
block_local_id
*
reduceSizePerBlock
),
in_block_desc
,
make_multi_index
(
0
,
0
));
constexpr
auto
in_block_copy_step
=
make_multi_index
(
0
,
BlockBufferSize
);
const
index_t
toReduceBlocks
=
(
reduceSizePerBlock
+
BlockSize
-
1
)
/
BlockSize
;
int
indexOffset
=
block_local_id
*
reduceSizePerBlock
;
for
(
index_t
reducedBlocks
=
0
;
reducedBlocks
<
toReduceBlocks
;
reducedBlocks
+=
GredAccessesPerThreadInBlock
)
{
blockwise_reduce
::
init_buffer_indices
(
in_block_idx_buf
,
indexOffset
);
blockwise_src_load
.
RunRead
(
src2dDesc
,
src_global_buf
);
blockwise_src_load
.
RunWrite
(
in_block_desc
,
in_block_val_buf
);
__syncthreads
();
// unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
// done here
blockwise_reduce
::
operate_on_elements
(
preUnaryOp
,
in_block_val_buf
);
index_t
BlocksInOneOp
=
(
reducedBlocks
<
toReduceBlocks
-
GredAccessesPerThreadInBlock
)
?
GredAccessesPerThreadInBlock
:
toReduceBlocks
-
reducedBlocks
;
blockwise_reduce
::
Reduce2
(
in_block_val_buf
,
in_block_idx_buf
,
BlocksInOneOp
,
accuValue_buf
(
I0
),
accuIndex_buf
(
I0
));
indexOffset
+=
BlockBufferSize
;
blockwise_src_load
.
MoveSrcSliceWindow
(
src2dDesc
,
in_block_copy_step
);
}
constexpr
auto
ReducedDataDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{}));
const
auto
workspace_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
dst1dDesc
.
GetLength
(
I0
)
*
BlkGroupSize
));
// The first thread in the block stores the reduced result to the global location
// representing the block
if
(
thread_local_id
==
0
)
{
auto
threadwise_workspace_val_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
compType
,
srcDataType
,
decltype
(
ReducedDataDesc
),
decltype
(
workspace_desc
),
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
workspace_desc
,
make_multi_index
(
block_global_id
));
auto
threadwise_workspace_idx_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
int
,
int
,
decltype
(
ReducedDataDesc
),
decltype
(
workspace_desc
),
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
workspace_desc
,
make_multi_index
(
block_global_id
));
threadwise_workspace_val_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
accuValue_buf
,
workspace_desc
,
workspace_global_val_buf
);
threadwise_workspace_idx_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
accuIndex_buf
,
workspace_desc
,
workspace_global_idx_buf
);
}
};
};
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp
0 → 100644
View file @
df0d6810
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
#define CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_binop.hpp"
namespace
ck
{
template
<
typename
buffer2dDescType
,
bool
blockIsOneRow
,
typename
opReduce
,
NanPropagation_t
nanPropaOpt
>
struct
BlockwiseReduction_2d_block_buffer
{
using
compType
=
typename
opReduce
::
dataType
;
static
constexpr
auto
buffer2dDesc
=
buffer2dDescType
{};
static
constexpr
index_t
BlockSize
=
blockIsOneRow
?
buffer2dDesc
.
GetLength
(
Number
<
1
>
{})
:
buffer2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
index_t
NumBlocks
=
blockIsOneRow
?
buffer2dDesc
.
GetLength
(
Number
<
0
>
{})
:
buffer2dDesc
.
GetLength
(
Number
<
1
>
{});
using
binop
=
detail
::
binop_with_nan_check
<
nanPropaOpt
,
opReduce
,
compType
>
;
// This interface does not accumulate on indices
template
<
typename
BufferType
>
__device__
static
void
Reduce
(
BufferType
&
block_buffer
,
index_t
toReduceBlocks
,
compType
&
accuData
)
{
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
compType
lAccuData
=
opReduce
::
GetZeroVal
();
index_t
offset
;
for
(
index_t
otherDimInd
=
0
;
otherDimInd
<
toReduceBlocks
;
otherDimInd
++
)
{
offset
=
blockIsOneRow
?
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
otherDimInd
,
thread_local_id
))
:
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_local_id
,
otherDimInd
));
compType
opData
=
type_convert
<
compType
>
{}(
block_buffer
[
offset
]);
binop
::
calculate
(
lAccuData
,
opData
);
}
offset
=
blockIsOneRow
?
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
0
,
thread_local_id
))
:
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_local_id
,
0
));
block_buffer
(
offset
)
=
lAccuData
;
__syncthreads
();
for
(
index_t
indOffset
=
BlockSize
/
2
;
indOffset
>
0
;
indOffset
/=
2
)
{
if
(
thread_local_id
<
indOffset
)
{
index_t
offset1
=
blockIsOneRow
?
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
0
,
thread_local_id
))
:
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_local_id
,
0
));
index_t
offset2
=
blockIsOneRow
?
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
0
,
thread_local_id
+
indOffset
))
:
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_local_id
+
indOffset
,
0
));
compType
opData1
=
type_convert
<
compType
>
{}(
block_buffer
[
offset1
]);
compType
opData2
=
type_convert
<
compType
>
{}(
block_buffer
[
offset2
]);
binop
::
calculate
(
opData1
,
opData2
);
block_buffer
(
offset1
)
=
type_convert
<
compType
>
{}(
opData1
);
}
__syncthreads
();
}
if
(
thread_local_id
==
0
)
{
compType
tmpVal
=
type_convert
<
compType
>
{}(
block_buffer
[
0
]);
binop
::
calculate
(
accuData
,
tmpVal
);
}
};
// This interface accumulates on both data values and indices
template
<
typename
BufferType
,
typename
IdxBufferType
>
__device__
static
void
Reduce2
(
BufferType
&
block_buffer
,
IdxBufferType
&
block_indices_buffer
,
index_t
toReduceBlocks
,
compType
&
accuData
,
int
&
accuIndex
)
{
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
compType
lAccuData
=
opReduce
::
GetZeroVal
();
int
lAccuIndex
=
0
;
if
constexpr
(
blockIsOneRow
)
{
for
(
index_t
otherDimInd
=
0
;
otherDimInd
<
toReduceBlocks
;
otherDimInd
++
)
{
for
(
index_t
indOffset
=
1
;
indOffset
<
BlockSize
;
indOffset
*=
2
)
{
if
(
thread_local_id
%
(
indOffset
*
2
)
==
0
)
{
index_t
offset1
=
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
otherDimInd
,
thread_local_id
));
index_t
offset2
=
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
otherDimInd
,
thread_local_id
+
indOffset
));
compType
currVal1
=
type_convert
<
compType
>
{}(
block_buffer
[
offset1
]);
compType
currVal2
=
type_convert
<
compType
>
{}(
block_buffer
[
offset2
]);
int
currIndex1
=
block_indices_buffer
[
offset1
];
int
currIndex2
=
block_indices_buffer
[
offset2
];
binop
::
calculate
(
currVal1
,
currVal2
,
currIndex1
,
currIndex2
);
block_buffer
(
offset1
)
=
type_convert
<
compType
>
{}(
currVal1
);
block_indices_buffer
(
offset1
)
=
currIndex1
;
}
__syncthreads
();
}
}
if
(
thread_local_id
==
0
)
{
for
(
index_t
otherDimInd
=
0
;
otherDimInd
<
toReduceBlocks
;
otherDimInd
++
)
{
index_t
offset
=
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
otherDimInd
,
0
));
compType
tmpVal
=
type_convert
<
compType
>
{}(
block_buffer
[
offset
]);
int
tmpIndex
=
block_indices_buffer
[
offset
];
binop
::
calculate
(
lAccuData
,
tmpVal
,
lAccuIndex
,
tmpIndex
);
}
binop
::
calculate
(
accuData
,
lAccuData
,
accuIndex
,
lAccuIndex
);
}
}
else
{
index_t
offset
;
for
(
index_t
otherDimInd
=
0
;
otherDimInd
<
toReduceBlocks
;
otherDimInd
++
)
{
offset
=
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_local_id
,
otherDimInd
));
compType
currVal
=
type_convert
<
compType
>
{}(
block_buffer
[
offset
]);
int
currIndex
=
block_indices_buffer
[
offset
];
binop
::
calculate
(
lAccuData
,
currVal
,
lAccuIndex
,
currIndex
);
}
offset
=
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_local_id
,
0
));
block_buffer
(
offset
)
=
lAccuData
;
block_indices_buffer
(
offset
)
=
lAccuIndex
;
__syncthreads
();
for
(
index_t
indOffset
=
1
;
indOffset
<
BlockSize
;
indOffset
*=
2
)
{
if
(
thread_local_id
%
(
indOffset
*
2
)
==
0
)
{
index_t
offset1
=
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_local_id
,
0
));
index_t
offset2
=
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_local_id
+
indOffset
,
0
));
compType
currVal1
=
type_convert
<
compType
>
{}(
block_buffer
[
offset1
]);
compType
currVal2
=
type_convert
<
compType
>
{}(
block_buffer
[
offset2
]);
int
currIndex1
=
block_indices_buffer
[
offset1
];
int
currIndex2
=
block_indices_buffer
[
offset2
];
binop
::
calculate
(
currVal1
,
currVal2
,
currIndex1
,
currIndex2
);
block_buffer
(
offset1
)
=
type_convert
<
compType
>
{}(
currVal1
);
block_indices_buffer
(
offset1
)
=
currIndex1
;
}
__syncthreads
();
}
if
(
thread_local_id
==
0
)
{
compType
tmpVal
=
type_convert
<
compType
>
{}(
block_buffer
[
0
]);
int
tmpIndex
=
block_indices_buffer
[
0
];
binop
::
calculate
(
accuData
,
tmpVal
,
accuIndex
,
tmpIndex
);
}
}
};
template
<
typename
BufferType
>
__device__
static
void
set_buffer_value
(
BufferType
&
block_buffer
,
compType
value
)
{
index_t
thread_id
=
get_thread_local_1d_id
();
for
(
index_t
otherDimInd
=
0
;
otherDimInd
<
NumBlocks
;
otherDimInd
++
)
{
index_t
offset
=
blockIsOneRow
?
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
otherDimInd
,
thread_id
))
:
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_id
,
otherDimInd
));
block_buffer
(
offset
)
=
value
;
__syncthreads
();
}
};
// Initialize the block-wise indices buffer, the index for each element in the block-wise data
// buffer
// is calculated according to its position in the buffer and the global starting index
template
<
typename
IdxBufferType
>
__device__
static
void
init_buffer_indices
(
IdxBufferType
&
block_indices_buffer
,
int
indexStart
)
{
index_t
thread_id
=
get_thread_local_1d_id
();
for
(
index_t
otherDimInd
=
0
;
otherDimInd
<
NumBlocks
;
otherDimInd
++
)
{
index_t
offset
=
blockIsOneRow
?
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
otherDimInd
,
thread_id
))
:
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_id
,
otherDimInd
));
block_indices_buffer
(
offset
)
=
offset
+
indexStart
;
__syncthreads
();
}
};
// Execute unary operation on the block buffer elements
template
<
typename
unary_op_type
,
typename
BufferType
>
__device__
static
void
operate_on_elements
(
unary_op_type
&
unary_op
,
BufferType
&
block_buffer
)
{
index_t
thread_id
=
get_thread_local_1d_id
();
for
(
index_t
otherDimInd
=
0
;
otherDimInd
<
NumBlocks
;
otherDimInd
++
)
{
index_t
offset
=
blockIsOneRow
?
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
otherDimInd
,
thread_id
))
:
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_id
,
otherDimInd
));
block_buffer
(
offset
)
=
unary_op
(
block_buffer
[
offset
]);
__syncthreads
();
}
};
};
};
// end of namespace ck
#endif
composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp
0 → 100644
View file @
df0d6810
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
#define CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_binop.hpp"
namespace
ck
{
template
<
typename
BufferType
,
typename
opReduce
,
NanPropagation_t
nanPropaOpt
>
struct
ThreadReduce
{
using
compType
=
typename
opReduce
::
dataType
;
static_assert
(
BufferType
::
IsStaticBuffer
(),
"Thread-wise reduction needs use StaticBuffer!"
);
static_assert
(
std
::
is_same
<
typename
BufferType
::
type
,
compType
>::
value
,
"Data type of StaticBuffer for Thread-wise reduction should be same as the compType!"
);
static
constexpr
index_t
ThreadBufferLen
=
BufferType
::
Size
();
using
binop
=
detail
::
binop_with_nan_check
<
nanPropaOpt
,
opReduce
,
compType
>
;
// This interface does not accumulate on indices
__device__
static
void
Reduce
(
const
BufferType
&
thread_buffer
,
compType
&
accuData
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}(
[
&
](
auto
I
)
{
binop
::
calculate
(
accuData
,
thread_buffer
[
I
]);
});
};
// This interface accumulates on both data values and indices and
// is called by Direct_ThreadWise reduction method at first-time reduction
__device__
static
void
Reduce2
(
const
BufferType
&
thread_buffer
,
compType
&
accuData
,
int
&
accuIndex
,
int
indexStart
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
int
currIndex
=
I
+
indexStart
;
binop
::
calculate
(
accuData
,
thread_buffer
[
I
],
accuIndex
,
currIndex
);
});
};
// Set the elements in the per-thread buffer to a specific value
// cppcheck-suppress constParameter
__device__
static
void
set_buffer_value
(
BufferType
&
thread_buffer
,
compType
value
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
thread_buffer
(
I
)
=
value
;
});
};
// Execute unary operation on the per-thread buffer elements
template
<
typename
unary_op_type
>
__device__
static
void
operate_on_elements
(
unary_op_type
&
unary_op
,
BufferType
&
thread_buffer
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}(
[
&
](
auto
I
)
{
thread_buffer
(
I
)
=
unary_op
(
thread_buffer
[
I
]);
});
};
};
template
<
typename
BufferType
,
typename
IdxBufferType
,
typename
opReduce
,
NanPropagation_t
nanPropaOpt
>
struct
ThreadReduceWithIndicesInput
{
using
compType
=
typename
opReduce
::
dataType
;
static_assert
(
BufferType
::
IsStaticBuffer
(),
"Thread-wise reduction needs use StaticBuffer!"
);
static_assert
(
IdxBufferType
::
IsStaticBuffer
(),
"Thread-wise reduction needs use StaticBuffer for indices!"
);
static_assert
(
std
::
is_same
<
typename
BufferType
::
type
,
compType
>::
value
,
"Data type of StaticBuffer for Thread-wise reduction should be same as the compType!"
);
static_assert
(
std
::
is_same
<
typename
IdxBufferType
::
type
,
index_t
>::
value
,
"Indices type of StaticBuffer for Thread-wise reduction should be index_t!"
);
static_assert
(
BufferType
::
Size
()
==
IdxBufferType
::
Size
(),
"StaticBuffers for data and indices should have the same sizes!"
);
static
constexpr
index_t
ThreadBufferLen
=
BufferType
::
Size
();
using
binop
=
detail
::
binop_with_nan_check
<
nanPropaOpt
,
opReduce
,
compType
>
;
// This interface accumulates on both data values and indices and
// is called by Direct_ThreadWise reduction method at second-time reduction
__device__
static
void
Reduce
(
const
BufferType
&
thread_buffer
,
const
IdxBufferType
&
thread_indices_buffer
,
compType
&
accuData
,
int
&
accuIndex
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
binop
::
calculate
(
accuData
,
thread_buffer
[
I
],
accuIndex
,
thread_indices_buffer
[
I
]);
});
};
// Set the elements in the per-thread buffer to a specific value
// cppcheck-suppress constParameter
__device__
static
void
set_buffer_value
(
BufferType
&
thread_buffer
,
compType
value
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
thread_buffer
(
I
)
=
value
;
});
};
// Execute unary operation on the per-thread buffer elements
template
<
typename
unary_op_type
>
__device__
static
void
operate_on_elements
(
unary_op_type
&
unary_op
,
BufferType
&
thread_buffer
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}(
[
&
](
auto
I
)
{
thread_buffer
(
I
)
=
unary_op
(
thread_buffer
[
I
]);
});
};
};
};
// end of namespace ck
#endif
composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp
0 → 100644
View file @
df0d6810
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_WARPWISE_HPP
#define CK_REDUCTION_FUNCTIONS_WARPWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_binop.hpp"
namespace
ck
{
template
<
typename
BufferType
,
index_t
BlockSize
,
typename
opReduce
,
NanPropagation_t
nanPropaOpt
>
struct
WarpReduce
{
using
compType
=
typename
opReduce
::
dataType
;
using
binop
=
detail
::
binop_with_nan_check
<
nanPropaOpt
,
opReduce
,
compType
>
;
static_assert
(
BufferType
::
IsStaticBuffer
(),
"Per-thread buffer for WarpWise reduction should be StaticBuffer!"
);
static_assert
(
std
::
is_same
<
typename
BufferType
::
type
,
compType
>::
value
,
"Data type of per-thread StaticBuffer for WarpWise reduction should be same as "
"the compType!"
);
static
constexpr
index_t
ThreadBufferLen
=
BufferType
::
Size
();
static
constexpr
bool
have_builtin_shuffle
=
std
::
is_same
<
compType
,
float
>::
value
||
std
::
is_same
<
compType
,
double
>::
value
;
// This interface does not accumulate on indices
__device__
static
void
Reduce
(
const
BufferType
&
thread_buffer
,
compType
&
accuData
)
{
if
constexpr
(
have_builtin_shuffle
)
ReduceImpl1
(
thread_buffer
,
accuData
);
else
ReduceImpl2
(
thread_buffer
,
accuData
);
};
// This interface implementation uses HIP built-in device shuffling functions
__device__
static
void
ReduceImpl1
(
const
BufferType
&
thread_buffer
,
compType
&
accuData
)
{
compType
lAccuData
=
opReduce
::
GetZeroVal
();
static_for
<
0
,
ThreadBufferLen
,
1
>
{}(
[
&
](
auto
I
)
{
binop
::
calculate
(
lAccuData
,
thread_buffer
[
I
]);
});
// synchronize among all threads in this warp
__all
(
1
);
for
(
index_t
stride
=
warpSize
/
2
;
stride
>
0
;
stride
/=
2
)
{
compType
tmpVal
=
__shfl_down
(
lAccuData
,
stride
,
warpSize
);
binop
::
calculate
(
lAccuData
,
tmpVal
);
__all
(
1
);
}
binop
::
calculate
(
accuData
,
lAccuData
);
};
// This interface implementation does not use HIP built-in device shuffling functions
// since for fp16, built-in shuffling functions is not provided by HIP
__device__
static
void
ReduceImpl2
(
const
BufferType
&
thread_buffer
,
compType
&
accuData
)
{
compType
lAccuData
=
opReduce
::
GetZeroVal
();
static_for
<
0
,
ThreadBufferLen
,
1
>
{}(
[
&
](
auto
I
)
{
binop
::
calculate
(
lAccuData
,
thread_buffer
[
I
]);
});
__syncthreads
();
index_t
thread_id
=
get_thread_local_1d_id
();
index_t
warpId
=
thread_id
/
warpSize
;
index_t
thread_inwarp_id
=
thread_id
%
warpSize
;
__shared__
compType
shuffle_buffer
[
BlockSize
];
compType
*
myBuffer
=
&
shuffle_buffer
[
warpId
*
warpSize
];
myBuffer
[
thread_inwarp_id
]
=
lAccuData
;
__syncthreads
();
for
(
index_t
stride
=
warpSize
/
2
;
stride
>
0
;
stride
/=
2
)
{
if
(
thread_inwarp_id
<
stride
)
{
compType
currVal1
=
myBuffer
[
thread_inwarp_id
];
compType
currVal2
=
myBuffer
[
thread_inwarp_id
+
stride
];
binop
::
calculate
(
currVal1
,
currVal2
);
myBuffer
[
thread_inwarp_id
]
=
currVal1
;
}
__syncthreads
();
}
if
(
thread_inwarp_id
==
0
)
binop
::
calculate
(
accuData
,
myBuffer
[
0
]);
};
// This interface accumulates on both data values and indices and is called by Direct_WarpWise
// reduction method at first-time reduction
__device__
static
void
Reduce2
(
const
BufferType
&
thread_buffer
,
compType
&
accuData
,
int
&
accuIndex
,
int
indexStart
)
{
if
constexpr
(
have_builtin_shuffle
)
Reduce2Impl1
(
thread_buffer
,
accuData
,
accuIndex
,
indexStart
);
else
Reduce2Impl2
(
thread_buffer
,
accuData
,
accuIndex
,
indexStart
);
};
// This interface implementation uses HIP built-in device shuffling functions
__device__
static
void
Reduce2Impl1
(
const
BufferType
&
thread_buffer
,
compType
&
accuData
,
int
&
accuIndex
,
int
indexStart
)
{
compType
lAccuData
=
opReduce
::
GetZeroVal
();
int
lAccuIndex
=
0
;
index_t
thread_inwarp_id
=
get_thread_local_1d_id
()
%
warpSize
;
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
int
currIndex
=
thread_inwarp_id
*
ThreadBufferLen
+
I
+
indexStart
;
binop
::
calculate
(
lAccuData
,
thread_buffer
[
I
],
lAccuIndex
,
currIndex
);
});
// synchronize among all threads in this warp
__all
(
1
);
for
(
index_t
stride
=
1
;
stride
<
warpSize
;
stride
*=
2
)
{
compType
tmpVal
=
__shfl_down
(
lAccuData
,
stride
,
warpSize
);
int
tmpIndex
=
__shfl_down
(
lAccuIndex
,
stride
,
warpSize
);
binop
::
calculate
(
lAccuData
,
tmpVal
,
lAccuIndex
,
tmpIndex
);
__all
(
1
);
}
if
(
thread_inwarp_id
==
0
)
binop
::
calculate
(
accuData
,
lAccuData
,
accuIndex
,
lAccuIndex
);
};
// This interface implementation does not use HIP built-in device shuffling functions since for
// fp16, built-in shuffling functions is not provided by HIP
__device__
static
void
Reduce2Impl2
(
const
BufferType
&
thread_buffer
,
compType
&
accuData
,
int
&
accuIndex
,
int
indexStart
)
{
compType
lAccuData
=
opReduce
::
GetZeroVal
();
int
lAccuIndex
=
0
;
index_t
thread_id
=
get_thread_local_1d_id
();
index_t
warpId
=
thread_id
/
warpSize
;
index_t
thread_inwarp_id
=
thread_id
%
warpSize
;
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
int
currIndex
=
thread_inwarp_id
*
ThreadBufferLen
+
I
+
indexStart
;
binop
::
calculate
(
lAccuData
,
thread_buffer
[
I
],
lAccuIndex
,
currIndex
);
});
__shared__
compType
shuffle_data_buffer
[
BlockSize
];
__shared__
int
shuffle_indices_buffer
[
BlockSize
];
compType
*
myDataBuffer
=
&
shuffle_data_buffer
[
warpId
*
warpSize
];
int
*
myIndicesBuffer
=
&
shuffle_indices_buffer
[
warpId
*
warpSize
];
myDataBuffer
[
thread_inwarp_id
]
=
lAccuData
;
myIndicesBuffer
[
thread_inwarp_id
]
=
lAccuIndex
;
__syncthreads
();
for
(
index_t
stride
=
1
;
stride
<
warpSize
;
stride
*=
2
)
{
compType
currVal1
=
myDataBuffer
[
thread_inwarp_id
];
compType
currVal2
=
myDataBuffer
[
thread_inwarp_id
+
stride
];
int
currIndex1
=
myIndicesBuffer
[
thread_inwarp_id
];
int
currIndex2
=
myIndicesBuffer
[
thread_inwarp_id
+
stride
];
binop
::
calculate
(
currVal1
,
currVal2
,
currIndex1
,
currIndex2
);
myDataBuffer
[
thread_inwarp_id
]
=
currVal1
;
myIndicesBuffer
[
thread_inwarp_id
]
=
currIndex1
;
__syncthreads
();
}
if
(
thread_inwarp_id
==
0
)
binop
::
calculate
(
accuData
,
myDataBuffer
[
0
],
accuIndex
,
myIndicesBuffer
[
0
]);
};
// cppcheck-suppress constParameter
__device__
static
void
set_buffer_value
(
BufferType
&
thread_buffer
,
compType
value
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
thread_buffer
(
I
)
=
value
;
});
__all
(
1
);
};
// Execute unary operation on the per-thread buffer elements
template
<
typename
unary_op_type
>
__device__
static
void
operate_on_elements
(
unary_op_type
&
unary_op
,
BufferType
&
thread_buffer
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}(
[
&
](
auto
I
)
{
thread_buffer
(
I
)
=
unary_op
(
thread_buffer
[
I
]);
});
__all
(
1
);
};
};
template
<
typename
BufferType
,
typename
IdxBufferType
,
index_t
BlockSize
,
typename
opReduce
,
NanPropagation_t
nanPropaOpt
>
struct
WarpReduceWithIndicesInput
{
using
compType
=
typename
opReduce
::
dataType
;
using
binop
=
detail
::
binop_with_nan_check
<
nanPropaOpt
,
opReduce
,
compType
>
;
static_assert
(
BufferType
::
IsStaticBuffer
(),
"Per-thread buffer for WarpWise reduction should be StaticBuffer!"
);
static_assert
(
IdxBufferType
::
IsStaticBuffer
(),
"Per-thread buffer for WarpWise reduction should be StaticBuffer for indices!"
);
static_assert
(
std
::
is_same
<
typename
BufferType
::
type
,
compType
>::
value
,
"Data type of per-thread StaticBuffer for WarpWise reduction should be same as "
"the compType!"
);
static_assert
(
std
::
is_same
<
typename
IdxBufferType
::
type
,
index_t
>::
value
,
"Indices type per-thread of StaticBuffer for WarpWise reduction should be index_t!"
);
static_assert
(
BufferType
::
Size
()
==
IdxBufferType
::
Size
(),
"StaticBuffers for data and indices should have the same sizes!"
);
static
constexpr
index_t
ThreadBufferLen
=
BufferType
::
Size
();
static
constexpr
bool
have_builtin_shuffle
=
std
::
is_same
<
compType
,
float
>::
value
||
std
::
is_same
<
compType
,
double
>::
value
;
// This interface accumulates on both data values and indices and is called by Direct_WarpWise
// reduction method at second-time reduction
__device__
static
void
Reduce
(
const
BufferType
&
thread_buffer
,
const
IdxBufferType
&
thread_indices_buffer
,
compType
&
accuData
,
int
&
accuIndex
)
{
if
constexpr
(
have_builtin_shuffle
)
ReduceImpl1
(
thread_buffer
,
thread_indices_buffer
,
accuData
,
accuIndex
);
else
ReduceImpl2
(
thread_buffer
,
thread_indices_buffer
,
accuData
,
accuIndex
);
};
// This interface implementation uses HIP built-in device shuffling functions
__device__
static
void
ReduceImpl1
(
const
BufferType
&
thread_buffer
,
const
IdxBufferType
&
thread_indices_buffer
,
compType
&
accuData
,
int
&
accuIndex
)
{
compType
lAccuData
=
opReduce
::
GetZeroVal
();
int
lAccuIndex
=
0
;
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
binop
::
calculate
(
lAccuData
,
thread_buffer
[
I
],
lAccuIndex
,
thread_indices_buffer
[
I
]);
});
// synchronize among all threads in this warp
__all
(
1
);
for
(
index_t
stride
=
1
;
stride
<
warpSize
;
stride
*=
2
)
{
compType
tmpVal
=
__shfl_down
(
lAccuData
,
stride
,
warpSize
);
int
tmpIndex
=
__shfl_down
(
lAccuIndex
,
stride
,
warpSize
);
binop
::
calculate
(
lAccuData
,
tmpVal
,
lAccuIndex
,
tmpIndex
);
__all
(
1
);
}
binop
::
calculate
(
accuData
,
lAccuData
,
accuIndex
,
lAccuIndex
);
};
// This interface implementation does not use HIP built-in device shuffling functions
// since for fp16, built-in shuffling functions is not provided by HIP
__device__
static
void
ReduceImpl2
(
const
BufferType
&
thread_buffer
,
const
IdxBufferType
&
thread_indices_buffer
,
compType
&
accuData
,
int
&
accuIndex
)
{
compType
lAccuData
=
opReduce
::
GetZeroVal
();
int
lAccuIndex
=
0
;
index_t
thread_id
=
get_thread_local_1d_id
();
index_t
warpId
=
thread_id
/
warpSize
;
index_t
thread_inwarp_id
=
thread_id
%
warpSize
;
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
binop
::
calculate
(
lAccuData
,
thread_buffer
[
I
],
lAccuIndex
,
thread_indices_buffer
[
I
]);
});
__shared__
compType
shuffle_data_buffer
[
BlockSize
];
__shared__
int
shuffle_indices_buffer
[
BlockSize
];
compType
*
myDataBuffer
=
&
shuffle_data_buffer
[
warpId
*
warpSize
];
int
*
myIndicesBuffer
=
&
shuffle_indices_buffer
[
warpId
*
warpSize
];
myDataBuffer
[
thread_inwarp_id
]
=
lAccuData
;
myIndicesBuffer
[
thread_inwarp_id
]
=
lAccuIndex
;
__syncthreads
();
for
(
index_t
stride
=
1
;
stride
<
warpSize
;
stride
*=
2
)
{
compType
currVal1
=
myDataBuffer
[
thread_inwarp_id
];
compType
currVal2
=
myDataBuffer
[
thread_inwarp_id
+
stride
];
int
currIndex1
=
myIndicesBuffer
[
thread_inwarp_id
];
int
currIndex2
=
myIndicesBuffer
[
thread_inwarp_id
+
stride
];
binop
::
calculate
(
currVal1
,
currVal2
,
currIndex1
,
currIndex2
);
myDataBuffer
[
thread_inwarp_id
]
=
currVal1
;
myIndicesBuffer
[
thread_inwarp_id
]
=
currIndex1
;
__syncthreads
();
}
if
(
thread_inwarp_id
==
0
)
binop
::
calculate
(
accuData
,
myDataBuffer
[
0
],
accuIndex
,
myIndicesBuffer
[
0
]);
};
// cppcheck-suppress constParameter
__device__
static
void
set_buffer_value
(
BufferType
&
thread_buffer
,
compType
value
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
thread_buffer
(
I
)
=
value
;
});
__all
(
1
);
};
// Execute unary operation on the per-thread buffer elements
template
<
typename
unary_op_type
>
__device__
static
void
operate_on_elements
(
unary_op_type
&
unary_op
,
BufferType
&
thread_buffer
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}(
[
&
](
auto
I
)
{
thread_buffer
(
I
)
=
unary_op
(
thread_buffer
[
I
]);
});
__all
(
1
);
};
};
};
// end of namespace ck
#endif
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
View file @
df0d6810
...
@@ -709,9 +709,6 @@ struct ThreadwiseTensorSliceTransfer_v3
...
@@ -709,9 +709,6 @@ struct ThreadwiseTensorSliceTransfer_v3
:
src_coord_
(
make_tensor_coordinate
(
src_desc
,
src_slice_origin
)),
:
src_coord_
(
make_tensor_coordinate
(
src_desc
,
src_slice_origin
)),
dst_coord_
(
make_tensor_coordinate
(
dst_desc
,
dst_slice_origin
))
dst_coord_
(
make_tensor_coordinate
(
dst_desc
,
dst_slice_origin
))
{
{
// TODO: fix this
static_assert
(
is_same
<
SrcData
,
DstData
>::
value
,
"wrong! current implementation assume SrcData and DstData are same type"
);
}
}
__device__
void
SetSrcSliceOrigin
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_idx
)
__device__
void
SetSrcSliceOrigin
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_idx
)
...
@@ -981,7 +978,8 @@ struct ThreadwiseTensorSliceTransfer_v3
...
@@ -981,7 +978,8 @@ struct ThreadwiseTensorSliceTransfer_v3
constexpr
index_t
buffer_offset
=
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
dst_data_idx
+
i
*
dst_scalar_step_in_vector
);
buffer_desc_
.
CalculateOffset
(
dst_data_idx
+
i
*
dst_scalar_step_in_vector
);
dst_tmp_vector
.
template
AsType
<
DstData
>()(
i
)
=
buffer_
[
Number
<
buffer_offset
>
{}];
dst_tmp_vector
.
template
AsType
<
DstData
>()(
i
)
=
type_convert
<
DstData
>
{}(
buffer_
[
Number
<
buffer_offset
>
{}]);
});
});
using
dst_vector_t
=
typename
decltype
(
dst_tmp_vector
)
::
type
;
using
dst_vector_t
=
typename
decltype
(
dst_tmp_vector
)
::
type
;
...
...
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
df0d6810
...
@@ -209,13 +209,49 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
...
@@ -209,13 +209,49 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
index_t
src_wave_addr_offset
)
index_t
src_wave_addr_offset
)
{
{
static_assert
(
static_assert
(
(
is_same
<
T
,
float
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
))
||
(
is_same
<
T
,
double
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
))
||
(
is_same
<
T
,
int8_
t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
))
||
(
is_same
<
T
,
floa
t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
))
||
(
is_same
<
T
,
half_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
))
||
(
is_same
<
T
,
half_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
))
||
(
is_same
<
T
,
int32_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
)),
(
is_same
<
T
,
int32_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
))
||
(
is_same
<
T
,
int8_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
)),
"wrong! not implemented"
);
"wrong! not implemented"
);
if
constexpr
(
is_same
<
T
,
float
>::
value
)
if
constexpr
(
is_same
<
T
,
double
>::
value
)
{
// use fp32 load to mimic fp64 load
if
constexpr
(
N
==
1
)
{
const
float2_t
tmp
=
llvm_amdgcn_raw_buffer_load_fp32x2
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
return
as_type
<
double
>
(
tmp
);
}
else
if
constexpr
(
N
==
2
)
{
const
float4_t
tmp
=
llvm_amdgcn_raw_buffer_load_fp32x4
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
return
as_type
<
double2_t
>
(
tmp
);
}
else
if
constexpr
(
N
==
4
)
{
const
float4_t
f32_0
=
llvm_amdgcn_raw_buffer_load_fp32x4
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
const
float4_t
f32_1
=
llvm_amdgcn_raw_buffer_load_fp32x4
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
+
4
*
sizeof
(
float
),
0
);
vector_type
<
double
,
4
>
tmp
;
tmp
.
AsType
<
double2_t
>
()(
Number
<
0
>
{})
=
as_type
<
double2_t
>
(
f32_0
);
tmp
.
AsType
<
double2_t
>
()(
Number
<
1
>
{})
=
as_type
<
double2_t
>
(
f32_1
);
return
tmp
.
AsType
<
double4_t
>
()(
Number
<
0
>
{});
}
}
else
if
constexpr
(
is_same
<
T
,
float
>::
value
)
{
{
if
constexpr
(
N
==
1
)
if
constexpr
(
N
==
1
)
{
{
...
@@ -267,25 +303,11 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
...
@@ -267,25 +303,11 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
}
}
else
if
constexpr
(
N
==
8
)
else
if
constexpr
(
N
==
8
)
{
{
#if 0
// use fp32 load to mimic fp16 load
vector_type<half_t, 8> tmp;
tmp.AsType<half4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_fp16x4(
src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
tmp.AsType<half4_t>()(Number<1>{}) =
llvm_amdgcn_raw_buffer_load_fp16x4(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(half_t),
0);
return tmp.AsType<half8_t>()(Number<0>{});
#else
float4_t
tmp
=
llvm_amdgcn_raw_buffer_load_fp32x4
(
float4_t
tmp
=
llvm_amdgcn_raw_buffer_load_fp32x4
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
0
);
return
as_type
<
half8_t
>
(
tmp
);
return
as_type
<
half8_t
>
(
tmp
);
#endif
}
}
}
}
else
if
constexpr
(
is_same
<
T
,
int32_t
>::
value
)
else
if
constexpr
(
is_same
<
T
,
int32_t
>::
value
)
...
@@ -417,13 +439,34 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
...
@@ -417,13 +439,34 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
index_t
dst_wave_addr_offset
)
index_t
dst_wave_addr_offset
)
{
{
static_assert
(
static_assert
(
(
is_same
<
T
,
float
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
))
||
(
is_same
<
T
,
double
>::
value
&&
(
N
==
1
||
N
==
2
))
||
(
is_same
<
T
,
float
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
))
||
(
is_same
<
T
,
half_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
))
||
(
is_same
<
T
,
int32_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
))
||
(
is_same
<
T
,
int32_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
))
||
(
is_same
<
T
,
int8_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
))
||
(
is_same
<
T
,
int8_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
)),
(
is_same
<
T
,
half_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
)),
"wrong! not implemented"
);
"wrong! not implemented"
);
if
constexpr
(
is_same
<
T
,
float
>::
value
)
if
constexpr
(
is_same
<
T
,
double
>::
value
)
{
// use fp32 store to mimic fp64 store
if
constexpr
(
N
==
1
)
{
llvm_amdgcn_raw_buffer_store_fp32x2
(
as_type
<
float2_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
0
);
}
else
if
constexpr
(
N
==
2
)
{
llvm_amdgcn_raw_buffer_store_fp32x4
(
as_type
<
float4_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
0
);
}
}
else
if
constexpr
(
is_same
<
T
,
float
>::
value
)
{
{
if
constexpr
(
N
==
1
)
if
constexpr
(
N
==
1
)
{
{
...
@@ -450,6 +493,49 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
...
@@ -450,6 +493,49 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
0
);
0
);
}
}
}
}
else
if
constexpr
(
is_same
<
T
,
half_t
>::
value
)
{
if
constexpr
(
N
==
1
)
{
llvm_amdgcn_raw_buffer_store_fp16
(
src_thread_data
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
0
);
}
else
if
constexpr
(
N
==
2
)
{
llvm_amdgcn_raw_buffer_store_fp16x2
(
src_thread_data
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
0
);
}
else
if
constexpr
(
N
==
4
)
{
llvm_amdgcn_raw_buffer_store_fp16x4
(
src_thread_data
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
0
);
}
else
if
constexpr
(
N
==
8
)
{
vector_type
<
half_t
,
8
>
tmp
{
src_thread_data
};
llvm_amdgcn_raw_buffer_store_fp16x4
(
tmp
.
AsType
<
half4_t
>
()[
Number
<
0
>
{}],
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
0
);
llvm_amdgcn_raw_buffer_store_fp16x4
(
tmp
.
AsType
<
half4_t
>
()[
Number
<
1
>
{}],
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
+
4
*
sizeof
(
half_t
),
0
);
}
}
else
if
constexpr
(
is_same
<
T
,
int32_t
>::
value
)
else
if
constexpr
(
is_same
<
T
,
int32_t
>::
value
)
{
{
if
constexpr
(
N
==
1
)
if
constexpr
(
N
==
1
)
...
@@ -536,49 +622,6 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
...
@@ -536,49 +622,6 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
0
);
0
);
}
}
}
}
else
if
constexpr
(
is_same
<
T
,
half_t
>::
value
)
{
if
constexpr
(
N
==
1
)
{
llvm_amdgcn_raw_buffer_store_fp16
(
src_thread_data
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
0
);
}
else
if
constexpr
(
N
==
2
)
{
llvm_amdgcn_raw_buffer_store_fp16x2
(
src_thread_data
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
0
);
}
else
if
constexpr
(
N
==
4
)
{
llvm_amdgcn_raw_buffer_store_fp16x4
(
src_thread_data
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
0
);
}
else
if
constexpr
(
N
==
8
)
{
vector_type
<
half_t
,
8
>
tmp
{
src_thread_data
};
llvm_amdgcn_raw_buffer_store_fp16x4
(
tmp
.
AsType
<
half4_t
>
()[
Number
<
0
>
{}],
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
0
);
llvm_amdgcn_raw_buffer_store_fp16x4
(
tmp
.
AsType
<
half4_t
>
()[
Number
<
1
>
{}],
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
+
4
*
sizeof
(
half_t
),
0
);
}
}
}
}
// buffer_load requires:
// buffer_load requires:
...
...
composable_kernel/include/utility/data_type.hpp
View file @
df0d6810
...
@@ -73,6 +73,13 @@ struct scalar_type<vector_type<T, N>>
...
@@ -73,6 +73,13 @@ struct scalar_type<vector_type<T, N>>
};
};
//
//
template
<
>
struct
scalar_type
<
double
>
{
using
type
=
double
;
static
constexpr
index_t
vector_size
=
1
;
};
template
<
>
template
<
>
struct
scalar_type
<
float
>
struct
scalar_type
<
float
>
{
{
...
@@ -864,6 +871,10 @@ struct vector_type<T, 256>
...
@@ -864,6 +871,10 @@ struct vector_type<T, 256>
}
}
};
};
// fp64
using
double2_t
=
typename
vector_type
<
double
,
2
>::
type
;
using
double4_t
=
typename
vector_type
<
double
,
4
>::
type
;
// fp32
// fp32
using
float2_t
=
typename
vector_type
<
float
,
2
>::
type
;
using
float2_t
=
typename
vector_type
<
float
,
2
>::
type
;
using
float4_t
=
typename
vector_type
<
float
,
4
>::
type
;
using
float4_t
=
typename
vector_type
<
float
,
4
>::
type
;
...
...
composable_kernel/include/utility/dynamic_buffer.hpp
View file @
df0d6810
...
@@ -38,6 +38,10 @@ struct DynamicBuffer
...
@@ -38,6 +38,10 @@ struct DynamicBuffer
return
BufferAddressSpace
;
return
BufferAddressSpace
;
}
}
__host__
__device__
constexpr
const
T
&
operator
[](
index_t
i
)
const
{
return
p_data_
[
i
];
}
__host__
__device__
constexpr
T
&
operator
()(
index_t
i
)
{
return
p_data_
[
i
];
}
template
<
typename
X
,
template
<
typename
X
,
typename
enable_if
<
typename
enable_if
<
is_same
<
typename
scalar_type
<
remove_cv_t
<
remove_reference_t
<
X
>
>>::
type
,
is_same
<
typename
scalar_type
<
remove_cv_t
<
remove_reference_t
<
X
>
>>::
type
,
...
@@ -234,9 +238,14 @@ __host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize el
...
@@ -234,9 +238,14 @@ __host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize el
return
DynamicBuffer
<
BufferAddressSpace
,
T
,
ElementSpaceSize
,
true
>
{
p
,
element_space_size
};
return
DynamicBuffer
<
BufferAddressSpace
,
T
,
ElementSpaceSize
,
true
>
{
p
,
element_space_size
};
}
}
template
<
AddressSpaceEnum_t
BufferAddressSpace
,
typename
T
,
typename
ElementSpaceSize
>
template
<
AddressSpaceEnum_t
BufferAddressSpace
,
typename
T
,
typename
ElementSpaceSize
,
typename
X
,
typename
enable_if
<
is_same
<
remove_cvref_t
<
T
>,
remove_cvref_t
<
X
>>::
value
,
bool
>::
type
=
false
>
__host__
__device__
constexpr
auto
__host__
__device__
constexpr
auto
make_dynamic_buffer
(
T
*
p
,
ElementSpaceSize
element_space_size
,
T
invalid_element_value
)
make_dynamic_buffer
(
T
*
p
,
ElementSpaceSize
element_space_size
,
X
invalid_element_value
)
{
{
return
DynamicBuffer
<
BufferAddressSpace
,
T
,
ElementSpaceSize
,
false
>
{
return
DynamicBuffer
<
BufferAddressSpace
,
T
,
ElementSpaceSize
,
false
>
{
p
,
element_space_size
,
invalid_element_value
};
p
,
element_space_size
,
invalid_element_value
};
...
...
composable_kernel/include/utility/reduction_common.hpp
0 → 100644
View file @
df0d6810
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_COMMON_HPP
#define CK_REDUCTION_COMMON_HPP
// this enumerate should be synchronized with include/miopen/reduce_common.hpp
namespace
ck
{
enum
class
ReductionMethod_t
{
DirectThreadWise
=
1
,
DirectWarpWise
=
2
,
BlockWise
=
3
,
MultiBlock
=
4
};
// end of namespace ck
enum
class
ReduceTensorOp_t
{
ADD
=
0
,
MUL
=
1
,
MIN
=
2
,
MAX
=
3
,
AMAX
=
4
,
AVG
=
5
,
NORM1
=
6
,
NORM2
=
7
,
// MUL_NO_ZEROS = 8,
};
enum
class
NanPropagation_t
{
NOT_PROPAGATE_NAN
=
0
,
PROPAGATE_NAN
=
1
,
};
enum
class
ReduceTensorIndices_t
{
NO_INDICES
=
0
,
FLATTENED_INDICES
=
1
,
};
enum
class
IndicesType_t
{
INDICES_32BIT
=
0
,
INDICES_64BIT
=
1
,
INDICES_16BIT
=
2
,
INDICES_8BIT
=
3
,
};
struct
float_equal_one
{
template
<
class
T
>
__device__
static
inline
bool
apply
(
T
x
)
{
return
x
<=
type_convert
<
T
>
{}(
1.0
f
)
and
x
>=
type_convert
<
T
>
{}(
1.0
f
);
}
template
<
class
T
>
__device__
inline
bool
operator
()(
T
x
)
{
return
(
float_equal_one
::
apply
(
x
));
};
};
struct
float_equal_zero
{
template
<
class
T
>
__device__
static
inline
bool
apply
(
T
x
)
{
return
x
<=
type_convert
<
T
>
{}(
0.0
f
)
and
x
>=
type_convert
<
T
>
{}(
0.0
f
);
}
template
<
class
T
>
__device__
inline
bool
operator
()(
T
x
)
{
return
(
float_equal_zero
::
apply
(
x
));
};
};
};
// end of namespace ck
#endif
composable_kernel/include/utility/reduction_functions_binop.hpp
0 → 100644
View file @
df0d6810
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_BINOP_HPP
#define CK_REDUCTION_FUNCTIONS_BINOP_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
namespace
ck
{
namespace
detail
{
static
inline
__device__
bool
isnan
(
half_t
x
)
{
return
__hisnan
(
x
);
};
template
<
NanPropagation_t
nanPropaOpt
,
typename
opReduce
,
typename
compType
>
struct
binop_with_nan_check
;
template
<
typename
opReduce
,
typename
compType
>
struct
binop_with_nan_check
<
NanPropagation_t
::
NOT_PROPAGATE_NAN
,
opReduce
,
compType
>
{
// cppcheck-suppress constParameter
__device__
static
inline
void
calculate
(
compType
&
accuVal
,
compType
currVal
)
{
opReduce
{}(
accuVal
,
currVal
);
};
// The method is called when the opReduce is indexable and the user asked for indices
__device__
static
inline
void
// cppcheck-suppress constParameter
calculate
(
compType
&
accuVal
,
compType
currVal
,
int
&
accuIndex
,
int
currIndex
)
{
bool
changed
=
false
;
opReduce
{}(
accuVal
,
currVal
,
changed
);
if
(
changed
)
accuIndex
=
currIndex
;
};
};
template
<
typename
opReduce
,
typename
compType
>
struct
binop_with_nan_check
<
NanPropagation_t
::
PROPAGATE_NAN
,
opReduce
,
compType
>
{
__device__
static
inline
void
calculate
(
compType
&
accuVal
,
compType
currVal
)
{
if
(
isnan
(
currVal
))
accuVal
=
currVal
;
else
opReduce
{}(
accuVal
,
currVal
);
};
// The method is called when the opReduce is indexable and the user asked for indices
__device__
static
inline
void
calculate
(
compType
&
accuVal
,
compType
currVal
,
int
&
accuIndex
,
int
currIndex
)
{
if
(
isnan
(
currVal
))
{
accuVal
=
currVal
;
accuIndex
=
currIndex
;
}
else
{
bool
changed
=
false
;
opReduce
{}(
accuVal
,
currVal
,
changed
);
if
(
changed
)
accuIndex
=
currIndex
;
}
};
};
};
// namespace detail
};
// end of namespace ck
#endif
composable_kernel/include/utility/reduction_operator.hpp
0 → 100644
View file @
df0d6810
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_OPERATOR_HPP
#define CK_REDUCTION_OPERATOR_HPP
#include "reduction_common.hpp"
namespace
ck
{
namespace
reduce
{
// Every binary operator used in reduction is represented by a templated functor class. Each functor
// class must provide at least
// three members:
// 1) GetZeroVal() -- the interface to return the "identity element" for the binary operator,
// "identity element" is the unique
// element in the algebraic space that doesn't affect the value of other elements
// when operated with any of them.
// 2) indexable -- boolean value indicating whether indices of the operated elements could be
// recorded. Usually, Min/Max operator could
// need to record the indices of elements. For operator like Add/Mul, no need to
// record the indices.
// 3) operator() -- the first argument of the operator must be both an input & output, and the
// corresponding variable usually stores
// the accumulated result of many operator() calls; the second argument is only an
// input. For indexable binary
// operator, the second version of operator() has third argument (which is an
// output) to indicate whether the
// accumulated value (the first argument) has changed, in which case the recorded
// accumulated index also need be
// changed.
template
<
class
T
>
struct
Add
{
using
dataType
=
T
;
__device__
static
T
GetZeroVal
()
{
return
type_convert
<
T
>
{}(
0.0
f
);
};
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
a
=
a
+
b
;
}
static
constexpr
bool
indexable
=
false
;
};
template
<
class
T
>
struct
Mul
{
using
dataType
=
T
;
__device__
static
T
GetZeroVal
()
{
return
type_convert
<
T
>
{}(
1.0
f
);
};
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
a
=
a
*
b
;
}
static
constexpr
bool
indexable
=
false
;
};
template
<
class
T
>
struct
Max
{
using
dataType
=
T
;
__device__
static
T
GetZeroVal
()
{
return
std
::
numeric_limits
<
T
>::
min
();
};
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
if
(
a
<
b
)
a
=
b
;
}
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
,
bool
&
changed
)
const
{
if
(
a
<
b
)
{
a
=
b
;
changed
=
true
;
}
}
static
constexpr
bool
indexable
=
true
;
};
template
<
class
T
>
struct
Min
{
using
dataType
=
T
;
__device__
static
T
GetZeroVal
()
{
return
std
::
numeric_limits
<
T
>::
max
();
};
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
if
(
a
>
b
)
a
=
b
;
}
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
,
bool
&
changed
)
const
{
if
(
a
>
b
)
{
a
=
b
;
changed
=
true
;
}
}
static
constexpr
bool
indexable
=
true
;
};
template
<
>
__device__
half_t
Max
<
half_t
>::
GetZeroVal
()
{
return
type_convert
<
half_t
>
{}(
std
::
numeric_limits
<
float
>::
min
());
};
template
<
>
__device__
half_t
Min
<
half_t
>::
GetZeroVal
()
{
return
type_convert
<
half_t
>
{}(
std
::
numeric_limits
<
float
>::
max
());
};
// Unary operators are usually called element-wisely before the reduction is executed on the
// elements.
// They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
template
<
class
T
,
bool
hasDividing
>
struct
unary_identic
{
__device__
unary_identic
(
const
int
divider
=
1
)
{
scaler
=
1.0
f
/
static_cast
<
float
>
(
divider
);
};
__device__
inline
constexpr
T
operator
()(
T
a
)
const
{
return
a
*
type_convert
<
T
>
{}(
scaler
);
};
float
scaler
=
1.0
f
;
};
template
<
class
T
>
struct
unary_identic
<
T
,
false
>
{
__device__
unary_identic
(
const
int
divider
=
1
)
{
(
void
)
divider
;
};
__device__
inline
constexpr
T
operator
()(
T
a
)
const
{
return
a
;
};
};
template
<
class
T
,
bool
hasDividing
>
struct
unary_square
{
__device__
unary_square
(
const
int
divider
=
1
)
{
scaler
=
1.0
f
/
static_cast
<
float
>
(
divider
);
};
__device__
inline
constexpr
T
operator
()(
T
a
)
const
{
a
=
a
*
a
;
return
a
*
type_convert
<
T
>
{}(
scaler
);
};
float
scaler
=
1.0
f
;
};
template
<
class
T
>
struct
unary_square
<
T
,
false
>
{
__device__
unary_square
(
const
int
divider
=
1
)
{
(
void
)
divider
;
};
__device__
inline
constexpr
T
operator
()(
T
a
)
const
{
return
a
*
a
;
};
};
template
<
class
T
,
bool
hasDividing
>
struct
unary_abs
{
__device__
unary_abs
(
const
int
divider
=
1
)
{
scaler
=
1.0
f
/
static_cast
<
float
>
(
divider
);
};
__device__
inline
constexpr
T
operator
()(
T
a
)
const
{
a
=
abs
(
a
);
return
a
*
type_convert
<
T
>
{}(
scaler
);
};
float
scaler
=
1.0
f
;
};
template
<
class
T
>
struct
unary_abs
<
T
,
false
>
{
__device__
unary_abs
(
const
int
divider
=
1
)
{
(
void
)
divider
;
};
__device__
inline
constexpr
T
operator
()(
T
a
)
const
{
return
abs
(
a
);
};
};
// We know for sure that 4.0 has __habs(), but 3.0 does not have it.
// Let's assume that __habs() exists since 3.5.
#if HIP_PACKAGE_VERSION_FLAT < 3005000000
inline
__device__
__half
__habs
(
__half
x
)
{
union
{
__half
half
;
unsigned
short
u16
;
}
val
;
val
.
half
=
x
;
val
.
u16
=
val
.
u16
&
0x7fff
;
return
val
.
half
;
}
#endif
template
<
bool
hasDividing
>
struct
unary_abs
<
half_t
,
hasDividing
>
{
__device__
unary_abs
(
const
int
divider
=
1
)
{
scaler
=
1.0
f
/
static_cast
<
float
>
(
divider
);
};
__device__
inline
half_t
operator
()(
half_t
a
)
const
{
a
=
static_cast
<
half_t
>
(
__habs
(
a
));
return
a
*
type_convert
<
half_t
>
{}(
scaler
);
};
float
scaler
=
1.0
f
;
};
template
<
>
struct
unary_abs
<
half_t
,
false
>
{
__device__
unary_abs
(
const
int
divider
=
1
)
{
(
void
)
divider
;
};
__device__
inline
half_t
operator
()(
half_t
a
)
const
{
return
static_cast
<
half_t
>
(
__habs
(
a
));
};
};
template
<
class
T
>
struct
unary_sqrt
{
__device__
unary_sqrt
(
const
int
divider
=
1
)
{
(
void
)
divider
;
};
__device__
inline
T
operator
()(
T
a
)
const
{
return
sqrtf
(
a
);
};
};
template
<
>
struct
unary_sqrt
<
half_t
>
{
__device__
unary_sqrt
(
const
int
divider
=
1
)
{
(
void
)
divider
;
};
__device__
inline
half_t
operator
()(
half_t
a
)
const
{
return
static_cast
<
half_t
>
(
hsqrt
(
a
));
};
};
};
// end of namespace reduce
// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their
// respective functor classes.
// The "GetZeroVal()" interface and boolean member "indexable" are also provided in
// reduce_binary_operactor for
// easier checking by the upper-layer codes in the kernels.
template
<
typename
T
,
ReduceTensorOp_t
op
>
struct
reduce_binary_operator
;
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
ADD
>
{
using
opType
=
reduce
::
Add
<
T
>
;
using
dataType
=
T
;
__device__
static
T
GetZeroVal
()
{
return
reduce
::
Add
<
T
>::
GetZeroVal
();
};
static
constexpr
bool
indexable
=
reduce
::
Add
<
T
>::
indexable
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
MUL
>
{
using
opType
=
reduce
::
Mul
<
T
>
;
using
dataType
=
T
;
__device__
static
T
GetZeroVal
()
{
return
reduce
::
Mul
<
T
>::
GetZeroVal
();
};
static
constexpr
bool
indexable
=
reduce
::
Mul
<
T
>::
indexable
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
MIN
>
{
using
opType
=
reduce
::
Min
<
T
>
;
using
dataType
=
T
;
__device__
static
T
GetZeroVal
()
{
return
reduce
::
Min
<
T
>::
GetZeroVal
();
};
static
constexpr
bool
indexable
=
reduce
::
Min
<
T
>::
indexable
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
MAX
>
{
using
opType
=
reduce
::
Max
<
T
>
;
using
dataType
=
T
;
__device__
static
T
GetZeroVal
()
{
return
reduce
::
Max
<
T
>::
GetZeroVal
();
};
static
constexpr
bool
indexable
=
reduce
::
Max
<
T
>::
indexable
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
AMAX
>
{
using
opType
=
reduce
::
Max
<
T
>
;
using
dataType
=
T
;
__device__
static
T
GetZeroVal
()
{
return
reduce
::
Max
<
T
>::
GetZeroVal
();
};
static
constexpr
bool
indexable
=
reduce
::
Max
<
T
>::
indexable
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
AVG
>
{
using
opType
=
reduce
::
Add
<
T
>
;
using
dataType
=
T
;
__device__
static
T
GetZeroVal
()
{
return
reduce
::
Add
<
T
>::
GetZeroVal
();
};
static
constexpr
bool
indexable
=
reduce
::
Add
<
T
>::
indexable
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
NORM1
>
{
using
opType
=
reduce
::
Add
<
T
>
;
using
dataType
=
T
;
__device__
static
T
GetZeroVal
()
{
return
reduce
::
Add
<
T
>::
GetZeroVal
();
};
static
constexpr
bool
indexable
=
reduce
::
Add
<
T
>::
indexable
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
NORM2
>
{
using
opType
=
reduce
::
Add
<
T
>
;
using
dataType
=
T
;
__device__
static
T
GetZeroVal
()
{
return
reduce
::
Add
<
T
>::
GetZeroVal
();
};
static
constexpr
bool
indexable
=
reduce
::
Add
<
T
>::
indexable
;
};
// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
// functor classes.
// The two unary functors are called before and afer the Reduction is executed respectively
template
<
typename
T
,
ReduceTensorOp_t
op
,
bool
isFirsReduce
,
bool
isLastReduce
>
struct
reduce_unary_operator
{
using
preUnaryOp
=
reduce
::
unary_identic
<
T
,
false
>
;
using
posUnaryOp
=
reduce
::
unary_identic
<
T
,
false
>
;
};
template
<
typename
T
,
bool
isFirstReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp_t
::
AVG
,
isFirstReduce
,
true
>
{
using
preUnaryOp
=
reduce
::
unary_identic
<
T
,
false
>
;
using
posUnaryOp
=
reduce
::
unary_identic
<
T
,
true
>
;
};
template
<
typename
T
,
bool
isLastReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp_t
::
NORM1
,
true
,
isLastReduce
>
{
using
preUnaryOp
=
reduce
::
unary_abs
<
T
,
false
>
;
using
posUnaryOp
=
reduce
::
unary_identic
<
T
,
false
>
;
};
template
<
typename
T
,
bool
isLastReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp_t
::
AMAX
,
true
,
isLastReduce
>
{
using
preUnaryOp
=
reduce
::
unary_abs
<
T
,
false
>
;
using
posUnaryOp
=
reduce
::
unary_identic
<
T
,
false
>
;
};
template
<
typename
T
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp_t
::
NORM2
,
true
,
false
>
{
using
preUnaryOp
=
reduce
::
unary_square
<
T
,
false
>
;
using
posUnaryOp
=
reduce
::
unary_identic
<
T
,
false
>
;
};
template
<
typename
T
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp_t
::
NORM2
,
true
,
true
>
{
using
preUnaryOp
=
reduce
::
unary_square
<
T
,
false
>
;
using
posUnaryOp
=
reduce
::
unary_sqrt
<
T
>
;
};
template
<
typename
T
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp_t
::
NORM2
,
false
,
true
>
{
using
preUnaryOp
=
reduce
::
unary_identic
<
T
,
false
>
;
using
posUnaryOp
=
reduce
::
unary_sqrt
<
T
>
;
};
}
// end of namespace ck
#endif
composable_kernel/include/utility/type.hpp
View file @
df0d6810
...
@@ -22,6 +22,9 @@ using remove_reference_t = typename std::remove_reference<T>::type;
...
@@ -22,6 +22,9 @@ using remove_reference_t = typename std::remove_reference<T>::type;
template
<
typename
T
>
template
<
typename
T
>
using
remove_cv_t
=
typename
std
::
remove_cv
<
T
>::
type
;
using
remove_cv_t
=
typename
std
::
remove_cv
<
T
>::
type
;
template
<
typename
T
>
using
remove_cvref_t
=
remove_cv_t
<
std
::
remove_reference_t
<
T
>>
;
template
<
typename
T
>
template
<
typename
T
>
inline
constexpr
bool
is_pointer_v
=
std
::
is_pointer
<
T
>::
value
;
inline
constexpr
bool
is_pointer_v
=
std
::
is_pointer
<
T
>::
value
;
...
...
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
0 → 100644
View file @
df0d6810
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_blockwise.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
srcDims
=
CK_PARAM_IN_DIMS
;
constexpr
index_t
dstDims
=
CK_PARAM_OUT_DIMS
;
using
toReduceDims
=
Sequence
<
CK_PARAM_TOREDUCE_DIMS
>
;
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
////////////////////////////////////////////////////////////////////////////////////////
using
specDims
=
typename
sequence_merge
<
Sequence
<>
,
toReduceDims
>::
type
;
static_assert
(
is_valid_sequence_map
<
specDims
>::
value
&&
specDims
::
Size
()
==
srcDims
,
"Wrong invariant and/or toReduce dimensions!"
);
// The number of invariant dimensions can be zero if all dimension are to be reduced
static_assert
(
dstDims
==
1
,
"If all source dimensions are reduced, the dest should have only one dimension !!"
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredAccessesPerThreadInBlock
=
CK_PARAM_ACCESSES_PER_THREAD_INBLOCK
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_1_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
inLength0
,
int
inLength1
,
int
inLength2
,
int
inLength3
,
int
inLength4
,
int
inLength5
,
int
inStride0
,
int
inStride1
,
int
inStride2
,
int
inStride3
,
int
inStride4
,
int
inStride5
,
int
outLength0
,
int
outLength1
,
int
outLength2
,
int
outLength3
,
int
outLength4
,
int
outLength5
,
int
outStride0
,
int
outStride1
,
int
outStride2
,
int
outStride3
,
int
outStride4
,
int
outStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
GridSize
;
(
void
)
BlkGroupSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
srcLengths
[
6
]
=
{
inLength0
,
inLength1
,
inLength2
,
inLength3
,
inLength4
,
inLength5
};
const
int
srcStrides
[
6
]
=
{
inStride0
,
inStride1
,
inStride2
,
inStride3
,
inStride4
,
inStride5
};
const
int
dstLengths
[
6
]
=
{
outLength0
,
outLength1
,
outLength2
,
outLength3
,
outLength4
,
outLength5
};
const
int
dstStrides
[
6
]
=
{
outStride0
,
outStride1
,
outStride2
,
outStride3
,
outStride4
,
outStride5
};
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
srcStrides
,
Number
<
srcDims
>
{});
const
auto
tupleDstLengths
=
make_tuple_from_array
(
dstLengths
,
Number
<
dstDims
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
dstStrides
,
Number
<
dstDims
>
{});
const
auto
srcDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
auto
one_dim_srcDesc
=
transform_tensor_descriptor
(
srcDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
auto
src2dDesc
=
transform_tensor_descriptor
(
one_dim_srcDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
one_dim_srcDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
auto
dst1dDesc
=
transform_tensor_descriptor
(
dstDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
invariantLen
=
src2dDesc
.
GetLength
(
Number
<
0
>
{});
const
auto
toReduceLen
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
constexpr
auto
copySliceLen
=
BlockSize
*
GredAccessesPerThreadInBlock
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad
=
((
toReduceLen
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pass_through_transform
(
invariantLen
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
hipThreadIdx_x
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
hipThreadIdx_x
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
(
hipThreadIdx_x
==
0
)
*
static_cast
<
decltype
(
dst1dDesc
)
*>
(
p_dst1dDesc
)
=
dst1dDesc
;
};
template
<
index_t
srcDims
,
index_t
dstDims
,
typename
invariantDims
,
typename
toReduceDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_srcLengths
=
typename
uniform_sequence_gen
<
srcDims
,
8
>::
type
{};
static
constexpr
auto
ref_dstLengths
=
typename
uniform_sequence_gen
<
dstDims
,
1
>::
type
{};
// don't have to use accurate strides to get an expected referrence type
static
constexpr
auto
ref_srcDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_srcLengths
),
make_tuple_from_seq
(
ref_srcLengths
));
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_dstLengths
),
make_tuple_from_seq
(
ref_dstLengths
));
static
constexpr
auto
ref_one_dim_srcDesc
=
transform_tensor_descriptor
(
ref_srcDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_srcLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_src2dDesc
=
transform_tensor_descriptor
(
ref_one_dim_srcDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
ref_one_dim_srcDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
static
constexpr
auto
ref_dst1dDesc
=
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_dstLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_invariantLen
=
ref_src2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
auto
ref_toReduceLen
=
ref_src2dDesc
.
GetLength
(
Number
<
1
>
{});
// used by the BlockWise and MultiBlock method
using
refType_src2dDesc_padded_34
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pass_through_transform
(
ref_invariantLen
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dst1dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dst1dDesc
);
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
toReduceDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
toReduceDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_34
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
toReduceDims
>::
refType_src2dDesc_padded_34
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
toReduceDims
>::
refType_dst1dDesc_padded
;
template
<
ReductionMethod_t
impl
,
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_34
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_1
(
int
origReduceLen
,
int
BlkGroupSize
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
void
*
__restrict__
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
BlkGroupSize
;
(
void
)
ws_buf2_bytes_offset
;
const
void
*
p_src2dDesc
=
ws_global
;
const
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_blockwise
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
true
,
true
,
GredAccessesPerThreadInBlock
>
;
constexpr
int
RunId
=
need_indices
?
2
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
p_src_global
),
beta
,
static_cast
<
dstDataType
*
const
__restrict__
>
(
p_dst_global
),
static_cast
<
const
int
*
const
__restrict__
>
(
nullptr
),
static_cast
<
int
*
const
__restrict__
>
(
indices_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
0 → 100644
View file @
df0d6810
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_blockwise.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
srcDims
=
CK_PARAM_IN_DIMS
;
constexpr
index_t
dstDims
=
CK_PARAM_OUT_DIMS
;
using
toReduceDims
=
Sequence
<
CK_PARAM_TOREDUCE_DIMS
>
;
using
invariantDims
=
Sequence
<
CK_PARAM_INVARIANT_DIMS
>
;
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
////////////////////////////////////////////////////////////////////////////////////////
using
specDims
=
typename
sequence_merge
<
invariantDims
,
toReduceDims
>::
type
;
static_assert
(
is_valid_sequence_map
<
specDims
>::
value
&&
specDims
::
Size
()
==
srcDims
,
"Wrong invariant and/or toReduce dimensions!"
);
// The number of invariant dimensions can be zero if all dimension are to be reduced
static_assert
(
invariantDims
::
Size
()
>
0
||
dstDims
==
1
,
"If all source dimensions are reduced, the dest should have only one dimension !!"
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredAccessesPerThreadInBlock
=
CK_PARAM_ACCESSES_PER_THREAD_INBLOCK
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_1_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
inLength0
,
int
inLength1
,
int
inLength2
,
int
inLength3
,
int
inLength4
,
int
inLength5
,
int
inStride0
,
int
inStride1
,
int
inStride2
,
int
inStride3
,
int
inStride4
,
int
inStride5
,
int
outLength0
,
int
outLength1
,
int
outLength2
,
int
outLength3
,
int
outLength4
,
int
outLength5
,
int
outStride0
,
int
outStride1
,
int
outStride2
,
int
outStride3
,
int
outStride4
,
int
outStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
GridSize
;
(
void
)
BlkGroupSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
srcLengths
[
6
]
=
{
inLength0
,
inLength1
,
inLength2
,
inLength3
,
inLength4
,
inLength5
};
const
int
srcStrides
[
6
]
=
{
inStride0
,
inStride1
,
inStride2
,
inStride3
,
inStride4
,
inStride5
};
const
int
dstLengths
[
6
]
=
{
outLength0
,
outLength1
,
outLength2
,
outLength3
,
outLength4
,
outLength5
};
const
int
dstStrides
[
6
]
=
{
outStride0
,
outStride1
,
outStride2
,
outStride3
,
outStride4
,
outStride5
};
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
srcStrides
,
Number
<
srcDims
>
{});
const
auto
tupleDstLengths
=
make_tuple_from_array
(
dstLengths
,
Number
<
dstDims
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
dstStrides
,
Number
<
dstDims
>
{});
const
auto
srcDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
auto
toReduceDimLengths
=
make_tuple_from_array_and_index_seq
(
srcLengths
,
toReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
srcLengths
,
invariantDims
{});
auto
src2dDesc
=
transform_tensor_descriptor
(
srcDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
toReduceDimLengths
)),
make_tuple
(
invariantDims
{},
toReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
auto
dst1dDesc
=
transform_tensor_descriptor
(
dstDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
invariantLen
=
src2dDesc
.
GetLength
(
Number
<
0
>
{});
const
auto
toReduceLen
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
constexpr
auto
copySliceLen
=
BlockSize
*
GredAccessesPerThreadInBlock
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad
=
((
toReduceLen
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pass_through_transform
(
invariantLen
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
hipThreadIdx_x
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
hipThreadIdx_x
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
(
hipThreadIdx_x
==
0
)
*
static_cast
<
decltype
(
dst1dDesc
)
*>
(
p_dst1dDesc
)
=
dst1dDesc
;
};
template
<
index_t
srcDims
,
index_t
dstDims
,
typename
invariantDims
,
typename
toReduceDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_toReduceDimLengths
=
typename
uniform_sequence_gen
<
toReduceDims
::
Size
(),
8
>::
type
{};
static
constexpr
auto
ref_invariantDimLengths
=
typename
uniform_sequence_gen
<
invariantDims
::
Size
(),
8
>::
type
{};
static
constexpr
auto
ref_srcLengths
=
typename
uniform_sequence_gen
<
srcDims
,
8
>::
type
{};
static
constexpr
auto
ref_dstLengths
=
typename
uniform_sequence_gen
<
dstDims
,
8
>::
type
{};
// don't have to use accurate strides to get an expected referrence type
static
constexpr
auto
ref_srcDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_srcLengths
),
make_tuple_from_seq
(
ref_srcLengths
));
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_dstLengths
),
make_tuple_from_seq
(
ref_dstLengths
));
static
constexpr
auto
ref_src2dDesc
=
transform_tensor_descriptor
(
ref_srcDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_invariantDimLengths
)),
make_merge_transform
(
make_tuple_from_seq
(
ref_toReduceDimLengths
))),
make_tuple
(
invariantDims
{},
toReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
static
constexpr
auto
ref_dst1dDesc
=
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_dstLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_invariantLen
=
ref_src2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
auto
ref_toReduceLen
=
ref_src2dDesc
.
GetLength
(
Number
<
1
>
{});
// used by the BlockWise and MultiBlock method
using
refType_src2dDesc_padded_34
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pass_through_transform
(
ref_invariantLen
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dst1dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dst1dDesc
);
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_34
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_src2dDesc_padded_34
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_34
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_1
(
int
origReduceLen
,
int
BlkGroupSize
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
void
*
__restrict__
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
BlkGroupSize
;
(
void
)
ws_buf2_bytes_offset
;
const
void
*
p_src2dDesc
=
ws_global
;
const
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_blockwise
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
true
,
true
,
GredAccessesPerThreadInBlock
>
;
constexpr
int
RunId
=
need_indices
?
2
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
p_src_global
),
beta
,
static_cast
<
dstDataType
*
const
__restrict__
>
(
p_dst_global
),
static_cast
<
const
int
*
const
__restrict__
>
(
nullptr
),
static_cast
<
int
*
const
__restrict__
>
(
indices_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
0 → 100644
View file @
df0d6810
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_multiblock.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
srcDims
=
CK_PARAM_IN_DIMS
;
constexpr
index_t
dstDims
=
CK_PARAM_OUT_DIMS
;
using
toReduceDims
=
Sequence
<
CK_PARAM_TOREDUCE_DIMS
>
;
using
invariantDims
=
Sequence
<
CK_PARAM_INVARIANT_DIMS
>
;
// this could be empty
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
////////////////////////////////////////////////////////////////////////////////////////
using
specDims
=
typename
sequence_merge
<
Sequence
<>
,
toReduceDims
>::
type
;
static_assert
(
is_valid_sequence_map
<
specDims
>::
value
&&
specDims
::
Size
()
==
srcDims
,
"Wrong invariant and/or toReduce dimensions!"
);
// The number of invariant dimensions can be zero if all dimension are to be reduced
static_assert
(
dstDims
==
1
,
"If all source dimensions are reduced, the dest should have only one dimension !!"
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredAccessesPerThreadInBlock
=
CK_PARAM_ACCESSES_PER_THREAD_INBLOCK
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_1_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
inLength0
,
int
inLength1
,
int
inLength2
,
int
inLength3
,
int
inLength4
,
int
inLength5
,
int
inStride0
,
int
inStride1
,
int
inStride2
,
int
inStride3
,
int
inStride4
,
int
inStride5
,
int
outLength0
,
int
outLength1
,
int
outLength2
,
int
outLength3
,
int
outLength4
,
int
outLength5
,
int
outStride0
,
int
outStride1
,
int
outStride2
,
int
outStride3
,
int
outStride4
,
int
outStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
GridSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
srcLengths
[
6
]
=
{
inLength0
,
inLength1
,
inLength2
,
inLength3
,
inLength4
,
inLength5
};
const
int
srcStrides
[
6
]
=
{
inStride0
,
inStride1
,
inStride2
,
inStride3
,
inStride4
,
inStride5
};
const
int
dstLengths
[
6
]
=
{
outLength0
,
outLength1
,
outLength2
,
outLength3
,
outLength4
,
outLength5
};
const
int
dstStrides
[
6
]
=
{
outStride0
,
outStride1
,
outStride2
,
outStride3
,
outStride4
,
outStride5
};
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
srcStrides
,
Number
<
srcDims
>
{});
const
auto
tupleDstLengths
=
make_tuple_from_array
(
dstLengths
,
Number
<
dstDims
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
dstStrides
,
Number
<
dstDims
>
{});
const
auto
srcDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
auto
one_dim_srcDesc
=
transform_tensor_descriptor
(
srcDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
auto
src2dDesc
=
transform_tensor_descriptor
(
one_dim_srcDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
one_dim_srcDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
auto
dst1dDesc
=
transform_tensor_descriptor
(
dstDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
invariantLen
=
src2dDesc
.
GetLength
(
Number
<
0
>
{});
const
auto
toReduceLen
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
constexpr
auto
copySliceLen
=
BlockSize
*
GredAccessesPerThreadInBlock
;
const
index_t
reduceSizePerBlock
=
(((
toReduceLen
+
BlkGroupSize
-
1
)
/
BlkGroupSize
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad
=
reduceSizePerBlock
*
BlkGroupSize
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pass_through_transform
(
invariantLen
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
hipThreadIdx_x
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
hipThreadIdx_x
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
(
hipThreadIdx_x
==
0
)
*
static_cast
<
decltype
(
dst1dDesc
)
*>
(
p_dst1dDesc
)
=
dst1dDesc
;
};
template
<
index_t
srcDims
,
index_t
dstDims
,
typename
toReduceDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_srcLengths
=
typename
uniform_sequence_gen
<
srcDims
,
8
>::
type
{};
static
constexpr
auto
ref_dstLengths
=
typename
uniform_sequence_gen
<
dstDims
,
1
>::
type
{};
// don't have to use accurate strides to get an expected referrence type
static
constexpr
auto
ref_srcDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_srcLengths
),
make_tuple_from_seq
(
ref_srcLengths
));
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_dstLengths
),
make_tuple_from_seq
(
ref_dstLengths
));
static
constexpr
auto
ref_one_dim_srcDesc
=
transform_tensor_descriptor
(
ref_srcDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_srcLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_src2dDesc
=
transform_tensor_descriptor
(
ref_one_dim_srcDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
ref_one_dim_srcDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
static
constexpr
auto
ref_dst1dDesc
=
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_dstLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_invariantLen
=
ref_src2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
auto
ref_toReduceLen
=
ref_src2dDesc
.
GetLength
(
Number
<
1
>
{});
// used by the BlockWise and MultiBlock method
using
refType_src2dDesc_padded_34
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pass_through_transform
(
ref_invariantLen
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dst1dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dst1dDesc
);
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
toReduceDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
toReduceDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_34
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
toReduceDims
>::
refType_src2dDesc_padded_34
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
toReduceDims
>::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_34
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_1
(
int
origReduceLen
,
int
BlkGroupSize
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
void
*
__restrict__
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
p_dst_global
;
(
void
)
indices_global
;
const
void
*
p_src2dDesc
=
ws_global
;
const
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
void
*
ws_buf1_global
=
static_cast
<
char
*>
(
ws_global
)
+
4096
;
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_multiblock
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
GredAccessesPerThreadInBlock
>
;
void
*
const
ws_buf2_global
=
ws_buf2_bytes_offset
>
0
?
static_cast
<
void
*>
(
static_cast
<
char
*>
(
ws_buf1_global
)
+
ws_buf2_bytes_offset
)
:
nullptr
;
constexpr
int
RunId
=
need_indices
?
2
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
BlkGroupSize
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
p_src_global
),
beta
,
static_cast
<
srcDataType
*
const
__restrict__
>
(
ws_buf1_global
),
static_cast
<
int
*
const
__restrict__
>
(
ws_buf2_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
0 → 100644
View file @
df0d6810
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_multiblock.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
srcDims
=
CK_PARAM_IN_DIMS
;
constexpr
index_t
dstDims
=
CK_PARAM_OUT_DIMS
;
using
toReduceDims
=
Sequence
<
CK_PARAM_TOREDUCE_DIMS
>
;
using
invariantDims
=
Sequence
<
CK_PARAM_INVARIANT_DIMS
>
;
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
////////////////////////////////////////////////////////////////////////////////////////
using
specDims
=
typename
sequence_merge
<
invariantDims
,
toReduceDims
>::
type
;
static_assert
(
is_valid_sequence_map
<
specDims
>::
value
&&
specDims
::
Size
()
==
srcDims
,
"Wrong invariant and/or toReduce dimensions!"
);
// The number of invariant dimensions can be zero if all dimension are to be reduced
static_assert
(
invariantDims
::
Size
()
>
0
||
dstDims
==
1
,
"If all source dimensions are reduced, the dest should have only one dimension !!"
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredAccessesPerThreadInBlock
=
CK_PARAM_ACCESSES_PER_THREAD_INBLOCK
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_1_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
inLength0
,
int
inLength1
,
int
inLength2
,
int
inLength3
,
int
inLength4
,
int
inLength5
,
int
inStride0
,
int
inStride1
,
int
inStride2
,
int
inStride3
,
int
inStride4
,
int
inStride5
,
int
outLength0
,
int
outLength1
,
int
outLength2
,
int
outLength3
,
int
outLength4
,
int
outLength5
,
int
outStride0
,
int
outStride1
,
int
outStride2
,
int
outStride3
,
int
outStride4
,
int
outStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
GridSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
srcLengths
[
6
]
=
{
inLength0
,
inLength1
,
inLength2
,
inLength3
,
inLength4
,
inLength5
};
const
int
srcStrides
[
6
]
=
{
inStride0
,
inStride1
,
inStride2
,
inStride3
,
inStride4
,
inStride5
};
const
int
dstLengths
[
6
]
=
{
outLength0
,
outLength1
,
outLength2
,
outLength3
,
outLength4
,
outLength5
};
const
int
dstStrides
[
6
]
=
{
outStride0
,
outStride1
,
outStride2
,
outStride3
,
outStride4
,
outStride5
};
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
srcStrides
,
Number
<
srcDims
>
{});
const
auto
tupleDstLengths
=
make_tuple_from_array
(
dstLengths
,
Number
<
dstDims
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
dstStrides
,
Number
<
dstDims
>
{});
const
auto
srcDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
auto
toReduceDimLengths
=
make_tuple_from_array_and_index_seq
(
srcLengths
,
toReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
srcLengths
,
invariantDims
{});
auto
src2dDesc
=
transform_tensor_descriptor
(
srcDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
toReduceDimLengths
)),
make_tuple
(
invariantDims
{},
toReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
auto
dst1dDesc
=
transform_tensor_descriptor
(
dstDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
invariantLen
=
src2dDesc
.
GetLength
(
Number
<
0
>
{});
const
auto
toReduceLen
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
constexpr
auto
copySliceLen
=
BlockSize
*
GredAccessesPerThreadInBlock
;
const
index_t
reduceSizePerBlock
=
(((
toReduceLen
+
BlkGroupSize
-
1
)
/
BlkGroupSize
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad
=
reduceSizePerBlock
*
BlkGroupSize
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pass_through_transform
(
invariantLen
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
hipThreadIdx_x
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
hipThreadIdx_x
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
(
hipThreadIdx_x
==
0
)
*
static_cast
<
decltype
(
dst1dDesc
)
*>
(
p_dst1dDesc
)
=
dst1dDesc
;
};
template
<
index_t
srcDims
,
index_t
dstDims
,
typename
invariantDims
,
typename
toReduceDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_toReduceDimLengths
=
typename
uniform_sequence_gen
<
toReduceDims
::
Size
(),
8
>::
type
{};
static
constexpr
auto
ref_invariantDimLengths
=
typename
uniform_sequence_gen
<
invariantDims
::
Size
(),
8
>::
type
{};
static
constexpr
auto
ref_srcLengths
=
typename
uniform_sequence_gen
<
srcDims
,
8
>::
type
{};
static
constexpr
auto
ref_dstLengths
=
typename
uniform_sequence_gen
<
dstDims
,
8
>::
type
{};
// don't have to use accurate strides to get an expected referrence type
static
constexpr
auto
ref_srcDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_srcLengths
),
make_tuple_from_seq
(
ref_srcLengths
));
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_dstLengths
),
make_tuple_from_seq
(
ref_dstLengths
));
static
constexpr
auto
ref_src2dDesc
=
transform_tensor_descriptor
(
ref_srcDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_invariantDimLengths
)),
make_merge_transform
(
make_tuple_from_seq
(
ref_toReduceDimLengths
))),
make_tuple
(
invariantDims
{},
toReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
static
constexpr
auto
ref_dst1dDesc
=
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_dstLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_invariantLen
=
ref_src2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
auto
ref_toReduceLen
=
ref_src2dDesc
.
GetLength
(
Number
<
1
>
{});
// used by the BlockWise and MultiBlock method
using
refType_src2dDesc_padded_34
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pass_through_transform
(
ref_invariantLen
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dst1dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dst1dDesc
);
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_34
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_src2dDesc_padded_34
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_34
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_1
(
int
origReduceLen
,
int
BlkGroupSize
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
void
*
__restrict__
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
p_dst_global
;
(
void
)
indices_global
;
const
void
*
p_src2dDesc
=
ws_global
;
const
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
void
*
ws_buf1_global
=
static_cast
<
char
*>
(
ws_global
)
+
4096
;
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_multiblock
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
GredAccessesPerThreadInBlock
>
;
void
*
const
ws_buf2_global
=
ws_buf2_bytes_offset
>
0
?
static_cast
<
void
*>
(
static_cast
<
char
*>
(
ws_buf1_global
)
+
ws_buf2_bytes_offset
)
:
nullptr
;
constexpr
int
RunId
=
need_indices
?
2
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
BlkGroupSize
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
p_src_global
),
beta
,
static_cast
<
srcDataType
*
const
__restrict__
>
(
ws_buf1_global
),
static_cast
<
int
*
const
__restrict__
>
(
ws_buf2_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
0 → 100644
View file @
df0d6810
This diff is collapsed.
Click to expand it.
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment