Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
9dce6851
Commit
9dce6851
authored
Mar 10, 2022
by
Jing Zhang
Browse files
merge develop
parents
3cc57101
5d37d7bf
Changes
473
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3424 additions
and
0 deletions
+3424
-0
include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
...on/gpu/device/device_reduce_multiblock_partial_reduce.hpp
+419
-0
include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
.../tensor_operation/gpu/device/device_reduce_threadwise.hpp
+355
-0
include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
...de/ck/tensor_operation/gpu/device/gemm_specialization.hpp
+0
-0
include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
...ensor_operation/gpu/device/reduction_operator_mapping.hpp
+169
-0
include/ck/tensor_operation/gpu/device/tensor_layout.hpp
include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+3
-0
include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
...k/tensor_operation/gpu/element/element_wise_operation.hpp
+336
-0
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
...or_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
+925
-0
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
.../gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
+268
-0
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
.../grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
+514
-0
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
...r_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+435
-0
include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_xdlops_v2r3.hpp
..._operation/gpu/grid/gridwise_batched_gemm_xdlops_v2r3.hpp
+0
-0
include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp
...or_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp
+0
-0
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp
...ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp
+0
-0
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r3.hpp
...ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r3.hpp
+0
-0
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp
...e/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp
+0
-0
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp
...e/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp
+0
-0
include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
+0
-0
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+0
-0
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
+0
-0
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
...tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+0
-0
No files found.
include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
0 → 100644
View file @
9dce6851
#ifndef DEVICE_REDUCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
#define DEVICE_REDUCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
#include <iostream>
#include <sstream>
#include "device.hpp"
#include "device_reduce.hpp"
#include "device_reduce_common.hpp"
#include "gridwise_2d_reduction_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
int
Rank
,
typename
ReduceDims
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
bool
PropagateNan
,
bool
NeedIndices
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
struct
DeviceReduceMultiBlockPartialReduce
:
public
DeviceReduce
<
InElementwiseOperation
,
AccElementwiseOperation
>
{
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
static_assert
(
BlockSize
==
MThreadClusterSize
*
KThreadClusterSize
,
"Invalid thread cluster size assignments!"
);
static_assert
(
OutDstVectorSize
==
1
,
"OutDstVectorSize must be 1 for MultiBlockPartialReduce!"
);
using
IndexDataType
=
int32_t
;
using
InvariantDims
=
decltype
(
get_invariant_dims
<
Rank
,
ReduceDims
>
());
static
constexpr
index_t
srcDims
=
Rank
;
static
constexpr
index_t
dstDims
=
(
InvariantDims
::
Size
()
==
0
)
?
1
:
InvariantDims
::
Size
();
static
constexpr
bool
reduceAllDims
=
(
InvariantDims
::
Size
()
==
0
);
static
constexpr
int
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
int
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
size_t
GetWorkspaceSizeInBytes
(
const
std
::
vector
<
int
>&
inLengths
)
override
{
size_t
invariant_total_length
;
size_t
reduce_total_length
;
std
::
tie
(
invariant_total_length
,
reduce_total_length
)
=
get_2d_lengths
<
Rank
,
ReduceDims
>
(
inLengths
);
int
iterations
=
1
;
while
(
true
)
{
int
testBlkGroupSize
=
(
reduce_total_length
+
(
K_BlockTileSize
*
iterations
)
-
1
)
/
(
K_BlockTileSize
*
iterations
);
// we want the blkGroupSize be not more than 128
if
(
testBlkGroupSize
<=
128
)
break
;
iterations
++
;
};
int
blkGroupSize
=
(
reduce_total_length
+
(
K_BlockTileSize
*
iterations
)
-
1
)
/
(
K_BlockTileSize
*
iterations
);
size_t
workspace_size
=
invariant_total_length
*
blkGroupSize
;
size_t
wsSizeInBytes
=
!
NeedIndices
?
workspace_size
*
sizeof
(
AccDataType
)
:
workspace_size
*
(
sizeof
(
AccDataType
)
+
sizeof
(
int
))
+
64
+
sizeof
(
int
);
return
(
wsSizeInBytes
);
};
bool
HasFurtherCall
()
override
{
return
(
true
);
};
static
auto
MakeSrc2dDescriptor
(
const
std
::
vector
<
int
>&
inLengths
,
const
std
::
vector
<
int
>&
inStrides
,
int
blkGroupSize
,
int
kBlockTileIterations
)
{
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
inLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
inStrides
,
Number
<
srcDims
>
{});
const
auto
inDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
in_grid_desc_m_k
=
[
&
]()
{
if
constexpr
(
reduceAllDims
)
{
const
auto
one_dim_inDesc
=
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
transform_tensor_descriptor
(
one_dim_inDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
one_dim_inDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
}
else
{
const
auto
toReduceDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
ReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
InvariantDims
{});
return
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
toReduceDimLengths
)),
make_tuple
(
InvariantDims
{},
ReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
}();
const
auto
outerLen
=
in_grid_desc_m_k
.
GetLength
(
Number
<
0
>
{});
const
auto
innerLen
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
int
reduceSizePerBlock
=
K_BlockTileSize
*
kBlockTileIterations
;
const
auto
inPad_M
=
math
::
integer_least_multiple
(
outerLen
,
M_BlockTileSize
)
-
outerLen
;
const
auto
inPad_K
=
reduceSizePerBlock
*
blkGroupSize
-
innerLen
;
auto
in_grid_desc_m_k_padded
=
transform_tensor_descriptor
(
in_grid_desc_m_k
,
make_tuple
(
make_right_pad_transform
(
outerLen
,
inPad_M
),
make_right_pad_transform
(
innerLen
,
inPad_K
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
(
in_grid_desc_m_k_padded
);
};
static
auto
MakeWorkspace2dDescriptor
(
int
outerLen
,
int
blkGroupSize
)
{
auto
ws_desc_m_k
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
outerLen
,
blkGroupSize
));
const
auto
wsPad
=
math
::
integer_least_multiple
(
outerLen
,
M_BlockTileSize
)
-
outerLen
;
auto
ws_desc_m_k_padded
=
transform_tensor_descriptor
(
ws_desc_m_k
,
make_tuple
(
make_right_pad_transform
(
outerLen
,
wsPad
),
make_pass_through_transform
(
blkGroupSize
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
(
ws_desc_m_k_padded
);
};
struct
Argument
:
public
BaseArgument
{
Argument
(
const
std
::
vector
<
index_t
>&
inLengths
,
const
std
::
vector
<
index_t
>&
inStrides
,
const
std
::
vector
<
index_t
>&
outLengths
,
const
std
::
vector
<
index_t
>&
outStrides
,
float
alpha
,
float
beta
,
const
InDataType
*
in_dev
,
OutDataType
*
out_dev
,
IndexDataType
*
out_indices_dev
,
AccDataType
*
workspace_dev
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
AccElementwiseOperation
&
acc_elementwise_op
)
:
in_dev_
{
in_dev
},
out_dev_
{
out_dev
},
out_indices_dev_
{
out_indices_dev
},
workspace_dev_
{
workspace_dev
}
{
inLengths_
=
inLengths
;
inStrides_
=
inStrides
;
outLengths_
=
outLengths
;
outStrides_
=
outStrides
;
in_elementwise_op_
=
in_elementwise_op
;
acc_elementwise_op_
=
acc_elementwise_op
;
alpha_
=
static_cast
<
AccDataType
>
(
alpha
);
beta_
=
static_cast
<
OutDataType
>
(
beta
);
std
::
tie
(
invariant_total_length
,
reduce_total_length
)
=
get_2d_lengths
<
Rank
,
ReduceDims
>
(
inLengths
);
if
constexpr
(
InvariantDims
::
Size
()
==
0
)
invariant_lowest_length
=
1
;
else
invariant_lowest_length
=
inLengths
[
InvariantDims
::
At
(
InvariantDims
::
Size
()
-
1
)];
reduce_lowest_length
=
inLengths
[
ReduceDims
::
At
(
ReduceDims
::
Size
()
-
1
)];
int
iterations
=
1
;
while
(
true
)
{
int
testBlkGroupSize
=
(
reduce_total_length
+
(
K_BlockTileSize
*
iterations
)
-
1
)
/
(
K_BlockTileSize
*
iterations
);
// we want the blkGroupSize be not more than 128
if
(
testBlkGroupSize
<=
128
)
break
;
iterations
++
;
};
blkGroupSize
=
(
reduce_total_length
+
(
K_BlockTileSize
*
iterations
)
-
1
)
/
(
K_BlockTileSize
*
iterations
);
kBlockTileIterations
=
iterations
;
gridSize
=
math
::
integer_least_multiple
(
invariant_total_length
,
M_BlockTileSize
)
/
M_BlockTileSize
*
blkGroupSize
;
size_t
ws_buf2_bytes_offset
=
math
::
integer_least_multiple
(
invariant_total_length
*
blkGroupSize
*
sizeof
(
AccDataType
),
64
);
if
constexpr
(
NeedIndices
)
workspace_indices_dev_
=
reinterpret_cast
<
int
*>
(
reinterpret_cast
<
char
*>
(
workspace_dev_
)
+
ws_buf2_bytes_offset
);
else
workspace_indices_dev_
=
nullptr
;
}
std
::
vector
<
int
>
inLengths_
;
std
::
vector
<
int
>
inStrides_
;
std
::
vector
<
int
>
outLengths_
;
std
::
vector
<
int
>
outStrides_
;
AccDataType
alpha_
;
OutDataType
beta_
;
const
InDataType
*
in_dev_
;
OutDataType
*
out_dev_
;
IndexDataType
*
out_indices_dev_
;
AccDataType
*
workspace_dev_
;
IndexDataType
*
workspace_indices_dev_
;
InElementwiseOperation
in_elementwise_op_
;
AccElementwiseOperation
acc_elementwise_op_
;
int
invariant_lowest_length
;
int
reduce_lowest_length
;
size_t
invariant_total_length
;
size_t
reduce_total_length
;
index_t
blkGroupSize
;
index_t
kBlockTileIterations
;
size_t
gridSize
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
)
{
const
auto
in_grid_desc_m_k
=
DeviceReduceMultiBlockPartialReduce
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
kBlockTileIterations
);
const
auto
ws_desc_m_k
=
DeviceReduceMultiBlockPartialReduce
::
MakeWorkspace2dDescriptor
(
arg
.
invariant_total_length
,
arg
.
blkGroupSize
);
using
InGridDesc_M_K
=
decltype
(
in_grid_desc_m_k
);
using
WorkspaceDesc_M_K
=
decltype
(
ws_desc_m_k
);
using
GridwiseReduce
=
GridwiseReduction_mk_to_mk_multiblock_partial_reduce
<
InDataType
,
AccDataType
,
IndexDataType
,
InGridDesc_M_K
,
WorkspaceDesc_M_K
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
PropagateNan
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
OutDstVectorSize
>
;
float
avg_time
=
0
;
const
auto
kernel
=
kernel_partial_reduce_multiblock
<
GridwiseReduce
,
NeedIndices
,
InDataType
,
AccDataType
,
IndexDataType
,
InGridDesc_M_K
,
WorkspaceDesc_M_K
,
InElementwiseOperation
,
AccElementwiseOperation
>
;
avg_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
arg
.
gridSize
),
dim3
(
BlockSize
),
0
,
in_grid_desc_m_k
,
ws_desc_m_k
,
arg
.
in_elementwise_op_
,
arg
.
acc_elementwise_op_
,
arg
.
blkGroupSize
,
arg
.
kBlockTileIterations
,
arg
.
in_dev_
,
arg
.
workspace_dev_
,
arg
.
workspace_indices_dev_
);
return
(
avg_time
);
};
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
);
};
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
constexpr
(
OutDstVectorSize
!=
1
)
return
(
false
);
if
constexpr
(
InSrcVectorDim
==
0
)
{
if
constexpr
(
InvariantDims
::
Size
()
==
0
)
return
(
false
);
if
(
pArg
->
inStrides_
[
InvariantDims
::
At
(
InvariantDims
::
Size
()
-
1
)]
!=
1
)
return
(
false
);
if
(
pArg
->
invariant_lowest_length
%
InSrcVectorSize
!=
0
)
return
(
false
);
}
else
{
if
(
pArg
->
inStrides_
[
ReduceDims
::
At
(
ReduceDims
::
Size
()
-
1
)]
!=
1
)
return
(
false
);
if
(
pArg
->
reduce_lowest_length
%
InSrcVectorSize
!=
0
)
return
(
false
);
};
// cases with small reduce_total_length should be handled by the BlockWise method
if
(
pArg
->
reduce_total_length
<=
BlockSize
*
KThreadSliceSize
)
return
(
false
);
return
(
true
);
};
std
::
vector
<
int
>
GetWorkspace2dLengths
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
return
(
std
::
vector
<
int
>
{
static_cast
<
int
>
(
pArg
->
invariant_total_length
),
pArg
->
blkGroupSize
});
};
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
int
>&
inLengths
,
const
std
::
vector
<
int
>&
inStrides
,
const
std
::
vector
<
int
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
,
float
alpha
,
float
beta
,
const
void
*
in_dev
,
void
*
out_dev
,
void
*
out_indices_dev
,
void
*
workspace_dev
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
AccElementwiseOperation
&
acc_elementwise_op
)
override
{
return
std
::
make_unique
<
Argument
>
(
inLengths
,
inStrides
,
outLengths
,
outStrides
,
alpha
,
beta
,
static_cast
<
const
InDataType
*>
(
in_dev
),
static_cast
<
OutDataType
*>
(
out_dev
),
static_cast
<
IndexDataType
*>
(
out_indices_dev
),
static_cast
<
AccDataType
*>
(
workspace_dev
),
in_elementwise_op
,
acc_elementwise_op
);
};
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
();
};
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceReduceMultiBlockPartialReduce<"
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
MThreadClusterSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"InSrcVectorDim_"
<<
InSrcVectorDim
<<
"_InSrcVectorSize_"
<<
InSrcVectorSize
<<
"_OutDstVectorSize_"
<<
OutDstVectorSize
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
0 → 100644
View file @
9dce6851
#ifndef DEVICE_REDUCE_THREADWISE_HPP
#define DEVICE_REDUCE_THREADWISE_HPP
#include <iostream>
#include <sstream>
#include "device.hpp"
#include "device_reduce.hpp"
#include "device_reduce_common.hpp"
#include "gridwise_2d_reduction_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
index_t
Rank
,
typename
ReduceDims
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
OutElementwiseOperation
,
bool
PropagateNan
,
bool
NeedIndices
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
struct
DeviceReduceThreadWise
:
public
DeviceReduce
<
InElementwiseOperation
,
OutElementwiseOperation
>
{
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
static_assert
((
BlockSize
==
MThreadClusterSize
)
&&
(
KThreadClusterSize
==
1
),
"Threadwise can only be called with KThreadClusterSize be 1 !"
);
using
IndexDataType
=
int32_t
;
static
constexpr
bool
BetaIsZero
=
NeedIndices
;
using
InvariantDims
=
decltype
(
get_invariant_dims
<
Rank
,
ReduceDims
>
());
static
constexpr
index_t
srcDims
=
Rank
;
static
constexpr
index_t
dstDims
=
(
InvariantDims
::
Size
()
==
0
)
?
1
:
InvariantDims
::
Size
();
static
constexpr
bool
reduceAllDims
=
(
InvariantDims
::
Size
()
==
0
);
static
constexpr
int
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
int
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
static
auto
MakeSrc2dDescriptor
(
const
std
::
vector
<
int
>&
inLengths
,
const
std
::
vector
<
int
>&
inStrides
)
{
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
inLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
inStrides
,
Number
<
srcDims
>
{});
const
auto
inDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
in_grid_desc_m_k
=
[
&
]()
{
if
constexpr
(
reduceAllDims
)
{
const
auto
one_dim_inDesc
=
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
transform_tensor_descriptor
(
one_dim_inDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
one_dim_inDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
}
else
{
const
auto
toReduceDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
ReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
InvariantDims
{});
return
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
toReduceDimLengths
)),
make_tuple
(
InvariantDims
{},
ReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
}();
const
auto
outerLen
=
in_grid_desc_m_k
.
GetLength
(
Number
<
0
>
{});
const
auto
innerLen
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
auto
inPad_M
=
math
::
integer_least_multiple
(
outerLen
,
M_BlockTileSize
)
-
outerLen
;
const
auto
inPad_K
=
math
::
integer_least_multiple
(
innerLen
,
K_BlockTileSize
)
-
innerLen
;
auto
in_grid_desc_m_k_padded
=
transform_tensor_descriptor
(
in_grid_desc_m_k
,
make_tuple
(
make_right_pad_transform
(
outerLen
,
inPad_M
),
make_right_pad_transform
(
innerLen
,
inPad_K
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
(
in_grid_desc_m_k_padded
);
};
static
auto
MakeDst1dDescriptor
(
const
std
::
vector
<
int
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
)
{
const
auto
tupleDstLengths
=
make_tuple_from_array
(
outLengths
,
Number
<
dstDims
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
outStrides
,
Number
<
dstDims
>
{});
auto
outDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
auto
out_grid_desc_m
=
transform_tensor_descriptor
(
outDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
outerLen
=
out_grid_desc_m
.
GetLength
(
Number
<
0
>
{});
const
auto
outPad
=
math
::
integer_least_multiple
(
outerLen
,
M_BlockTileSize
)
-
outerLen
;
auto
out_grid_desc_m_padded
=
transform_tensor_descriptor
(
out_grid_desc_m
,
make_tuple
(
make_right_pad_transform
(
outerLen
,
outPad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
(
out_grid_desc_m_padded
);
};
struct
Argument
:
public
BaseArgument
{
Argument
(
const
std
::
vector
<
int
>&
inLengths
,
const
std
::
vector
<
int
>&
inStrides
,
const
std
::
vector
<
int
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
,
float
alpha
,
float
beta
,
const
InDataType
*
in_dev
,
OutDataType
*
out_dev
,
IndexDataType
*
out_indices_dev
,
AccDataType
*
workspace_dev
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
OutElementwiseOperation
&
acc_elementwise_op
)
:
in_dev_
{
in_dev
},
out_dev_
{
out_dev
},
out_indices_dev_
{
out_indices_dev
}
{
(
void
)
workspace_dev
;
inLengths_
=
inLengths
;
inStrides_
=
inStrides
;
outLengths_
=
outLengths
;
outStrides_
=
outStrides
;
in_elementwise_op_
=
in_elementwise_op
;
acc_elementwise_op_
=
acc_elementwise_op
;
alpha_
=
static_cast
<
AccDataType
>
(
alpha
);
beta_
=
static_cast
<
OutDataType
>
(
beta
);
std
::
tie
(
invariant_total_length
,
reduce_total_length
)
=
get_2d_lengths
<
Rank
,
ReduceDims
>
(
inLengths
);
if
constexpr
(
InvariantDims
::
Size
()
==
0
)
invariant_lowest_length
=
1
;
else
invariant_lowest_length
=
inLengths
[
InvariantDims
::
At
(
InvariantDims
::
Size
()
-
1
)];
reduce_lowest_length
=
inLengths
[
ReduceDims
::
At
(
ReduceDims
::
Size
()
-
1
)];
gridSize
=
math
::
integer_least_multiple
(
invariant_total_length
,
M_BlockTileSize
)
/
M_BlockTileSize
;
}
std
::
vector
<
int
>
inLengths_
;
std
::
vector
<
int
>
inStrides_
;
std
::
vector
<
int
>
outLengths_
;
std
::
vector
<
int
>
outStrides_
;
AccDataType
alpha_
;
OutDataType
beta_
;
const
InDataType
*
in_dev_
;
OutDataType
*
out_dev_
;
IndexDataType
*
out_indices_dev_
;
InElementwiseOperation
in_elementwise_op_
;
OutElementwiseOperation
acc_elementwise_op_
;
int
invariant_lowest_length
;
int
reduce_lowest_length
;
size_t
invariant_total_length
;
size_t
reduce_total_length
;
size_t
gridSize
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
)
{
const
auto
in_grid_desc_m_k
=
DeviceReduceThreadWise
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
);
const
auto
out_grid_desc_m
=
DeviceReduceThreadWise
::
MakeDst1dDescriptor
(
arg
.
outLengths_
,
arg
.
outStrides_
);
using
InGridDesc_M_K
=
decltype
(
in_grid_desc_m_k
);
using
OutGridDesc_M
=
decltype
(
out_grid_desc_m
);
using
GridwiseReduce
=
GridwiseReduction_mk_to_m_threadwise
<
InDataType
,
OutDataType
,
AccDataType
,
IndexDataType
,
InGridDesc_M_K
,
OutGridDesc_M
,
ReduceOperation
,
InElementwiseOperation
,
OutElementwiseOperation
,
PropagateNan
,
BetaIsZero
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
OutDstVectorSize
>
;
float
avg_time
=
0
;
const
auto
kernel
=
kernel_reduce_threadwise
<
GridwiseReduce
,
NeedIndices
,
InDataType
,
OutDataType
,
AccDataType
,
IndexDataType
,
InGridDesc_M_K
,
OutGridDesc_M
,
InElementwiseOperation
,
OutElementwiseOperation
>
;
avg_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
arg
.
gridSize
),
dim3
(
BlockSize
),
0
,
in_grid_desc_m_k
,
out_grid_desc_m
,
arg
.
in_elementwise_op_
,
arg
.
acc_elementwise_op_
,
arg
.
alpha_
,
arg
.
in_dev_
,
arg
.
beta_
,
arg
.
out_dev_
,
arg
.
out_indices_dev_
);
return
(
avg_time
);
};
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
)
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
);
};
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
constexpr
(
InSrcVectorDim
==
0
)
{
if
constexpr
(
InvariantDims
::
Size
()
==
0
)
return
(
false
);
if
(
pArg
->
inStrides_
[
InvariantDims
::
At
(
InvariantDims
::
Size
()
-
1
)]
!=
1
)
return
(
false
);
if
(
pArg
->
invariant_lowest_length
%
InSrcVectorSize
!=
0
)
return
(
false
);
}
else
{
if
(
pArg
->
inStrides_
[
ReduceDims
::
At
(
ReduceDims
::
Size
()
-
1
)]
!=
1
)
return
(
false
);
if
(
pArg
->
reduce_lowest_length
%
InSrcVectorSize
!=
0
)
return
(
false
);
};
// To improve
if
(
pArg
->
invariant_lowest_length
%
OutDstVectorSize
!=
0
)
return
(
false
);
// TODO: remove this. Should return true, as long as this DeviceOP instance support this
// case for bigger reduce_total_length size, we are supposed to use BlockWise method for
// better performance
if
(
pArg
->
reduce_total_length
/
KThreadSliceSize
>=
32
)
return
(
false
);
return
(
true
);
};
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
int
>&
inLengths
,
const
std
::
vector
<
int
>&
inStrides
,
const
std
::
vector
<
int
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
,
float
alpha
,
float
beta
,
const
void
*
in_dev
,
void
*
out_dev
,
void
*
out_indices_dev
,
void
*
workspace_dev
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
OutElementwiseOperation
&
acc_elementwise_op
)
override
{
return
std
::
make_unique
<
Argument
>
(
inLengths
,
inStrides
,
outLengths
,
outStrides
,
alpha
,
beta
,
static_cast
<
const
InDataType
*>
(
in_dev
),
static_cast
<
OutDataType
*>
(
out_dev
),
static_cast
<
IndexDataType
*>
(
out_indices_dev
),
static_cast
<
AccDataType
*>
(
workspace_dev
),
in_elementwise_op
,
acc_elementwise_op
);
};
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
();
};
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceReducceThreadWise<"
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
MThreadClusterSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"InSrcVectorDim_"
<<
InSrcVectorDim
<<
"_InSrcVectorSize_"
<<
InSrcVectorSize
<<
"_OutDstVectorSize_"
<<
OutDstVectorSize
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
device
_operation/
includ
e/gemm_specialization.hpp
→
include/ck/tensor
_operation/
gpu/devic
e/gemm_specialization.hpp
View file @
9dce6851
File moved
include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
0 → 100644
View file @
9dce6851
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_OPERATOR_MAPPING_HPP
#define CK_REDUCTION_OPERATOR_MAPPING_HPP
#include "reduction_operator.hpp"
#include "reduction_enums.hpp"
#include "element_wise_operation.hpp"
namespace
ck
{
// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their
// respective functor classes.
// The boolean member "indexable" are also provided in reduce_binary_operactor for
// easier checking by the upper-layer codes in the kernels.
template
<
typename
T
,
ReduceTensorOp_t
Op
>
struct
reduce_binary_operator
;
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
ADD
>
{
using
opType
=
reduce
::
Add
<
T
>
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
false
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
MUL
>
{
using
opType
=
reduce
::
Mul
<
T
>
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
false
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
MIN
>
{
using
opType
=
reduce
::
Min
<
T
>
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
true
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
MAX
>
{
using
opType
=
reduce
::
Max
<
T
>
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
true
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
AMAX
>
{
using
opType
=
reduce
::
AMax
<
T
>
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
true
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
AVG
>
{
using
opType
=
reduce
::
Add
<
T
>
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
false
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
NORM1
>
{
using
opType
=
reduce
::
Add
<
T
>
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
false
;
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp_t
::
NORM2
>
{
using
opType
=
reduce
::
Add
<
T
>
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
false
;
};
// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
// functor classes.
// The two unary functors are called before and afer the Reduction is executed respectively
template
<
typename
T
,
ReduceTensorOp_t
Op
,
bool
IsFirstReduce
,
bool
IsLastReduce
>
struct
reduce_unary_operator
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
};
template
<
typename
T
,
bool
IsFirstReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp_t
::
AVG
,
IsFirstReduce
,
true
>
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
,
true
>
;
};
template
<
typename
T
,
bool
IsLastReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp_t
::
NORM1
,
true
,
IsLastReduce
>
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryAbs
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
};
template
<
typename
T
,
bool
IsLastReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp_t
::
AMAX
,
true
,
IsLastReduce
>
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryAbs
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
};
template
<
typename
T
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp_t
::
NORM2
,
true
,
false
>
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySquare
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
};
template
<
typename
T
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp_t
::
NORM2
,
true
,
true
>
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySquare
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySqrt
<
T
,
T
>
;
};
template
<
typename
T
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp_t
::
NORM2
,
false
,
true
>
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySqrt
<
T
,
T
>
;
};
}
// end of namespace ck
#endif
device
_operation/
includ
e/tensor_layout.hpp
→
include/ck/tensor
_operation/
gpu/devic
e/tensor_layout.hpp
View file @
9dce6851
...
...
@@ -87,14 +87,17 @@ struct NKHW : public BaseTensorLayout
struct
NDHWC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NDHWC"
;
};
struct
KZYXC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"KZYXC"
;
};
struct
NDHWK
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NDHWK"
;
};
}
// namespace convolution
...
...
composable_kernel/
include/tensor_operation/element_wise_operation.hpp
→
include/
ck/
tensor_operation/
gpu/element/
element_wise_operation.hpp
View file @
9dce6851
#ifndef CK_ELEMENT_WISE_OPERATION_HPP
#define CK_ELEMENT_WISE_OPERATION_HPP
#include "data_type.hpp"
#include "data_type.hpp"
...
...
@@ -13,7 +14,7 @@ struct PassThrough
__host__
__device__
void
operator
()(
half_t
&
y
,
const
half_t
&
x
)
const
{
y
=
x
;
}
__host__
__device__
void
operator
()(
ushor
t
&
y
,
const
ushor
t
&
x
)
const
{
y
=
x
;
}
__host__
__device__
void
operator
()(
bhalf_
t
&
y
,
const
bhalf_
t
&
x
)
const
{
y
=
x
;
}
__host__
__device__
void
operator
()(
int32_t
&
y
,
const
int32_t
&
x
)
const
{
y
=
x
;
}
...
...
@@ -143,6 +144,192 @@ struct AddHardswishAdd
}
};
struct
RequantReluRequant
{
// FIXME: We just need one scale for Relu / Leaky Relu / PRelu
RequantReluRequant
(
float
scaleGemm
,
float
scaleRelu
)
:
scaleGemm_
(
scaleGemm
),
scaleRelu_
(
scaleRelu
)
{
}
__host__
__device__
constexpr
void
operator
()(
int8_t
&
y
,
const
int
&
x
)
const
{
float
gemm_requant
=
scaleGemm_
*
static_cast
<
float
>
(
x
);
float
relu
=
gemm_requant
>
0
?
gemm_requant
:
0
;
float
relu_requant
=
scaleRelu_
*
relu
;
y
=
static_cast
<
int8_t
>
(
relu_requant
>
127
?
127
:
relu_requant
<
-
128
?
-
128
:
relu_requant
);
}
// for reference_gemm
__host__
__device__
constexpr
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
float
gemm_requant
=
scaleGemm_
*
x
;
float
relu
=
gemm_requant
>
0
?
gemm_requant
:
0
;
float
relu_requant
=
scaleRelu_
*
relu
;
y
=
static_cast
<
float
>
(
relu_requant
>
127
?
127
:
relu_requant
<
-
128
?
-
128
:
relu_requant
);
}
float
scaleGemm_
;
float
scaleRelu_
;
};
// Unary operators are usually called element-wisely before/after the reduction is executed on the
// elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
template
<
typename
Y
,
typename
X
,
bool
HasDividing
=
false
>
struct
UnaryIdentic
;
template
<
>
struct
UnaryIdentic
<
float
,
float
,
false
>
{
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
x
;
};
};
template
<
>
struct
UnaryIdentic
<
float
,
float
,
true
>
{
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
divider_
=
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
x
/
type_convert
<
float
>
(
divider_
);
};
int32_t
divider_
=
1
;
};
template
<
>
struct
UnaryIdentic
<
half_t
,
half_t
,
false
>
{
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
half_t
&
y
,
const
half_t
&
x
)
const
{
y
=
x
;
};
};
template
<
>
struct
UnaryIdentic
<
double
,
double
,
false
>
{
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
x
;
};
};
template
<
>
struct
UnaryIdentic
<
double
,
double
,
true
>
{
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
divider_
=
divider
;
};
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
x
/
type_convert
<
double
>
(
divider_
);
};
int32_t
divider_
=
1
;
};
template
<
>
struct
UnaryIdentic
<
int32_t
,
int32_t
,
false
>
{
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
int32_t
&
y
,
const
int32_t
&
x
)
const
{
y
=
x
;
};
};
template
<
typename
Y
,
typename
X
,
bool
HasDividing
=
false
>
struct
UnarySquare
;
template
<
>
struct
UnarySquare
<
float
,
float
,
false
>
{
__host__
__device__
UnarySquare
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
x
*
x
;
};
};
template
<
>
struct
UnarySquare
<
float
,
float
,
true
>
{
__host__
__device__
UnarySquare
(
const
int32_t
divider
=
1
)
{
divider_
=
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
x
*
x
/
type_convert
<
float
>
(
divider_
);
};
int32_t
divider_
=
1
;
};
template
<
>
struct
UnarySquare
<
double
,
double
,
false
>
{
__host__
__device__
UnarySquare
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
x
*
x
;
};
};
template
<
>
struct
UnarySquare
<
double
,
double
,
true
>
{
__host__
__device__
UnarySquare
(
const
int32_t
divider
=
1
)
{
divider_
=
divider
;
};
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
x
*
x
/
type_convert
<
double
>
(
divider_
);
};
int32_t
divider_
=
1
;
};
template
<
typename
Y
,
typename
X
>
struct
UnaryAbs
;
template
<
>
struct
UnaryAbs
<
float
,
float
>
{
__host__
__device__
UnaryAbs
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
abs
(
x
);
};
};
template
<
>
struct
UnaryAbs
<
half_t
,
half_t
>
{
__host__
__device__
UnaryAbs
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
half_t
&
y
,
const
half_t
&
x
)
const
{
y
=
__habs
(
x
);
};
};
template
<
>
struct
UnaryAbs
<
double
,
double
>
{
__host__
__device__
UnaryAbs
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
abs
(
x
);
};
};
template
<
typename
Y
,
typename
X
>
struct
UnarySqrt
;
template
<
>
struct
UnarySqrt
<
float
,
float
>
{
__host__
__device__
UnarySqrt
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
sqrtf
(
x
);
};
};
template
<
>
struct
UnarySqrt
<
double
,
double
>
{
__host__
__device__
UnarySqrt
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
sqrt
(
x
);
};
};
}
// namespace element_wise
}
// namespace tensor_operation
}
// namespace ck
...
...
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
0 → 100644
View file @
9dce6851
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_2D_REDUCTION_BLOCKWISE_HPP
#define CK_GRIDWISE_2D_REDUCTION_BLOCKWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_blockwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
namespace
ck
{
template
<
typename
GridwiseReduction
,
bool
NeedIndices
,
typename
InDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
IndexDataType
,
typename
InGridDesc_M_K
,
typename
OutGridDesc_M
,
typename
InElementwiseOperation
,
typename
OutElementwiseOperation
>
__global__
void
kernel_reduce_blockwise
(
const
InGridDesc_M_K
in_grid_desc_m_k
,
const
OutGridDesc_M
out_grid_desc_m
,
const
InElementwiseOperation
in_elementwise_op
,
const
OutElementwiseOperation
acc_elementwise_op
,
AccDataType
alpha
,
const
InDataType
*
const
__restrict__
p_in_global
,
OutDataType
beta
,
OutDataType
*
const
__restrict__
p_out_global
,
const
IndexDataType
*
const
__restrict__
p_ws_indices_global
,
IndexDataType
*
const
__restrict__
p_indices_global
)
{
if
constexpr
(
!
NeedIndices
)
{
GridwiseReduction
::
Run
(
in_grid_desc_m_k
,
out_grid_desc_m
,
in_elementwise_op
,
acc_elementwise_op
,
alpha
,
p_in_global
,
beta
,
p_out_global
,
p_ws_indices_global
,
p_indices_global
);
}
else
{
GridwiseReduction
::
RunWithIndex
(
in_grid_desc_m_k
,
out_grid_desc_m
,
in_elementwise_op
,
acc_elementwise_op
,
alpha
,
p_in_global
,
beta
,
p_out_global
,
p_ws_indices_global
,
p_indices_global
);
};
};
template
<
typename
GridwiseReduction
,
bool
NeedIndices
,
typename
InDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
IndexDataType
,
typename
InGridDesc_M_K
,
typename
OutGridDesc_M
,
typename
InElementwiseOperation
,
typename
OutElementwiseOperation
>
__global__
void
kernel_reduce_blockwise_second_call
(
const
InGridDesc_M_K
in_grid_desc_m_k
,
const
OutGridDesc_M
out_grid_desc_m
,
const
InElementwiseOperation
in_elementwise_op
,
const
OutElementwiseOperation
acc_elementwise_op
,
AccDataType
alpha
,
const
InDataType
*
const
__restrict__
p_in_global
,
OutDataType
beta
,
OutDataType
*
const
__restrict__
p_out_global
,
const
IndexDataType
*
const
__restrict__
p_ws_indices_global
,
IndexDataType
*
const
__restrict__
p_indices_global
)
{
if
constexpr
(
!
NeedIndices
)
{
GridwiseReduction
::
Run
(
in_grid_desc_m_k
,
out_grid_desc_m
,
in_elementwise_op
,
acc_elementwise_op
,
alpha
,
p_in_global
,
beta
,
p_out_global
,
p_ws_indices_global
,
p_indices_global
);
}
else
{
GridwiseReduction
::
RunSecondCallWithIndex
(
in_grid_desc_m_k
,
out_grid_desc_m
,
in_elementwise_op
,
acc_elementwise_op
,
alpha
,
p_in_global
,
beta
,
p_out_global
,
p_ws_indices_global
,
p_indices_global
);
};
};
template
<
typename
InDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
IndexDataType
,
typename
InGridDesc_M_K
,
typename
OutGridDesc_M
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
OutElementwiseOperation
,
bool
PropagateNan
,
bool
BetaIsZero
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
struct
GridwiseReduction_mk_to_m_blockwise
{
static
constexpr
bool
reorder_thread_cluster
=
(
InSrcVectorDim
==
0
);
static
constexpr
auto
buffer_1d_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
BlockSize
>
{}));
template
<
typename
T
>
using
PassThroughOp
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
index_t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
index_t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
__device__
static
void
Run
(
const
InGridDesc_M_K
&
in_grid_desc_m_k
,
const
OutGridDesc_M
&
out_grid_desc_m
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
OutElementwiseOperation
&
acc_elementwise_op
,
AccDataType
alpha
,
const
InDataType
*
const
__restrict__
p_in_global
,
OutDataType
beta
,
OutDataType
*
const
__restrict__
p_out_global
,
const
IndexDataType
*
const
__restrict__
p_ws_indices_global
,
IndexDataType
*
const
__restrict__
p_indices_global
)
{
using
BlockwiseReduce
=
PartitionedBlockwiseReductionOn1dBuffer
<
decltype
(
buffer_1d_desc
),
AccDataType
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
reorder_thread_cluster
,
ReduceOperation
,
PropagateNan
>
;
using
Accumulation
=
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
>
;
(
void
)
p_ws_indices_global
;
(
void
)
p_indices_global
;
// LDS
__shared__
AccDataType
p_block_reduce_buffer
[
BlockSize
];
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_in_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
auto
out_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_out_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
block_reduce_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_block_reduce_buffer
,
BlockSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zeroVal
;
});
const
auto
toReduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_1d_id
=
get_block_1d_id
();
const
index_t
thread_m_cluster_id
=
reorder_thread_cluster
?
thread_local_id
%
MThreadClusterSize
:
((
thread_local_id
/
KThreadClusterSize
)
%
MThreadClusterSize
);
const
index_t
thread_k_cluster_id
=
reorder_thread_cluster
?
((
thread_local_id
/
MThreadClusterSize
)
%
KThreadClusterSize
)
:
thread_local_id
%
KThreadClusterSize
;
using
ThreadBufferLengths
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
KThreadSliceSize
));
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
K_BlockTileSize
);
const
index_t
toReduceTiles
=
(
toReduceLength
+
K_BlockTileSize
-
1
)
/
K_BlockTileSize
;
index_t
reducedTiles
=
0
;
do
{
threadwise_src_load
.
Run
(
in_grid_desc_m_k
,
in_global_buf
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
// do element-wise pre-reduction operation
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
in_elementwise_op
(
in_thread_buf
(
offset
),
in_thread_buf
(
offset
));
});
// reduce on each thread-local slice
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
Accumulation
::
Calculate
(
accu_value_buf
(
I
),
in_thread_buf
[
offset
]);
});
});
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
reducedTiles
++
;
}
while
(
reducedTiles
<
toReduceTiles
);
constexpr
auto
reduced_data_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{}));
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
constexpr
(
reorder_thread_cluster
)
{
block_reduce_buf
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
)
=
accu_value_buf
[
I
];
}
else
block_reduce_buf
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
)
=
accu_value_buf
[
I
];
accu_value_buf
(
I
)
=
zeroVal
;
__syncthreads
();
BlockwiseReduce
::
Reduce
(
block_reduce_buf
,
accu_value_buf
(
I
),
thread_m_cluster_id
,
thread_k_cluster_id
);
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
(
thread_k_cluster_id
==
0
)
{
acc_elementwise_op
(
accu_value_buf
(
I
),
accu_value_buf
(
I
));
accu_value_buf
(
I
)
*=
alpha
;
}
});
if
(
thread_k_cluster_id
==
0
)
{
if
constexpr
(
!
BetaIsZero
)
{
if
(
!
float_equal_zero
{}(
beta
))
{
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
priorDstValueBuf
;
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
OutDataType
,
OutDataType
,
OutGridDesc_M
,
decltype
(
reduced_data_desc
),
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
1
,
false
>
(
out_grid_desc_m
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
));
threadwise_dst_load
.
Run
(
out_grid_desc_m
,
out_global_buf
,
reduced_data_desc
,
make_tuple
(
I0
),
priorDstValueBuf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
+=
type_convert
<
AccDataType
>
(
priorDstValueBuf
[
I
]
*
beta
);
});
};
};
auto
threadwise_dst_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
AccDataType
,
OutDataType
,
decltype
(
reduced_data_desc
),
OutGridDesc_M
,
PassThroughOp
<
AccDataType
>
,
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
out_grid_desc_m
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
),
PassThroughOp
<
AccDataType
>
{});
threadwise_dst_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
),
accu_value_buf
,
out_grid_desc_m
,
out_global_buf
);
}
};
__device__
static
void
RunWithIndex
(
const
InGridDesc_M_K
&
in_grid_desc_m_k
,
const
OutGridDesc_M
&
out_grid_desc_m
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
OutElementwiseOperation
&
acc_elementwise_op
,
AccDataType
alpha
,
const
InDataType
*
const
__restrict__
p_in_global
,
OutDataType
beta
,
OutDataType
*
const
__restrict__
p_out_global
,
const
IndexDataType
*
const
__restrict__
p_ws_indices_global
,
IndexDataType
*
const
__restrict__
p_indices_global
)
{
using
BlockwiseReduceWithIndex
=
PartitionedBlockwiseReductionWithIndexOn1dBuffer
<
decltype
(
buffer_1d_desc
),
AccDataType
,
IndexDataType
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
reorder_thread_cluster
,
ReduceOperation
,
PropagateNan
>
;
using
AccumulationWithIndex
=
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
;
(
void
)
p_ws_indices_global
;
// LDS
__shared__
AccDataType
p_block_reduce_val_buffer
[
BlockSize
];
__shared__
IndexDataType
p_block_reduce_idx_buffer
[
BlockSize
];
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_in_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_out_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
out_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_indices_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
block_reduce_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_block_reduce_val_buffer
,
BlockSize
);
auto
block_reduce_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_block_reduce_idx_buffer
,
BlockSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_val_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
index_t
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_idx_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
const
auto
toReduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_1d_id
=
get_block_1d_id
();
const
index_t
thread_m_cluster_id
=
reorder_thread_cluster
?
thread_local_id
%
MThreadClusterSize
:
((
thread_local_id
/
KThreadClusterSize
)
%
MThreadClusterSize
);
const
index_t
thread_k_cluster_id
=
reorder_thread_cluster
?
((
thread_local_id
/
MThreadClusterSize
)
%
KThreadClusterSize
)
:
thread_local_id
%
KThreadClusterSize
;
using
ThreadBufferLengths
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
KThreadSliceSize
));
index_t
indexOffset
=
0
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zeroVal
;
accu_index_buf
(
I
)
=
0
;
});
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
K_BlockTileSize
);
const
index_t
toReduceTiles
=
(
toReduceLength
+
K_BlockTileSize
-
1
)
/
K_BlockTileSize
;
index_t
reducedTiles
=
0
;
do
{
// load the thread slice
threadwise_src_load
.
Run
(
in_grid_desc_m_k
,
in_global_buf
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_val_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
// initialize the indices for the per-thread to-reduce values
in_thread_idx_buf
(
offset
)
=
indexOffset
+
thread_k_cluster_id
*
KThreadSliceSize
+
J
();
// do element-wise pre-reduction operation
in_elementwise_op
(
in_thread_val_buf
(
offset
),
in_thread_val_buf
(
offset
));
});
AccDataType
tmpValue
=
zeroVal
;
IndexDataType
tmpIndex
=
0
;
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
// reduce on the dim1 thread slice
AccumulationWithIndex
::
Calculate
(
tmpValue
,
in_thread_val_buf
[
offset
],
tmpIndex
,
in_thread_idx_buf
[
offset
]);
});
// store thread local value to LDS for parallel reduction
if
constexpr
(
reorder_thread_cluster
)
{
block_reduce_val_buf
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
)
=
tmpValue
;
block_reduce_idx_buf
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
)
=
tmpIndex
;
}
else
{
block_reduce_val_buf
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
)
=
tmpValue
;
block_reduce_idx_buf
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
)
=
tmpIndex
;
}
__syncthreads
();
BlockwiseReduceWithIndex
::
Reduce
(
block_reduce_val_buf
,
block_reduce_idx_buf
,
tmpValue
,
tmpIndex
,
thread_m_cluster_id
,
thread_k_cluster_id
);
AccumulationWithIndex
::
Calculate
(
accu_value_buf
(
I
),
tmpValue
,
accu_index_buf
(
I
),
tmpIndex
);
});
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
indexOffset
+=
K_BlockTileSize
;
reducedTiles
++
;
}
while
(
reducedTiles
<
toReduceTiles
);
constexpr
auto
reduced_data_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{}));
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
(
thread_k_cluster_id
==
0
)
{
// for indiced operation, acc_elementwise_op shoud do nothing
acc_elementwise_op
(
accu_value_buf
(
I
),
accu_value_buf
(
I
));
accu_value_buf
(
I
)
*=
alpha
;
}
});
if
(
thread_k_cluster_id
==
0
)
{
if
constexpr
(
!
BetaIsZero
)
{
if
(
!
float_equal_zero
{}(
beta
))
{
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
priorDstValueBuf
;
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
OutDataType
,
OutDataType
,
OutGridDesc_M
,
decltype
(
reduced_data_desc
),
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
1
,
false
>
(
out_grid_desc_m
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
));
threadwise_dst_load
.
Run
(
out_grid_desc_m
,
out_global_val_buf
,
reduced_data_desc
,
make_tuple
(
I0
),
priorDstValueBuf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
+=
type_convert
<
AccDataType
>
(
priorDstValueBuf
[
I
]
*
beta
);
});
};
};
auto
threadwise_dst_val_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
AccDataType
,
OutDataType
,
decltype
(
reduced_data_desc
),
OutGridDesc_M
,
PassThroughOp
<
AccDataType
>
,
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
false
>
(
out_grid_desc_m
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
),
PassThroughOp
<
AccDataType
>
{});
auto
threadwise_dst_idx_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
IndexDataType
,
IndexDataType
,
decltype
(
reduced_data_desc
),
OutGridDesc_M
,
PassThroughOp
<
index_t
>
,
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
false
>
(
out_grid_desc_m
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
),
PassThroughOp
<
index_t
>
{});
threadwise_dst_val_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
),
accu_value_buf
,
out_grid_desc_m
,
out_global_val_buf
);
threadwise_dst_idx_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
),
accu_index_buf
,
out_grid_desc_m
,
out_global_idx_buf
);
}
};
__device__
static
void
RunSecondCallWithIndex
(
const
InGridDesc_M_K
&
in_grid_desc_m_k
,
const
OutGridDesc_M
&
out_grid_desc_m
,
const
InElementwiseOperation
in_elementwise_op
,
const
OutElementwiseOperation
acc_elementwise_op
,
AccDataType
alpha
,
const
InDataType
*
const
__restrict__
p_ws_values_global
,
OutDataType
beta
,
OutDataType
*
const
__restrict__
p_out_global
,
const
IndexDataType
*
const
__restrict__
p_ws_indices_global
,
IndexDataType
*
const
__restrict__
p_indices_global
)
{
using
BlockwiseReduceWithIndex
=
PartitionedBlockwiseReductionWithIndexOn1dBuffer
<
decltype
(
buffer_1d_desc
),
AccDataType
,
IndexDataType
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
reorder_thread_cluster
,
ReduceOperation
,
PropagateNan
>
;
using
AccumulationWithIndex
=
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
;
(
void
)
in_elementwise_op
;
// LDS
__shared__
AccDataType
p_block_reduce_val_buffer
[
BlockSize
];
__shared__
IndexDataType
p_block_reduce_idx_buffer
[
BlockSize
];
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
src_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_ws_values_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
const
auto
src_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_ws_indices_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
());
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_out_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
out_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_indices_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
block_reduce_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_block_reduce_val_buffer
,
BlockSize
);
auto
block_reduce_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_block_reduce_idx_buffer
,
BlockSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_val_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_idx_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
const
auto
toReduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_1d_id
=
get_block_1d_id
();
const
index_t
thread_m_cluster_id
=
reorder_thread_cluster
?
thread_local_id
%
MThreadClusterSize
:
((
thread_local_id
/
KThreadClusterSize
)
%
MThreadClusterSize
);
const
index_t
thread_k_cluster_id
=
reorder_thread_cluster
?
((
thread_local_id
/
MThreadClusterSize
)
%
KThreadClusterSize
)
:
thread_local_id
%
KThreadClusterSize
;
using
ThreadBufferLengths
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_src_val_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
KThreadSliceSize
));
auto
threadwise_src_idx_load
=
ThreadwiseTensorSliceTransfer_v2
<
IndexDataType
,
IndexDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
KThreadSliceSize
));
// index_t indexOffset = 0;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zeroVal
;
accu_index_buf
(
I
)
=
0
;
});
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
K_BlockTileSize
);
const
index_t
toReduceTiles
=
(
toReduceLength
+
K_BlockTileSize
-
1
)
/
K_BlockTileSize
;
index_t
reducedTiles
=
0
;
do
{
// load the thread slice
threadwise_src_val_load
.
Run
(
in_grid_desc_m_k
,
src_global_val_buf
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_val_buf
);
threadwise_src_idx_load
.
Run
(
in_grid_desc_m_k
,
src_global_idx_buf
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_idx_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
AccDataType
tmpValue
=
zeroVal
;
IndexDataType
tmpIndex
=
0
;
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
// reduce on the dim1 thread slice
AccumulationWithIndex
::
Calculate
(
tmpValue
,
in_thread_val_buf
[
offset
],
tmpIndex
,
in_thread_idx_buf
[
offset
]);
});
// store thread local value to LDS for parallel reduction
if
constexpr
(
reorder_thread_cluster
)
{
block_reduce_val_buf
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
)
=
tmpValue
;
block_reduce_idx_buf
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
)
=
tmpIndex
;
}
else
{
block_reduce_val_buf
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
)
=
tmpValue
;
block_reduce_idx_buf
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
)
=
tmpIndex
;
}
__syncthreads
();
BlockwiseReduceWithIndex
::
Reduce
(
block_reduce_val_buf
,
block_reduce_idx_buf
,
tmpValue
,
tmpIndex
,
thread_m_cluster_id
,
thread_k_cluster_id
);
AccumulationWithIndex
::
Calculate
(
accu_value_buf
(
I
),
tmpValue
,
accu_index_buf
(
I
),
tmpIndex
);
});
threadwise_src_val_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
threadwise_src_idx_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
// indexOffset += K_BlockTileSize;
reducedTiles
++
;
}
while
(
reducedTiles
<
toReduceTiles
);
constexpr
auto
reduced_data_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{}));
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
(
thread_k_cluster_id
==
0
)
{
// for indiced operation, acc_elementwise_op shoud do nothing
acc_elementwise_op
(
accu_value_buf
(
I
),
accu_value_buf
(
I
));
accu_value_buf
(
I
)
*=
alpha
;
}
});
if
(
thread_k_cluster_id
==
0
)
{
if
constexpr
(
!
BetaIsZero
)
{
if
(
!
float_equal_zero
{}(
beta
))
{
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
priorDstValueBuf
;
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
OutDataType
,
OutDataType
,
OutGridDesc_M
,
decltype
(
reduced_data_desc
),
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
1
,
true
>
(
out_grid_desc_m
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
));
threadwise_dst_load
.
Run
(
out_grid_desc_m
,
out_global_val_buf
,
reduced_data_desc
,
make_tuple
(
I0
),
priorDstValueBuf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
+=
type_convert
<
AccDataType
>
(
priorDstValueBuf
[
I
]
*
beta
);
});
};
};
auto
threadwise_dst_val_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
AccDataType
,
OutDataType
,
decltype
(
reduced_data_desc
),
OutGridDesc_M
,
PassThroughOp
<
AccDataType
>
,
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
out_grid_desc_m
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
),
PassThroughOp
<
AccDataType
>
{});
auto
threadwise_dst_idx_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
IndexDataType
,
IndexDataType
,
decltype
(
reduced_data_desc
),
OutGridDesc_M
,
PassThroughOp
<
IndexDataType
>
,
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
out_grid_desc_m
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
),
PassThroughOp
<
index_t
>
{});
threadwise_dst_val_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
),
accu_value_buf
,
out_grid_desc_m
,
out_global_val_buf
);
threadwise_dst_idx_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
),
accu_index_buf
,
out_grid_desc_m
,
out_global_idx_buf
);
}
};
};
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
0 → 100644
View file @
9dce6851
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_ATOMIC_ADD_HPP
#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_ATOMIC_ADD_HPP
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_blockwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
namespace
ck
{
template
<
typename
GridwiseReduction
,
typename
InDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
InGridDesc_M_K
,
typename
OutGridDesc_M
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
>
__global__
void
kernel_reduce_multiblock_atocmi_add
(
const
InGridDesc_M_K
in_grid_desc_m_k
,
const
OutGridDesc_M
out_grid_desc_m
,
const
InElementwiseOperation
in_elementwise_op
,
const
AccElementwiseOperation
acc_elementwise_op
,
index_t
block_group_size
,
index_t
num_k_block_tile_iteration
,
AccDataType
alpha
,
const
InDataType
*
const
__restrict__
p_in_global
,
OutDataType
*
const
__restrict__
p_out_global
)
{
GridwiseReduction
::
Run
(
in_grid_desc_m_k
,
out_grid_desc_m
,
in_elementwise_op
,
acc_elementwise_op
,
block_group_size
,
num_k_block_tile_iteration
,
alpha
,
p_in_global
,
p_out_global
);
};
template
<
typename
InDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
InGridDesc_M_K
,
typename
OutGridDesc_M
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
bool
PropagateNan
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
struct
GridwiseReduction_mk_to_m_multiblock_atomic_add
{
static
constexpr
bool
reorder_thread_cluster
=
(
InSrcVectorDim
==
0
);
static
constexpr
auto
buffer_1d_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
BlockSize
>
{}));
using
blockwise_reduce
=
PartitionedBlockwiseReductionOn1dBuffer
<
decltype
(
buffer_1d_desc
),
AccDataType
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
reorder_thread_cluster
,
ReduceOperation
,
PropagateNan
>
;
template
<
typename
T
>
using
PassThroughOp
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
index_t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
index_t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
using
Accumulation
=
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
>
;
__device__
static
void
Run
(
const
InGridDesc_M_K
&
in_grid_desc_m_k
,
const
OutGridDesc_M
&
out_grid_desc_m
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
AccElementwiseOperation
&
acc_elementwise_op
,
index_t
block_group_size
,
index_t
num_k_block_tile_iteration
,
AccDataType
alpha
,
const
InDataType
*
const
__restrict__
p_in_global
,
OutDataType
*
const
__restrict__
p_out_global
)
{
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
// LDS
__shared__
AccDataType
p_block_reduce_buffer
[
BlockSize
];
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_in_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
auto
out_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_out_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
block_reduce_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_block_reduce_buffer
,
BlockSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zeroVal
;
});
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_id
=
get_block_1d_id
();
const
index_t
blkgroup_id
=
block_global_id
/
block_group_size
;
const
index_t
block_local_id
=
block_global_id
%
block_group_size
;
const
index_t
thread_m_cluster_id
=
reorder_thread_cluster
?
thread_local_id
%
MThreadClusterSize
:
((
thread_local_id
/
KThreadClusterSize
)
%
MThreadClusterSize
);
const
index_t
thread_k_cluster_id
=
reorder_thread_cluster
?
((
thread_local_id
/
MThreadClusterSize
)
%
KThreadClusterSize
)
:
thread_local_id
%
KThreadClusterSize
;
const
index_t
reduceSizePerBlock
=
K_BlockTileSize
*
num_k_block_tile_iteration
;
using
ThreadBufferLengths
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
blkgroup_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
block_local_id
*
reduceSizePerBlock
+
thread_k_cluster_id
*
KThreadSliceSize
));
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
K_BlockTileSize
);
index_t
reducedTiles
=
0
;
do
{
threadwise_src_load
.
Run
(
in_grid_desc_m_k
,
in_global_buf
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
// do element-wise pre-reduction operation
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
in_elementwise_op
(
in_thread_buf
(
offset
),
in_thread_buf
(
offset
));
});
// reduce on each thread-local slice
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
Accumulation
::
Calculate
(
accu_value_buf
(
I
),
in_thread_buf
[
offset
]);
});
});
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
reducedTiles
++
;
}
while
(
reducedTiles
<
num_k_block_tile_iteration
);
constexpr
auto
reduced_data_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{}));
// Each block executes multiple parallel reductions on the LDS, and by atomic-adding its
// reduced output to the global location corresponding to each invariant dimension to get a
// consistent reduced result for that invariant dimension. due to the using of vector_load,
// each block/thread is involved into multiple invarirant dimensions.
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
constexpr
(
reorder_thread_cluster
)
{
block_reduce_buf
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
)
=
accu_value_buf
[
I
];
}
else
block_reduce_buf
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
)
=
accu_value_buf
[
I
];
accu_value_buf
(
I
)
=
zeroVal
;
__syncthreads
();
blockwise_reduce
::
Reduce
(
block_reduce_buf
,
accu_value_buf
(
I
),
thread_m_cluster_id
,
thread_k_cluster_id
);
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
(
thread_k_cluster_id
==
0
)
{
acc_elementwise_op
(
accu_value_buf
(
I
),
accu_value_buf
(
I
));
accu_value_buf
(
I
)
*=
alpha
;
}
});
if
(
thread_k_cluster_id
==
0
)
{
auto
threadwise_dst_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
AccDataType
,
OutDataType
,
decltype
(
reduced_data_desc
),
OutGridDesc_M
,
PassThroughOp
<
AccDataType
>
,
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum_t
::
AtomicAdd
,
1
,
true
>
(
out_grid_desc_m
,
make_multi_index
(
blkgroup_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
),
PassThroughOp
<
AccDataType
>
{});
threadwise_dst_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
),
accu_value_buf
,
out_grid_desc_m
,
out_global_buf
);
}
};
};
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
0 → 100644
View file @
9dce6851
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_TWO_CALL_HPP
#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_TWO_CALL_HPP
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_blockwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
namespace
ck
{
template
<
typename
GridwiseReduction
,
bool
NeedIndices
,
typename
InDataType
,
typename
AccDataType
,
typename
IndexDataType
,
typename
InGridDesc_M_K
,
typename
WorkspaceDesc_M_K
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
>
__global__
void
kernel_partial_reduce_multiblock
(
const
InGridDesc_M_K
in_grid_desc_m_k
,
const
WorkspaceDesc_M_K
workspace_desc_m_k
,
const
InElementwiseOperation
in_elementwise_op
,
const
AccElementwiseOperation
acc_elementwise_op
,
index_t
block_group_size
,
index_t
num_k_block_tile_iteration
,
const
InDataType
*
const
__restrict__
p_src_global
,
AccDataType
*
const
__restrict__
p_ws_values_global
,
IndexDataType
*
const
__restrict__
p_ws_indices_global
)
{
if
constexpr
(
!
NeedIndices
)
{
GridwiseReduction
::
Run
(
in_grid_desc_m_k
,
workspace_desc_m_k
,
in_elementwise_op
,
acc_elementwise_op
,
block_group_size
,
num_k_block_tile_iteration
,
p_src_global
,
p_ws_values_global
,
p_ws_indices_global
);
}
else
{
GridwiseReduction
::
RunWithIndex
(
in_grid_desc_m_k
,
workspace_desc_m_k
,
in_elementwise_op
,
acc_elementwise_op
,
block_group_size
,
num_k_block_tile_iteration
,
p_src_global
,
p_ws_values_global
,
p_ws_indices_global
);
};
};
template
<
typename
InDataType
,
typename
AccDataType
,
typename
IndexDataType
,
typename
InGridDesc_M_K
,
typename
WorkspaceDesc_M_K
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
bool
PropagateNan
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
struct
GridwiseReduction_mk_to_mk_multiblock_partial_reduce
{
static
constexpr
bool
reorder_thread_cluster
=
(
InSrcVectorDim
==
0
);
static
constexpr
auto
buffer1dDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
BlockSize
>
{}));
template
<
typename
T
>
using
PassThroughOp
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
index_t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
index_t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
__device__
static
void
Run
(
const
InGridDesc_M_K
&
in_grid_desc_m_k
,
const
WorkspaceDesc_M_K
&
workspace_desc_m_k
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
AccElementwiseOperation
&
acc_elementwise_op
,
index_t
block_group_size
,
index_t
num_k_block_tile_iteration
,
const
InDataType
*
const
__restrict__
p_src_global
,
AccDataType
*
const
__restrict__
p_ws_values_global
,
IndexDataType
*
const
__restrict__
p_ws_indices_global
)
{
using
BlockwiseReduce
=
PartitionedBlockwiseReductionOn1dBuffer
<
decltype
(
buffer1dDesc
),
AccDataType
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
reorder_thread_cluster
,
ReduceOperation
,
PropagateNan
>
;
using
Accumulation
=
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
>
;
(
void
)
p_ws_indices_global
;
(
void
)
acc_elementwise_op
;
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
// LDS
__shared__
AccDataType
p_block_reduce_buffer
[
BlockSize
];
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_src_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
auto
workspace_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_ws_values_global
,
workspace_desc_m_k
.
GetElementSpaceSize
());
auto
block_reduce_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_block_reduce_buffer
,
BlockSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zeroVal
;
});
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_id
=
get_block_1d_id
();
const
index_t
blkgroup_id
=
block_global_id
/
block_group_size
;
const
index_t
block_local_id
=
block_global_id
%
block_group_size
;
const
index_t
thread_m_cluster_id
=
reorder_thread_cluster
?
thread_local_id
%
MThreadClusterSize
:
((
thread_local_id
/
KThreadClusterSize
)
%
MThreadClusterSize
);
const
index_t
thread_k_cluster_id
=
reorder_thread_cluster
?
((
thread_local_id
/
MThreadClusterSize
)
%
KThreadClusterSize
)
:
thread_local_id
%
KThreadClusterSize
;
const
index_t
reduceSizePerBlock
=
K_BlockTileSize
*
num_k_block_tile_iteration
;
using
ThreadBufferLengths
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
blkgroup_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
block_local_id
*
reduceSizePerBlock
+
thread_k_cluster_id
*
KThreadSliceSize
));
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
K_BlockTileSize
);
index_t
reducedTiles
=
0
;
do
{
threadwise_src_load
.
Run
(
in_grid_desc_m_k
,
in_global_buf
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
// do element-wise pre-reduction operation
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
in_elementwise_op
(
in_thread_buf
(
offset
),
in_thread_buf
(
offset
));
});
// reduce on each thread-local slice
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
Accumulation
::
Calculate
(
accu_value_buf
(
I
),
in_thread_buf
[
offset
]);
});
});
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
reducedTiles
++
;
}
while
(
reducedTiles
<
num_k_block_tile_iteration
);
constexpr
auto
reduced_data_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
1
>
{}));
// Each block executes multiple parallel reductions on the LDS, and due to the using of
// vector_load, each block/thread is involved into multiple invarirant dimensions.
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
constexpr
(
reorder_thread_cluster
)
{
block_reduce_buf
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
)
=
accu_value_buf
[
I
];
}
else
block_reduce_buf
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
)
=
accu_value_buf
[
I
];
accu_value_buf
(
I
)
=
zeroVal
;
__syncthreads
();
BlockwiseReduce
::
Reduce
(
block_reduce_buf
,
accu_value_buf
(
I
),
thread_m_cluster_id
,
thread_k_cluster_id
);
});
if
(
thread_k_cluster_id
==
0
)
{
auto
threadwise_workspace_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
AccDataType
,
AccDataType
,
decltype
(
reduced_data_desc
),
WorkspaceDesc_M_K
,
PassThroughOp
<
AccDataType
>
,
Sequence
<
MThreadSliceSize
,
1
>
,
Sequence
<
0
,
1
>
,
1
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
workspace_desc_m_k
,
make_multi_index
(
blkgroup_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
block_local_id
),
PassThroughOp
<
AccDataType
>
{});
threadwise_workspace_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
,
I0
),
accu_value_buf
,
workspace_desc_m_k
,
workspace_global_buf
);
}
};
__device__
static
void
RunWithIndex
(
const
InGridDesc_M_K
&
in_grid_desc_m_k
,
const
WorkspaceDesc_M_K
&
workspace_desc_m_k
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
AccElementwiseOperation
&
acc_elementwise_op
,
index_t
block_group_size
,
index_t
num_k_block_tile_iteration
,
const
InDataType
*
const
__restrict__
p_src_global
,
AccDataType
*
const
__restrict__
p_ws_values_global
,
IndexDataType
*
const
__restrict__
p_ws_indices_global
)
{
using
BlockwiseReduceWithIndex
=
PartitionedBlockwiseReductionWithIndexOn1dBuffer
<
decltype
(
buffer1dDesc
),
AccDataType
,
IndexDataType
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
reorder_thread_cluster
,
ReduceOperation
,
PropagateNan
>
;
using
AccumulationWithIndex
=
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
;
(
void
)
acc_elementwise_op
;
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
// LDS
__shared__
AccDataType
p_block_reduce_val_buffer
[
BlockSize
];
__shared__
index_t
p_block_reduce_idx_buffer
[
BlockSize
];
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_src_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
auto
workspace_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_ws_values_global
,
workspace_desc_m_k
.
GetElementSpaceSize
());
auto
workspace_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_ws_indices_global
,
workspace_desc_m_k
.
GetElementSpaceSize
());
auto
block_reduce_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_block_reduce_val_buffer
,
BlockSize
);
auto
block_reduce_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_block_reduce_idx_buffer
,
BlockSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_val_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_idx_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_id
=
get_block_1d_id
();
const
index_t
blkgroup_id
=
block_global_id
/
block_group_size
;
const
index_t
block_local_id
=
block_global_id
%
block_group_size
;
const
index_t
thread_m_cluster_id
=
reorder_thread_cluster
?
thread_local_id
%
MThreadClusterSize
:
((
thread_local_id
/
KThreadClusterSize
)
%
MThreadClusterSize
);
const
index_t
thread_k_cluster_id
=
reorder_thread_cluster
?
((
thread_local_id
/
MThreadClusterSize
)
%
KThreadClusterSize
)
:
thread_local_id
%
KThreadClusterSize
;
const
index_t
reduceSizePerBlock
=
K_BlockTileSize
*
num_k_block_tile_iteration
;
using
ThreadBufferLengths
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
blkgroup_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
block_local_id
*
reduceSizePerBlock
+
thread_k_cluster_id
*
KThreadSliceSize
));
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
K_BlockTileSize
);
index_t
indexOffset
=
block_local_id
*
reduceSizePerBlock
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zeroVal
;
accu_index_buf
(
I
)
=
0
;
});
index_t
reducedTiles
=
0
;
do
{
// load the thread slice
threadwise_src_load
.
Run
(
in_grid_desc_m_k
,
in_global_buf
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_val_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
// initialize the indices for the per-thread to-reduce values
in_thread_idx_buf
(
offset
)
=
indexOffset
+
thread_k_cluster_id
*
KThreadSliceSize
+
J
();
// do element-wise pre-reduction operation
in_elementwise_op
(
in_thread_val_buf
(
offset
),
in_thread_val_buf
(
offset
));
});
AccDataType
tmpValue
=
zeroVal
;
IndexDataType
tmpIndex
=
0
;
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
// reduce on the dim1 thread slice
AccumulationWithIndex
::
Calculate
(
tmpValue
,
in_thread_val_buf
[
offset
],
tmpIndex
,
in_thread_idx_buf
[
offset
]);
});
// store thread local value to LDS for parallel reduction
if
constexpr
(
reorder_thread_cluster
)
{
block_reduce_val_buf
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
)
=
tmpValue
;
block_reduce_idx_buf
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
)
=
tmpIndex
;
}
else
{
block_reduce_val_buf
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
)
=
tmpValue
;
block_reduce_idx_buf
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
)
=
tmpIndex
;
}
__syncthreads
();
BlockwiseReduceWithIndex
::
Reduce
(
block_reduce_val_buf
,
block_reduce_idx_buf
,
tmpValue
,
tmpIndex
,
thread_m_cluster_id
,
thread_k_cluster_id
);
AccumulationWithIndex
::
Calculate
(
accu_value_buf
(
I
),
tmpValue
,
accu_index_buf
(
I
),
tmpIndex
);
});
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
indexOffset
+=
K_BlockTileSize
;
reducedTiles
++
;
}
while
(
reducedTiles
<
num_k_block_tile_iteration
);
constexpr
auto
reduced_data_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
1
>
{}));
if
(
thread_k_cluster_id
==
0
)
{
auto
threadwise_workspace_val_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
AccDataType
,
AccDataType
,
decltype
(
reduced_data_desc
),
WorkspaceDesc_M_K
,
PassThroughOp
<
AccDataType
>
,
Sequence
<
MThreadSliceSize
,
1
>
,
Sequence
<
0
,
1
>
,
1
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
workspace_desc_m_k
,
make_multi_index
(
blkgroup_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
block_local_id
),
PassThroughOp
<
AccDataType
>
{});
auto
threadwise_workspace_idx_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
IndexDataType
,
IndexDataType
,
decltype
(
reduced_data_desc
),
WorkspaceDesc_M_K
,
PassThroughOp
<
IndexDataType
>
,
Sequence
<
MThreadSliceSize
,
1
>
,
Sequence
<
0
,
1
>
,
1
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
workspace_desc_m_k
,
make_multi_index
(
blkgroup_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
block_local_id
),
PassThroughOp
<
IndexDataType
>
{});
threadwise_workspace_val_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
,
I0
),
accu_value_buf
,
workspace_desc_m_k
,
workspace_global_val_buf
);
threadwise_workspace_idx_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
,
I0
),
accu_index_buf
,
workspace_desc_m_k
,
workspace_global_idx_buf
);
}
};
};
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
0 → 100644
View file @
9dce6851
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_2D_REDUCTION_THREADWISE_HPP
#define CK_GRIDWISE_2D_REDUCTION_THREADWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
namespace
ck
{
template
<
typename
GridwiseReduction
,
bool
NeedIndices
,
typename
InDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
IndexDataType
,
typename
InGridDesc_M_K
,
typename
OutGridDesc_M
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
>
__global__
void
kernel_reduce_threadwise
(
const
InGridDesc_M_K
in_grid_desc_m_k
,
const
OutGridDesc_M
out_grid_desc_m
,
const
InElementwiseOperation
in_elementwise_op
,
const
AccElementwiseOperation
acc_elementwise_op
,
AccDataType
alpha
,
const
InDataType
*
const
__restrict__
p_in_global
,
OutDataType
beta
,
OutDataType
*
const
__restrict__
p_out_global
,
IndexDataType
*
const
__restrict__
p_indices_global
)
{
if
constexpr
(
!
NeedIndices
)
{
GridwiseReduction
::
Run
(
in_grid_desc_m_k
,
out_grid_desc_m
,
in_elementwise_op
,
acc_elementwise_op
,
alpha
,
p_in_global
,
beta
,
p_out_global
,
p_indices_global
);
}
else
{
GridwiseReduction
::
RunWithIndices
(
in_grid_desc_m_k
,
out_grid_desc_m
,
in_elementwise_op
,
acc_elementwise_op
,
alpha
,
p_in_global
,
beta
,
p_out_global
,
p_indices_global
);
};
};
template
<
typename
InDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
IndexDataType
,
typename
InGridDesc_M_K
,
typename
OutGridDesc_M
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
bool
PropagateNan
,
bool
BetaIsZero
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
struct
GridwiseReduction_mk_to_m_threadwise
{
template
<
typename
T
>
using
PassThroughOp
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
__device__
static
void
Run
(
const
InGridDesc_M_K
&
in_grid_desc_m_k
,
const
OutGridDesc_M
&
out_grid_desc_m
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
AccElementwiseOperation
&
acc_elementwise_op
,
AccDataType
alpha
,
const
InDataType
*
const
__restrict__
p_in_global
,
OutDataType
beta
,
OutDataType
*
const
__restrict__
p_out_global
,
IndexDataType
*
const
__restrict__
p_indices_global
)
{
using
Accumulation
=
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
>
;
(
void
)
p_indices_global
;
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_in_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
auto
dst_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_out_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zeroVal
;
});
const
auto
toReduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
using
ThreadBufferLengths
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
index_t
thread_global_1d_id
=
get_block_1d_id
()
*
BlockSize
+
get_thread_local_1d_id
();
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
,
0
));
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
KThreadSliceSize
);
index_t
reducedLength
=
0
;
do
{
threadwise_src_load
.
Run
(
in_grid_desc_m_k
,
in_global_buf
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
// do element-wise pre-reduction operation
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
in_elementwise_op
(
in_thread_buf
(
offset
),
in_thread_buf
(
offset
));
});
// reduce on each thread-local slice
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
Accumulation
::
Calculate
(
accu_value_buf
(
I
),
in_thread_buf
[
offset
]);
});
});
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
reducedLength
+=
KThreadSliceSize
;
}
while
(
reducedLength
<
toReduceLength
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
acc_elementwise_op
(
accu_value_buf
(
I
),
accu_value_buf
(
I
));
accu_value_buf
(
I
)
*=
alpha
;
});
constexpr
auto
reduced_data_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{}));
if
constexpr
(
!
BetaIsZero
)
{
if
(
!
float_equal_zero
{}(
beta
))
{
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
OutDataType
,
OutDataType
,
OutGridDesc_M
,
decltype
(
reduced_data_desc
),
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
1
,
1
,
true
>
(
out_grid_desc_m
,
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
));
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
priorDstValue_buf
;
threadwise_dst_load
.
Run
(
out_grid_desc_m
,
dst_global_buf
,
reduced_data_desc
,
make_tuple
(
I0
),
priorDstValue_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
+=
type_convert
<
AccDataType
>
(
priorDstValue_buf
[
I
]
*
beta
);
});
};
};
auto
threadwise_dst_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
AccDataType
,
OutDataType
,
decltype
(
reduced_data_desc
),
OutGridDesc_M
,
PassThroughOp
<
AccDataType
>
,
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
false
>
(
out_grid_desc_m
,
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
),
PassThroughOp
<
AccDataType
>
{});
threadwise_dst_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
),
accu_value_buf
,
out_grid_desc_m
,
dst_global_buf
);
};
__device__
static
void
RunWithIndices
(
const
InGridDesc_M_K
&
in_grid_desc_m_k
,
const
OutGridDesc_M
&
out_grid_desc_m
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
AccElementwiseOperation
&
acc_elementwise_op
,
AccDataType
alpha
,
const
InDataType
*
const
__restrict__
p_in_global
,
OutDataType
beta
,
OutDataType
*
const
__restrict__
p_out_global
,
IndexDataType
*
const
__restrict__
p_indices_global
)
{
using
AccumulationWithIndex
=
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
;
(
void
)
acc_elementwise_op
;
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_in_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_out_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
out_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_indices_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zeroVal
;
accu_index_buf
(
I
)
=
0
;
});
const
auto
toReduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
using
ThreadBufferLengths
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
index_t
thread_global_1d_id
=
get_block_1d_id
()
*
BlockSize
+
get_thread_local_1d_id
();
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
,
0
));
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
KThreadSliceSize
);
index_t
indexStart
=
0
;
index_t
reducedLength
=
0
;
do
{
threadwise_src_load
.
Run
(
in_grid_desc_m_k
,
in_global_buf
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
// do element-wise pre-reduction operation
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
in_elementwise_op
(
in_thread_buf
(
offset
),
in_thread_buf
(
offset
));
});
// reduce on each thread-local slice
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
AccumulationWithIndex
::
Calculate
(
accu_value_buf
(
I
),
in_thread_buf
[
offset
],
accu_index_buf
(
I
),
indexStart
+
J
);
});
});
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
indexStart
+=
KThreadSliceSize
;
reducedLength
+=
KThreadSliceSize
;
}
while
(
reducedLength
<
toReduceLength
);
// for indiced operation, acc_elementwise_op shoud do nothing
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
acc_elementwise_op
(
accu_value_buf
(
I
),
accu_value_buf
(
I
));
accu_value_buf
(
I
)
*=
alpha
;
});
constexpr
auto
reduced_data_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{}));
if
constexpr
(
!
BetaIsZero
)
{
if
(
!
float_equal_zero
{}(
beta
))
{
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
OutDataType
,
OutDataType
,
OutGridDesc_M
,
decltype
(
reduced_data_desc
),
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
1
,
1
,
false
>
(
out_grid_desc_m
,
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
));
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
priorDstValue_buf
;
threadwise_dst_load
.
Run
(
out_grid_desc_m
,
out_global_val_buf
,
reduced_data_desc
,
make_tuple
(
I0
),
priorDstValue_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
+=
type_convert
<
AccDataType
>
(
priorDstValue_buf
[
I
]
*
beta
);
});
};
};
auto
threadwise_dst_val_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
AccDataType
,
OutDataType
,
decltype
(
reduced_data_desc
),
OutGridDesc_M
,
PassThroughOp
<
AccDataType
>
,
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
false
>
(
out_grid_desc_m
,
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
),
PassThroughOp
<
AccDataType
>
{});
auto
threadwise_dst_idx_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
IndexDataType
,
IndexDataType
,
decltype
(
reduced_data_desc
),
OutGridDesc_M
,
PassThroughOp
<
IndexDataType
>
,
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
false
>
(
out_grid_desc_m
,
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
),
PassThroughOp
<
IndexDataType
>
{});
threadwise_dst_val_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
),
accu_value_buf
,
out_grid_desc_m
,
out_global_val_buf
);
threadwise_dst_idx_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
),
accu_index_buf
,
out_grid_desc_m
,
out_global_idx_buf
);
};
};
}
// namespace ck
#endif
composable_kernel/
include/tensor_operation/gridwise_batched_gemm_xdlops_v2r3.hpp
→
include/
ck/
tensor_operation/
gpu/grid/
gridwise_batched_gemm_xdlops_v2r3.hpp
View file @
9dce6851
File moved
composable_kernel/
include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp
→
include/
ck/
tensor_operation/
gpu/grid/
gridwise_contraction_dlops_v1r2.hpp
View file @
9dce6851
File moved
composable_kernel/
include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp
→
include/
ck/
tensor_operation/
gpu/grid/
gridwise_gemm_dlops_v1r2.hpp
View file @
9dce6851
File moved
composable_kernel/
include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp
→
include/
ck/
tensor_operation/
gpu/grid/
gridwise_gemm_dlops_v1r3.hpp
View file @
9dce6851
File moved
composable_kernel/
include/tensor_operation/gridwise_gemm_dlops_v2.hpp
→
include/
ck/
tensor_operation/
gpu/grid/
gridwise_gemm_dlops_v2.hpp
View file @
9dce6851
File moved
composable_kernel/
include/tensor_operation/gridwise_gemm_dlops_v3.hpp
→
include/
ck/
tensor_operation/
gpu/grid/
gridwise_gemm_dlops_v3.hpp
View file @
9dce6851
File moved
composable_kernel/
include/tensor_operation/gridwise_gemm_pipeline_v1.hpp
→
include/
ck/
tensor_operation/
gpu/grid/
gridwise_gemm_pipeline_v1.hpp
View file @
9dce6851
File moved
composable_kernel/
include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
→
include/
ck/
tensor_operation/
gpu/grid/
gridwise_gemm_xdlops_v2r3.hpp
View file @
9dce6851
File moved
composable_kernel/
include/tensor_operation/gridwise_gemm_xdlops_v2r4.hpp
→
include/
ck/
tensor_operation/
gpu/grid/
gridwise_gemm_xdlops_v2r4.hpp
View file @
9dce6851
File moved
composable_kernel/
include/tensor_operation/gridwise_gemm_xdlops_v2r4r2.hpp
→
include/
ck/
tensor_operation/
gpu/grid/
gridwise_gemm_xdlops_v2r4r2.hpp
View file @
9dce6851
File moved
Prev
1
…
5
6
7
8
9
10
11
12
13
…
24
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment