Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b7a6f810
Commit
b7a6f810
authored
Mar 10, 2022
by
Chao Liu
Browse files
Merge remote-tracking branch 'origin/develop' into fix_threadwise_copy_error_in_reduction
parents
b29dfd70
827301d9
Changes
70
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
711 additions
and
601 deletions
+711
-601
example/12_reduce/README.md
example/12_reduce/README.md
+60
-0
example/12_reduce/reduce_blockwise.cpp
example/12_reduce/reduce_blockwise.cpp
+11
-38
example/13_pool2d_fwd/README.md
example/13_pool2d_fwd/README.md
+55
-0
include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
...sor_operation/gpu/block/reduction_functions_blockwise.hpp
+56
-71
include/ck/tensor_operation/gpu/device/device_reduce.hpp
include/ck/tensor_operation/gpu/device/device_reduce.hpp
+3
-2
include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
...k/tensor_operation/gpu/device/device_reduce_blockwise.hpp
+25
-16
include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
...ration/gpu/device/device_reduce_blockwise_second_call.hpp
+19
-12
include/ck/tensor_operation/gpu/device/device_reduce_common.hpp
...e/ck/tensor_operation/gpu/device/device_reduce_common.hpp
+40
-17
include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
...ration/gpu/device/device_reduce_multiblock_atomic_add.hpp
+24
-16
include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
...on/gpu/device/device_reduce_multiblock_partial_reduce.hpp
+28
-21
include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
.../tensor_operation/gpu/device/device_reduce_threadwise.hpp
+25
-15
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
...or_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
+129
-154
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
.../gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
+42
-38
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
.../grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
+71
-84
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
...r_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+25
-22
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
..._instance/gpu/reduce/device_reduce_instance_blockwise.hpp
+34
-31
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
...u/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
+19
-19
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp
...u/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp
+9
-9
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
...u/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
+27
-27
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp
...u/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp
+9
-9
No files found.
example/12_reduce/README.md
0 → 100644
View file @
b7a6f810
# Instructions for ```reduce_blockwise``` Example
## Docker script
```
bash
docker run
\
-it
\
--rm
\
--privileged
\
--group-add
sudo
\
-w
/root/workspace
\
-v
${
PATH_TO_LOCAL_WORKSPACE
}
:/root/workspace
\
rocm/tensorflow:rocm4.3.1-tf2.6-dev
\
/bin/bash
```
## Build ```reduce_blockwise```
```
bash
mkdir
build
&&
cd
build
```
```
bash
# Need to specify target ID, example below is gfx908
cmake
\
-D
BUILD_DEV
=
OFF
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_CXX_FLAGS
=
"-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
..
```
```
bash
make
-j
reduce_blockwise
```
## Run ```reduce_blockwise```
```
bash
# -D <xxx> : input 4-d tensor lengths
# -v <x> : verification (0=no, 1=yes)
#arg1: initialization (0=no init, 1=integer value, 2=decimal value)
#arg2: run kernel # of times (>1)
./bin/reduce_blockwise
-D
16,64,32,960
-v
1 1 10
```
Result
```
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 3 times...
Perf: 0.23536 ms, 267.32 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
error: 0
max_diff: 0, 529, 529
root@dc-smc-18:/data/composable_kernel/Build3# bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 10 times...
Perf: 0.23392 ms, 268.966 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
error: 0
max_diff: 0, 528, 528
```
example/12_reduce/reduce_blockwise.cpp
View file @
b7a6f810
...
...
@@ -14,6 +14,7 @@
#include "device_reduce_blockwise.hpp"
#include "host_reduce_util.hpp"
#include "host_generic_reduction.hpp"
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
...
...
@@ -29,7 +30,7 @@ using kOutDataType = ck::half_t;
using
kAccDataType
=
float
;
constexpr
int
Rank
=
4
;
using
ReduceDims_
=
ck
::
Sequence
<
0
,
1
,
2
>
;
constexpr
int
NumReduceDim
=
3
;
constexpr
ReduceTensorOp_t
ReduceOpId
=
ReduceTensorOp_t
::
NORM2
;
constexpr
NanPropagation_t
NanOpt
=
NanPropagation_t
::
PROPAGATE_NAN
;
...
...
@@ -46,7 +47,7 @@ using DeviceReduceInstance = DeviceReduceBlockWise<kInDataType,
kAccDataType
,
kOutDataType
,
Rank
,
ReduceDim
s_
,
Num
ReduceDim
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
...
...
@@ -192,39 +193,13 @@ class SimpleAppArgs
};
};
template
<
int
Rank
,
typename
ReduceDims
>
static
std
::
vector
<
int
>
get_reduce_dims
()
{
std
::
vector
<
int
>
resDims
;
static_for
<
0
,
ReduceDims
::
Size
(),
1
>
{}([
&
](
auto
i
)
{
resDims
.
push_back
(
ReduceDims
::
At
(
i
));
});
return
(
resDims
);
};
template
<
int
Rank
,
typename
ReduceDims
>
static
std
::
vector
<
int
>
get_invariant_dims
()
{
std
::
vector
<
int
>
resDims
;
unsigned
int
incFlag
=
0
;
static_for
<
0
,
ReduceDims
::
Size
(),
1
>
{}(
[
&
](
auto
i
)
{
incFlag
=
incFlag
|
(
0x1
<<
ReduceDims
::
At
(
i
));
});
for
(
int
dim
=
0
;
dim
<
Rank
;
dim
++
)
{
if
(
incFlag
&
(
0x1
<<
dim
))
continue
;
resDims
.
push_back
(
dim
);
};
return
(
resDims
);
};
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
::
host_reduce
;
const
std
::
vector
<
int
>
reduceDims
{
0
,
1
,
2
};
const
std
::
vector
<
int
>
invariantDims
{
3
};
SimpleAppArgs
args
;
if
(
args
.
processArgs
(
argc
,
argv
)
<
0
)
...
...
@@ -260,15 +235,12 @@ int main(int argc, char* argv[])
Tensor
<
InDataType
>
in
(
args
.
inLengths
);
const
std
::
vector
<
int
>
InvariantDims
=
get_invariant_dims
<
Rank
,
ReduceDims_
>
();
const
std
::
vector
<
int
>
ReduceDims
=
get_reduce_dims
<
Rank
,
ReduceDims_
>
();
std
::
vector
<
size_t
>
outLengths
;
if
(
I
nvariantDims
.
empty
())
if
(
i
nvariantDims
.
empty
())
outLengths
.
push_back
(
1
);
else
for
(
auto
dim
:
I
nvariantDims
)
for
(
auto
dim
:
i
nvariantDims
)
outLengths
.
push_back
(
args
.
inLengths
[
dim
]);
Tensor
<
OutDataType
>
out_ref
(
outLengths
);
...
...
@@ -328,7 +300,7 @@ int main(int argc, char* argv[])
if
(
args
.
do_verification
)
{
ReductionHost
<
InDataType
,
AccDataType
,
OutDataType
,
ReduceOpId
,
PropagateNan
,
NeedIndices
>
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
I
nvariantDims
,
R
educeDims
);
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
i
nvariantDims
,
r
educeDims
);
hostReduce
.
Run
(
alpha
,
in
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
out_indices_ref
.
mData
.
data
());
...
...
@@ -350,6 +322,7 @@ int main(int argc, char* argv[])
i_inStrides
,
i_outLengths
,
i_outStrides
,
reduceDims
,
alpha
,
beta
,
in_dev
.
GetDeviceBuffer
(),
...
...
example/13_pool2d_fwd/README.md
0 → 100644
View file @
b7a6f810
# Instructions for ```pool2d_fwd``` Example
## Docker script
```
bash
docker run
\
-it
\
--rm
\
--privileged
\
--group-add
sudo
\
-w
/root/workspace
\
-v
${
PATH_TO_LOCAL_WORKSPACE
}
:/root/workspace
\
rocm/tensorflow:rocm4.3.1-tf2.6-dev
\
/bin/bash
```
## Build ```pool2d_fwd```
```
bash
mkdir
build
&&
cd
build
```
```
bash
# Need to specify target ID, example below is gfx908
cmake
\
-D
BUILD_DEV
=
OFF
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_CXX_FLAGS
=
"-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
..
```
```
bash
make
-j
pool2d_fwd
```
## Run ```pool2d_fwd```
```
bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg3: run kernel # of times (>1)
#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
./example/pool2d_fwd 1 1 10
```
Result
```
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1}
Warm up
Start running 10 times...
Perf: 0.415453 ms, 1.37996 TFlops, 749.726 GB/s
error: 0
max_diff: 0, 1, 1
```
include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
View file @
b7a6f810
...
...
@@ -32,57 +32,53 @@
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "cluster_descriptor.hpp"
namespace
ck
{
template
<
typename
Buffer1dDescType
,
typename
AccDataType
,
template
<
typename
AccDataType
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
bool
ReorderThreadClusters
,
typename
ThreadClusterLengths_M_K
,
typename
ThreadClusterArrangeOrder
,
typename
OpReduce
,
bool
PropagateNan
>
struct
PartitionedBlockwiseReduction
On1dBuffer
struct
PartitionedBlockwiseReduction
{
static
constexpr
auto
buffer_1d_desc
=
Buffer1dDescType
{};
static_assert
(
BlockSize
==
MThreadClusterSize
*
KThreadClusterSize
,
static_assert
(
BlockSize
==
ThreadClusterLengths_M_K
::
At
(
0
)
*
ThreadClusterLengths_M_K
::
At
(
1
),
"The product of cluster lengths should be same as BlockSize!"
);
static_assert
(
KThreadClusterSize
>
1
,
"Parallel reduction need work on at least two elements"
);
static_assert
(
buffer_1d_desc
.
GetElementSize
()
==
BlockSize
,
"The buffer size should be the same as BlockSize!"
);
static
constexpr
auto
BufferLength_M
=
ThreadClusterLengths_M_K
::
At
(
0
);
static
constexpr
auto
BufferLength_K
=
ThreadClusterLengths_M_K
::
At
(
1
);
static_assert
(
BufferLength_K
>
1
,
"Parallel reduction need work on at least two elements"
);
static
constexpr
auto
block_buf_desc_m_k
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
BufferLength_M
>
{},
Number
<
BufferLength_K
>
{}));
static
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
using
Accumulation
=
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
OpReduce
,
AccDataType
>
;
template
<
typename
BufferType
>
__device__
static
void
Reduce
(
BufferType
&
block_buffer
,
AccDataType
&
accuData
,
index_t
thread_m_cluster_id
,
index_t
thread_k_cluster_id
)
__device__
static
void
Reduce
(
BufferType
&
block_buffer
,
AccDataType
&
accuData
)
{
constexpr
auto
cluster_len_shift
=
get_shift
<
KThreadClusterSize
>
();
constexpr
auto
cluster_len_shift
=
get_shift
<
BufferLength_K
>
();
const
auto
thread_cluster_idx
=
thread_cluster_desc
.
CalculateBottomIndex
(
make_multi_index
(
get_thread_local_1d_id
()));
const
auto
thread_m_cluster_id
=
thread_cluster_idx
[
Number
<
0
>
{}];
const
auto
thread_k_cluster_id
=
thread_cluster_idx
[
Number
<
1
>
{}];
static_for
<
0
,
cluster_len_shift
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
indOffset
=
1
<<
(
cluster_len_shift
-
1
-
I
());
if
(
thread_k_cluster_id
<
indOffset
)
{
// consider the thread clusters order, ensure the contiguous locations are accessed
// by contiguous Thread-ID
index_t
offset1
=
ReorderThreadClusters
?
buffer_1d_desc
.
CalculateOffset
(
make_tuple
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
))
:
buffer_1d_desc
.
CalculateOffset
(
make_tuple
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
));
index_t
offset2
=
ReorderThreadClusters
?
buffer_1d_desc
.
CalculateOffset
(
make_tuple
(
(
thread_k_cluster_id
+
indOffset
)
*
MThreadClusterSize
+
thread_m_cluster_id
))
:
buffer_1d_desc
.
CalculateOffset
(
make_tuple
(
thread_m_cluster_id
*
KThreadClusterSize
+
(
thread_k_cluster_id
+
indOffset
)));
index_t
offset1
=
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
);
index_t
offset2
=
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
+
make_tuple
(
0
,
indOffset
));
AccDataType
opData1
=
type_convert
<
AccDataType
>
(
block_buffer
[
offset1
]);
AccDataType
opData2
=
type_convert
<
AccDataType
>
(
block_buffer
[
offset2
]);
...
...
@@ -93,34 +89,34 @@ struct PartitionedBlockwiseReductionOn1dBuffer
__syncthreads
();
});
index_t
offset
=
ReorderThreadClusters
?
buffer_1d_desc
.
CalculateOffset
(
make_tuple
(
thread_m_cluster_id
))
:
buffer_1d_desc
.
CalculateOffset
(
make_tuple
(
thread_m_cluster_id
*
KThreadClusterSize
));
index_t
offset
=
block_buf_desc_m_k
.
CalculateOffset
(
make_tuple
(
thread_m_cluster_id
,
0
));
accuData
=
type_convert
<
AccDataType
>
(
block_buffer
[
offset
]);
};
};
template
<
typename
Buffer1dDescType
,
typename
AccDataType
,
template
<
typename
AccDataType
,
typename
IndexDataType
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
bool
ReorderThreadClusters
,
typename
ThreadClusterLengths_M_K
,
typename
ThreadClusterArrangeOrder
,
typename
OpReduce
,
bool
PropagateNan
>
struct
PartitionedBlockwiseReductionWithIndex
On1dBuffer
struct
PartitionedBlockwiseReductionWithIndex
{
static
constexpr
auto
buffer_1d_desc
=
Buffer1dDescType
{};
static_assert
(
BlockSize
==
MThreadClusterSize
*
KThreadClusterSize
,
static_assert
(
BlockSize
==
ThreadClusterLengths_M_K
::
At
(
0
)
*
ThreadClusterLengths_M_K
::
At
(
1
),
"The product of cluster lengths should be same as BlockSize!"
);
static_assert
(
KThreadClusterSize
>
1
,
"Parallel reduction need work on at least two elements"
);
static_assert
(
buffer_1d_desc
.
GetElementSize
()
==
BlockSize
,
"The buffer size should be the same as BlockSize!"
);
static
constexpr
auto
BufferLength_M
=
ThreadClusterLengths_M_K
::
At
(
0
);
static
constexpr
auto
BufferLength_K
=
ThreadClusterLengths_M_K
::
At
(
1
);
static_assert
(
BufferLength_K
>
1
,
"Parallel reduction need work on at least two elements"
);
static
constexpr
auto
block_buf_desc_m_k
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
BufferLength_M
>
{},
Number
<
BufferLength_K
>
{}));
static
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
using
Accumulation
=
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
OpReduce
,
AccDataType
,
IndexDataType
>
;
...
...
@@ -130,32 +126,24 @@ struct PartitionedBlockwiseReductionWithIndexOn1dBuffer
__device__
static
void
Reduce
(
BufferType
&
block_val_buffer
,
IdxBufferType
&
block_idx_buffer
,
AccDataType
&
accuData
,
IndexDataType
&
accuIndex
,
index_t
thread_m_cluster_id
,
index_t
thread_k_cluster_id
)
IndexDataType
&
accuIndex
)
{
constexpr
auto
cluster_len_shift
=
get_shift
<
KThreadClusterSize
>
();
constexpr
auto
cluster_len_shift
=
get_shift
<
BufferLength_K
>
();
const
auto
thread_cluster_idx
=
thread_cluster_desc
.
CalculateBottomIndex
(
make_multi_index
(
get_thread_local_1d_id
()));
const
auto
thread_m_cluster_id
=
thread_cluster_idx
[
Number
<
0
>
{}];
const
auto
thread_k_cluster_id
=
thread_cluster_idx
[
Number
<
1
>
{}];
static_for
<
0
,
cluster_len_shift
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
indOffset
=
1
<<
I
();
if
(
thread_k_cluster_id
%
(
indOffset
*
2
)
==
0
)
{
// consider the thread clusters order, ensure the contiguous locations are accessed
// by contiguous Thread-ID
index_t
offset1
=
ReorderThreadClusters
?
buffer_1d_desc
.
CalculateOffset
(
make_tuple
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
))
:
buffer_1d_desc
.
CalculateOffset
(
make_tuple
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
));
index_t
offset2
=
ReorderThreadClusters
?
buffer_1d_desc
.
CalculateOffset
(
make_tuple
(
(
thread_k_cluster_id
+
indOffset
)
*
MThreadClusterSize
+
thread_m_cluster_id
))
:
buffer_1d_desc
.
CalculateOffset
(
make_tuple
(
thread_m_cluster_id
*
KThreadClusterSize
+
(
thread_k_cluster_id
+
indOffset
)));
index_t
offset1
=
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
);
index_t
offset2
=
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
+
make_tuple
(
0
,
indOffset
));
AccDataType
opData1
=
type_convert
<
AccDataType
>
(
block_val_buffer
[
offset1
]);
AccDataType
opData2
=
type_convert
<
AccDataType
>
(
block_val_buffer
[
offset2
]);
...
...
@@ -170,10 +158,7 @@ struct PartitionedBlockwiseReductionWithIndexOn1dBuffer
__syncthreads
();
});
index_t
offset
=
ReorderThreadClusters
?
buffer_1d_desc
.
CalculateOffset
(
make_tuple
(
thread_m_cluster_id
))
:
buffer_1d_desc
.
CalculateOffset
(
make_tuple
(
thread_m_cluster_id
*
KThreadClusterSize
));
index_t
offset
=
block_buf_desc_m_k
.
CalculateOffset
(
make_tuple
(
thread_m_cluster_id
,
0
));
accuData
=
type_convert
<
AccDataType
>
(
block_val_buffer
[
offset
]);
accuIndex
=
block_idx_buffer
[
offset
];
...
...
include/ck/tensor_operation/gpu/device/device_reduce.hpp
View file @
b7a6f810
...
...
@@ -36,14 +36,15 @@ struct DeviceReduce : public BaseOperator
const
std
::
vector
<
int
>&
inStrides
,
const
std
::
vector
<
int
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
,
const
std
::
vector
<
int
>&
reduceDims
,
float
alpha
,
float
beta
,
const
void
*
in_dev
,
void
*
out_dev
,
void
*
out_indices_dev
,
void
*
workspace_dev
,
const
InElementwiseOperation
&
in
E
lementwise
O
p
,
const
AccElementwiseOperation
&
acc
E
lementwise
O
p
)
=
0
;
const
InElementwiseOperation
&
in
_e
lementwise
_o
p
,
const
AccElementwiseOperation
&
acc
_e
lementwise
_o
p
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
...
...
include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
View file @
b7a6f810
...
...
@@ -15,8 +15,8 @@ namespace device {
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
int
Rank
,
typename
ReduceDim
s
,
in
dex_
t
Rank
,
index_t
Num
ReduceDim
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
...
...
@@ -40,7 +40,12 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
static
constexpr
bool
BetaIsZero
=
NeedIndices
;
using
InvariantDims
=
decltype
(
get_invariant_dims
<
Rank
,
ReduceDims
>
());
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
using
InvariantDims
=
typename
conditional
<
NumInvariantDim
==
0
,
Sequence
<>
,
typename
arithmetic_sequence_gen
<
0
,
NumInvariantDim
,
1
>::
type
>::
type
;
using
ReduceDims
=
typename
arithmetic_sequence_gen
<
NumInvariantDim
,
Rank
,
1
>::
type
;
static
constexpr
index_t
srcDims
=
Rank
;
static
constexpr
index_t
dstDims
=
(
InvariantDims
::
Size
()
==
0
)
?
1
:
InvariantDims
::
Size
();
...
...
@@ -74,7 +79,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
}
else
{
const
auto
toR
educeDimLengths
=
const
auto
r
educeDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
ReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
InvariantDims
{});
...
...
@@ -82,7 +87,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
return
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
toR
educeDimLengths
)),
make_merge_transform
(
r
educeDimLengths
)),
make_tuple
(
InvariantDims
{},
ReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
...
...
@@ -136,6 +141,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
const
std
::
vector
<
int
>&
inStrides
,
const
std
::
vector
<
int
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
,
const
std
::
vector
<
int
>&
reduceDims
,
float
alpha
,
float
beta
,
const
InDataType
*
in_dev
,
...
...
@@ -144,30 +150,31 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
AccDataType
*
workspace_dev
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
AccElementwiseOperation
&
acc_elementwise_op
)
:
in_dev_
{
in_dev
},
out_dev_
{
out_dev
},
out_indices_dev_
{
out_indices_dev
}
:
outLengths_
{
outLengths
},
outStrides_
{
outStrides
},
in_dev_
{
in_dev
},
out_dev_
{
out_dev
},
out_indices_dev_
{
out_indices_dev
},
in_elementwise_op_
{
in_elementwise_op
},
acc_elementwise_op_
{
acc_elementwise_op
}
{
(
void
)
workspace_dev
;
inLengths_
=
inLengths
;
inStrides_
=
inStrides
;
outLengths_
=
outLengths
;
outStrides_
=
outStrides
;
in_elementwise_op_
=
in_elementwise_op
;
acc_elementwise_op_
=
acc_elementwise_op
;
std
::
tie
(
inLengths_
,
inStrides_
)
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inLengths
,
inStrides
,
reduceDims
);
alpha_
=
static_cast
<
AccDataType
>
(
alpha
);
beta_
=
static_cast
<
OutDataType
>
(
beta
);
std
::
tie
(
invariant_total_length
,
reduce_total_length
)
=
get_2d_lengths
<
Rank
,
ReduceDims
>
(
inLengths
);
get_2d_lengths
<
Rank
,
ReduceDims
>
(
inLengths
_
);
if
constexpr
(
InvariantDims
::
Size
()
==
0
)
invariant_lowest_length
=
1
;
else
invariant_lowest_length
=
inLengths
[
InvariantDims
::
At
(
InvariantDims
::
Size
()
-
1
)];
invariant_lowest_length
=
inLengths
_
[
InvariantDims
::
At
(
InvariantDims
::
Size
()
-
1
)];
reduce_lowest_length
=
inLengths
[
ReduceDims
::
At
(
ReduceDims
::
Size
()
-
1
)];
reduce_lowest_length
=
inLengths
_
[
ReduceDims
::
At
(
ReduceDims
::
Size
()
-
1
)];
gridSize
=
math
::
integer_least_multiple
(
invariant_total_length
,
M_BlockTileSize
)
/
M_BlockTileSize
;
...
...
@@ -305,6 +312,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
const
std
::
vector
<
int
>&
inStrides
,
const
std
::
vector
<
int
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
,
const
std
::
vector
<
int
>&
reduceDims
,
float
alpha
,
float
beta
,
const
void
*
in_dev
,
...
...
@@ -318,6 +326,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
inStrides
,
outLengths
,
outStrides
,
reduceDims
,
alpha
,
beta
,
static_cast
<
const
InDataType
*>
(
in_dev
),
...
...
include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
View file @
b7a6f810
...
...
@@ -15,8 +15,8 @@ namespace device {
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
int
Rank
,
typename
ReduceDim
s
,
in
dex_
t
Rank
,
index_t
Num
ReduceDim
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
...
...
@@ -45,7 +45,11 @@ struct DeviceReduceBlockWiseSecondCall
std
::
is_same
<
InDataType
,
AccDataType
>::
value
,
"InDataType and AccDataType should be the same to use DEviceReduceBlockWiseSecondCall!"
);
using
InvariantDims
=
decltype
(
get_invariant_dims
<
Rank
,
ReduceDims
>
());
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
using
InvariantDims
=
typename
conditional
<
NumInvariantDim
==
0
,
Sequence
<>
,
typename
arithmetic_sequence_gen
<
0
,
NumInvariantDim
,
1
>::
type
>::
type
;
static
constexpr
index_t
dstDims
=
(
InvariantDims
::
Size
()
==
0
)
?
1
:
InvariantDims
::
Size
();
...
...
@@ -117,16 +121,16 @@ struct DeviceReduceBlockWiseSecondCall
AccDataType
*
workspace_dev
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
AccElementwiseOperation
&
acc_elementwise_op
)
:
in_dev_
{
in_dev
},
out_dev_
{
out_dev
},
out_indices_dev_
{
out_indices_dev
}
:
inLengths_
(
inLengths
),
inStrides_
(
inStrides
),
outLengths_
(
outLengths
),
outStrides_
(
outStrides
),
in_dev_
{
in_dev
},
out_dev_
{
out_dev
},
out_indices_dev_
{
out_indices_dev
},
in_elementwise_op_
(
in_elementwise_op
),
acc_elementwise_op_
(
acc_elementwise_op
)
{
inLengths_
=
inLengths
;
inStrides_
=
inStrides
;
outLengths_
=
outLengths
;
outStrides_
=
outStrides
;
in_elementwise_op_
=
in_elementwise_op
;
acc_elementwise_op_
=
acc_elementwise_op
;
alpha_
=
static_cast
<
AccDataType
>
(
alpha
);
beta_
=
static_cast
<
OutDataType
>
(
beta
);
...
...
@@ -268,6 +272,7 @@ struct DeviceReduceBlockWiseSecondCall
const
std
::
vector
<
int
>&
inStrides
,
const
std
::
vector
<
int
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
,
const
std
::
vector
<
int
>&
reduceDims
,
float
alpha
,
float
beta
,
const
void
*
in_dev
,
...
...
@@ -277,6 +282,8 @@ struct DeviceReduceBlockWiseSecondCall
const
InElementwiseOperation
&
in_elementwise_op
,
const
AccElementwiseOperation
&
acc_elementwise_op
)
override
{
(
void
)
reduceDims
;
return
std
::
make_unique
<
Argument
>
(
inLengths
,
inStrides
,
outLengths
,
...
...
include/ck/tensor_operation/gpu/device/device_reduce_common.hpp
View file @
b7a6f810
...
...
@@ -2,6 +2,7 @@
#define DEVICE_REDUCE_COMMON_HPP
#include <vector>
#include <cassert>
#include "common_header.hpp"
#include "reduction_enums.hpp"
...
...
@@ -40,23 +41,6 @@ constexpr bool belong()
return
(
inside
);
};
template
<
int
Rank
,
typename
ReduceDims
,
int
start
=
0
>
constexpr
auto
get_invariant_dims
()
{
static_assert
(
Rank
<=
6
,
"bigger Rank size not supported!"
);
if
constexpr
(
start
>=
Rank
)
return
Sequence
<>
{};
else
{
if
constexpr
(
!
belong
<
start
,
ReduceDims
>
())
return
merge_sequences
(
Sequence
<
start
>
{},
get_invariant_dims
<
Rank
,
ReduceDims
,
start
+
1
>
());
else
return
get_invariant_dims
<
Rank
,
ReduceDims
,
start
+
1
>
();
};
};
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
static
auto
make_tuple_from_array_and_index_seq
(
const
std
::
vector
<
int
>&
lengths
,
Sequence
<
Ns
...
>
)
...
...
@@ -74,6 +58,45 @@ static auto make_tuple_from_array(const std::vector<int>& lengths, Number<arrayS
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
Rank
,
index_t
NumReduceDim
>
static
inline
std
::
pair
<
std
::
vector
<
int
>
,
std
::
vector
<
int
>>
shuffle_tensor_dimensions
(
const
std
::
vector
<
int
>&
dimLengths
,
const
std
::
vector
<
int
>&
dimStrides
,
const
std
::
vector
<
int
>&
reduceDims
)
{
std
::
vector
<
int
>
newDimLengths
;
std
::
vector
<
int
>
newDimStrides
;
assert
(
Rank
==
dimLengths
.
size
()
&&
Rank
==
dimStrides
.
size
()
&&
NumReduceDim
==
reduceDims
.
size
());
int
reduceFlag
=
0
;
// flag the bits for the reduceDims
for
(
int
i
=
0
;
i
<
NumReduceDim
;
i
++
)
{
reduceFlag
|=
1
<<
reduceDims
[
i
];
};
// collect invariant dimensions
for
(
int
i
=
0
;
i
<
Rank
;
i
++
)
if
((
reduceFlag
&
(
1
<<
i
))
==
0
)
{
newDimLengths
.
push_back
(
dimLengths
[
i
]);
newDimStrides
.
push_back
(
dimStrides
[
i
]);
};
// collect reduce dimensions
for
(
int
i
=
0
;
i
<
Rank
;
i
++
)
if
((
reduceFlag
&
(
1
<<
i
))
>
0
)
{
newDimLengths
.
push_back
(
dimLengths
[
i
]);
newDimStrides
.
push_back
(
dimStrides
[
i
]);
};
return
std
::
make_pair
(
newDimLengths
,
newDimStrides
);
};
}
// namespace device
}
// namespace tensor_operation
...
...
include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
View file @
b7a6f810
...
...
@@ -17,8 +17,8 @@ namespace device {
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
int
Rank
,
typename
ReduceDim
s
,
in
dex_
t
Rank
,
index_t
Num
ReduceDim
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
...
...
@@ -41,7 +41,12 @@ struct DeviceReduceMultiBlockAtomicAdd
using
IndexDataType
=
int32_t
;
using
InvariantDims
=
decltype
(
get_invariant_dims
<
Rank
,
ReduceDims
>
());
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
using
InvariantDims
=
typename
conditional
<
NumInvariantDim
==
0
,
Sequence
<>
,
typename
arithmetic_sequence_gen
<
0
,
NumInvariantDim
,
1
>::
type
>::
type
;
using
ReduceDims
=
typename
arithmetic_sequence_gen
<
NumInvariantDim
,
Rank
,
1
>::
type
;
static
constexpr
index_t
srcDims
=
Rank
;
static
constexpr
index_t
dstDims
=
(
InvariantDims
::
Size
()
==
0
)
?
1
:
InvariantDims
::
Size
();
...
...
@@ -84,7 +89,7 @@ struct DeviceReduceMultiBlockAtomicAdd
}
else
{
const
auto
toR
educeDimLengths
=
const
auto
r
educeDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
ReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
InvariantDims
{});
...
...
@@ -92,7 +97,7 @@ struct DeviceReduceMultiBlockAtomicAdd
return
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
toR
educeDimLengths
)),
make_merge_transform
(
r
educeDimLengths
)),
make_tuple
(
InvariantDims
{},
ReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
...
...
@@ -147,6 +152,7 @@ struct DeviceReduceMultiBlockAtomicAdd
const
std
::
vector
<
int
>&
inStrides
,
const
std
::
vector
<
int
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
,
const
std
::
vector
<
int
>&
reduceDims
,
float
alpha
,
float
beta
,
const
InDataType
*
in_dev
,
...
...
@@ -155,31 +161,31 @@ struct DeviceReduceMultiBlockAtomicAdd
AccDataType
*
workspace_dev
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
AccElementwiseOperation
&
acc_elementwise_op
)
:
in_dev_
{
in_dev
},
out_dev_
{
out_dev
}
:
outLengths_
{
outLengths
},
outStrides_
{
outStrides
},
in_dev_
{
in_dev
},
out_dev_
{
out_dev
},
in_elementwise_op_
{
in_elementwise_op
},
acc_elementwise_op_
{
acc_elementwise_op
}
{
(
void
)
out_indices_dev
;
(
void
)
workspace_dev
;
inLengths_
=
inLengths
;
inStrides_
=
inStrides
;
outLengths_
=
outLengths
;
outStrides_
=
outStrides
;
in_elementwise_op_
=
in_elementwise_op
;
acc_elementwise_op_
=
acc_elementwise_op
;
std
::
tie
(
inLengths_
,
inStrides_
)
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inLengths
,
inStrides
,
reduceDims
);
alpha_
=
static_cast
<
AccDataType
>
(
alpha
);
beta_
=
static_cast
<
OutDataType
>
(
beta
);
std
::
tie
(
invariant_total_length
,
reduce_total_length
)
=
get_2d_lengths
<
Rank
,
ReduceDims
>
(
inLengths
);
get_2d_lengths
<
Rank
,
ReduceDims
>
(
inLengths
_
);
if
constexpr
(
InvariantDims
::
Size
()
==
0
)
invariant_lowest_length
=
1
;
else
invariant_lowest_length
=
inLengths
[
InvariantDims
::
At
(
InvariantDims
::
Size
()
-
1
)];
invariant_lowest_length
=
inLengths
_
[
InvariantDims
::
At
(
InvariantDims
::
Size
()
-
1
)];
reduce_lowest_length
=
inLengths
[
ReduceDims
::
At
(
ReduceDims
::
Size
()
-
1
)];
reduce_lowest_length
=
inLengths
_
[
ReduceDims
::
At
(
ReduceDims
::
Size
()
-
1
)];
int
iterations
=
1
;
while
(
true
)
...
...
@@ -369,6 +375,7 @@ struct DeviceReduceMultiBlockAtomicAdd
const
std
::
vector
<
int
>&
inStrides
,
const
std
::
vector
<
int
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
,
const
std
::
vector
<
int
>&
reduceDims
,
float
alpha
,
float
beta
,
const
void
*
in_dev
,
...
...
@@ -382,6 +389,7 @@ struct DeviceReduceMultiBlockAtomicAdd
inStrides
,
outLengths
,
outStrides
,
reduceDims
,
alpha
,
beta
,
static_cast
<
const
InDataType
*>
(
in_dev
),
...
...
include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
View file @
b7a6f810
...
...
@@ -15,8 +15,8 @@ namespace device {
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
int
Rank
,
typename
ReduceDim
s
,
in
dex_
t
Rank
,
index_t
Num
ReduceDim
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
...
...
@@ -41,7 +41,12 @@ struct DeviceReduceMultiBlockPartialReduce
using
IndexDataType
=
int32_t
;
using
InvariantDims
=
decltype
(
get_invariant_dims
<
Rank
,
ReduceDims
>
());
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
using
InvariantDims
=
typename
conditional
<
NumInvariantDim
==
0
,
Sequence
<>
,
typename
arithmetic_sequence_gen
<
0
,
NumInvariantDim
,
1
>::
type
>::
type
;
using
ReduceDims
=
typename
arithmetic_sequence_gen
<
NumInvariantDim
,
Rank
,
1
>::
type
;
static
constexpr
index_t
srcDims
=
Rank
;
static
constexpr
index_t
dstDims
=
(
InvariantDims
::
Size
()
==
0
)
?
1
:
InvariantDims
::
Size
();
...
...
@@ -112,7 +117,7 @@ struct DeviceReduceMultiBlockPartialReduce
}
else
{
const
auto
toR
educeDimLengths
=
const
auto
r
educeDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
ReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
InvariantDims
{});
...
...
@@ -120,7 +125,7 @@ struct DeviceReduceMultiBlockPartialReduce
return
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
toR
educeDimLengths
)),
make_merge_transform
(
r
educeDimLengths
)),
make_tuple
(
InvariantDims
{},
ReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
...
...
@@ -161,10 +166,11 @@ struct DeviceReduceMultiBlockPartialReduce
struct
Argument
:
public
BaseArgument
{
Argument
(
const
std
::
vector
<
index_t
>&
inLengths
,
const
std
::
vector
<
index_t
>&
inStrides
,
const
std
::
vector
<
index_t
>&
outLengths
,
const
std
::
vector
<
index_t
>&
outStrides
,
Argument
(
const
std
::
vector
<
int
>&
inLengths
,
const
std
::
vector
<
int
>&
inStrides
,
const
std
::
vector
<
int
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
,
const
std
::
vector
<
int
>&
reduceDims
,
float
alpha
,
float
beta
,
const
InDataType
*
in_dev
,
...
...
@@ -173,31 +179,30 @@ struct DeviceReduceMultiBlockPartialReduce
AccDataType
*
workspace_dev
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
AccElementwiseOperation
&
acc_elementwise_op
)
:
in_dev_
{
in_dev
},
:
outLengths_
{
outLengths
},
outStrides_
{
outStrides
},
in_dev_
{
in_dev
},
out_dev_
{
out_dev
},
out_indices_dev_
{
out_indices_dev
},
workspace_dev_
{
workspace_dev
}
workspace_dev_
{
workspace_dev
},
in_elementwise_op_
{
in_elementwise_op
},
acc_elementwise_op_
{
acc_elementwise_op
}
{
inLengths_
=
inLengths
;
inStrides_
=
inStrides
;
outLengths_
=
outLengths
;
outStrides_
=
outStrides
;
in_elementwise_op_
=
in_elementwise_op
;
acc_elementwise_op_
=
acc_elementwise_op
;
std
::
tie
(
inLengths_
,
inStrides_
)
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inLengths
,
inStrides
,
reduceDims
);
alpha_
=
static_cast
<
AccDataType
>
(
alpha
);
beta_
=
static_cast
<
OutDataType
>
(
beta
);
std
::
tie
(
invariant_total_length
,
reduce_total_length
)
=
get_2d_lengths
<
Rank
,
ReduceDims
>
(
inLengths
);
get_2d_lengths
<
Rank
,
ReduceDims
>
(
inLengths
_
);
if
constexpr
(
InvariantDims
::
Size
()
==
0
)
invariant_lowest_length
=
1
;
else
invariant_lowest_length
=
inLengths
[
InvariantDims
::
At
(
InvariantDims
::
Size
()
-
1
)];
invariant_lowest_length
=
inLengths
_
[
InvariantDims
::
At
(
InvariantDims
::
Size
()
-
1
)];
reduce_lowest_length
=
inLengths
[
ReduceDims
::
At
(
ReduceDims
::
Size
()
-
1
)];
reduce_lowest_length
=
inLengths
_
[
ReduceDims
::
At
(
ReduceDims
::
Size
()
-
1
)];
int
iterations
=
1
;
while
(
true
)
...
...
@@ -370,6 +375,7 @@ struct DeviceReduceMultiBlockPartialReduce
const
std
::
vector
<
int
>&
inStrides
,
const
std
::
vector
<
int
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
,
const
std
::
vector
<
int
>&
reduceDims
,
float
alpha
,
float
beta
,
const
void
*
in_dev
,
...
...
@@ -383,6 +389,7 @@ struct DeviceReduceMultiBlockPartialReduce
inStrides
,
outLengths
,
outStrides
,
reduceDims
,
alpha
,
beta
,
static_cast
<
const
InDataType
*>
(
in_dev
),
...
...
include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
View file @
b7a6f810
...
...
@@ -16,7 +16,7 @@ template <typename InDataType,
typename
AccDataType
,
typename
OutDataType
,
index_t
Rank
,
typename
ReduceDim
s
,
index_t
Num
ReduceDim
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
OutElementwiseOperation
,
...
...
@@ -40,7 +40,12 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
static
constexpr
bool
BetaIsZero
=
NeedIndices
;
using
InvariantDims
=
decltype
(
get_invariant_dims
<
Rank
,
ReduceDims
>
());
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
using
InvariantDims
=
typename
conditional
<
NumInvariantDim
==
0
,
Sequence
<>
,
typename
arithmetic_sequence_gen
<
0
,
NumInvariantDim
,
1
>::
type
>::
type
;
using
ReduceDims
=
typename
arithmetic_sequence_gen
<
NumInvariantDim
,
Rank
,
1
>::
type
;
static
constexpr
index_t
srcDims
=
Rank
;
static
constexpr
index_t
dstDims
=
(
InvariantDims
::
Size
()
==
0
)
?
1
:
InvariantDims
::
Size
();
...
...
@@ -74,7 +79,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
}
else
{
const
auto
toR
educeDimLengths
=
const
auto
r
educeDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
ReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
InvariantDims
{});
...
...
@@ -82,7 +87,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
return
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
toR
educeDimLengths
)),
make_merge_transform
(
r
educeDimLengths
)),
make_tuple
(
InvariantDims
{},
ReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
...
...
@@ -136,6 +141,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
const
std
::
vector
<
int
>&
inStrides
,
const
std
::
vector
<
int
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
,
const
std
::
vector
<
int
>&
reduceDims
,
float
alpha
,
float
beta
,
const
InDataType
*
in_dev
,
...
...
@@ -144,30 +150,32 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
AccDataType
*
workspace_dev
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
OutElementwiseOperation
&
acc_elementwise_op
)
:
in_dev_
{
in_dev
},
out_dev_
{
out_dev
},
out_indices_dev_
{
out_indices_dev
}
:
outLengths_
{
outLengths
},
outStrides_
{
outStrides
},
in_dev_
{
in_dev
},
out_dev_
{
out_dev
},
out_indices_dev_
{
out_indices_dev
},
in_elementwise_op_
{
in_elementwise_op
},
acc_elementwise_op_
{
acc_elementwise_op
}
{
(
void
)
workspace_dev
;
inLengths_
=
inLengths
;
inStrides_
=
inStrides
;
outLengths_
=
outLengths
;
outStrides_
=
outStrides
;
in_elementwise_op_
=
in_elementwise_op
;
acc_elementwise_op_
=
acc_elementwise_op
;
std
::
tie
(
inLengths_
,
inStrides_
)
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inLengths
,
inStrides
,
reduceDims
);
alpha_
=
static_cast
<
AccDataType
>
(
alpha
);
beta_
=
static_cast
<
OutDataType
>
(
beta
);
std
::
tie
(
invariant_total_length
,
reduce_total_length
)
=
get_2d_lengths
<
Rank
,
ReduceDims
>
(
inLengths
);
get_2d_lengths
<
Rank
,
ReduceDims
>
(
inLengths
_
);
if
constexpr
(
InvariantDims
::
Size
()
==
0
)
invariant_lowest_length
=
1
;
else
invariant_lowest_length
=
inLengths
[
InvariantDims
::
At
(
InvariantDims
::
Size
()
-
1
)];
invariant_lowest_length
=
inLengths
_
[
InvariantDims
::
At
(
InvariantDims
::
Size
()
-
1
)];
reduce_lowest_length
=
inLengths
[
ReduceDims
::
At
(
ReduceDims
::
Size
()
-
1
)];
reduce_lowest_length
=
inLengths
_
[
ReduceDims
::
At
(
ReduceDims
::
Size
()
-
1
)];
gridSize
=
math
::
integer_least_multiple
(
invariant_total_length
,
M_BlockTileSize
)
/
M_BlockTileSize
;
...
...
@@ -306,6 +314,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
const
std
::
vector
<
int
>&
inStrides
,
const
std
::
vector
<
int
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
,
const
std
::
vector
<
int
>&
reduceDims
,
float
alpha
,
float
beta
,
const
void
*
in_dev
,
...
...
@@ -319,6 +328,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
inStrides
,
outLengths
,
outStrides
,
reduceDims
,
alpha
,
beta
,
static_cast
<
const
InDataType
*>
(
in_dev
),
...
...
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
View file @
b7a6f810
...
...
@@ -31,8 +31,8 @@
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_blockwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "cluster_descriptor.hpp"
namespace
ck
{
...
...
@@ -158,13 +158,27 @@ struct GridwiseReduction_mk_to_m_blockwise
{
static
constexpr
bool
reorder_thread_cluster
=
(
InSrcVectorDim
==
0
);
static
constexpr
auto
buffer_1d_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
BlockSize
>
{}));
using
ThreadClusterLengths_M_K
=
Sequence
<
MThreadClusterSize
,
KThreadClusterSize
>
;
using
ThreadBufferDimAccessOrder
=
typename
conditional
<
reorder_thread_cluster
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
using
ThreadClusterArrangeOrder
=
typename
conditional
<
reorder_thread_cluster
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
static
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
// For laying out the threads to do reducing on LDS buffer, for LDS buffer, we always use the
// Dim_K as the fastest one
static
constexpr
auto
block_buf_desc_m_k
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadClusterSize
>
{},
Number
<
KThreadClusterSize
>
{}));
template
<
typename
T
>
using
PassThroughOp
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
index_t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
index_t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
...
...
@@ -180,12 +194,10 @@ struct GridwiseReduction_mk_to_m_blockwise
const
IndexDataType
*
const
__restrict__
p_ws_indices_global
,
IndexDataType
*
const
__restrict__
p_indices_global
)
{
using
BlockwiseReduce
=
PartitionedBlockwiseReductionOn1dBuffer
<
decltype
(
buffer_1d_desc
),
AccDataType
,
using
BlockwiseReduce
=
PartitionedBlockwiseReduction
<
AccDataType
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
reorder_thread_cluster
,
ThreadClusterLengths_M_K
,
ThreadClusterArrangeOrder
,
ReduceOperation
,
PropagateNan
>
;
using
Accumulation
=
...
...
@@ -221,28 +233,28 @@ struct GridwiseReduction_mk_to_m_blockwise
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_1d_id
=
get_block_1d_id
();
const
index_t
thread_m_cluster_id
=
reorder_thread_cluster
?
thread_local_id
%
MThreadClusterSize
:
((
thread_local_id
/
KThreadClusterSize
)
%
MThreadClusterSize
);
const
index_t
thread_k_cluster_id
=
reorder_thread_cluster
?
((
thread_local_id
/
MThreadClusterSize
)
%
KT
hread
C
luster
Size
)
:
thread_local_id
%
KT
hread
C
luster
Size
;
const
auto
thread_cluster_idx
=
thread_cluster_desc
.
CalculateBottomIndex
(
make_multi_index
(
thread_local_id
)
);
const
auto
thread_m_cluster_id
=
t
hread
_c
luster
_idx
[
I0
];
const
auto
thread_k_cluster_id
=
t
hread
_c
luster
_idx
[
I1
]
;
using
ThreadBufferLengths
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
KThreadSliceSize
));
...
...
@@ -283,21 +295,14 @@ struct GridwiseReduction_mk_to_m_blockwise
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{}));
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
constexpr
(
reorder_thread_cluster
)
{
block_reduce_buf
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
)
=
accu_value_buf
[
I
];
}
else
block_reduce_buf
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
)
=
block_reduce_buf
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
accu_value_buf
[
I
];
accu_value_buf
(
I
)
=
zeroVal
;
__syncthreads
();
BlockwiseReduce
::
Reduce
(
block_reduce_buf
,
accu_value_buf
(
I
),
thread_m_cluster_id
,
thread_k_cluster_id
);
BlockwiseReduce
::
Reduce
(
block_reduce_buf
,
accu_value_buf
(
I
));
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
...
...
@@ -380,13 +385,11 @@ struct GridwiseReduction_mk_to_m_blockwise
IndexDataType
*
const
__restrict__
p_indices_global
)
{
using
BlockwiseReduceWithIndex
=
PartitionedBlockwiseReductionWithIndexOn1dBuffer
<
decltype
(
buffer_1d_desc
),
AccDataType
,
PartitionedBlockwiseReductionWithIndex
<
AccDataType
,
IndexDataType
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
reorder_thread_cluster
,
ThreadClusterLengths_M_K
,
ThreadClusterArrangeOrder
,
ReduceOperation
,
PropagateNan
>
;
...
...
@@ -432,28 +435,28 @@ struct GridwiseReduction_mk_to_m_blockwise
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_1d_id
=
get_block_1d_id
();
const
index_t
thread_m_cluster_id
=
reorder_thread_cluster
?
thread_local_id
%
MThreadClusterSize
:
((
thread_local_id
/
KThreadClusterSize
)
%
MThreadClusterSize
);
const
index_t
thread_k_cluster_id
=
reorder_thread_cluster
?
((
thread_local_id
/
MThreadClusterSize
)
%
KT
hread
C
luster
Size
)
:
thread_local_id
%
KT
hread
C
luster
Size
;
const
auto
thread_cluster_idx
=
thread_cluster_desc
.
CalculateBottomIndex
(
make_multi_index
(
thread_local_id
)
);
const
auto
thread_m_cluster_id
=
t
hread
_c
luster
_idx
[
I0
];
const
auto
thread_k_cluster_id
=
t
hread
_c
luster
_idx
[
I1
]
;
using
ThreadBufferLengths
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
KThreadSliceSize
));
...
...
@@ -503,29 +506,15 @@ struct GridwiseReduction_mk_to_m_blockwise
});
// store thread local value to LDS for parallel reduction
if
constexpr
(
reorder_thread_cluster
)
{
block_reduce_val_buf
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
)
=
tmpValue
;
block_reduce_idx_buf
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
)
=
tmpIndex
;
}
else
{
block_reduce_val_buf
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
)
=
tmpValue
;
block_reduce_idx_buf
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
)
=
tmpIndex
;
}
block_reduce_val_buf
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
tmpValue
;
block_reduce_idx_buf
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
tmpIndex
;
__syncthreads
();
BlockwiseReduceWithIndex
::
Reduce
(
block_reduce_val_buf
,
block_reduce_idx_buf
,
tmpValue
,
tmpIndex
,
thread_m_cluster_id
,
thread_k_cluster_id
);
BlockwiseReduceWithIndex
::
Reduce
(
block_reduce_val_buf
,
block_reduce_idx_buf
,
tmpValue
,
tmpIndex
);
AccumulationWithIndex
::
Calculate
(
accu_value_buf
(
I
),
tmpValue
,
accu_index_buf
(
I
),
tmpIndex
);
...
...
@@ -648,13 +637,11 @@ struct GridwiseReduction_mk_to_m_blockwise
IndexDataType
*
const
__restrict__
p_indices_global
)
{
using
BlockwiseReduceWithIndex
=
PartitionedBlockwiseReductionWithIndexOn1dBuffer
<
decltype
(
buffer_1d_desc
),
AccDataType
,
PartitionedBlockwiseReductionWithIndex
<
AccDataType
,
IndexDataType
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
reorder_thread_cluster
,
Sequence
<
MThreadClusterSize
,
KThreadClusterSize
>
,
ThreadClusterArrangeOrder
,
ReduceOperation
,
PropagateNan
>
;
...
...
@@ -707,43 +694,45 @@ struct GridwiseReduction_mk_to_m_blockwise
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_1d_id
=
get_block_1d_id
();
const
index_t
thread_m_cluster_id
=
reorder_thread_cluster
?
thread_local_id
%
MThreadClusterSize
:
((
thread_local_id
/
KThreadClusterSize
)
%
MThreadClusterSize
);
const
index_t
thread_k_cluster_id
=
reorder_thread_cluster
?
((
thread_local_id
/
MThreadClusterSize
)
%
KT
hread
C
luster
Size
)
:
thread_local_id
%
KT
hread
C
luster
Size
;
const
auto
thread_cluster_idx
=
thread_cluster_desc
.
CalculateBottomIndex
(
make_multi_index
(
thread_local_id
)
);
const
auto
thread_m_cluster_id
=
t
hread
_c
luster
_idx
[
I0
];
const
auto
thread_k_cluster_id
=
t
hread
_c
luster
_idx
[
I1
]
;
using
ThreadBufferLengths
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_src_val_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
auto
threadwise_src_val_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
KThreadSliceSize
));
auto
threadwise_src_idx_load
=
ThreadwiseTensorSliceTransfer_v2
<
IndexDataType
,
auto
threadwise_src_idx_load
=
ThreadwiseTensorSliceTransfer_v2
<
IndexDataType
,
IndexDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
KThreadSliceSize
));
...
...
@@ -787,29 +776,15 @@ struct GridwiseReduction_mk_to_m_blockwise
});
// store thread local value to LDS for parallel reduction
if
constexpr
(
reorder_thread_cluster
)
{
block_reduce_val_buf
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
)
=
tmpValue
;
block_reduce_idx_buf
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
)
=
tmpIndex
;
}
else
{
block_reduce_val_buf
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
)
=
tmpValue
;
block_reduce_idx_buf
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
)
=
tmpIndex
;
}
block_reduce_val_buf
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
tmpValue
;
block_reduce_idx_buf
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
tmpIndex
;
__syncthreads
();
BlockwiseReduceWithIndex
::
Reduce
(
block_reduce_val_buf
,
block_reduce_idx_buf
,
tmpValue
,
tmpIndex
,
thread_m_cluster_id
,
thread_k_cluster_id
);
BlockwiseReduceWithIndex
::
Reduce
(
block_reduce_val_buf
,
block_reduce_idx_buf
,
tmpValue
,
tmpIndex
);
AccumulationWithIndex
::
Calculate
(
accu_value_buf
(
I
),
tmpValue
,
accu_index_buf
(
I
),
tmpIndex
);
...
...
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
View file @
b7a6f810
...
...
@@ -86,15 +86,26 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
{
static
constexpr
bool
reorder_thread_cluster
=
(
InSrcVectorDim
==
0
);
static
constexpr
auto
buffer_1d_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
BlockSize
>
{}));
using
ThreadClusterLengths_M_K
=
Sequence
<
MThreadClusterSize
,
KThreadClusterSize
>
;
using
blockwise_reduce
=
PartitionedBlockwiseReductionOn1dBuffer
<
decltype
(
buffer_1d_desc
),
AccDataType
,
using
ThreadBufferDimAccessOrder
=
typename
conditional
<
reorder_thread_cluster
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
using
ThreadClusterArrangeOrder
=
typename
conditional
<
reorder_thread_cluster
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
static
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
// For laying out the threads to do reducing on LDS buffer, for LDS buffer, we always use the
// Dim_K as the fastest one
static
constexpr
auto
block_buf_desc_m_k
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadClusterSize
>
{},
Number
<
KThreadClusterSize
>
{}));
using
BlockwiseReduce
=
PartitionedBlockwiseReduction
<
AccDataType
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
reorder_thread_cluster
,
ThreadClusterLengths_M_K
,
ThreadClusterArrangeOrder
,
ReduceOperation
,
PropagateNan
>
;
...
...
@@ -102,6 +113,7 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
using
PassThroughOp
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
index_t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
index_t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
...
...
@@ -145,12 +157,12 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
const
index_t
block_global_id
=
get_block_1d_id
();
const
index_t
blkgroup_id
=
block_global_id
/
block_group_size
;
const
index_t
block_local_id
=
block_global_id
%
block_group_size
;
const
index_t
thread_m_cluster_id
=
reorder_thread_cluster
?
thread_local_id
%
MThreadClusterSize
:
((
thread_local_id
/
KThreadClusterSize
)
%
MThreadClusterSize
);
const
index_t
thread_k_cluster_id
=
reorder_thread_cluster
?
((
thread_local_id
/
MThreadClusterSize
)
%
KT
hread
C
luster
Size
)
:
thread_local_id
%
KT
hread
C
luster
Size
;
const
auto
thread_cluster_idx
=
thread_cluster_desc
.
CalculateBottomIndex
(
make_multi_index
(
thread_local_id
)
);
const
auto
thread_m_cluster_id
=
t
hread
_c
luster
_idx
[
I0
];
const
auto
thread_k_cluster_id
=
t
hread
_c
luster
_idx
[
I1
]
;
const
index_t
reduceSizePerBlock
=
K_BlockTileSize
*
num_k_block_tile_iteration
;
...
...
@@ -158,13 +170,12 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
...
...
@@ -212,21 +223,14 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
// consistent reduced result for that invariant dimension. due to the using of vector_load,
// each block/thread is involved into multiple invarirant dimensions.
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
constexpr
(
reorder_thread_cluster
)
{
block_reduce_buf
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
)
=
accu_value_buf
[
I
];
}
else
block_reduce_buf
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
)
=
block_reduce_buf
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
accu_value_buf
[
I
];
accu_value_buf
(
I
)
=
zeroVal
;
__syncthreads
();
blockwise_reduce
::
Reduce
(
block_reduce_buf
,
accu_value_buf
(
I
),
thread_m_cluster_id
,
thread_k_cluster_id
);
BlockwiseReduce
::
Reduce
(
block_reduce_buf
,
accu_value_buf
(
I
));
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
View file @
b7a6f810
...
...
@@ -30,8 +30,8 @@
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_blockwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "cluster_descriptor.hpp"
namespace
ck
{
...
...
@@ -103,13 +103,27 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
{
static
constexpr
bool
reorder_thread_cluster
=
(
InSrcVectorDim
==
0
);
static
constexpr
auto
buffer1dDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
BlockSize
>
{}));
using
ThreadClusterLengths_M_K
=
Sequence
<
MThreadClusterSize
,
KThreadClusterSize
>
;
using
ThreadBufferDimAccessOrder
=
typename
conditional
<
reorder_thread_cluster
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
using
ThreadClusterArrangeOrder
=
typename
conditional
<
reorder_thread_cluster
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
static
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
// For laying out the threads to do reducing on LDS buffer, for LDS buffer, we always use the
// Dim_K as the fastest one
static
constexpr
auto
block_buf_desc_m_k
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadClusterSize
>
{},
Number
<
KThreadClusterSize
>
{}));
template
<
typename
T
>
using
PassThroughOp
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
index_t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
index_t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
...
...
@@ -124,12 +138,10 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
AccDataType
*
const
__restrict__
p_ws_values_global
,
IndexDataType
*
const
__restrict__
p_ws_indices_global
)
{
using
BlockwiseReduce
=
PartitionedBlockwiseReductionOn1dBuffer
<
decltype
(
buffer1dDesc
),
AccDataType
,
using
BlockwiseReduce
=
PartitionedBlockwiseReduction
<
AccDataType
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
reorder_thread_cluster
,
ThreadClusterLengths_M_K
,
ThreadClusterArrangeOrder
,
ReduceOperation
,
PropagateNan
>
;
...
...
@@ -168,12 +180,12 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
const
index_t
block_global_id
=
get_block_1d_id
();
const
index_t
blkgroup_id
=
block_global_id
/
block_group_size
;
const
index_t
block_local_id
=
block_global_id
%
block_group_size
;
const
index_t
thread_m_cluster_id
=
reorder_thread_cluster
?
thread_local_id
%
MThreadClusterSize
:
((
thread_local_id
/
KThreadClusterSize
)
%
MThreadClusterSize
);
const
index_t
thread_k_cluster_id
=
reorder_thread_cluster
?
((
thread_local_id
/
MThreadClusterSize
)
%
KT
hread
C
luster
Size
)
:
thread_local_id
%
KT
hread
C
luster
Size
;
const
auto
thread_cluster_idx
=
thread_cluster_desc
.
CalculateBottomIndex
(
make_multi_index
(
thread_local_id
)
);
const
auto
thread_m_cluster_id
=
t
hread
_c
luster
_idx
[
I0
];
const
auto
thread_k_cluster_id
=
t
hread
_c
luster
_idx
[
I1
]
;
const
index_t
reduceSizePerBlock
=
K_BlockTileSize
*
num_k_block_tile_iteration
;
...
...
@@ -181,13 +193,12 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
...
...
@@ -233,21 +244,14 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
// Each block executes multiple parallel reductions on the LDS, and due to the using of
// vector_load, each block/thread is involved into multiple invarirant dimensions.
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
constexpr
(
reorder_thread_cluster
)
{
block_reduce_buf
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
)
=
accu_value_buf
[
I
];
}
else
block_reduce_buf
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
)
=
block_reduce_buf
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
accu_value_buf
[
I
];
accu_value_buf
(
I
)
=
zeroVal
;
__syncthreads
();
BlockwiseReduce
::
Reduce
(
block_reduce_buf
,
accu_value_buf
(
I
),
thread_m_cluster_id
,
thread_k_cluster_id
);
BlockwiseReduce
::
Reduce
(
block_reduce_buf
,
accu_value_buf
(
I
));
});
if
(
thread_k_cluster_id
==
0
)
...
...
@@ -290,13 +294,11 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
IndexDataType
*
const
__restrict__
p_ws_indices_global
)
{
using
BlockwiseReduceWithIndex
=
PartitionedBlockwiseReductionWithIndexOn1dBuffer
<
decltype
(
buffer1dDesc
),
AccDataType
,
PartitionedBlockwiseReductionWithIndex
<
AccDataType
,
IndexDataType
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
reorder_thread_cluster
,
ThreadClusterLengths_M_K
,
ThreadClusterArrangeOrder
,
ReduceOperation
,
PropagateNan
>
;
...
...
@@ -346,12 +348,12 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
const
index_t
block_global_id
=
get_block_1d_id
();
const
index_t
blkgroup_id
=
block_global_id
/
block_group_size
;
const
index_t
block_local_id
=
block_global_id
%
block_group_size
;
const
index_t
thread_m_cluster_id
=
reorder_thread_cluster
?
thread_local_id
%
MThreadClusterSize
:
((
thread_local_id
/
KThreadClusterSize
)
%
MThreadClusterSize
);
const
index_t
thread_k_cluster_id
=
reorder_thread_cluster
?
((
thread_local_id
/
MThreadClusterSize
)
%
KT
hread
C
luster
Size
)
:
thread_local_id
%
KT
hread
C
luster
Size
;
const
auto
thread_cluster_idx
=
thread_cluster_desc
.
CalculateBottomIndex
(
make_multi_index
(
thread_local_id
)
);
const
auto
thread_m_cluster_id
=
t
hread
_c
luster
_idx
[
I0
];
const
auto
thread_k_cluster_id
=
t
hread
_c
luster
_idx
[
I1
]
;
const
index_t
reduceSizePerBlock
=
K_BlockTileSize
*
num_k_block_tile_iteration
;
...
...
@@ -359,13 +361,12 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
...
...
@@ -418,29 +419,15 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
});
// store thread local value to LDS for parallel reduction
if
constexpr
(
reorder_thread_cluster
)
{
block_reduce_val_buf
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
)
=
tmpValue
;
block_reduce_idx_buf
(
thread_k_cluster_id
*
MThreadClusterSize
+
thread_m_cluster_id
)
=
tmpIndex
;
}
else
{
block_reduce_val_buf
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
)
=
tmpValue
;
block_reduce_idx_buf
(
thread_m_cluster_id
*
KThreadClusterSize
+
thread_k_cluster_id
)
=
tmpIndex
;
}
block_reduce_val_buf
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
tmpValue
;
block_reduce_idx_buf
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
tmpIndex
;
__syncthreads
();
BlockwiseReduceWithIndex
::
Reduce
(
block_reduce_val_buf
,
block_reduce_idx_buf
,
tmpValue
,
tmpIndex
,
thread_m_cluster_id
,
thread_k_cluster_id
);
BlockwiseReduceWithIndex
::
Reduce
(
block_reduce_val_buf
,
block_reduce_idx_buf
,
tmpValue
,
tmpIndex
);
AccumulationWithIndex
::
Calculate
(
accu_value_buf
(
I
),
tmpValue
,
accu_index_buf
(
I
),
tmpIndex
);
...
...
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
View file @
b7a6f810
...
...
@@ -101,6 +101,9 @@ template <typename InDataType,
index_t
OutDstVectorSize
>
struct
GridwiseReduction_mk_to_m_threadwise
{
using
ThreadBufferDimAccessOrder
=
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
template
<
typename
T
>
using
PassThroughOp
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
...
...
@@ -147,17 +150,17 @@ struct GridwiseReduction_mk_to_m_threadwise
index_t
thread_global_1d_id
=
get_block_1d_id
()
*
BlockSize
+
get_thread_local_1d_id
();
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
,
0
));
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
,
0
));
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
KThreadSliceSize
);
...
...
@@ -299,17 +302,17 @@ struct GridwiseReduction_mk_to_m_threadwise
index_t
thread_global_1d_id
=
get_block_1d_id
()
*
BlockSize
+
get_thread_local_1d_id
();
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
,
0
));
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
,
0
));
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
KThreadSliceSize
);
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
View file @
b7a6f810
...
...
@@ -57,7 +57,7 @@ template <typename InDataType,
typename
AccDataType
,
typename
OutDataType
,
int
Rank
,
typename
ReduceDim
s
,
int
Num
ReduceDim
,
ReduceTensorOp_t
ReduceOpId
,
NanPropagation_t
NanOpt
,
ReduceTensorIndices_t
IndicesOpt
>
...
...
@@ -91,7 +91,7 @@ void add_device_reduce_instance_blockwise(
AccDataType
,
OutDataType
,
Rank
,
ReduceDim
s
,
Num
ReduceDim
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
...
...
@@ -112,18 +112,20 @@ void add_device_reduce_instance_blockwise(
});
};
#define ADD_BLOCKWISE_INST_BY_TYPE(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
#define ADD_BLOCKWISE_INST_BY_TYPE( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
template void add_device_reduce_instance_blockwise<inT, \
compT, \
outT, \
Rank, \
Sequence<__VA_ARGS__>,
\
NumReduceDim,
\
ReduceOpId, \
NanOpt, \
IndicesOpt>( \
std::vector<deviceReduceBlockWisePtrType<compT, ReduceOpId>> & device_op_instances)
#define ADD_BLOCKWISE_INST_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
#define ADD_BLOCKWISE_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_BLOCKWISE_INST_BY_TYPE(inT, \
compT, \
outT, \
...
...
@@ -131,15 +133,15 @@ void add_device_reduce_instance_blockwise(
static_cast<NanPropagation_t>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
Rank, \
__VA_ARGS__
)
NumReduceDim
)
#define ADD_BLOCKWISE_INST_REF_BY_TYPE( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
...)
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
NumReduceDim)
\
extern template void add_device_reduce_instance_blockwise<inT, \
compT, \
outT, \
Rank, \
Sequence<__VA_ARGS__>,
\
NumReduceDim,
\
ReduceOpId, \
NanOpt, \
IndicesOpt>( \
...
...
@@ -149,7 +151,8 @@ void add_device_reduce_instance_blockwise(
AccElementwiseOperation>> & \
device_op_instances)
#define ADD_BLOCKWISE_INST_REF_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
#define ADD_BLOCKWISE_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \
compT, \
outT, \
...
...
@@ -157,7 +160,7 @@ void add_device_reduce_instance_blockwise(
static_cast<NanPropagation_t>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
Rank, \
__VA_ARGS__
)
NumReduceDim
)
}
// namespace device_reduce_instance
}
// namespace device
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
View file @
b7a6f810
...
...
@@ -11,25 +11,25 @@ namespace device {
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
//
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp
View file @
b7a6f810
...
...
@@ -11,16 +11,16 @@ namespace device {
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
0
);
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
View file @
b7a6f810
...
...
@@ -11,34 +11,34 @@ namespace device {
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
0
);
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp
View file @
b7a6f810
...
...
@@ -11,16 +11,16 @@ namespace device {
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
0
);
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment