Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
bb1f8082
Commit
bb1f8082
authored
May 26, 2022
by
root
Browse files
Merge remote-tracking branch 'origin/develop' into myamlak/cgemm
parents
97ac5007
82d7d993
Changes
177
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1301 additions
and
2341 deletions
+1301
-2341
include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
.../ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
+7
-9
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
...ration/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
+3
-4
include/ck/tensor_operation/gpu/device/device_base.hpp
include/ck/tensor_operation/gpu/device/device_base.hpp
+2
-0
include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
...tensor_operation/gpu/device/device_binary_elementwise.hpp
+92
-51
include/ck/tensor_operation/gpu/device/device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
...e_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+51
-0
include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
+586
-0
include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
+7
-3
include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
.../tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
+6
-0
include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
...ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
+6
-0
include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
...k/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
+2
-4
include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
...nsor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
+22
-23
include/ck/tensor_operation/gpu/device/device_reduce.hpp
include/ck/tensor_operation/gpu/device/device_reduce.hpp
+6
-23
include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
...k/tensor_operation/gpu/device/device_reduce_blockwise.hpp
+0
-374
include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
...ration/gpu/device/device_reduce_blockwise_second_call.hpp
+0
-328
include/ck/tensor_operation/gpu/device/device_reduce_common.hpp
...e/ck/tensor_operation/gpu/device/device_reduce_common.hpp
+9
-9
include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
.../tensor_operation/gpu/device/device_reduce_multiblock.hpp
+200
-113
include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
...on/gpu/device/device_reduce_multiblock_partial_reduce.hpp
+0
-440
include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
.../tensor_operation/gpu/device/device_reduce_threadwise.hpp
+71
-74
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+231
-0
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
...or_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
+0
-886
No files found.
include/ck/tensor_operation/gpu/block/blockwise_gemm_dl
ops
_v2r3.hpp
→
include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
View file @
bb1f8082
#ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R3_HPP
#pragma once
#define CK_BLOCKWISE_GEMM_DLOPS_V2R3_HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "tensor_adaptor.hpp"
#include "tensor_adaptor.hpp"
#include "threadwise_tensor_slice_transfer_v
2
.hpp"
#include "threadwise_tensor_slice_transfer_v
4r1
.hpp"
#include "threadwise_contraction_dl
ops
.hpp"
#include "threadwise_contraction_dl.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -41,7 +39,7 @@ template <index_t BlockSize,
...
@@ -41,7 +39,7 @@ template <index_t BlockSize,
typename
enable_if
<
ABlockDesc_BK0_BM_BK1
::
IsKnownAtCompileTime
()
&&
typename
enable_if
<
ABlockDesc_BK0_BM_BK1
::
IsKnownAtCompileTime
()
&&
BBlockDesc_BK0_BN_BK1
::
IsKnownAtCompileTime
(),
BBlockDesc_BK0_BN_BK1
::
IsKnownAtCompileTime
(),
bool
>
::
type
=
false
>
bool
>
::
type
=
false
>
struct
BlockwiseGemmDl
ops
_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
struct
BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
{
{
using
AIndex
=
MultiIndex
<
3
>
;
using
AIndex
=
MultiIndex
<
3
>
;
using
BIndex
=
MultiIndex
<
3
>
;
using
BIndex
=
MultiIndex
<
3
>
;
...
@@ -148,7 +146,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
...
@@ -148,7 +146,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
MakeBBlockDescriptor_BK0_BN0_BN1_BK1
(
BBlockDesc_BK0_BN_BK1
{});
MakeBBlockDescriptor_BK0_BN0_BN1_BK1
(
BBlockDesc_BK0_BN_BK1
{});
public:
public:
__device__
BlockwiseGemmDl
ops
_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
()
__device__
BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
()
:
c_thread_origin_data_idx_
{
CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1
(
:
c_thread_origin_data_idx_
{
CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1
(
get_thread_local_1d_id
())},
get_thread_local_1d_id
())},
a_thread_copy_
{
a_thread_copy_
{
...
@@ -175,6 +173,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
...
@@ -175,6 +173,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
"wrong!"
);
"wrong!"
);
// TODO: remove this restriction
// TODO: remove this restriction
static_assert
(
BM0
==
2
,
"wrong"
);
static_assert
(
BM0
==
2
&&
BN0
==
2
,
"wrong"
);
static_assert
(
BM0
==
2
&&
BN0
==
2
,
"wrong"
);
}
}
...
@@ -226,7 +225,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
...
@@ -226,7 +225,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
b_thread_desc_bk0_bn0_bn1_bk1_
.
GetElementSpaceSize
());
b_thread_desc_bk0_bn0_bn1_bk1_
.
GetElementSpaceSize
());
constexpr
auto
threadwise_contraction
=
constexpr
auto
threadwise_contraction
=
ThreadwiseContractionDl
ops
_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
<
ThreadwiseContractionDl_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
<
FloatA
,
FloatA
,
FloatB
,
FloatB
,
FloatC
,
FloatC
,
...
@@ -407,4 +406,3 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
...
@@ -407,4 +406,3 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
View file @
bb1f8082
...
@@ -75,14 +75,13 @@ struct BlockwiseTensorSliceTransfer_v5r1
...
@@ -75,14 +75,13 @@ struct BlockwiseTensorSliceTransfer_v5r1
}
}
}
}
template
<
typename
SrcBuffer
,
typename
SrcStepHacks
>
template
<
typename
SrcBuffer
>
__device__
void
__device__
void
RunRead
(
const
SrcDesc
&
src_desc
,
const
SrcBuffer
&
src_buf
)
RunRead
(
const
SrcDesc
&
src_desc
,
const
SrcBuffer
&
src_buf
,
const
SrcStepHacks
&
src_step_hacks
)
{
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
{
threadwise_transfer_
.
RunRead
(
src_desc
,
src_buf
,
src_step_hacks
);
threadwise_transfer_
.
RunRead
(
src_desc
,
src_buf
);
}
}
}
}
...
...
include/ck/tensor_operation/gpu/device/device_base.hpp
View file @
bb1f8082
...
@@ -40,6 +40,8 @@ struct BaseOperator
...
@@ -40,6 +40,8 @@ struct BaseOperator
virtual
bool
IsSupportedArgument
(
const
BaseArgument
*
)
{
return
false
;
}
virtual
bool
IsSupportedArgument
(
const
BaseArgument
*
)
{
return
false
;
}
virtual
std
::
string
GetTypeString
()
const
{
return
""
;
}
virtual
std
::
string
GetTypeString
()
const
{
return
""
;
}
virtual
size_t
GetWorkSpaceSize
(
const
BaseArgument
*
)
const
{
return
0
;
}
virtual
~
BaseOperator
()
{}
virtual
~
BaseOperator
()
{}
};
};
...
...
include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
View file @
bb1f8082
...
@@ -15,91 +15,107 @@ template <typename ADataType,
...
@@ -15,91 +15,107 @@ template <typename ADataType,
typename
CDataType
,
typename
CDataType
,
typename
ComputeDataType
,
typename
ComputeDataType
,
typename
ElementwiseFunctor
,
typename
ElementwiseFunctor
,
index_t
Dim
,
index_t
NDim
,
index_t
ScalarPerVector
>
index_t
MPerThread
,
index_t
AScalarPerVector
,
index_t
BScalarPerVector
,
index_t
CScalarPerVector
>
struct
DeviceBinaryElementwise
:
public
BaseOperator
struct
DeviceBinaryElementwise
:
public
BaseOperator
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
template
<
typename
Desc_M
0
>
template
<
typename
Desc_M
>
static
auto
PadDescriptor_M
0
_1d
(
Desc_M
0
desc_m
0
,
index_t
gridSize
,
index_t
blockSize
)
static
auto
PadDescriptor_M_1d
(
Desc_M
desc_m
,
index_t
gridSize
,
index_t
blockSize
)
{
{
const
auto
m0
=
desc_m
0
.
GetLength
(
I0
);
const
auto
M
=
desc_m
.
GetLength
(
I0
);
const
index_t
loop_step
=
gridSize
*
blockSize
*
ScalarPerVector
;
const
index_t
loop_step
=
gridSize
*
blockSize
*
MPerThread
;
const
auto
pad
=
math
::
integer_least_multiple
(
m0
,
loop_step
)
-
m0
;
const
auto
pad
=
math
::
integer_least_multiple
(
M
,
loop_step
)
-
M
;
const
auto
desc_m
0
_pad
=
const
auto
desc_m_pad
=
transform_tensor_descriptor
(
desc_m
0
,
transform_tensor_descriptor
(
desc_m
,
make_tuple
(
make_right_pad_transform
(
m0
,
pad
)),
make_tuple
(
make_right_pad_transform
(
M
,
pad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
make_tuple
(
Sequence
<
0
>
{}));
return
desc_m
0
_pad
;
return
desc_m_pad
;
}
}
static
auto
MakeDescriptor_M
0
(
const
std
::
vector
<
index_t
>&
shape
,
static
auto
MakeDescriptor_M
(
const
std
::
vector
<
index_t
>&
lengths
,
const
std
::
vector
<
index_t
>&
stride
,
const
std
::
vector
<
index_t
>&
stride
s
,
index_t
gridSize
,
index_t
gridSize
,
index_t
blockSize
)
index_t
blockSize
)
{
{
auto
tupleOfShape
=
generate_tuple
([
&
](
auto
I
)
{
return
shape
[
I
];
},
Number
<
Dim
>
{});
auto
tupleOfShape
=
generate_tuple
([
&
](
auto
I
)
{
return
lengths
[
I
];
},
Number
<
N
Dim
>
{});
auto
tupleOfStride
=
generate_tuple
([
&
](
auto
I
)
{
return
stride
[
I
];
},
Number
<
Dim
>
{});
auto
tupleOfStride
=
generate_tuple
([
&
](
auto
I
)
{
return
stride
s
[
I
];
},
Number
<
N
Dim
>
{});
// nd desc - [s0, s1, s2, ...]
// nd desc - [s0, s1, s2, ...]
const
auto
desc
=
make_naive_tensor_descriptor
(
tupleOfShape
,
tupleOfStride
);
const
auto
desc
=
make_naive_tensor_descriptor
(
tupleOfShape
,
tupleOfStride
);
// merge nd to 1d desc - [s0 * s1 * ...]
// merge nd to 1d desc - [s0 * s1 * ...]
if
constexpr
(
Dim
>
1
)
if
constexpr
(
N
Dim
>
1
)
{
{
const
auto
desc_m
0
=
transform_tensor_descriptor
(
const
auto
desc_m
=
transform_tensor_descriptor
(
desc
,
desc
,
make_tuple
(
make_merge_transform
(
tupleOfShape
)),
make_tuple
(
make_merge_transform
(
tupleOfShape
)),
make_tuple
(
generate_sequence_v2
([
&
](
auto
I
)
{
return
I
;
},
Number
<
Dim
>
{})),
make_tuple
(
generate_sequence_v2
([
&
](
auto
I
)
{
return
I
;
},
Number
<
N
Dim
>
{})),
make_tuple
(
Sequence
<
0
>
{}));
make_tuple
(
Sequence
<
0
>
{}));
return
PadDescriptor_M
0
_1d
(
desc_m
0
,
gridSize
,
blockSize
);
return
PadDescriptor_M_1d
(
desc_m
,
gridSize
,
blockSize
);
}
}
else
else
return
PadDescriptor_M
0
_1d
(
desc
,
gridSize
,
blockSize
);
return
PadDescriptor_M_1d
(
desc
,
gridSize
,
blockSize
);
}
}
using
GridDesc_M0
=
decltype
(
MakeDescriptor_M0
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
AGridDesc_M
=
decltype
(
MakeDescriptor_M
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
BGridDesc_M
=
decltype
(
MakeDescriptor_M
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
CGridDesc_M
=
decltype
(
MakeDescriptor_M
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
GridwiseBinEltwise
=
GridwiseBinaryElementwise_1D
<
ADataType
,
using
GridwiseBinEltwise
=
GridwiseBinaryElementwise_1D
<
ADataType
,
BDataType
,
BDataType
,
CDataType
,
CDataType
,
ComputeDataType
,
ComputeDataType
,
GridDesc_M0
,
AGridDesc_M
,
BGridDesc_M
,
CGridDesc_M
,
ElementwiseFunctor
,
ElementwiseFunctor
,
ScalarPerVector
>
;
MPerThread
,
AScalarPerVector
,
BScalarPerVector
,
CScalarPerVector
>
;
struct
Argument
:
public
BaseArgument
struct
Argument
:
public
BaseArgument
{
{
Argument
(
const
ADataType
*
p_a
,
Argument
(
const
ADataType
*
p_a
,
const
BDataType
*
p_b
,
const
BDataType
*
p_b
,
CDataType
*
p_c
,
CDataType
*
p_c
,
const
std
::
vector
<
index_t
>&
shape
,
const
std
::
vector
<
index_t
>&
lengths
,
const
std
::
vector
<
index_t
>&
stride
_a
,
const
std
::
vector
<
index_t
>&
a_
stride
s
,
const
std
::
vector
<
index_t
>&
stride
_b
,
const
std
::
vector
<
index_t
>&
b_
stride
s
,
const
std
::
vector
<
index_t
>&
stride
_c
,
const
std
::
vector
<
index_t
>&
c_
stride
s
,
ElementwiseFunctor
functor
)
ElementwiseFunctor
functor
)
:
p_a_
(
p_a
),
:
p_a_
(
p_a
),
p_b_
(
p_b
),
p_b_
(
p_b
),
p_c_
(
p_c
),
p_c_
(
p_c
),
shape_
(
shape
),
lengths_
(
lengths
),
a_strides_
(
a_strides
),
b_strides_
(
b_strides
),
c_strides_
(
c_strides
),
functor_
(
functor
),
functor_
(
functor
),
blockSize_
(
256
),
blockSize_
(
256
),
gridSize_
(
120
)
// FIXME - Calculate the grid size by number of CU in the future
gridSize_
(
120
)
// FIXME - Calculate the grid size by number of CU in the future
{
{
a_grid_desc_m
0
_
=
MakeDescriptor_M
0
(
shape
,
stride
_a
,
gridSize_
,
blockSize_
);
a_grid_desc_m_
=
MakeDescriptor_M
(
lengths
,
a_
stride
s
,
gridSize_
,
blockSize_
);
b_grid_desc_m
0
_
=
MakeDescriptor_M
0
(
shape
,
stride
_b
,
gridSize_
,
blockSize_
);
b_grid_desc_m_
=
MakeDescriptor_M
(
lengths
,
b_
stride
s
,
gridSize_
,
blockSize_
);
c_grid_desc_m
0
_
=
MakeDescriptor_M
0
(
shape
,
stride
_c
,
gridSize_
,
blockSize_
);
c_grid_desc_m_
=
MakeDescriptor_M
(
lengths
,
c_
stride
s
,
gridSize_
,
blockSize_
);
}
}
const
ADataType
*
p_a_
;
const
ADataType
*
p_a_
;
const
BDataType
*
p_b_
;
const
BDataType
*
p_b_
;
CDataType
*
p_c_
;
CDataType
*
p_c_
;
std
::
vector
<
int
>
shape_
;
std
::
vector
<
int
>
lengths_
;
GridDesc_M0
a_grid_desc_m0_
;
AGridDesc_M
a_grid_desc_m_
;
GridDesc_M0
b_grid_desc_m0_
;
BGridDesc_M
b_grid_desc_m_
;
GridDesc_M0
c_grid_desc_m0_
;
CGridDesc_M
c_grid_desc_m_
;
std
::
vector
<
index_t
>
a_strides_
;
std
::
vector
<
index_t
>
b_strides_
;
std
::
vector
<
index_t
>
c_strides_
;
ElementwiseFunctor
functor_
;
ElementwiseFunctor
functor_
;
index_t
blockSize_
;
index_t
blockSize_
;
index_t
gridSize_
;
index_t
gridSize_
;
...
@@ -113,7 +129,9 @@ struct DeviceBinaryElementwise : public BaseOperator
...
@@ -113,7 +129,9 @@ struct DeviceBinaryElementwise : public BaseOperator
ADataType
,
ADataType
,
BDataType
,
BDataType
,
CDataType
,
CDataType
,
GridDesc_M0
,
AGridDesc_M
,
BGridDesc_M
,
CGridDesc_M
,
ElementwiseFunctor
>
;
ElementwiseFunctor
>
;
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
...
@@ -124,9 +142,9 @@ struct DeviceBinaryElementwise : public BaseOperator
...
@@ -124,9 +142,9 @@ struct DeviceBinaryElementwise : public BaseOperator
arg
.
p_a_
,
arg
.
p_a_
,
arg
.
p_b_
,
arg
.
p_b_
,
arg
.
p_c_
,
arg
.
p_c_
,
arg
.
a_grid_desc_m
0
_
,
arg
.
a_grid_desc_m_
,
arg
.
b_grid_desc_m
0
_
,
arg
.
b_grid_desc_m_
,
arg
.
c_grid_desc_m
0
_
,
arg
.
c_grid_desc_m_
,
arg
.
functor_
);
arg
.
functor_
);
return
elapsed_time
;
return
elapsed_time
;
}
}
...
@@ -146,7 +164,30 @@ struct DeviceBinaryElementwise : public BaseOperator
...
@@ -146,7 +164,30 @@ struct DeviceBinaryElementwise : public BaseOperator
if
(
pArg
==
nullptr
)
if
(
pArg
==
nullptr
)
return
false
;
return
false
;
if
(
pArg
->
shape_
.
back
()
%
ScalarPerVector
!=
0
)
if
(
pArg
->
lengths_
.
size
()
!=
NDim
)
return
false
;
if
(
pArg
->
lengths_
.
back
()
%
MPerThread
!=
0
)
return
false
;
auto
IsScalarPerVectorValid
=
[](
bool
isLastDimensionCoalesced
,
int
scalarPerVector
)
{
bool
ret
=
true
;
if
(
!
isLastDimensionCoalesced
)
ret
=
scalarPerVector
==
1
;
else
ret
=
MPerThread
%
scalarPerVector
==
0
;
return
ret
;
};
if
(
!
IsScalarPerVectorValid
(
pArg
->
a_strides_
.
back
()
==
1
,
AScalarPerVector
))
return
false
;
if
(
!
IsScalarPerVectorValid
(
pArg
->
b_strides_
.
back
()
==
1
,
BScalarPerVector
))
return
false
;
if
(
!
IsScalarPerVectorValid
(
pArg
->
c_strides_
.
back
()
==
1
,
CScalarPerVector
))
return
false
;
return
false
;
return
true
;
return
true
;
...
@@ -155,19 +196,19 @@ struct DeviceBinaryElementwise : public BaseOperator
...
@@ -155,19 +196,19 @@ struct DeviceBinaryElementwise : public BaseOperator
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
const
void
*
p_b
,
const
void
*
p_b
,
void
*
p_c
,
void
*
p_c
,
std
::
vector
<
index_t
>
shape
,
std
::
vector
<
index_t
>
lengths
,
std
::
vector
<
index_t
>
stride
_a
,
std
::
vector
<
index_t
>
a_
stride
s
,
std
::
vector
<
index_t
>
stride
_b
,
std
::
vector
<
index_t
>
b_
stride
s
,
std
::
vector
<
index_t
>
stride
_c
,
std
::
vector
<
index_t
>
c_
stride
s
,
ElementwiseFunctor
functor
)
ElementwiseFunctor
functor
)
{
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
static_cast
<
const
BDataType
*>
(
p_b
),
static_cast
<
const
BDataType
*>
(
p_b
),
static_cast
<
CDataType
*>
(
p_c
),
static_cast
<
CDataType
*>
(
p_c
),
shape
,
lengths
,
stride
_a
,
a_
stride
s
,
stride
_b
,
b_
stride
s
,
stride
_c
,
c_
stride
s
,
functor
);
functor
);
}
}
...
@@ -180,7 +221,7 @@ struct DeviceBinaryElementwise : public BaseOperator
...
@@ -180,7 +221,7 @@ struct DeviceBinaryElementwise : public BaseOperator
// clang-format off
// clang-format off
str
<<
"DeviceBinaryElementwise"
str
<<
"DeviceBinaryElementwise"
<<
"<"
<<
"<"
<<
"
ScalarPerVector = "
<<
ScalarPerVector
<<
"
MPerThread = "
<<
MPerThread
<<
">"
;
<<
">"
;
// clang-format on
// clang-format on
...
...
include/ck/tensor_operation/gpu/device/device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
View file @
bb1f8082
...
@@ -1175,6 +1175,57 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
...
@@ -1175,6 +1175,57 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
return
str
.
str
();
return
str
.
str
();
}
}
template
<
ck
::
index_t
NDim
,
typename
ck
::
enable_if
<
NDim
==
1
,
bool
>
::
type
=
false
>
static
size_t
GetWorkSpaceSize
(
const
Argument
&
arg
)
{
size_t
WorkSpaceSize
=
0
;
if
(
arg
.
k_batch_
>
1
)
{
if
constexpr
(
std
::
is_same
<
InDataType
,
ck
::
bhalf_t
>::
value
)
{
WorkSpaceSize
=
arg
.
Conv_K_
*
arg
.
Conv_C_
*
arg
.
filter_spatial_lengths_
[
0
]
*
sizeof
(
float
);
}
}
return
WorkSpaceSize
;
}
template
<
ck
::
index_t
NDim
,
typename
ck
::
enable_if
<
NDim
==
2
,
bool
>
::
type
=
false
>
static
size_t
GetWorkSpaceSize
(
const
Argument
&
arg
)
{
size_t
WorkSpaceSize
=
0
;
if
(
arg
.
k_batch_
>
1
)
{
if
constexpr
(
std
::
is_same
<
InDataType
,
ck
::
bhalf_t
>::
value
)
{
WorkSpaceSize
=
arg
.
Conv_K_
*
arg
.
Conv_C_
*
arg
.
filter_spatial_lengths_
[
0
]
*
arg
.
filter_spatial_lengths_
[
1
]
*
sizeof
(
float
);
}
}
return
WorkSpaceSize
;
}
template
<
ck
::
index_t
NDim
,
typename
ck
::
enable_if
<
NDim
==
3
,
bool
>
::
type
=
false
>
static
size_t
GetWorkSpaceSize
(
const
Argument
&
arg
)
{
size_t
WorkSpaceSize
=
0
;
if
(
arg
.
k_batch_
>
1
)
{
if
constexpr
(
std
::
is_same
<
InDataType
,
ck
::
bhalf_t
>::
value
)
{
WorkSpaceSize
=
arg
.
Conv_K_
*
arg
.
Conv_C_
*
arg
.
filter_spatial_lengths_
[
0
]
*
arg
.
filter_spatial_lengths_
[
1
]
*
arg
.
filter_spatial_lengths_
[
2
]
*
sizeof
(
float
);
}
}
return
WorkSpaceSize
;
}
size_t
GetWorkSpaceSize
(
const
BaseArgument
*
p_arg
)
const
override
final
{
return
GetWorkSpaceSize
<
NumDimSpatial
>
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
};
};
}
// namespace device
}
// namespace device
...
...
include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
0 → 100644
View file @
bb1f8082
#pragma once
#include <iostream>
#include <sstream>
#include "device.hpp"
#include "device_base.hpp"
#include "device_gemm.hpp"
#include "common_header.hpp"
#include "tensor_layout.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gemm_specialization.hpp"
#include "element_wise_operation.hpp"
#include "gridwise_gemm_dl_v1r3.hpp"
#include "device_prop.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AccDataType
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
,
GemmSpecialization
GemmSpec
,
index_t
BlockSize
,
index_t
MPerBlock
,
index_t
NPerBlock
,
index_t
K0PerBlock
,
index_t
K1
,
index_t
M1PerThread
,
index_t
N1PerThread
,
index_t
KPerThread
,
typename
M1N1ThreadClusterM1Xs
,
typename
M1N1ThreadClusterN1Xs
,
typename
ABlockTransferThreadSliceLengths_K0_M0_M1_K1
,
typename
ABlockTransferThreadClusterLengths_K0_M0_M1_K1
,
typename
ABlockTransferThreadClusterArrangeOrder
,
typename
ABlockTransferSrcAccessOrder
,
typename
ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
,
typename
ABlockTransferSrcVectorTensorContiguousDimOrder
,
typename
ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
,
typename
BBlockTransferThreadSliceLengths_K0_N0_N1_K1
,
typename
BBlockTransferThreadClusterLengths_K0_N0_N1_K1
,
typename
BBlockTransferThreadClusterArrangeOrder
,
typename
BBlockTransferSrcAccessOrder
,
typename
BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
,
typename
BBlockTransferSrcVectorTensorContiguousDimOrder
,
typename
BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
,
typename
CThreadTransferSrcDstAccessOrder
,
index_t
CThreadTransferSrcDstVectorDim
,
index_t
CThreadTransferDstScalarPerVector
,
enable_if_t
<
is_same_v
<
AElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
&&
is_same_v
<
AElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
&&
is_same_v
<
AElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
,
bool
>
=
false
>
struct
DeviceGemmDl
:
public
DeviceGemm
<
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
I4
=
Number
<
4
>
{};
static
constexpr
auto
I5
=
Number
<
5
>
{};
static
constexpr
auto
K1Number
=
Number
<
K1
>
{};
static
auto
MakeAGridDescriptor_K0_M_K1
(
index_t
M
,
index_t
K
,
index_t
StrideA
)
{
assert
(
K
%
K1
==
0
);
const
index_t
K0
=
K
/
K1
;
const
auto
a_grid_desc_m_k
=
[
&
]()
{
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
ALayout
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
M
,
K
),
make_tuple
(
StrideA
,
I1
));
}
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
ColumnMajor
,
ALayout
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
M
,
K
),
make_tuple
(
I1
,
StrideA
));
}
}();
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MNPadding
)
{
const
auto
PadM
=
(
MPerBlock
-
M
%
MPerBlock
)
%
MPerBlock
;
return
transform_tensor_descriptor
(
a_grid_desc_m_k
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0
,
K1Number
)),
make_right_pad_transform
(
M
,
PadM
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
}
else
{
return
transform_tensor_descriptor
(
a_grid_desc_m_k
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0
,
K1Number
)),
make_pass_through_transform
(
M
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
}
}
static
auto
MakeBGridDescriptor_K0_N_K1
(
index_t
K
,
index_t
N
,
index_t
StrideB
)
{
assert
(
K
%
K1
==
0
);
const
index_t
K0
=
K
/
K1
;
const
auto
b_grid_desc_k_n
=
[
&
]()
{
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
BLayout
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
K
,
N
),
make_tuple
(
StrideB
,
I1
));
}
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
ColumnMajor
,
BLayout
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
K
,
N
),
make_tuple
(
I1
,
StrideB
));
}
}();
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MNPadding
)
{
const
auto
PadN
=
(
NPerBlock
-
N
%
NPerBlock
)
%
NPerBlock
;
return
transform_tensor_descriptor
(
b_grid_desc_k_n
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0
,
K1Number
)),
make_right_pad_transform
(
N
,
PadN
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
}
else
{
return
transform_tensor_descriptor
(
b_grid_desc_k_n
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0
,
K1Number
)),
make_pass_through_transform
(
N
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
}
}
static
auto
MakeCGridDescriptor_M_N
(
index_t
M
,
index_t
N
,
index_t
StrideC
)
{
const
auto
c_grid_desc_m_n
=
[
&
]()
{
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
CLayout
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
StrideC
,
I1
));
}
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
ColumnMajor
,
CLayout
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
I1
,
StrideC
));
}
}();
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MNPadding
)
{
const
auto
PadM
=
(
MPerBlock
-
M
%
MPerBlock
)
%
MPerBlock
;
const
auto
PadN
=
(
NPerBlock
-
N
%
NPerBlock
)
%
NPerBlock
;
return
transform_tensor_descriptor
(
c_grid_desc_m_n
,
make_tuple
(
make_right_pad_transform
(
M
,
PadM
),
make_right_pad_transform
(
N
,
PadN
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
else
{
return
transform_tensor_descriptor
(
c_grid_desc_m_n
,
make_tuple
(
make_pass_through_transform
(
M
),
make_pass_through_transform
(
N
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
}
using
AGridDesc_K0_M_K1
=
decltype
(
MakeAGridDescriptor_K0_M_K1
(
1
,
1
,
1
));
using
BGridDesc_K0_N_K1
=
decltype
(
MakeBGridDescriptor_K0_N_K1
(
1
,
1
,
1
));
using
CGridDesc_M_N
=
decltype
(
MakeCGridDescriptor_M_N
(
1
,
1
,
1
));
// GridwiseGemm
using
GridwiseGemm
=
GridwiseGemmDl_km_kn_mn_v1r3
<
BlockSize
,
ADataType
,
AccDataType
,
CDataType
,
InMemoryDataOperationEnum
::
Set
,
AGridDesc_K0_M_K1
,
BGridDesc_K0_N_K1
,
CGridDesc_M_N
,
MPerBlock
,
NPerBlock
,
K0PerBlock
,
M1PerThread
,
N1PerThread
,
KPerThread
,
M1N1ThreadClusterM1Xs
,
M1N1ThreadClusterN1Xs
,
ABlockTransferThreadSliceLengths_K0_M0_M1_K1
,
ABlockTransferThreadClusterLengths_K0_M0_M1_K1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
,
ABlockTransferSrcVectorTensorContiguousDimOrder
,
ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
,
BBlockTransferThreadSliceLengths_K0_N0_N1_K1
,
BBlockTransferThreadClusterLengths_K0_N0_N1_K1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
,
BBlockTransferSrcVectorTensorContiguousDimOrder
,
BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
>
;
using
AGridDesc_K0_M0_M1_K1
=
decltype
(
GridwiseGemm
::
MakeAGridDescriptor_K0_M0_M1_K1
(
AGridDesc_K0_M_K1
{}));
using
BGridDesc_K0_N0_N1_K1
=
decltype
(
GridwiseGemm
::
MakeBGridDescriptor_K0_N0_N1_K1
(
BGridDesc_K0_N_K1
{}));
using
CGridDesc_M0_M10_M11_N0_N10_N11
=
decltype
(
GridwiseGemm
::
MakeCGridDescriptor_M0_M10_M11_N0_N10_N11
(
CGridDesc_M_N
{}));
using
DefaultBlock2CTileMap
=
decltype
(
GridwiseGemm
::
MakeDefaultBlock2CTileMap
(
CGridDesc_M_N
{}));
// Argument
struct
Argument
:
public
BaseArgument
{
Argument
(
const
ADataType
*
p_a_grid
,
const
BDataType
*
p_b_grid
,
CDataType
*
p_c_grid
,
index_t
M
,
index_t
N
,
index_t
K
,
index_t
StrideA
,
index_t
StrideB
,
index_t
StrideC
,
index_t
M01
,
index_t
N01
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
:
p_a_grid_
{
p_a_grid
},
p_b_grid_
{
p_b_grid
},
p_c_grid_
{
p_c_grid
},
a_grid_desc_k0_m0_m1_k1_
{},
b_grid_desc_k0_n0_n1_k1_
{},
c_grid_desc_m0_m10_m11_n0_n10_n11_
{},
block_2_ctile_map_
{},
M01_
{
M01
},
N01_
{
N01
},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
c_element_op_
{
c_element_op
}
{
a_grid_desc_k0_m_k1_
=
DeviceGemmDl
::
MakeAGridDescriptor_K0_M_K1
(
M
,
K
,
StrideA
);
b_grid_desc_k0_n_k1_
=
DeviceGemmDl
::
MakeBGridDescriptor_K0_N_K1
(
K
,
N
,
StrideB
);
c_grid_desc_m_n_
=
DeviceGemmDl
::
MakeCGridDescriptor_M_N
(
M
,
N
,
StrideC
);
if
(
GridwiseGemm
::
CheckValidity
(
a_grid_desc_k0_m_k1_
,
b_grid_desc_k0_n_k1_
,
c_grid_desc_m_n_
))
{
a_grid_desc_k0_m0_m1_k1_
=
GridwiseGemm
::
MakeAGridDescriptor_K0_M0_M1_K1
(
a_grid_desc_k0_m_k1_
);
b_grid_desc_k0_n0_n1_k1_
=
GridwiseGemm
::
MakeBGridDescriptor_K0_N0_N1_K1
(
b_grid_desc_k0_n_k1_
);
c_grid_desc_m0_m10_m11_n0_n10_n11_
=
GridwiseGemm
::
MakeCGridDescriptor_M0_M10_M11_N0_N10_N11
(
c_grid_desc_m_n_
);
block_2_ctile_map_
=
GridwiseGemm
::
MakeDefaultBlock2CTileMap
(
c_grid_desc_m_n_
);
}
}
// private:
const
ADataType
*
p_a_grid_
;
const
BDataType
*
p_b_grid_
;
CDataType
*
p_c_grid_
;
AGridDesc_K0_M_K1
a_grid_desc_k0_m_k1_
;
BGridDesc_K0_N_K1
b_grid_desc_k0_n_k1_
;
CGridDesc_M_N
c_grid_desc_m_n_
;
AGridDesc_K0_M0_M1_K1
a_grid_desc_k0_m0_m1_k1_
;
BGridDesc_K0_N0_N1_K1
b_grid_desc_k0_n0_n1_k1_
;
CGridDesc_M0_M10_M11_N0_N10_N11
c_grid_desc_m0_m10_m11_n0_n10_n11_
;
DefaultBlock2CTileMap
block_2_ctile_map_
;
// TODO: unused, but may be useful in future.
index_t
M01_
;
index_t
N01_
;
// TODO: unused since gridwise_gemm_dl_v1r3 does NOT support prologue for the time being.
AElementwiseOperation
a_element_op_
;
BElementwiseOperation
b_element_op_
;
CElementwiseOperation
c_element_op_
;
};
// Invoker
struct
Invoker
:
public
BaseInvoker
{
using
Argument
=
DeviceGemmDl
::
Argument
;
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
{
std
::
cout
<<
"arg.a_grid_desc_k0_m0_m1_k1_{"
<<
arg
.
a_grid_desc_k0_m_k1_
.
GetLength
(
I0
)
<<
", "
<<
arg
.
a_grid_desc_k0_m_k1_
.
GetLength
(
I1
)
<<
", "
<<
arg
.
a_grid_desc_k0_m_k1_
.
GetLength
(
I2
)
<<
"}"
<<
std
::
endl
;
std
::
cout
<<
"arg.b_grid_desc_k0_n0_n1_k1_{"
<<
arg
.
b_grid_desc_k0_n_k1_
.
GetLength
(
I0
)
<<
", "
<<
arg
.
b_grid_desc_k0_n_k1_
.
GetLength
(
I1
)
<<
", "
<<
arg
.
b_grid_desc_k0_n_k1_
.
GetLength
(
I2
)
<<
"}"
<<
std
::
endl
;
std
::
cout
<<
"arg.c_grid_desc_m_n_{ "
<<
arg
.
c_grid_desc_m_n_
.
GetLength
(
I0
)
<<
", "
<<
arg
.
c_grid_desc_m_n_
.
GetLength
(
I1
)
<<
"}"
<<
std
::
endl
;
}
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_k0_m_k1_
,
arg
.
b_grid_desc_k0_n_k1_
,
arg
.
c_grid_desc_m_n_
))
{
throw
std
::
runtime_error
(
"wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdl_v2r3 has invalid setting"
);
}
const
index_t
grid_size
=
GridwiseGemm
::
CalculateGridSize
(
arg
.
c_grid_desc_m_n_
.
GetLength
(
I0
),
arg
.
c_grid_desc_m_n_
.
GetLength
(
I1
));
const
auto
K0
=
arg
.
a_grid_desc_k0_m0_m1_k1_
.
GetLength
(
I0
);
const
bool
has_main_k_block_loop
=
GridwiseGemm
::
CalculateHasMainKBlockLoop
(
K0
);
const
bool
has_double_tail_k_block_loop
=
GridwiseGemm
::
CalculateHasDoubleTailKBlockLoop
(
K0
);
float
ave_time
=
0
;
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dl_v1r3
<
GridwiseGemm
,
ADataType
,
CDataType
,
remove_reference_t
<
AGridDesc_K0_M0_M1_K1
>
,
remove_reference_t
<
BGridDesc_K0_N0_N1_K1
>
,
remove_reference_t
<
CGridDesc_M0_M10_M11_N0_N10_N11
>
,
remove_reference_t
<
DefaultBlock2CTileMap
>
,
true
,
true
>
;
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
a_grid_desc_k0_m0_m1_k1_
,
arg
.
b_grid_desc_k0_n0_n1_k1_
,
arg
.
c_grid_desc_m0_m10_m11_n0_n10_n11_
,
arg
.
block_2_ctile_map_
);
}
else
if
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dl_v1r3
<
GridwiseGemm
,
ADataType
,
CDataType
,
remove_reference_t
<
AGridDesc_K0_M0_M1_K1
>
,
remove_reference_t
<
BGridDesc_K0_N0_N1_K1
>
,
remove_reference_t
<
CGridDesc_M0_M10_M11_N0_N10_N11
>
,
remove_reference_t
<
DefaultBlock2CTileMap
>
,
true
,
false
>
;
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
a_grid_desc_k0_m0_m1_k1_
,
arg
.
b_grid_desc_k0_n0_n1_k1_
,
arg
.
c_grid_desc_m0_m10_m11_n0_n10_n11_
,
arg
.
block_2_ctile_map_
);
}
else
if
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dl_v1r3
<
GridwiseGemm
,
ADataType
,
CDataType
,
remove_reference_t
<
AGridDesc_K0_M0_M1_K1
>
,
remove_reference_t
<
BGridDesc_K0_N0_N1_K1
>
,
remove_reference_t
<
CGridDesc_M0_M10_M11_N0_N10_N11
>
,
remove_reference_t
<
DefaultBlock2CTileMap
>
,
false
,
true
>
;
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
a_grid_desc_k0_m0_m1_k1_
,
arg
.
b_grid_desc_k0_n0_n1_k1_
,
arg
.
c_grid_desc_m0_m10_m11_n0_n10_n11_
,
arg
.
block_2_ctile_map_
);
}
else
{
const
auto
kernel
=
kernel_gemm_dl_v1r3
<
GridwiseGemm
,
ADataType
,
CDataType
,
remove_reference_t
<
AGridDesc_K0_M0_M1_K1
>
,
remove_reference_t
<
BGridDesc_K0_N0_N1_K1
>
,
remove_reference_t
<
CGridDesc_M0_M10_M11_N0_N10_N11
>
,
remove_reference_t
<
DefaultBlock2CTileMap
>
,
false
,
false
>
;
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
a_grid_desc_k0_m0_m1_k1_
,
arg
.
b_grid_desc_k0_n0_n1_k1_
,
arg
.
c_grid_desc_m0_m10_m11_n0_n10_n11_
,
arg
.
block_2_ctile_map_
);
}
return
ave_time
;
}
// polymorphic
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
};
static
constexpr
bool
IsValidCompilationParameter
()
{
// TODO: properly implement this check
return
true
;
}
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
if
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx1030"
)
{
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_k0_m_k1_
,
arg
.
b_grid_desc_k0_n_k1_
,
arg
.
c_grid_desc_m_n_
);
}
else
{
return
false
;
}
}
// polymorphic
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
return
IsSupportedArgument
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
static
auto
MakeArgument
(
const
ADataType
*
p_a
,
const
BDataType
*
p_b
,
CDataType
*
p_c
,
index_t
M
,
index_t
N
,
index_t
K
,
index_t
StrideA
,
index_t
StrideB
,
index_t
StrideC
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
{
return
Argument
{
p_a
,
p_b
,
p_c
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
1
,
1
,
a_element_op
,
b_element_op
,
c_element_op
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
// polymorphic
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
const
void
*
p_b
,
void
*
p_c
,
index_t
M
,
index_t
N
,
index_t
K
,
index_t
StrideA
,
index_t
StrideB
,
index_t
StrideC
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
,
index_t
/* KBatch */
=
1
)
override
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
static_cast
<
const
BDataType
*>
(
p_b
),
static_cast
<
CDataType
*>
(
p_c
),
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
1
,
1
,
a_element_op
,
b_element_op
,
c_element_op
);
}
// polymorphic
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
// polymorphic
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceGemmDl"
<<
"<"
<<
BlockSize
<<
", "
<<
MPerBlock
<<
", "
<<
NPerBlock
<<
", "
<<
K0PerBlock
<<
", "
<<
K1
<<
", "
<<
M1PerThread
<<
", "
<<
N1PerThread
<<
", "
<<
KPerThread
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
View file @
bb1f8082
#ifndef DEVICE_GEMM_XDL_HPP
#pragma once
#define DEVICE_GEMM_XDL_HPP
#include <iostream>
#include <iostream>
#include <sstream>
#include <sstream>
...
@@ -12,6 +11,7 @@
...
@@ -12,6 +11,7 @@
#include "tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_xdlops_v2r3.hpp"
#include "gridwise_gemm_xdlops_v2r3.hpp"
#include "gemm_specialization.hpp"
#include "gemm_specialization.hpp"
#include "device_prop.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -408,6 +408,11 @@ struct DeviceGemmXdl
...
@@ -408,6 +408,11 @@ struct DeviceGemmXdl
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
{
if
(
!
(
ck
::
get_device_name
()
==
"gfx908"
||
ck
::
get_device_name
()
==
"gfx90a"
))
{
return
false
;
}
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_k0_m_k1_
,
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_k0_m_k1_
,
arg
.
b_grid_desc_k0_n_k1_
,
arg
.
b_grid_desc_k0_n_k1_
,
arg
.
c_grid_desc_m_n_
,
arg
.
c_grid_desc_m_n_
,
...
@@ -515,4 +520,3 @@ struct DeviceGemmXdl
...
@@ -515,4 +520,3 @@ struct DeviceGemmXdl
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
View file @
bb1f8082
...
@@ -9,6 +9,7 @@
...
@@ -9,6 +9,7 @@
#include "tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_xdl_cshuffle_v1.hpp"
#include "gridwise_gemm_xdl_cshuffle_v1.hpp"
#include "tensor_operation/gpu/device/gemm_specialization.hpp"
#include "tensor_operation/gpu/device/gemm_specialization.hpp"
#include "device_prop.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -558,6 +559,11 @@ struct DeviceGemm_Xdl_CShuffle
...
@@ -558,6 +559,11 @@ struct DeviceGemm_Xdl_CShuffle
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
{
if
(
!
(
ck
::
get_device_name
()
==
"gfx908"
||
ck
::
get_device_name
()
==
"gfx90a"
))
{
return
false
;
}
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_ak0_m_ak1_
,
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
c_grid_desc_m_n_
,
arg
.
c_grid_desc_m_n_
,
...
...
include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
View file @
bb1f8082
...
@@ -12,6 +12,7 @@
...
@@ -12,6 +12,7 @@
#include "tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_xdlops_v2r4.hpp"
#include "gridwise_gemm_xdlops_v2r4.hpp"
#include "gemm_specialization.hpp"
#include "gemm_specialization.hpp"
#include "device_prop.hpp"
#ifndef CK_RUN_KERNEL_AND_TIME
#ifndef CK_RUN_KERNEL_AND_TIME
#define CK_RUN_KERNEL_AND_TIME 1
#define CK_RUN_KERNEL_AND_TIME 1
...
@@ -528,6 +529,11 @@ struct DeviceGemmXdlSplitK
...
@@ -528,6 +529,11 @@ struct DeviceGemmXdlSplitK
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
{
if
(
!
(
ck
::
get_device_name
()
==
"gfx908"
||
ck
::
get_device_name
()
==
"gfx90a"
))
{
return
false
;
}
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_kbatch_k0_m_k1_
,
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_kbatch_k0_m_k1_
,
arg
.
b_grid_desc_kbatch_k0_n_k1_
,
arg
.
b_grid_desc_kbatch_k0_n_k1_
,
arg
.
c_grid_desc_m_n_
,
arg
.
c_grid_desc_m_n_
,
...
...
include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
View file @
bb1f8082
...
@@ -346,7 +346,6 @@ struct DeviceGroupedGemmXdl
...
@@ -346,7 +346,6 @@ struct DeviceGroupedGemmXdl
return
block_2_ctile_map_
.
CheckValidity
(
c_grid_desc_m_n
);
return
block_2_ctile_map_
.
CheckValidity
(
c_grid_desc_m_n
);
}
}
private:
typename
GridwiseGemm
::
DefaultBlock2CTileMap
block_2_ctile_map_
;
typename
GridwiseGemm
::
DefaultBlock2CTileMap
block_2_ctile_map_
;
ck
::
index_t
BlockStart_
;
ck
::
index_t
BlockStart_
;
};
};
...
@@ -418,9 +417,8 @@ struct DeviceGroupedGemmXdl
...
@@ -418,9 +417,8 @@ struct DeviceGroupedGemmXdl
DeviceGroupedGemmXdl
::
MakeCGridDescriptor_M_N
(
M
,
N
,
StrideC
);
DeviceGroupedGemmXdl
::
MakeCGridDescriptor_M_N
(
M
,
N
,
StrideC
);
const
index_t
grid_size_grp
=
const
index_t
grid_size_grp
=
typename
GroupedGemmBlock2CTileMap
::
UnderlyingBlock2CTileMap
(
GroupedGemmBlock2CTileMap
(
c_grid_desc_m_n_
,
M01
,
N01
,
0
)
c_grid_desc_m_n_
,
M01
,
N01
)
.
block_2_ctile_map_
.
CalculateGridSize
(
c_grid_desc_m_n_
);
.
CalculateGridSize
(
c_grid_desc_m_n_
);
const
index_t
BlockStart
=
grid_size_
;
const
index_t
BlockStart
=
grid_size_
;
const
index_t
BlockEnd
=
grid_size_
+
grid_size_grp
;
const
index_t
BlockEnd
=
grid_size_
+
grid_size_grp
;
...
...
include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
View file @
bb1f8082
...
@@ -17,7 +17,7 @@ template <typename InDataType,
...
@@ -17,7 +17,7 @@ template <typename InDataType,
typename
OutDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
AccDataType
,
ck
::
ReduceTensorOp
ReduceOpId
,
ck
::
ReduceTensorOp
ReduceOpId
,
bool
NeedIndices
,
bool
OuputIndex
,
ck
::
index_t
BlockSize
,
ck
::
index_t
BlockSize
,
ck
::
index_t
ReduceMThreadClusterSize
,
ck
::
index_t
ReduceMThreadClusterSize
,
ck
::
index_t
ReduceKThreadClusterSize
,
ck
::
index_t
ReduceKThreadClusterSize
,
...
@@ -44,8 +44,6 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
...
@@ -44,8 +44,6 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
AccElementwiseOperation
;
static
constexpr
bool
BetaIsZero
=
true
;
static
constexpr
index_t
InSrcOutDstVectorDim
=
static
constexpr
index_t
InSrcOutDstVectorDim
=
0
;
// for NHWC, the dim C is the vector Dim for both input and output in memory, which is
0
;
// for NHWC, the dim C is the vector Dim for both input and output in memory, which is
// not reduced.
// not reduced.
...
@@ -206,28 +204,28 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
...
@@ -206,28 +204,28 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
{
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
{
using
gridwise_reduce
=
GridwiseReduction_mk_to_m_threadwise
<
InDataType
,
using
gridwise_reduce
=
OutDataType
,
GridwiseReduction_mk_to_m_threadwise
<
InDataType
,
AccDataType
,
OutDataType
,
IndexDataType
,
AccDataType
,
AGridDesc_M_K
,
IndexDataType
,
BGridDesc_M
,
AGridDesc_M_K
,
ReduceOperation
,
BGridDesc_M
,
InElementwiseOperation
,
ReduceOperation
,
AccElementwiseOperation
,
InElementwiseOperation
,
false
,
// propagate_nan
AccElementwiseOperation
,
BetaIsZero
,
InMemoryDataOperationEnum
::
Set
,
BlockSize
,
false
,
// propagate_nan
ReduceMThreadClusterSize
,
BlockSize
,
ReduceKThreadClusterSize
,
ReduceMThreadSliceSize
,
ReduceMThreadSliceSize
,
ReduceKThreadSliceSize
,
ReduceKThreadSliceSize
,
InSrcOutDstVectorDim
,
InSrcOutDstVectorDim
,
InSrcOutDstVectorSize
,
InSrcOutDstVectorSize
,
InSrcOutDstVectorSize
>
;
InSrcOutDstVectorSize
>
;
const
auto
kernel
=
kernel_reduce_threadwise
<
gridwise_reduce
,
const
auto
kernel
=
kernel_reduce_threadwise
<
gridwise_reduce
,
NeedIndices
,
OuputIndex
,
false
,
// don't have index input
InDataType
,
InDataType
,
OutDataType
,
OutDataType
,
AccDataType
,
AccDataType
,
...
@@ -252,6 +250,7 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
...
@@ -252,6 +250,7 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
arg
.
acc_element_op_
,
arg
.
acc_element_op_
,
float
(
1
),
float
(
1
),
arg
.
p_in_dev_
,
arg
.
p_in_dev_
,
nullptr
,
float
(
0
),
float
(
0
),
arg
.
p_out_dev_
,
arg
.
p_out_dev_
,
arg
.
p_out_indices_dev_
);
arg
.
p_out_indices_dev_
);
...
...
include/ck/tensor_operation/gpu/device/device_reduce.hpp
View file @
bb1f8082
...
@@ -16,35 +16,18 @@ namespace device {
...
@@ -16,35 +16,18 @@ namespace device {
template
<
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
>
template
<
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
>
struct
DeviceReduce
:
public
BaseOperator
struct
DeviceReduce
:
public
BaseOperator
{
{
virtual
long_index_t
GetWorkspaceSizeInBytes
(
const
std
::
vector
<
int
>
inLengths
,
const
std
::
vector
<
int
>
reduceDims
)
{
(
void
)
inLengths
;
(
void
)
reduceDims
;
return
(
0
);
};
virtual
bool
HasFurtherCall
()
{
return
(
false
);
};
virtual
std
::
vector
<
int
>
GetWorkspace2dLengths
(
const
BaseArgument
*
argPtr
)
{
(
void
)
argPtr
;
return
(
std
::
vector
<
int
>
{
0
,
0
});
};
virtual
std
::
unique_ptr
<
BaseArgument
>
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
int
>
inLengths
,
MakeArgumentPointer
(
const
std
::
vector
<
in
dex_
t
>
inLengths
,
const
std
::
vector
<
int
>
inStrides
,
const
std
::
vector
<
in
dex_
t
>
inStrides
,
const
std
::
vector
<
int
>
outLengths
,
const
std
::
vector
<
in
dex_
t
>
outLengths
,
const
std
::
vector
<
int
>
outStrides
,
const
std
::
vector
<
in
dex_
t
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
const
std
::
vector
<
int
>
reduceDims
,
float
alpha
,
float
alpha
,
float
beta
,
float
beta
,
const
void
*
in_dev
,
const
void
*
in_dev
,
const
void
*
in_index_dev
,
void
*
out_dev
,
void
*
out_dev
,
void
*
out_indices_dev
,
void
*
out_index_dev
,
void
*
workspace_dev
,
const
InElementwiseOperation
in_elementwise_op
,
const
InElementwiseOperation
in_elementwise_op
,
const
AccElementwiseOperation
acc_elementwise_op
)
=
0
;
const
AccElementwiseOperation
acc_elementwise_op
)
=
0
;
...
...
include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
deleted
100644 → 0
View file @
97ac5007
#ifndef DEVICE_REDUCE_BLOCKWISE_HPP
#define DEVICE_REDUCE_BLOCKWISE_HPP
#include <iostream>
#include <sstream>
#include "device.hpp"
#include "device_reduce.hpp"
#include "device_reduce_common.hpp"
#include "gridwise_2d_reduction_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
index_t
Rank
,
index_t
NumReduceDim
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
bool
PropagateNan
,
bool
NeedIndices
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
struct
DeviceReduceBlockWise
:
public
DeviceReduce
<
InElementwiseOperation
,
AccElementwiseOperation
>
{
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
static_assert
(
BlockSize
==
MThreadClusterSize
*
KThreadClusterSize
,
"Invalid thread cluster size assignments!"
);
static_assert
(((
InSrcVectorDim
==
0
&&
MThreadSliceSize
%
InSrcVectorSize
==
0
)
||
(
InSrcVectorDim
==
1
&&
KThreadSliceSize
%
InSrcVectorSize
==
0
))
&&
(
MThreadSliceSize
%
OutDstVectorSize
==
0
),
"Invalid thread slice sizes and/or vector sizes configuration, please check!"
);
using
IndexDataType
=
int32_t
;
static
constexpr
bool
BetaIsZero
=
NeedIndices
;
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
static
constexpr
index_t
numSrcDim
=
Rank
;
static
constexpr
index_t
numDstDim
=
(
NumInvariantDim
==
0
)
?
1
:
NumInvariantDim
;
static
constexpr
bool
reduceAllDim
=
(
NumInvariantDim
==
0
);
static
constexpr
int
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
int
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
static
auto
MakeSrc2dDescriptor
(
const
std
::
vector
<
int
>&
inLengths
,
const
std
::
vector
<
int
>&
inStrides
)
{
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
inLengths
,
Number
<
numSrcDim
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
inStrides
,
Number
<
numSrcDim
>
{});
const
auto
inDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
in_grid_desc_m_k
=
[
&
]()
{
if
constexpr
(
reduceAllDim
)
{
const
auto
one_dim_inDesc
=
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
numSrcDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
transform_tensor_descriptor
(
one_dim_inDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
one_dim_inDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
}
else
{
using
InvariantDims
=
typename
arithmetic_sequence_gen
<
0
,
NumInvariantDim
,
1
>::
type
;
using
ReduceDims
=
typename
arithmetic_sequence_gen
<
NumInvariantDim
,
Rank
,
1
>::
type
;
const
auto
reduceDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
ReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
InvariantDims
{});
return
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
reduceDimLengths
)),
make_tuple
(
InvariantDims
{},
ReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
}();
const
auto
invariantLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
0
>
{});
const
auto
reduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
auto
inPad_M
=
math
::
integer_least_multiple
(
invariantLength
,
M_BlockTileSize
)
-
invariantLength
;
const
auto
inPad_K
=
math
::
integer_least_multiple
(
reduceLength
,
K_BlockTileSize
)
-
reduceLength
;
auto
in_grid_desc_m_k_padded
=
transform_tensor_descriptor
(
in_grid_desc_m_k
,
make_tuple
(
make_right_pad_transform
(
invariantLength
,
inPad_M
),
make_right_pad_transform
(
reduceLength
,
inPad_K
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
(
in_grid_desc_m_k_padded
);
};
static
auto
MakeDst1dDescriptor
(
const
std
::
vector
<
int
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
)
{
const
auto
tupleDstLengths
=
make_tuple_from_array
(
outLengths
,
Number
<
numDstDim
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
outStrides
,
Number
<
numDstDim
>
{});
auto
outDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
auto
out_grid_desc_m
=
transform_tensor_descriptor
(
outDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
numDstDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
invariantLength
=
out_grid_desc_m
.
GetLength
(
Number
<
0
>
{});
const
auto
inPad
=
math
::
integer_least_multiple
(
invariantLength
,
M_BlockTileSize
)
-
invariantLength
;
auto
out_grid_desc_m_padded
=
transform_tensor_descriptor
(
out_grid_desc_m
,
make_tuple
(
make_right_pad_transform
(
invariantLength
,
inPad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
(
out_grid_desc_m_padded
);
};
struct
Argument
:
public
BaseArgument
{
Argument
(
const
std
::
vector
<
int
>
inLengths
,
const
std
::
vector
<
int
>
inStrides
,
const
std
::
vector
<
int
>
outLengths
,
const
std
::
vector
<
int
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
float
alpha
,
float
beta
,
const
InDataType
*
in_dev
,
OutDataType
*
out_dev
,
IndexDataType
*
out_indices_dev
,
AccDataType
*
workspace_dev
,
const
InElementwiseOperation
in_elementwise_op
,
const
AccElementwiseOperation
acc_elementwise_op
)
:
outLengths_
{
outLengths
},
outStrides_
{
outStrides
},
in_dev_
{
in_dev
},
out_dev_
{
out_dev
},
out_indices_dev_
{
out_indices_dev
},
in_elementwise_op_
{
in_elementwise_op
},
acc_elementwise_op_
{
acc_elementwise_op
}
{
(
void
)
workspace_dev
;
inLengths_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inLengths
,
reduceDims
);
inStrides_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inStrides
,
reduceDims
);
alpha_
=
type_convert
<
AccDataType
>
(
alpha
);
beta_
=
type_convert
<
AccDataType
>
(
beta
);
std
::
tie
(
invariant_total_length
,
reduce_total_length
)
=
get_2d_lengths
<
Rank
,
NumReduceDim
>
(
inLengths_
);
if
constexpr
(
NumInvariantDim
==
0
)
invariant_lowest_length
=
1
;
else
invariant_lowest_length
=
inLengths_
[
NumInvariantDim
-
1
];
reduce_lowest_length
=
inLengths_
[
Rank
-
1
];
gridSize
=
math
::
integer_least_multiple
(
invariant_total_length
,
M_BlockTileSize
)
/
M_BlockTileSize
;
}
std
::
vector
<
int
>
inLengths_
;
std
::
vector
<
int
>
inStrides_
;
std
::
vector
<
int
>
outLengths_
;
std
::
vector
<
int
>
outStrides_
;
AccDataType
alpha_
;
AccDataType
beta_
;
const
InDataType
*
in_dev_
;
OutDataType
*
out_dev_
;
IndexDataType
*
out_indices_dev_
;
InElementwiseOperation
in_elementwise_op_
;
AccElementwiseOperation
acc_elementwise_op_
;
int
invariant_lowest_length
;
int
reduce_lowest_length
;
size_t
invariant_total_length
;
size_t
reduce_total_length
;
size_t
gridSize
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
const
auto
in_grid_desc_m_k
=
DeviceReduceBlockWise
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
);
const
auto
out_grid_desc_m
=
DeviceReduceBlockWise
::
MakeDst1dDescriptor
(
arg
.
outLengths_
,
arg
.
outStrides_
);
using
InGridDesc_M_K
=
decltype
(
in_grid_desc_m_k
);
using
OutGridDesc_M
=
decltype
(
out_grid_desc_m
);
using
GridwiseReduce
=
GridwiseReduction_mk_to_m_blockwise
<
InDataType
,
OutDataType
,
AccDataType
,
IndexDataType
,
InGridDesc_M_K
,
OutGridDesc_M
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
PropagateNan
,
BetaIsZero
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
OutDstVectorSize
>
;
float
avg_time
=
0
;
const
auto
kernel
=
kernel_reduce_blockwise
<
GridwiseReduce
,
NeedIndices
,
InDataType
,
OutDataType
,
AccDataType
,
IndexDataType
,
InGridDesc_M_K
,
OutGridDesc_M
,
InElementwiseOperation
,
AccElementwiseOperation
>
;
avg_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
arg
.
gridSize
),
dim3
(
BlockSize
),
0
,
in_grid_desc_m_k
,
out_grid_desc_m
,
arg
.
in_elementwise_op_
,
arg
.
acc_elementwise_op_
,
arg
.
alpha_
,
arg
.
in_dev_
,
arg
.
beta_
,
arg
.
out_dev_
,
nullptr
,
arg
.
out_indices_dev_
);
return
(
avg_time
);
};
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
};
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
constexpr
(
InSrcVectorDim
==
0
)
{
if
constexpr
(
NumInvariantDim
==
0
)
{
return
(
false
);
}
else
{
if
(
pArg
->
inStrides_
[
NumInvariantDim
-
1
]
!=
1
)
return
(
false
);
if
(
pArg
->
invariant_lowest_length
%
InSrcVectorSize
!=
0
)
return
(
false
);
};
}
else
{
if
(
pArg
->
inStrides_
[
Rank
-
1
]
!=
1
)
return
(
false
);
if
(
pArg
->
reduce_lowest_length
%
InSrcVectorSize
!=
0
)
return
(
false
);
};
// To improve
if
(
pArg
->
invariant_lowest_length
%
OutDstVectorSize
!=
0
)
return
(
false
);
// cases with very small reduce_total_length should be handled by the ThreadWise method
if
(
pArg
->
reduce_total_length
/
KThreadSliceSize
<
2
)
return
(
false
);
return
(
true
);
};
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
int
>
inLengths
,
const
std
::
vector
<
int
>
inStrides
,
const
std
::
vector
<
int
>
outLengths
,
const
std
::
vector
<
int
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
float
alpha
,
float
beta
,
const
void
*
in_dev
,
void
*
out_dev
,
void
*
out_indices_dev
,
void
*
workspace_dev
,
const
InElementwiseOperation
in_elementwise_op
,
const
AccElementwiseOperation
acc_elementwise_op
)
override
{
return
std
::
make_unique
<
Argument
>
(
inLengths
,
inStrides
,
outLengths
,
outStrides
,
reduceDims
,
alpha
,
beta
,
static_cast
<
const
InDataType
*>
(
in_dev
),
static_cast
<
OutDataType
*>
(
out_dev
),
static_cast
<
IndexDataType
*>
(
out_indices_dev
),
static_cast
<
AccDataType
*>
(
workspace_dev
),
in_elementwise_op
,
acc_elementwise_op
);
};
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
();
};
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceReduceBlockWise<"
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
MThreadClusterSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"InSrcVectorDim_"
<<
InSrcVectorDim
<<
"_InSrcVectorSize_"
<<
InSrcVectorSize
<<
"_OutDstVectorSize_"
<<
OutDstVectorSize
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
deleted
100644 → 0
View file @
97ac5007
#ifndef DEVICE_REDUCE_BLOCKWISE_SECOND_CALL_HPP
#define DEVICE_REDUCE_BLOCKWISE_SECOND_CALL_HPP
#include <iostream>
#include <sstream>
#include "device.hpp"
#include "device_reduce.hpp"
#include "device_reduce_common.hpp"
#include "gridwise_2d_reduction_blockwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
index_t
Rank
,
index_t
NumReduceDim
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
bool
PropagateNan
,
bool
NeedIndices
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
struct
DeviceReduceBlockWiseSecondCall
:
public
DeviceReduce
<
InElementwiseOperation
,
AccElementwiseOperation
>
{
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
static_assert
(
BlockSize
==
MThreadClusterSize
*
KThreadClusterSize
,
"Invalid thread cluster size assignments!"
);
static_assert
((
InSrcVectorDim
==
1
&&
KThreadSliceSize
%
InSrcVectorSize
==
0
)
&&
(
MThreadSliceSize
%
OutDstVectorSize
==
0
),
"Invalid thread slice sizes and/or vector sizes configuration, please check!"
);
using
IndexDataType
=
int32_t
;
static
constexpr
bool
BetaIsZero
=
NeedIndices
;
static_assert
(
std
::
is_same
<
InDataType
,
AccDataType
>::
value
,
"InDataType and AccDataType should be the same to use DEviceReduceBlockWiseSecondCall!"
);
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
static
constexpr
index_t
numDstDim
=
(
NumInvariantDim
==
0
)
?
1
:
NumInvariantDim
;
static
constexpr
int
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
int
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
static
auto
MakeSrc2dDescriptor
(
const
std
::
vector
<
int
>&
inLengths
,
const
std
::
vector
<
int
>&
inStrides
)
{
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
inLengths
,
Number
<
2
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
inStrides
,
Number
<
2
>
{});
const
auto
in_grid_desc_m_k
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
invariantLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
0
>
{});
const
auto
reduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
auto
inPad_M
=
math
::
integer_least_multiple
(
invariantLength
,
M_BlockTileSize
)
-
invariantLength
;
const
auto
inPad_K
=
math
::
integer_least_multiple
(
reduceLength
,
K_BlockTileSize
)
-
reduceLength
;
auto
in_grid_desc_m_k_padded
=
transform_tensor_descriptor
(
in_grid_desc_m_k
,
make_tuple
(
make_right_pad_transform
(
invariantLength
,
inPad_M
),
make_right_pad_transform
(
reduceLength
,
inPad_K
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
(
in_grid_desc_m_k_padded
);
};
static
auto
MakeDst1dDescriptor
(
const
std
::
vector
<
int
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
)
{
const
auto
tupleDstLengths
=
make_tuple_from_array
(
outLengths
,
Number
<
numDstDim
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
outStrides
,
Number
<
numDstDim
>
{});
auto
outDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
auto
out_grid_desc_m
=
transform_tensor_descriptor
(
outDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
numDstDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
invariantLength
=
out_grid_desc_m
.
GetLength
(
Number
<
0
>
{});
const
auto
outPad
=
math
::
integer_least_multiple
(
invariantLength
,
M_BlockTileSize
)
-
invariantLength
;
auto
out_grid_desc_m_padded
=
transform_tensor_descriptor
(
out_grid_desc_m
,
make_tuple
(
make_right_pad_transform
(
invariantLength
,
outPad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
(
out_grid_desc_m_padded
);
};
struct
Argument
:
public
BaseArgument
{
Argument
(
const
std
::
vector
<
int
>&
inLengths
,
const
std
::
vector
<
int
>&
inStrides
,
const
std
::
vector
<
int
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
,
float
alpha
,
float
beta
,
const
InDataType
*
in_dev
,
OutDataType
*
out_dev
,
IndexDataType
*
out_indices_dev
,
AccDataType
*
workspace_dev
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
AccElementwiseOperation
&
acc_elementwise_op
)
:
inLengths_
(
inLengths
),
inStrides_
(
inStrides
),
outLengths_
(
outLengths
),
outStrides_
(
outStrides
),
in_dev_
{
in_dev
},
out_dev_
{
out_dev
},
out_indices_dev_
{
out_indices_dev
},
in_elementwise_op_
(
in_elementwise_op
),
acc_elementwise_op_
(
acc_elementwise_op
)
{
alpha_
=
type_convert
<
AccDataType
>
(
alpha
);
beta_
=
type_convert
<
AccDataType
>
(
beta
);
invariant_total_length
=
inLengths
[
0
];
reduce_total_length
=
inLengths
[
1
];
invariant_lowest_length
=
inLengths
[
0
];
reduce_lowest_length
=
inLengths
[
1
];
gridSize
=
math
::
integer_least_multiple
(
invariant_total_length
,
M_BlockTileSize
)
/
M_BlockTileSize
;
size_t
ws_buf2_bytes_offset
=
math
::
integer_least_multiple
(
invariant_total_length
*
reduce_total_length
*
sizeof
(
AccDataType
),
64
);
if
constexpr
(
NeedIndices
)
workspace_indices_dev_
=
reinterpret_cast
<
index_t
*>
(
reinterpret_cast
<
char
*>
(
workspace_dev
)
+
ws_buf2_bytes_offset
);
else
workspace_indices_dev_
=
nullptr
;
}
std
::
vector
<
int
>
inLengths_
;
std
::
vector
<
int
>
inStrides_
;
std
::
vector
<
int
>
outLengths_
;
std
::
vector
<
int
>
outStrides_
;
AccDataType
alpha_
;
AccDataType
beta_
;
const
InDataType
*
in_dev_
;
OutDataType
*
out_dev_
;
IndexDataType
*
out_indices_dev_
;
IndexDataType
*
workspace_indices_dev_
;
InElementwiseOperation
in_elementwise_op_
;
AccElementwiseOperation
acc_elementwise_op_
;
int
invariant_lowest_length
;
int
reduce_lowest_length
;
size_t
invariant_total_length
;
size_t
reduce_total_length
;
size_t
gridSize
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
const
auto
in_grid_desc_m_k
=
DeviceReduceBlockWiseSecondCall
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
);
const
auto
out_grid_desc_m
=
DeviceReduceBlockWiseSecondCall
::
MakeDst1dDescriptor
(
arg
.
outLengths_
,
arg
.
outStrides_
);
using
InGridDesc_M_K
=
decltype
(
in_grid_desc_m_k
);
using
OutGridDesc_M
=
decltype
(
out_grid_desc_m
);
using
GridwiseReduce
=
GridwiseReduction_mk_to_m_blockwise
<
InDataType
,
OutDataType
,
AccDataType
,
IndexDataType
,
InGridDesc_M_K
,
OutGridDesc_M
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
PropagateNan
,
BetaIsZero
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
OutDstVectorSize
>
;
float
avg_time
=
0
;
const
auto
kernel
=
kernel_reduce_blockwise_second_call
<
GridwiseReduce
,
NeedIndices
,
InDataType
,
OutDataType
,
AccDataType
,
IndexDataType
,
InGridDesc_M_K
,
OutGridDesc_M
,
InElementwiseOperation
,
AccElementwiseOperation
>
;
avg_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
arg
.
gridSize
),
dim3
(
BlockSize
),
0
,
in_grid_desc_m_k
,
out_grid_desc_m
,
arg
.
in_elementwise_op_
,
arg
.
acc_elementwise_op_
,
arg
.
alpha_
,
arg
.
in_dev_
,
arg
.
beta_
,
arg
.
out_dev_
,
arg
.
workspace_indices_dev_
,
arg
.
out_indices_dev_
);
return
(
avg_time
);
};
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
constexpr
(
InSrcVectorDim
==
0
)
return
(
false
);
if
(
pArg
->
reduce_lowest_length
%
InSrcVectorSize
!=
0
)
return
(
false
);
// To improve
if
(
pArg
->
invariant_lowest_length
%
OutDstVectorSize
!=
0
)
return
(
false
);
// cases with very small reduce_total_length should be handled by the ThreadWise method
if
(
pArg
->
reduce_total_length
/
KThreadSliceSize
<
2
)
return
(
false
);
return
(
true
);
};
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
int
>
inLengths
,
const
std
::
vector
<
int
>
inStrides
,
const
std
::
vector
<
int
>
outLengths
,
const
std
::
vector
<
int
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
float
alpha
,
float
beta
,
const
void
*
in_dev
,
void
*
out_dev
,
void
*
out_indices_dev
,
void
*
workspace_dev
,
const
InElementwiseOperation
in_elementwise_op
,
const
AccElementwiseOperation
acc_elementwise_op
)
override
{
(
void
)
reduceDims
;
return
std
::
make_unique
<
Argument
>
(
inLengths
,
inStrides
,
outLengths
,
outStrides
,
alpha
,
beta
,
static_cast
<
const
InDataType
*>
(
in_dev
),
static_cast
<
OutDataType
*>
(
out_dev
),
static_cast
<
IndexDataType
*>
(
out_indices_dev
),
static_cast
<
AccDataType
*>
(
workspace_dev
),
in_elementwise_op
,
acc_elementwise_op
);
};
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
();
};
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceReduceBlockWiseSecondCall<"
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
MThreadClusterSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"InSrcVectorDim_"
<<
InSrcVectorDim
<<
"_InSrcVectorSize_"
<<
InSrcVectorSize
<<
"_OutDstVectorSize_"
<<
OutDstVectorSize
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/device_reduce_common.hpp
View file @
bb1f8082
...
@@ -14,13 +14,13 @@ namespace device {
...
@@ -14,13 +14,13 @@ namespace device {
// here, inLengths[] is already shuffled so that lengths of invariant dims are included before those
// here, inLengths[] is already shuffled so that lengths of invariant dims are included before those
// of reduce dims
// of reduce dims
template
<
int
Rank
,
int
NumReduceDim
>
template
<
in
dex_
t
Rank
,
int
NumReduceDim
>
std
::
pair
<
size_t
,
size
_t
>
get_2d_lengths
(
const
std
::
vector
<
int
>&
inLengths
)
std
::
pair
<
long_index_t
,
long_index
_t
>
get_2d_lengths
(
const
std
::
vector
<
in
dex_
t
>&
inLengths
)
{
{
static_assert
(
Rank
<=
6
,
"bigger Rank size not supported!"
);
static_assert
(
Rank
<=
6
,
"bigger Rank size not supported!"
);
size
_t
invariant_total_length
=
1
;
long_index
_t
invariant_total_length
=
1
;
size
_t
reduce_total_length
=
1
;
long_index
_t
reduce_total_length
=
1
;
constexpr
int
NumInvariantDim
=
Rank
-
NumReduceDim
;
constexpr
int
NumInvariantDim
=
Rank
-
NumReduceDim
;
...
@@ -35,13 +35,13 @@ std::pair<size_t, size_t> get_2d_lengths(const std::vector<int>& inLengths)
...
@@ -35,13 +35,13 @@ std::pair<size_t, size_t> get_2d_lengths(const std::vector<int>& inLengths)
// helper functions using variadic template arguments
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
template
<
index_t
...
Ns
>
auto
make_tuple_from_array_and_index_seq
(
const
std
::
vector
<
int
>&
lengths
,
Sequence
<
Ns
...
>
)
auto
make_tuple_from_array_and_index_seq
(
const
std
::
vector
<
in
dex_
t
>&
lengths
,
Sequence
<
Ns
...
>
)
{
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
};
template
<
index_t
arraySize
>
template
<
index_t
arraySize
>
static
auto
make_tuple_from_array
(
const
std
::
vector
<
int
>&
lengths
,
Number
<
arraySize
>
)
auto
make_tuple_from_array
(
const
std
::
vector
<
in
dex_
t
>&
lengths
,
Number
<
arraySize
>
)
{
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
...
@@ -51,10 +51,10 @@ static auto make_tuple_from_array(const std::vector<int>& lengths, Number<arrayS
...
@@ -51,10 +51,10 @@ static auto make_tuple_from_array(const std::vector<int>& lengths, Number<arrayS
};
};
template
<
index_t
Rank
,
index_t
NumReduceDim
>
template
<
index_t
Rank
,
index_t
NumReduceDim
>
std
::
vector
<
int
>
shuffle_tensor_dimensions
(
const
std
::
vector
<
int
>&
origLengthsStrides
,
std
::
vector
<
in
dex_
t
>
shuffle_tensor_dimensions
(
const
std
::
vector
<
in
dex_
t
>&
origLengthsStrides
,
const
std
::
vector
<
int
>&
reduceDims
)
const
std
::
vector
<
int
>&
reduceDims
)
{
{
std
::
vector
<
int
>
newLengthsStrides
;
std
::
vector
<
in
dex_
t
>
newLengthsStrides
;
assert
(
Rank
==
origLengthsStrides
.
size
()
&&
NumReduceDim
==
reduceDims
.
size
());
assert
(
Rank
==
origLengthsStrides
.
size
()
&&
NumReduceDim
==
reduceDims
.
size
());
...
...
include/ck/tensor_operation/gpu/device/device_reduce_multiblock
_atomic_add
.hpp
→
include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
View file @
bb1f8082
#ifndef DEVICE_REDUCE_MULTIBLOCK_
ATOMIC_ADD_
HPP
#ifndef DEVICE_REDUCE_MULTIBLOCK_HPP
#define DEVICE_REDUCE_MULTIBLOCK_
ATOMIC_ADD_
HPP
#define DEVICE_REDUCE_MULTIBLOCK_HPP
#include <iostream>
#include <iostream>
#include <sstream>
#include <sstream>
...
@@ -7,8 +7,9 @@
...
@@ -7,8 +7,9 @@
#include "device_base.hpp"
#include "device_base.hpp"
#include "device_reduce.hpp"
#include "device_reduce.hpp"
#include "device_reduce_common.hpp"
#include "device_reduce_common.hpp"
#include "gridwise_2d_reduction_multiblock
_atomic_add
.hpp"
#include "gridwise_2d_reduction_multiblock.hpp"
#include "gridwise_set_buffer_value.hpp"
#include "gridwise_set_buffer_value.hpp"
#include "reduction_operator.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -22,8 +23,10 @@ template <typename InDataType,
...
@@ -22,8 +23,10 @@ template <typename InDataType,
typename
ReduceOperation
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
typename
AccElementwiseOperation
,
InMemoryDataOperationEnum
OutMemoryDataOperation
,
bool
PropagateNan
,
bool
PropagateNan
,
bool
NeedIndices
,
bool
OutputIndex
,
bool
HaveIndexInputIfOutputIndex
,
index_t
BlockSize
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
KThreadClusterSize
,
...
@@ -32,8 +35,7 @@ template <typename InDataType,
...
@@ -32,8 +35,7 @@ template <typename InDataType,
index_t
InSrcVectorDim
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
index_t
OutDstVectorSize
>
struct
DeviceReduceMultiBlockAtomicAdd
struct
DeviceReduceMultiBlock
:
public
DeviceReduce
<
InElementwiseOperation
,
AccElementwiseOperation
>
:
public
DeviceReduce
<
InElementwiseOperation
,
AccElementwiseOperation
>
{
{
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
static_assert
(
BlockSize
==
MThreadClusterSize
*
KThreadClusterSize
,
static_assert
(
BlockSize
==
MThreadClusterSize
*
KThreadClusterSize
,
...
@@ -46,26 +48,40 @@ struct DeviceReduceMultiBlockAtomicAdd
...
@@ -46,26 +48,40 @@ struct DeviceReduceMultiBlockAtomicAdd
using
IndexDataType
=
int32_t
;
using
IndexDataType
=
int32_t
;
static
constexpr
bool
HaveIndexInput
=
OutputIndex
&&
HaveIndexInputIfOutputIndex
;
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
static
constexpr
index_t
numSrcDim
=
Rank
;
static
constexpr
index_t
numSrcDim
=
Rank
;
static
constexpr
index_t
numDstDim
=
(
NumInvariantDim
==
0
)
?
1
:
NumInvariantDim
;
static
constexpr
index_t
numDstDim
=
(
NumInvariantDim
==
0
)
?
1
:
NumInvariantDim
;
static
constexpr
bool
reduceAllDim
=
(
NumInvariantDim
==
0
);
static
constexpr
bool
reduceAllDim
=
(
NumInvariantDim
==
0
);
static
constexpr
bool
support_AtomicAdd
=
// So far, only AtomicAdd is considered, other Atomic Operation like AtomicMax can be added
// later
static
constexpr
bool
use_multiblock
=
(
OutMemoryDataOperation
==
InMemoryDataOperationEnum
::
AtomicAdd
);
static
constexpr
bool
out_type_compatible_with_atomic_op
=
std
::
is_same
<
OutDataType
,
float
>::
value
||
std
::
is_same
<
OutDataType
,
double
>::
value
;
std
::
is_same
<
OutDataType
,
float
>::
value
||
std
::
is_same
<
OutDataType
,
double
>::
value
;
static_assert
(
!
NeedIndices
&&
support_AtomicAdd
,
static_assert
(
"MultiBlockAtomicAdd method can only be used with non-indiced operation and when "
!
use_multiblock
||
(
use_multiblock
&&
out_type_compatible_with_atomic_op
),
"having float/double output type!"
);
"The OutDataType must support the atomic operation for using MultiBlock reduction"
);
static_assert
(
!
use_multiblock
||
(
use_multiblock
&&
!
OutputIndex
),
"MultiBlock reduction can only be used when outputing index is not required"
);
static_assert
(
ReduceOperation
::
IsCompatibleInMemoryDataOperation
(
OutMemoryDataOperation
),
"The reduction accumulation operation must be compatible with the OutMemoryDataOperation!"
);
static
constexpr
int
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
in
dex_
t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
int
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
static
constexpr
in
dex_
t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
static
auto
MakeSrc2dDescriptor
(
const
std
::
vector
<
int
>&
inLengths
,
static
auto
MakeSrc2dDescriptor
(
const
std
::
vector
<
in
dex_
t
>&
inLengths
,
const
std
::
vector
<
int
>&
inStrides
,
const
std
::
vector
<
in
dex_
t
>&
inStrides
,
int
blkGroupSize
,
int
blkGroupSize
,
int
k
BlockTileIteration
s
)
int
num
BlockTileIteration
)
{
{
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
inLengths
,
Number
<
numSrcDim
>
{});
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
inLengths
,
Number
<
numSrcDim
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
inStrides
,
Number
<
numSrcDim
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
inStrides
,
Number
<
numSrcDim
>
{});
...
@@ -109,7 +125,7 @@ struct DeviceReduceMultiBlockAtomicAdd
...
@@ -109,7 +125,7 @@ struct DeviceReduceMultiBlockAtomicAdd
const
auto
invariantLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
0
>
{});
const
auto
invariantLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
0
>
{});
const
auto
reduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
auto
reduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
int
reduceSizePerBlock
=
K_BlockTileSize
*
k
BlockTileIteration
s
;
const
int
reduceSizePerBlock
=
K_BlockTileSize
*
num
BlockTileIteration
;
const
auto
inPad_M
=
const
auto
inPad_M
=
math
::
integer_least_multiple
(
invariantLength
,
M_BlockTileSize
)
-
invariantLength
;
math
::
integer_least_multiple
(
invariantLength
,
M_BlockTileSize
)
-
invariantLength
;
const
auto
inPad_K
=
reduceSizePerBlock
*
blkGroupSize
-
reduceLength
;
const
auto
inPad_K
=
reduceSizePerBlock
*
blkGroupSize
-
reduceLength
;
...
@@ -124,8 +140,8 @@ struct DeviceReduceMultiBlockAtomicAdd
...
@@ -124,8 +140,8 @@ struct DeviceReduceMultiBlockAtomicAdd
return
(
in_grid_desc_m_k_padded
);
return
(
in_grid_desc_m_k_padded
);
};
};
static
auto
MakeDst1dDescriptor
(
const
std
::
vector
<
int
>&
outLengths
,
static
auto
MakeDst1dDescriptor
(
const
std
::
vector
<
in
dex_
t
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
)
const
std
::
vector
<
in
dex_
t
>&
outStrides
)
{
{
const
auto
tupleDstLengths
=
make_tuple_from_array
(
outLengths
,
Number
<
numDstDim
>
{});
const
auto
tupleDstLengths
=
make_tuple_from_array
(
outLengths
,
Number
<
numDstDim
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
outStrides
,
Number
<
numDstDim
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
outStrides
,
Number
<
numDstDim
>
{});
...
@@ -151,31 +167,56 @@ struct DeviceReduceMultiBlockAtomicAdd
...
@@ -151,31 +167,56 @@ struct DeviceReduceMultiBlockAtomicAdd
return
(
out_grid_desc_m_padded
);
return
(
out_grid_desc_m_padded
);
};
};
static
auto
MakeDst1dDescriptorForBufferSet
(
const
std
::
vector
<
index_t
>&
outLengths
,
const
std
::
vector
<
index_t
>&
outStrides
)
{
const
auto
tupleDstLengths
=
make_tuple_from_array
(
outLengths
,
Number
<
numDstDim
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
outStrides
,
Number
<
numDstDim
>
{});
auto
outDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
auto
out_grid_desc_m
=
transform_tensor_descriptor
(
outDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
numDstDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
length
=
out_grid_desc_m
.
GetLength
(
Number
<
0
>
{});
const
auto
pad
=
math
::
integer_least_multiple
(
length
,
BlockSize
)
-
length
;
auto
out_grid_desc_m_padded
=
transform_tensor_descriptor
(
out_grid_desc_m
,
make_tuple
(
make_right_pad_transform
(
length
,
pad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
(
out_grid_desc_m_padded
);
};
struct
Argument
:
public
BaseArgument
struct
Argument
:
public
BaseArgument
{
{
Argument
(
const
std
::
vector
<
int
>
inLengths
,
Argument
(
const
std
::
vector
<
in
dex_
t
>
inLengths
,
const
std
::
vector
<
int
>
inStrides
,
const
std
::
vector
<
in
dex_
t
>
inStrides
,
const
std
::
vector
<
int
>
outLengths
,
const
std
::
vector
<
in
dex_
t
>
outLengths
,
const
std
::
vector
<
int
>
outStrides
,
const
std
::
vector
<
in
dex_
t
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
const
std
::
vector
<
int
>
reduceDims
,
float
alpha
,
float
alpha
,
float
beta
,
float
beta
,
const
InDataType
*
in_dev
,
const
InDataType
*
in_dev
,
const
IndexDataType
*
in_index_dev
,
OutDataType
*
out_dev
,
OutDataType
*
out_dev
,
IndexDataType
*
out_indices_dev
,
IndexDataType
*
out_index_dev
,
AccDataType
*
workspace_dev
,
const
InElementwiseOperation
in_elementwise_op
,
const
InElementwiseOperation
in_elementwise_op
,
const
AccElementwiseOperation
acc_elementwise_op
)
const
AccElementwiseOperation
acc_elementwise_op
)
:
outLengths_
{
outLengths
},
:
outLengths_
{
outLengths
},
outStrides_
{
outStrides
},
outStrides_
{
outStrides
},
in_dev_
{
in_dev
},
in_dev_
{
in_dev
},
in_index_dev_
{
in_index_dev
},
out_dev_
{
out_dev
},
out_dev_
{
out_dev
},
out_index_dev_
{
out_index_dev
},
in_elementwise_op_
{
in_elementwise_op
},
in_elementwise_op_
{
in_elementwise_op
},
acc_elementwise_op_
{
acc_elementwise_op
}
acc_elementwise_op_
{
acc_elementwise_op
}
{
{
(
void
)
out_indices_dev
;
(
void
)
workspace_dev
;
inLengths_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inLengths
,
reduceDims
);
inLengths_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inLengths
,
reduceDims
);
inStrides_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inStrides
,
reduceDims
);
inStrides_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inStrides
,
reduceDims
);
...
@@ -192,23 +233,34 @@ struct DeviceReduceMultiBlockAtomicAdd
...
@@ -192,23 +233,34 @@ struct DeviceReduceMultiBlockAtomicAdd
reduce_lowest_length
=
inLengths_
[
Rank
-
1
];
reduce_lowest_length
=
inLengths_
[
Rank
-
1
];
int
iterations
=
1
;
if
constexpr
(
use_multiblock
)
while
(
true
)
{
{
int
testBlkGroupSize
=
(
reduce_total_length
+
(
K_BlockTileSize
*
iterations
)
-
1
)
/
(
K_BlockTileSize
*
iterations
);
// we want the blkGroupSize be not more than 128
int
iterations
=
1
;
if
(
testBlkGroupSize
<=
128
)
while
(
true
)
break
;
{
int
testBlkGroupSize
=
(
reduce_total_length
+
(
K_BlockTileSize
*
iterations
)
-
1
)
/
(
K_BlockTileSize
*
iterations
);
iterations
++
;
// we want the blkGroupSize be not more than 128
};
if
(
testBlkGroupSize
<=
128
)
break
;
blkGroupSize
=
(
reduce_total_length
+
(
K_BlockTileSize
*
iterations
)
-
1
)
/
iterations
++
;
(
K_BlockTileSize
*
iterations
)
;
}
;
kBlockTileIterations
=
iterations
;
blkGroupSize
=
(
reduce_total_length
+
(
K_BlockTileSize
*
iterations
)
-
1
)
/
(
K_BlockTileSize
*
iterations
);
numBlockTileIteration
=
iterations
;
}
else
{
blkGroupSize
=
1
;
numBlockTileIteration
=
(
reduce_total_length
+
K_BlockTileSize
-
1
)
/
K_BlockTileSize
;
};
gridSize
=
math
::
integer_least_multiple
(
invariant_total_length
,
M_BlockTileSize
)
/
gridSize
=
math
::
integer_least_multiple
(
invariant_total_length
,
M_BlockTileSize
)
/
M_BlockTileSize
*
blkGroupSize
;
M_BlockTileSize
*
blkGroupSize
;
...
@@ -217,27 +269,29 @@ struct DeviceReduceMultiBlockAtomicAdd
...
@@ -217,27 +269,29 @@ struct DeviceReduceMultiBlockAtomicAdd
math
::
integer_least_multiple
(
invariant_total_length
,
BlockSize
)
/
BlockSize
;
math
::
integer_least_multiple
(
invariant_total_length
,
BlockSize
)
/
BlockSize
;
}
}
std
::
vector
<
int
>
inLengths_
;
std
::
vector
<
in
dex_
t
>
inLengths_
;
std
::
vector
<
int
>
inStrides_
;
std
::
vector
<
in
dex_
t
>
inStrides_
;
std
::
vector
<
int
>
outLengths_
;
std
::
vector
<
in
dex_
t
>
outLengths_
;
std
::
vector
<
int
>
outStrides_
;
std
::
vector
<
in
dex_
t
>
outStrides_
;
AccDataType
alpha_
;
AccDataType
alpha_
;
AccDataType
beta_
;
AccDataType
beta_
;
const
InDataType
*
in_dev_
;
const
InDataType
*
in_dev_
;
const
IndexDataType
*
in_index_dev_
;
OutDataType
*
out_dev_
;
OutDataType
*
out_dev_
;
IndexDataType
*
out_index_dev_
;
InElementwiseOperation
in_elementwise_op_
;
InElementwiseOperation
in_elementwise_op_
;
AccElementwiseOperation
acc_elementwise_op_
;
AccElementwiseOperation
acc_elementwise_op_
;
int
invariant_lowest_length
;
in
dex_
t
invariant_lowest_length
;
int
reduce_lowest_length
;
in
dex_
t
reduce_lowest_length
;
size
_t
invariant_total_length
;
long_index
_t
invariant_total_length
;
size
_t
reduce_total_length
;
long_index
_t
reduce_total_length
;
in
dex_
t
blkGroupSize
;
int
blkGroupSize
;
in
dex_t
k
BlockTileIteration
s
;
in
t
num
BlockTileIteration
;
size_t
gridSize
;
size_t
gridSize
;
size_t
gridSize_pre
;
size_t
gridSize_pre
;
...
@@ -247,52 +301,69 @@ struct DeviceReduceMultiBlockAtomicAdd
...
@@ -247,52 +301,69 @@ struct DeviceReduceMultiBlockAtomicAdd
{
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
{
const
auto
in_grid_desc_m_k
=
DeviceReduceMultiBlockAtomicAdd
::
MakeSrc2dDescriptor
(
const
auto
in_grid_desc_m_k
=
DeviceReduceMultiBlock
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
kBlockTileIterations
);
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
);
const
auto
out_grid_desc_m
=
DeviceReduceMultiBlockAtomicAdd
::
MakeDst1dDescriptor
(
const
auto
out_grid_desc_m
=
DeviceReduceMultiBlock
::
MakeDst1dDescriptor
(
arg
.
outLengths_
,
arg
.
outStrides_
);
const
auto
out_grid_desc_m_2
=
DeviceReduceMultiBlock
::
MakeDst1dDescriptorForBufferSet
(
arg
.
outLengths_
,
arg
.
outStrides_
);
arg
.
outLengths_
,
arg
.
outStrides_
);
using
InGridDesc_M_K
=
decltype
(
in_grid_desc_m_k
);
using
OutGridDesc_M
=
decltype
(
out_grid_desc_m
);
using
GridwiseReduce
=
GridwiseReduction_mk_to_m_multiblock_atomic_add
<
InDataType
,
OutDataType
,
AccDataType
,
InGridDesc_M_K
,
OutGridDesc_M
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
PropagateNan
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
OutDstVectorSize
>
;
float
avg_time
=
0
;
using
InGridDesc_M_K
=
decltype
(
in_grid_desc_m_k
);
using
OutGridDesc_M
=
decltype
(
out_grid_desc_m
);
using
OutGridDesc_M_2
=
decltype
(
out_grid_desc_m_2
);
using
GridwiseReduce
=
GridwiseReduction_mk_to_m_multiblock
<
InDataType
,
OutDataType
,
AccDataType
,
IndexDataType
,
InGridDesc_M_K
,
OutGridDesc_M
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
OutMemoryDataOperation
,
PropagateNan
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
OutDstVectorSize
>
;
const
auto
kernel_main
=
kernel_reduce_multiblock
<
GridwiseReduce
,
OutputIndex
,
HaveIndexInput
,
InDataType
,
OutDataType
,
AccDataType
,
int32_t
,
InGridDesc_M_K
,
OutGridDesc_M
,
InElementwiseOperation
,
AccElementwiseOperation
>
;
const
auto
kernel_pre
=
kernel_buffer_set_value
<
BlockSize
,
OutDataType
,
OutGridDesc_M
>
;
float
avg_time
=
0
;
const
auto
kernel_main
=
kernel_reduce_multiblock_atocmi_add
<
GridwiseReduce
,
InDataType
,
OutDataType
,
AccDataType
,
InGridDesc_M_K
,
OutGridDesc_M
,
InElementwiseOperation
,
AccElementwiseOperation
>
;
avg_time
+=
launch_and_time_kernel
(
stream_config
,
if
constexpr
(
use_multiblock
)
kernel_pre
,
{
dim3
(
arg
.
gridSize_pre
),
const
auto
zeroVal
=
dim3
(
BlockSize
),
ck
::
reduce
::
GetReductionZeroValueForInMemoryDataOperation
<
OutDataType
>
(
0
,
OutMemoryDataOperation
);
out_grid_desc_m
,
arg
.
out_dev_
,
const
auto
kernel_pre
=
static_cast
<
OutDataType
>
(
0.0
f
));
kernel_buffer_set_value
<
BlockSize
,
OutDataType
,
OutGridDesc_M_2
>
;
avg_time
+=
launch_and_time_kernel
(
stream_config
,
kernel_pre
,
dim3
(
arg
.
gridSize_pre
),
dim3
(
BlockSize
),
0
,
out_grid_desc_m_2
,
arg
.
out_dev_
,
zeroVal
);
};
avg_time
+=
launch_and_time_kernel
(
stream_config
,
avg_time
+=
launch_and_time_kernel
(
stream_config
,
kernel_main
,
kernel_main
,
...
@@ -304,25 +375,34 @@ struct DeviceReduceMultiBlockAtomicAdd
...
@@ -304,25 +375,34 @@ struct DeviceReduceMultiBlockAtomicAdd
arg
.
in_elementwise_op_
,
arg
.
in_elementwise_op_
,
arg
.
acc_elementwise_op_
,
arg
.
acc_elementwise_op_
,
arg
.
blkGroupSize
,
arg
.
blkGroupSize
,
arg
.
k
BlockTileIteration
s
,
arg
.
num
BlockTileIteration
,
arg
.
alpha_
,
arg
.
alpha_
,
arg
.
in_dev_
,
arg
.
in_dev_
,
arg
.
out_dev_
);
arg
.
in_index_dev_
,
arg
.
beta_
,
arg
.
out_dev_
,
arg
.
out_index_dev_
);
return
avg_time
;
return
(
avg_time
)
;
}
}
;
float
Run
(
const
BaseArgument
*
p_arg
,
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
}
;
};
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
constexpr
(
use_multiblock
)
{
if
(
static_cast
<
float
>
(
pArg
->
beta_
)
!=
0.0
f
)
return
(
false
);
};
if
constexpr
(
InSrcVectorDim
==
0
)
if
constexpr
(
InSrcVectorDim
==
0
)
{
{
if
constexpr
(
NumInvariantDim
==
0
)
if
constexpr
(
NumInvariantDim
==
0
)
...
@@ -347,36 +427,43 @@ struct DeviceReduceMultiBlockAtomicAdd
...
@@ -347,36 +427,43 @@ struct DeviceReduceMultiBlockAtomicAdd
return
(
false
);
return
(
false
);
};
};
if
(
static_cast
<
float
>
(
pArg
->
beta_
)
!=
0.0
f
)
return
(
false
);
// To improve
// To improve
if
(
pArg
->
invariant_lowest_length
%
OutDstVectorSize
!=
0
)
if
(
pArg
->
invariant_lowest_length
%
OutDstVectorSize
!=
0
)
return
(
false
);
return
(
false
);
// cases with small reduce_total_length should be handled by the BlockWise method
if
constexpr
(
use_multiblock
)
if
(
pArg
->
reduce_total_length
<=
BlockSize
*
KThreadSliceSize
)
{
return
(
false
);
// blkGroupSize of 1 should be handled by Blockwise path using
// InMemoryDataOperationEnum::Set
if
(
pArg
->
blkGroupSize
==
1
)
return
(
false
);
// This is very strong restriction, but needed to avoid some failure
// This is very strong restriction, but needed to avoid some failure
if
(
pArg
->
invariant_lowest_length
%
M_BlockTileSize
!=
0
)
if
(
pArg
->
invariant_lowest_length
%
M_BlockTileSize
!=
0
)
return
(
false
);
return
(
false
);
}
else
{
// cases with very small reduce_total_length should be handled by ThreadWise kernel
if
(
pArg
->
reduce_total_length
/
KThreadSliceSize
<
2
)
return
(
false
);
};
return
(
true
);
return
(
true
);
};
};
std
::
unique_ptr
<
BaseArgument
>
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
int
>
inLengths
,
MakeArgumentPointer
(
const
std
::
vector
<
in
dex_
t
>
inLengths
,
const
std
::
vector
<
int
>
inStrides
,
const
std
::
vector
<
in
dex_
t
>
inStrides
,
const
std
::
vector
<
int
>
outLengths
,
const
std
::
vector
<
in
dex_
t
>
outLengths
,
const
std
::
vector
<
int
>
outStrides
,
const
std
::
vector
<
in
dex_
t
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
const
std
::
vector
<
int
>
reduceDims
,
float
alpha
,
float
alpha
,
float
beta
,
float
beta
,
const
void
*
in_dev
,
const
void
*
in_dev
,
const
void
*
in_index_dev
,
void
*
out_dev
,
void
*
out_dev
,
void
*
out_indices_dev
,
void
*
out_index_dev
,
void
*
workspace_dev
,
const
InElementwiseOperation
in_elementwise_op
,
const
InElementwiseOperation
in_elementwise_op
,
const
AccElementwiseOperation
acc_elementwise_op
)
override
const
AccElementwiseOperation
acc_elementwise_op
)
override
{
{
...
@@ -388,9 +475,9 @@ struct DeviceReduceMultiBlockAtomicAdd
...
@@ -388,9 +475,9 @@ struct DeviceReduceMultiBlockAtomicAdd
alpha
,
alpha
,
beta
,
beta
,
static_cast
<
const
InDataType
*>
(
in_dev
),
static_cast
<
const
InDataType
*>
(
in_dev
),
static_cast
<
const
IndexDataType
*>
(
in_index_dev
),
static_cast
<
OutDataType
*>
(
out_dev
),
static_cast
<
OutDataType
*>
(
out_dev
),
static_cast
<
IndexDataType
*>
(
out_indices_dev
),
static_cast
<
IndexDataType
*>
(
out_index_dev
),
static_cast
<
AccDataType
*>
(
workspace_dev
),
in_elementwise_op
,
in_elementwise_op
,
acc_elementwise_op
);
acc_elementwise_op
);
};
};
...
...
include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
deleted
100644 → 0
View file @
97ac5007
#ifndef DEVICE_REDUCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
#define DEVICE_REDUCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
#include <iostream>
#include <sstream>
#include "device.hpp"
#include "device_reduce.hpp"
#include "device_reduce_common.hpp"
#include "gridwise_2d_reduction_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
index_t
Rank
,
index_t
NumReduceDim
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
bool
PropagateNan
,
bool
NeedIndices
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
struct
DeviceReduceMultiBlockPartialReduce
:
public
DeviceReduce
<
InElementwiseOperation
,
AccElementwiseOperation
>
{
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
static_assert
(
BlockSize
==
MThreadClusterSize
*
KThreadClusterSize
,
"Invalid thread cluster size assignments!"
);
static_assert
((
InSrcVectorDim
==
0
&&
MThreadSliceSize
%
InSrcVectorSize
==
0
)
||
(
InSrcVectorDim
==
1
&&
KThreadSliceSize
%
InSrcVectorSize
==
0
),
"Invalid thread slice sizes and/or vector sizes configuration, please check!"
);
static_assert
(
OutDstVectorSize
==
1
,
"OutDstVectorSize must be 1 for MultiBlockPartialReduce!"
);
using
IndexDataType
=
int32_t
;
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
static
constexpr
index_t
numSrcDim
=
Rank
;
static
constexpr
index_t
numDstDim
=
(
NumInvariantDim
==
0
)
?
1
:
NumInvariantDim
;
static
constexpr
bool
reduceAllDim
=
(
NumInvariantDim
==
0
);
static
constexpr
int
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
int
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
static
constexpr
int
MaxBlockGroupSize
=
256
;
long_index_t
GetWorkspaceSizeInBytes
(
const
std
::
vector
<
int
>
inLengths
,
const
std
::
vector
<
int
>
reduceDims
)
override
{
size_t
invariant_total_length
;
size_t
reduce_total_length
;
auto
inLengths_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inLengths
,
reduceDims
);
std
::
tie
(
invariant_total_length
,
reduce_total_length
)
=
get_2d_lengths
<
Rank
,
NumReduceDim
>
(
inLengths_
);
int
iterations
=
1
;
while
(
true
)
{
int
testBlkGroupSize
=
(
reduce_total_length
+
(
K_BlockTileSize
*
iterations
)
-
1
)
/
(
K_BlockTileSize
*
iterations
);
if
(
testBlkGroupSize
<=
MaxBlockGroupSize
)
break
;
iterations
++
;
};
int
blkGroupSize
=
(
reduce_total_length
+
(
K_BlockTileSize
*
iterations
)
-
1
)
/
(
K_BlockTileSize
*
iterations
);
long_index_t
workspace_size
=
invariant_total_length
*
blkGroupSize
;
long_index_t
wsSizeInBytes
=
!
NeedIndices
?
workspace_size
*
sizeof
(
AccDataType
)
:
workspace_size
*
(
sizeof
(
AccDataType
)
+
sizeof
(
int32_t
))
+
64
+
sizeof
(
int
);
return
(
wsSizeInBytes
);
};
bool
HasFurtherCall
()
override
{
return
(
true
);
};
static
auto
MakeSrc2dDescriptor
(
const
std
::
vector
<
int
>&
inLengths
,
const
std
::
vector
<
int
>&
inStrides
,
int
blkGroupSize
,
int
kBlockTileIterations
)
{
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
inLengths
,
Number
<
numSrcDim
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
inStrides
,
Number
<
numSrcDim
>
{});
const
auto
inDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
in_grid_desc_m_k
=
[
&
]()
{
if
constexpr
(
reduceAllDim
)
{
const
auto
one_dim_inDesc
=
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
numSrcDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
transform_tensor_descriptor
(
one_dim_inDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
one_dim_inDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
}
else
{
using
InvariantDims
=
typename
arithmetic_sequence_gen
<
0
,
NumInvariantDim
,
1
>::
type
;
using
ReduceDims
=
typename
arithmetic_sequence_gen
<
NumInvariantDim
,
Rank
,
1
>::
type
;
const
auto
reduceDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
ReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
InvariantDims
{});
return
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
reduceDimLengths
)),
make_tuple
(
InvariantDims
{},
ReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
}();
const
auto
invariantLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
0
>
{});
const
auto
reduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
int
reduceSizePerBlock
=
K_BlockTileSize
*
kBlockTileIterations
;
const
auto
inPad_M
=
math
::
integer_least_multiple
(
invariantLength
,
M_BlockTileSize
)
-
invariantLength
;
const
auto
inPad_K
=
reduceSizePerBlock
*
blkGroupSize
-
reduceLength
;
auto
in_grid_desc_m_k_padded
=
transform_tensor_descriptor
(
in_grid_desc_m_k
,
make_tuple
(
make_right_pad_transform
(
invariantLength
,
inPad_M
),
make_right_pad_transform
(
reduceLength
,
inPad_K
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
(
in_grid_desc_m_k_padded
);
};
static
auto
MakeWorkspace2dDescriptor
(
int
invariantLength
,
int
blkGroupSize
)
{
auto
ws_desc_m_k
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
invariantLength
,
blkGroupSize
));
const
auto
wsPad
=
math
::
integer_least_multiple
(
invariantLength
,
M_BlockTileSize
)
-
invariantLength
;
auto
ws_desc_m_k_padded
=
transform_tensor_descriptor
(
ws_desc_m_k
,
make_tuple
(
make_right_pad_transform
(
invariantLength
,
wsPad
),
make_pass_through_transform
(
blkGroupSize
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
(
ws_desc_m_k_padded
);
};
struct
Argument
:
public
BaseArgument
{
Argument
(
const
std
::
vector
<
int
>
inLengths
,
const
std
::
vector
<
int
>
inStrides
,
const
std
::
vector
<
int
>
outLengths
,
const
std
::
vector
<
int
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
float
alpha
,
float
beta
,
const
InDataType
*
in_dev
,
OutDataType
*
out_dev
,
IndexDataType
*
out_indices_dev
,
AccDataType
*
workspace_dev
,
const
InElementwiseOperation
in_elementwise_op
,
const
AccElementwiseOperation
acc_elementwise_op
)
:
outLengths_
{
outLengths
},
outStrides_
{
outStrides
},
in_dev_
{
in_dev
},
out_dev_
{
out_dev
},
out_indices_dev_
{
out_indices_dev
},
workspace_dev_
{
workspace_dev
},
in_elementwise_op_
{
in_elementwise_op
},
acc_elementwise_op_
{
acc_elementwise_op
}
{
inLengths_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inLengths
,
reduceDims
);
inStrides_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inStrides
,
reduceDims
);
alpha_
=
type_convert
<
AccDataType
>
(
alpha
);
beta_
=
type_convert
<
AccDataType
>
(
beta
);
std
::
tie
(
invariant_total_length
,
reduce_total_length
)
=
get_2d_lengths
<
Rank
,
NumReduceDim
>
(
inLengths_
);
if
constexpr
(
NumInvariantDim
==
0
)
invariant_lowest_length
=
1
;
else
invariant_lowest_length
=
inLengths_
[
NumInvariantDim
-
1
];
reduce_lowest_length
=
inLengths_
[
Rank
-
1
];
int
iterations
=
1
;
while
(
true
)
{
int
testBlkGroupSize
=
(
reduce_total_length
+
(
K_BlockTileSize
*
iterations
)
-
1
)
/
(
K_BlockTileSize
*
iterations
);
if
(
testBlkGroupSize
<=
MaxBlockGroupSize
)
break
;
iterations
++
;
};
blkGroupSize
=
(
reduce_total_length
+
(
K_BlockTileSize
*
iterations
)
-
1
)
/
(
K_BlockTileSize
*
iterations
);
kBlockTileIterations
=
iterations
;
gridSize
=
math
::
integer_least_multiple
(
invariant_total_length
,
M_BlockTileSize
)
/
M_BlockTileSize
*
blkGroupSize
;
size_t
ws_buf2_bytes_offset
=
math
::
integer_least_multiple
(
invariant_total_length
*
blkGroupSize
*
sizeof
(
AccDataType
),
64
);
if
constexpr
(
NeedIndices
)
workspace_indices_dev_
=
reinterpret_cast
<
int
*>
(
reinterpret_cast
<
char
*>
(
workspace_dev_
)
+
ws_buf2_bytes_offset
);
else
workspace_indices_dev_
=
nullptr
;
}
std
::
vector
<
int
>
inLengths_
;
std
::
vector
<
int
>
inStrides_
;
std
::
vector
<
int
>
outLengths_
;
std
::
vector
<
int
>
outStrides_
;
AccDataType
alpha_
;
AccDataType
beta_
;
const
InDataType
*
in_dev_
;
OutDataType
*
out_dev_
;
IndexDataType
*
out_indices_dev_
;
AccDataType
*
workspace_dev_
;
IndexDataType
*
workspace_indices_dev_
;
InElementwiseOperation
in_elementwise_op_
;
AccElementwiseOperation
acc_elementwise_op_
;
int
invariant_lowest_length
;
int
reduce_lowest_length
;
size_t
invariant_total_length
;
size_t
reduce_total_length
;
index_t
blkGroupSize
;
index_t
kBlockTileIterations
;
size_t
gridSize
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
const
auto
in_grid_desc_m_k
=
DeviceReduceMultiBlockPartialReduce
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
kBlockTileIterations
);
const
auto
ws_desc_m_k
=
DeviceReduceMultiBlockPartialReduce
::
MakeWorkspace2dDescriptor
(
arg
.
invariant_total_length
,
arg
.
blkGroupSize
);
using
InGridDesc_M_K
=
decltype
(
in_grid_desc_m_k
);
using
WorkspaceDesc_M_K
=
decltype
(
ws_desc_m_k
);
using
GridwiseReduce
=
GridwiseReduction_mk_to_mk_multiblock_partial_reduce
<
InDataType
,
AccDataType
,
IndexDataType
,
InGridDesc_M_K
,
WorkspaceDesc_M_K
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
PropagateNan
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
OutDstVectorSize
>
;
float
avg_time
=
0
;
const
auto
kernel
=
kernel_partial_reduce_multiblock
<
GridwiseReduce
,
NeedIndices
,
InDataType
,
AccDataType
,
IndexDataType
,
InGridDesc_M_K
,
WorkspaceDesc_M_K
,
InElementwiseOperation
,
AccElementwiseOperation
>
;
avg_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
arg
.
gridSize
),
dim3
(
BlockSize
),
0
,
in_grid_desc_m_k
,
ws_desc_m_k
,
arg
.
in_elementwise_op_
,
arg
.
acc_elementwise_op_
,
arg
.
blkGroupSize
,
arg
.
kBlockTileIterations
,
arg
.
in_dev_
,
arg
.
workspace_dev_
,
arg
.
workspace_indices_dev_
);
return
(
avg_time
);
};
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
constexpr
(
OutDstVectorSize
!=
1
)
return
(
false
);
if
constexpr
(
InSrcVectorDim
==
0
)
{
if
constexpr
(
NumInvariantDim
==
0
)
{
return
(
false
);
}
else
{
if
(
pArg
->
inStrides_
[
NumInvariantDim
-
1
]
!=
1
)
return
(
false
);
if
(
pArg
->
invariant_lowest_length
%
InSrcVectorSize
!=
0
)
return
(
false
);
};
}
else
{
if
(
pArg
->
inStrides_
[
Rank
-
1
]
!=
1
)
return
(
false
);
if
(
pArg
->
reduce_lowest_length
%
InSrcVectorSize
!=
0
)
return
(
false
);
};
// cases with small reduce_total_length should be handled by the BlockWise method
if
(
pArg
->
reduce_total_length
<=
BlockSize
*
KThreadSliceSize
)
return
(
false
);
return
(
true
);
};
std
::
vector
<
int
>
GetWorkspace2dLengths
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
return
(
std
::
vector
<
int
>
{
static_cast
<
int
>
(
pArg
->
invariant_total_length
),
pArg
->
blkGroupSize
});
};
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
int
>
inLengths
,
const
std
::
vector
<
int
>
inStrides
,
const
std
::
vector
<
int
>
outLengths
,
const
std
::
vector
<
int
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
float
alpha
,
float
beta
,
const
void
*
in_dev
,
void
*
out_dev
,
void
*
out_indices_dev
,
void
*
workspace_dev
,
const
InElementwiseOperation
in_elementwise_op
,
const
AccElementwiseOperation
acc_elementwise_op
)
override
{
return
std
::
make_unique
<
Argument
>
(
inLengths
,
inStrides
,
outLengths
,
outStrides
,
reduceDims
,
alpha
,
beta
,
static_cast
<
const
InDataType
*>
(
in_dev
),
static_cast
<
OutDataType
*>
(
out_dev
),
static_cast
<
IndexDataType
*>
(
out_indices_dev
),
static_cast
<
AccDataType
*>
(
workspace_dev
),
in_elementwise_op
,
acc_elementwise_op
);
};
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
();
};
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceReduceMultiBlockPartialReduce<"
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
MThreadClusterSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"InSrcVectorDim_"
<<
InSrcVectorDim
<<
"_InSrcVectorSize_"
<<
InSrcVectorSize
<<
"_OutDstVectorSize_"
<<
OutDstVectorSize
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
View file @
bb1f8082
...
@@ -6,6 +6,7 @@
...
@@ -6,6 +6,7 @@
#include "device.hpp"
#include "device.hpp"
#include "device_reduce.hpp"
#include "device_reduce.hpp"
#include "device_reduce_common.hpp"
#include "device_reduce_common.hpp"
#include "gridwise_2d_reduction_multiblock.hpp"
#include "gridwise_2d_reduction_threadwise.hpp"
#include "gridwise_2d_reduction_threadwise.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -19,22 +20,19 @@ template <typename InDataType,
...
@@ -19,22 +20,19 @@ template <typename InDataType,
index_t
NumReduceDim
,
index_t
NumReduceDim
,
typename
ReduceOperation
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
InElementwiseOperation
,
typename
Out
ElementwiseOperation
,
typename
Acc
ElementwiseOperation
,
bool
PropagateNan
,
bool
PropagateNan
,
bool
NeedIndices
,
bool
OutputIndex
,
bool
HaveIndexInputIfOutputIndex
,
index_t
BlockSize
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
index_t
OutDstVectorSize
>
struct
DeviceReduceThreadWise
:
public
DeviceReduce
<
InElementwiseOperation
,
Out
ElementwiseOperation
>
struct
DeviceReduceThreadWise
:
public
DeviceReduce
<
InElementwiseOperation
,
Acc
ElementwiseOperation
>
{
{
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
static_assert
((
BlockSize
==
MThreadClusterSize
)
&&
(
KThreadClusterSize
==
1
),
"Threadwise can only be called with KThreadClusterSize be 1 !"
);
static_assert
(((
InSrcVectorDim
==
0
&&
MThreadSliceSize
%
InSrcVectorSize
==
0
)
||
static_assert
(((
InSrcVectorDim
==
0
&&
MThreadSliceSize
%
InSrcVectorSize
==
0
)
||
(
InSrcVectorDim
==
1
&&
KThreadSliceSize
%
InSrcVectorSize
==
0
))
&&
(
InSrcVectorDim
==
1
&&
KThreadSliceSize
%
InSrcVectorSize
==
0
))
&&
...
@@ -43,7 +41,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
...
@@ -43,7 +41,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
using
IndexDataType
=
int32_t
;
using
IndexDataType
=
int32_t
;
static
constexpr
bool
BetaIsZero
=
NeedIndices
;
static
constexpr
bool
HaveIndexInput
=
OutputIndex
&&
HaveIndexInputIfOutputIndex
;
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
...
@@ -51,11 +49,11 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
...
@@ -51,11 +49,11 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
static
constexpr
index_t
numDstDim
=
(
NumInvariantDim
==
0
)
?
1
:
NumInvariantDim
;
static
constexpr
index_t
numDstDim
=
(
NumInvariantDim
==
0
)
?
1
:
NumInvariantDim
;
static
constexpr
bool
reduceAllDim
=
(
NumInvariantDim
==
0
);
static
constexpr
bool
reduceAllDim
=
(
NumInvariantDim
==
0
);
static
constexpr
int
M_BlockTileSize
=
MThreadCluster
Size
*
MThreadSliceSize
;
static
constexpr
in
dex_
t
M_BlockTileSize
=
Block
Size
*
MThreadSliceSize
;
static
constexpr
int
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
static
constexpr
in
dex_
t
K_BlockTileSize
=
1
*
KThreadSliceSize
;
static
auto
MakeSrc2dDescriptor
(
const
std
::
vector
<
int
>&
inLengths
,
static
auto
MakeSrc2dDescriptor
(
const
std
::
vector
<
in
dex_
t
>&
inLengths
,
const
std
::
vector
<
int
>&
inStrides
)
const
std
::
vector
<
in
dex_
t
>&
inStrides
)
{
{
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
inLengths
,
Number
<
numSrcDim
>
{});
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
inLengths
,
Number
<
numSrcDim
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
inStrides
,
Number
<
numSrcDim
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
inStrides
,
Number
<
numSrcDim
>
{});
...
@@ -114,8 +112,8 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
...
@@ -114,8 +112,8 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
return
(
in_grid_desc_m_k_padded
);
return
(
in_grid_desc_m_k_padded
);
};
};
static
auto
MakeDst1dDescriptor
(
const
std
::
vector
<
int
>&
outLengths
,
static
auto
MakeDst1dDescriptor
(
const
std
::
vector
<
in
dex_
t
>&
outLengths
,
const
std
::
vector
<
int
>&
outStrides
)
const
std
::
vector
<
in
dex_
t
>&
outStrides
)
{
{
const
auto
tupleDstLengths
=
make_tuple_from_array
(
outLengths
,
Number
<
numDstDim
>
{});
const
auto
tupleDstLengths
=
make_tuple_from_array
(
outLengths
,
Number
<
numDstDim
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
outStrides
,
Number
<
numDstDim
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
outStrides
,
Number
<
numDstDim
>
{});
...
@@ -143,30 +141,26 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
...
@@ -143,30 +141,26 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
struct
Argument
:
public
BaseArgument
struct
Argument
:
public
BaseArgument
{
{
Argument
(
const
std
::
vector
<
int
>
inLengths
,
Argument
(
const
std
::
vector
<
in
dex_
t
>
inLengths
,
const
std
::
vector
<
int
>
inStrides
,
const
std
::
vector
<
in
dex_
t
>
inStrides
,
const
std
::
vector
<
int
>
outLengths
,
const
std
::
vector
<
in
dex_
t
>
outLengths
,
const
std
::
vector
<
int
>
outStrides
,
const
std
::
vector
<
in
dex_
t
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
const
std
::
vector
<
int
>
reduceDims
,
float
alpha
,
float
alpha
,
float
beta
,
float
beta
,
const
InDataType
*
in_dev
,
const
InDataType
*
in_dev
,
OutDataType
*
out_dev
,
OutDataType
*
out_dev
,
IndexDataType
*
out_indices_dev
,
IndexDataType
*
out_index_dev
,
AccDataType
*
workspace_dev
,
const
InElementwiseOperation
in_elementwise_op
,
const
InElementwiseOperation
in_elementwise_op
,
const
Out
ElementwiseOperation
acc_elementwise_op
)
const
Acc
ElementwiseOperation
acc_elementwise_op
)
:
outLengths_
{
outLengths
},
:
outLengths_
{
outLengths
},
outStrides_
{
outStrides
},
outStrides_
{
outStrides
},
in_dev_
{
in_dev
},
in_dev_
{
in_dev
},
out_dev_
{
out_dev
},
out_dev_
{
out_dev
},
out_ind
ices
_dev_
{
out_ind
ices
_dev
},
out_ind
ex
_dev_
{
out_ind
ex
_dev
},
in_elementwise_op_
{
in_elementwise_op
},
in_elementwise_op_
{
in_elementwise_op
},
acc_elementwise_op_
{
acc_elementwise_op
}
acc_elementwise_op_
{
acc_elementwise_op
}
{
{
(
void
)
workspace_dev
;
inLengths_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inLengths
,
reduceDims
);
inLengths_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inLengths
,
reduceDims
);
inStrides_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inStrides
,
reduceDims
);
inStrides_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inStrides
,
reduceDims
);
...
@@ -183,30 +177,33 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
...
@@ -183,30 +177,33 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
reduce_lowest_length
=
inLengths_
[
Rank
-
1
];
reduce_lowest_length
=
inLengths_
[
Rank
-
1
];
numBlockTileIteration
=
(
reduce_total_length
+
K_BlockTileSize
-
1
)
/
K_BlockTileSize
;
gridSize
=
math
::
integer_least_multiple
(
invariant_total_length
,
M_BlockTileSize
)
/
gridSize
=
math
::
integer_least_multiple
(
invariant_total_length
,
M_BlockTileSize
)
/
M_BlockTileSize
;
M_BlockTileSize
;
}
}
std
::
vector
<
int
>
inLengths_
;
std
::
vector
<
in
dex_
t
>
inLengths_
;
std
::
vector
<
int
>
inStrides_
;
std
::
vector
<
in
dex_
t
>
inStrides_
;
std
::
vector
<
int
>
outLengths_
;
std
::
vector
<
in
dex_
t
>
outLengths_
;
std
::
vector
<
int
>
outStrides_
;
std
::
vector
<
in
dex_
t
>
outStrides_
;
AccDataType
alpha_
;
AccDataType
alpha_
;
AccDataType
beta_
;
AccDataType
beta_
;
const
InDataType
*
in_dev_
;
const
InDataType
*
in_dev_
;
OutDataType
*
out_dev_
;
OutDataType
*
out_dev_
;
IndexDataType
*
out_ind
ices
_dev_
;
IndexDataType
*
out_ind
ex
_dev_
;
InElementwiseOperation
in_elementwise_op_
;
InElementwiseOperation
in_elementwise_op_
;
Out
ElementwiseOperation
acc_elementwise_op_
;
Acc
ElementwiseOperation
acc_elementwise_op_
;
int
invariant_lowest_length
;
in
dex_
t
invariant_lowest_length
;
int
reduce_lowest_length
;
in
dex_
t
reduce_lowest_length
;
size
_t
invariant_total_length
;
long_index
_t
invariant_total_length
;
size
_t
reduce_total_length
;
long_index
_t
reduce_total_length
;
int
numBlockTileIteration
;
size_t
gridSize
;
size_t
gridSize
;
};
};
...
@@ -221,30 +218,30 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
...
@@ -221,30 +218,30 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
using
InGridDesc_M_K
=
decltype
(
in_grid_desc_m_k
);
using
InGridDesc_M_K
=
decltype
(
in_grid_desc_m_k
);
using
OutGridDesc_M
=
decltype
(
out_grid_desc_m
);
using
OutGridDesc_M
=
decltype
(
out_grid_desc_m
);
using
GridwiseReduce
=
GridwiseReduction_mk_to_m_threadwise
<
InDataType
,
OutDataType
,
AccDataType
,
IndexDataType
,
InGridDesc_M_K
,
OutGridDesc_M
,
ReduceOperation
,
InElementwiseOperation
,
OutElementwiseOperation
,
PropagateNan
,
BetaIsZero
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
OutDstVectorSize
>
;
float
avg_time
=
0
;
float
avg_time
=
0
;
using
GridwiseReduce
=
GridwiseReduction_mk_to_m_threadwise
<
InDataType
,
OutDataType
,
AccDataType
,
IndexDataType
,
InGridDesc_M_K
,
OutGridDesc_M
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
InMemoryDataOperationEnum
::
Set
,
PropagateNan
,
BlockSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
OutDstVectorSize
>
;
const
auto
kernel
=
kernel_reduce_threadwise
<
GridwiseReduce
,
const
auto
kernel
=
kernel_reduce_threadwise
<
GridwiseReduce
,
NeedIndices
,
OutputIndex
,
HaveIndexInput
,
InDataType
,
InDataType
,
OutDataType
,
OutDataType
,
AccDataType
,
AccDataType
,
...
@@ -252,7 +249,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
...
@@ -252,7 +249,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
InGridDesc_M_K
,
InGridDesc_M_K
,
OutGridDesc_M
,
OutGridDesc_M
,
InElementwiseOperation
,
InElementwiseOperation
,
Out
ElementwiseOperation
>
;
Acc
ElementwiseOperation
>
;
avg_time
=
launch_and_time_kernel
(
stream_config
,
avg_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
kernel
,
...
@@ -265,9 +262,10 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
...
@@ -265,9 +262,10 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
arg
.
acc_elementwise_op_
,
arg
.
acc_elementwise_op_
,
arg
.
alpha_
,
arg
.
alpha_
,
arg
.
in_dev_
,
arg
.
in_dev_
,
nullptr
,
arg
.
beta_
,
arg
.
beta_
,
arg
.
out_dev_
,
arg
.
out_dev_
,
arg
.
out_ind
ices
_dev_
);
arg
.
out_ind
ex
_dev_
);
return
(
avg_time
);
return
(
avg_time
);
};
};
...
@@ -276,7 +274,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
...
@@ -276,7 +274,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
}
;
};
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
...
@@ -311,9 +309,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
...
@@ -311,9 +309,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
if
(
pArg
->
invariant_lowest_length
%
OutDstVectorSize
!=
0
)
if
(
pArg
->
invariant_lowest_length
%
OutDstVectorSize
!=
0
)
return
(
false
);
return
(
false
);
// TODO: remove this. Should return true, as long as this DeviceOP instance support this
// cases with big reduce_total_length should be handled by Blockwise kernel
// case for bigger reduce_total_length size, we are supposed to use BlockWise method for
// better performance
if
(
pArg
->
reduce_total_length
/
KThreadSliceSize
>=
32
)
if
(
pArg
->
reduce_total_length
/
KThreadSliceSize
>=
32
)
return
(
false
);
return
(
false
);
...
@@ -321,20 +317,22 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
...
@@ -321,20 +317,22 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
};
};
std
::
unique_ptr
<
BaseArgument
>
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
int
>
inLengths
,
MakeArgumentPointer
(
const
std
::
vector
<
in
dex_
t
>
inLengths
,
const
std
::
vector
<
int
>
inStrides
,
const
std
::
vector
<
in
dex_
t
>
inStrides
,
const
std
::
vector
<
int
>
outLengths
,
const
std
::
vector
<
in
dex_
t
>
outLengths
,
const
std
::
vector
<
int
>
outStrides
,
const
std
::
vector
<
in
dex_
t
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
const
std
::
vector
<
int
>
reduceDims
,
float
alpha
,
float
alpha
,
float
beta
,
float
beta
,
const
void
*
in_dev
,
const
void
*
in_dev
,
const
void
*
in_index_dev
,
void
*
out_dev
,
void
*
out_dev
,
void
*
out_indices_dev
,
void
*
out_index_dev
,
void
*
workspace_dev
,
const
InElementwiseOperation
in_elementwise_op
,
const
InElementwiseOperation
in_elementwise_op
,
const
Out
ElementwiseOperation
acc_elementwise_op
)
override
const
Acc
ElementwiseOperation
acc_elementwise_op
)
override
{
{
(
void
)
in_index_dev
;
return
std
::
make_unique
<
Argument
>
(
inLengths
,
return
std
::
make_unique
<
Argument
>
(
inLengths
,
inStrides
,
inStrides
,
outLengths
,
outLengths
,
...
@@ -344,8 +342,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
...
@@ -344,8 +342,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
beta
,
beta
,
static_cast
<
const
InDataType
*>
(
in_dev
),
static_cast
<
const
InDataType
*>
(
in_dev
),
static_cast
<
OutDataType
*>
(
out_dev
),
static_cast
<
OutDataType
*>
(
out_dev
),
static_cast
<
IndexDataType
*>
(
out_indices_dev
),
static_cast
<
IndexDataType
*>
(
out_index_dev
),
static_cast
<
AccDataType
*>
(
workspace_dev
),
in_elementwise_op
,
in_elementwise_op
,
acc_elementwise_op
);
acc_elementwise_op
);
};
};
...
@@ -360,9 +357,9 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
...
@@ -360,9 +357,9 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
auto
str
=
std
::
stringstream
();
auto
str
=
std
::
stringstream
();
// clang-format off
// clang-format off
str
<<
"DeviceReduc
c
eThreadWise<"
<<
BlockSize
<<
","
;
str
<<
"DeviceReduceThreadWise<"
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
MThreadCluster
Size
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"M_C"
<<
Block
Size
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
1
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"InSrcVectorDim_"
<<
InSrcVectorDim
<<
"_InSrcVectorSize_"
<<
InSrcVectorSize
<<
"_OutDstVectorSize_"
<<
OutDstVectorSize
<<
">"
;
str
<<
"InSrcVectorDim_"
<<
InSrcVectorDim
<<
"_InSrcVectorSize_"
<<
InSrcVectorSize
<<
"_OutDstVectorSize_"
<<
OutDstVectorSize
<<
">"
;
// clang-format on
// clang-format on
...
...
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
View file @
bb1f8082
...
@@ -8,6 +8,237 @@
...
@@ -8,6 +8,237 @@
namespace
ck
{
namespace
ck
{
// Rows of column-vectors
template
<
index_t
MPerBlock
,
index_t
NPerBlock
,
typename
CGridDesc_M_N
,
bool
DeviceCTileIndexCheck
=
false
>
struct
BlockToCTileMap_M00_N0_M01
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
__host__
__device__
BlockToCTileMap_M00_N0_M01
()
=
default
;
__host__
__device__
BlockToCTileMap_M00_N0_M01
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
=
1
)
:
M01_
(
M01
),
underlying_map_
(
GetBlockToCTileMap
(
c_grid_desc_m_n
,
M01
))
{
}
__host__
constexpr
index_t
CalculateGridSize
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
const
auto
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I0
),
MPerBlock
);
const
auto
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I1
),
NPerBlock
);
const
auto
M00
=
math
::
integer_divide_ceil
(
M0
,
M01_
);
const
index_t
grid_size
=
M00
*
M01_
*
N0
;
return
grid_size
;
}
template
<
typename
TopIdx
>
__host__
__device__
constexpr
auto
CalculateBottomIndex
(
const
TopIdx
&
idx_top
)
const
{
return
underlying_map_
.
CalculateBottomIndex
(
idx_top
);
}
template
<
typename
CTileIdx
,
typename
CTileDim
>
__host__
__device__
bool
ValidCTileIndex
(
const
CTileIdx
&
c_tile_idx
,
const
CTileDim
&
c_tile_dim
)
const
{
if
constexpr
(
DeviceCTileIndexCheck
)
return
DefaultValidCTileIndex
(
c_tile_idx
,
c_tile_dim
);
else
return
true
;
}
__host__
bool
CheckValidity
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
if
constexpr
(
DeviceCTileIndexCheck
)
return
true
;
// validity check moved to kernel
const
index_t
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I0
),
MPerBlock
);
if
(
M0
%
M01_
==
0
)
{
return
true
;
}
else
{
return
false
;
}
}
private:
__host__
__device__
static
constexpr
auto
GetBlockToCTileMap
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
)
{
const
auto
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I0
),
MPerBlock
);
const
auto
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I1
),
NPerBlock
);
const
auto
M00
=
math
::
integer_divide_ceil
(
M0
,
M01
);
const
auto
m00_n0_m01_to_m0_n0_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_insert_transform
(
1
),
make_unmerge_transform
(
make_tuple
(
M00
,
M01
)),
make_pass_through_transform
(
make_tuple
(
N0
))),
make_tuple
(
Sequence
<>
{},
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
3
>
{},
Sequence
<
2
>
{}));
const
auto
cblockid_to_m00_n0_m01_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
1
,
M00
,
N0
,
M01
))),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
cblockid_to_m0_n0_block_cluster_adaptor
=
chain_tensor_adaptors
(
m00_n0_m01_to_m0_n0_block_cluster_adaptor
,
cblockid_to_m00_n0_m01_block_cluster_adaptor
);
return
cblockid_to_m0_n0_block_cluster_adaptor
;
}
index_t
M01_
;
using
UnderlyingMap
=
decltype
(
GetBlockToCTileMap
(
CGridDesc_M_N
{},
1
));
UnderlyingMap
underlying_map_
;
};
// Rows of column-vectors
// This C-tile map dynamically adjusts M01 when C-tile index is out of range
template
<
index_t
MPerBlock
,
index_t
NPerBlock
,
typename
CGridDesc_M_N
>
struct
BlockToCTileMap_M00_N0_M01Adapt
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
__host__
__device__
BlockToCTileMap_M00_N0_M01Adapt
()
=
default
;
__host__
__device__
BlockToCTileMap_M00_N0_M01Adapt
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
=
8
)
:
M01_
(
M01
),
c_grid_desc_m_n_
(
c_grid_desc_m_n
)
{
}
__host__
constexpr
index_t
CalculateGridSize
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
const
auto
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I0
),
MPerBlock
);
const
auto
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I1
),
NPerBlock
);
const
index_t
grid_size
=
M0
*
N0
;
return
grid_size
;
}
template
<
typename
TopIdx
>
__host__
__device__
constexpr
auto
CalculateBottomIndex
(
const
TopIdx
&
idx_top
)
const
{
auto
block_1d_id
=
idx_top
[
I0
];
const
auto
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n_
.
GetLength
(
I0
),
MPerBlock
);
const
auto
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n_
.
GetLength
(
I1
),
NPerBlock
);
block_1d_id
=
block_1d_id
%
(
M0
*
N0
);
// swallow batch index
index_t
idx_N0
=
block_1d_id
%
N0
;
index_t
idx_M0
=
block_1d_id
/
N0
;
const
auto
M01_adapt
=
(
idx_M0
<
M0
-
M0
%
M01_
)
?
M01_
:
M0
%
M01_
;
index_t
idx_M00
=
idx_M0
/
M01_
;
index_t
idx_M01
=
idx_M0
%
M01_
;
index_t
idx_N0_M01_local
=
idx_N0
+
idx_M01
*
N0
;
return
make_tuple
(
idx_N0_M01_local
%
M01_adapt
+
idx_M00
*
M01_
,
idx_N0_M01_local
/
M01_adapt
);
}
template
<
typename
CTileIdx
,
typename
CTileDim
>
__host__
__device__
bool
ValidCTileIndex
(
const
CTileIdx
&
/* c_tile_idx */
,
const
CTileDim
&
/* c_tile_dim */
)
const
{
return
true
;
// always valid provided that user gets grid size from CalculateGridSize()
}
__host__
bool
CheckValidity
(
const
CGridDesc_M_N
&
/* c_grid_desc_m_n */
)
const
{
return
true
;
}
private:
index_t
M01_
;
CGridDesc_M_N
c_grid_desc_m_n_
;
};
// 2D slices of column-vectors in 3D space
// This C-tile map dynamically adjusts M01 when C-tile index is out of range
template
<
index_t
MPerBlock
,
index_t
NPerBlock
,
typename
CGridDesc_M_N
>
struct
BlockToCTileMap_KSplit_M00_N0_M01Adapt
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
__host__
__device__
BlockToCTileMap_KSplit_M00_N0_M01Adapt
()
=
default
;
__host__
__device__
BlockToCTileMap_KSplit_M00_N0_M01Adapt
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
=
8
,
index_t
KSplit
=
1
)
:
M01_
(
M01
),
KSplit_
(
KSplit
),
c_grid_desc_m_n_
(
c_grid_desc_m_n
)
{
}
__host__
constexpr
index_t
CalculateGridSize
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
const
auto
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I0
),
MPerBlock
);
const
auto
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I1
),
NPerBlock
);
const
index_t
grid_size
=
M0
*
N0
*
KSplit_
;
return
grid_size
;
}
template
<
typename
TopIdx
>
__host__
__device__
constexpr
auto
CalculateBottomIndex
(
const
TopIdx
&
idx_top
)
const
{
auto
block_1d_id
=
idx_top
[
I0
];
const
auto
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n_
.
GetLength
(
I0
),
MPerBlock
);
const
auto
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n_
.
GetLength
(
I1
),
NPerBlock
);
const
index_t
idx_ksplit
=
block_1d_id
/
(
M0
*
N0
);
block_1d_id
=
block_1d_id
%
(
M0
*
N0
);
index_t
idx_N0
=
block_1d_id
%
N0
;
index_t
idx_M0
=
block_1d_id
/
N0
;
const
auto
M01_adapt
=
(
idx_M0
<
M0
-
M0
%
M01_
)
?
M01_
:
M0
%
M01_
;
index_t
idx_M00
=
idx_M0
/
M01_
;
index_t
idx_M01
=
idx_M0
%
M01_
;
index_t
idx_N0_M01_local
=
idx_N0
+
idx_M01
*
N0
;
return
make_tuple
(
idx_ksplit
,
idx_N0_M01_local
%
M01_adapt
+
idx_M00
*
M01_
,
idx_N0_M01_local
/
M01_adapt
);
}
template
<
typename
CTileIdx
,
typename
CTileDim
>
__host__
__device__
bool
ValidCTileIndex
(
const
CTileIdx
&
/* c_tile_idx */
,
const
CTileDim
&
/* c_tile_dim */
)
const
{
return
true
;
// always valid provided that user gets grid size from CalculateGridSize()
}
__host__
bool
CheckValidity
(
const
CGridDesc_M_N
&
/* c_grid_desc_m_n */
)
const
{
return
true
;
}
private:
index_t
M01_
;
index_t
KSplit_
;
CGridDesc_M_N
c_grid_desc_m_n_
;
};
// Blocks of row-vectors
// Blocks of row-vectors
template
<
index_t
MPerBlock
,
template
<
index_t
MPerBlock
,
index_t
NPerBlock
,
index_t
NPerBlock
,
...
...
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
deleted
100644 → 0
View file @
97ac5007
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_2D_REDUCTION_BLOCKWISE_HPP
#define CK_GRIDWISE_2D_REDUCTION_BLOCKWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_blockwise.hpp"
#include "reduction_functions_threadwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "cluster_descriptor.hpp"
#include "element_wise_operation.hpp"
namespace
ck
{
template
<
typename
GridwiseReduction
,
bool
NeedIndices
,
typename
InDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
IndexDataType
,
typename
InGridDesc_M_K
,
typename
OutGridDesc_M
,
typename
InElementwiseOperation
,
typename
OutElementwiseOperation
>
__global__
void
kernel_reduce_blockwise
(
const
InGridDesc_M_K
in_grid_desc_m_k
,
const
OutGridDesc_M
out_grid_desc_m
,
const
InElementwiseOperation
in_elementwise_op
,
const
OutElementwiseOperation
acc_elementwise_op
,
AccDataType
alpha
,
const
InDataType
*
const
__restrict__
p_in_global
,
AccDataType
beta
,
OutDataType
*
const
__restrict__
p_out_global
,
const
IndexDataType
*
const
__restrict__
p_ws_indices_global
,
IndexDataType
*
const
__restrict__
p_indices_global
)
{
if
constexpr
(
!
NeedIndices
)
{
constexpr
bool
IsSecondCall
=
false
;
GridwiseReduction
::
template
Run
<
IsSecondCall
>(
in_grid_desc_m_k
,
out_grid_desc_m
,
in_elementwise_op
,
acc_elementwise_op
,
alpha
,
p_in_global
,
beta
,
p_out_global
,
p_ws_indices_global
,
p_indices_global
);
}
else
{
GridwiseReduction
::
RunWithIndex
(
in_grid_desc_m_k
,
out_grid_desc_m
,
in_elementwise_op
,
acc_elementwise_op
,
alpha
,
p_in_global
,
beta
,
p_out_global
,
p_ws_indices_global
,
p_indices_global
);
};
};
template
<
typename
GridwiseReduction
,
bool
NeedIndices
,
typename
InDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
IndexDataType
,
typename
InGridDesc_M_K
,
typename
OutGridDesc_M
,
typename
InElementwiseOperation
,
typename
OutElementwiseOperation
>
__global__
void
kernel_reduce_blockwise_second_call
(
const
InGridDesc_M_K
in_grid_desc_m_k
,
const
OutGridDesc_M
out_grid_desc_m
,
const
InElementwiseOperation
in_elementwise_op
,
const
OutElementwiseOperation
acc_elementwise_op
,
AccDataType
alpha
,
const
InDataType
*
const
__restrict__
p_in_global
,
AccDataType
beta
,
OutDataType
*
const
__restrict__
p_out_global
,
const
IndexDataType
*
const
__restrict__
p_ws_indices_global
,
IndexDataType
*
const
__restrict__
p_indices_global
)
{
if
constexpr
(
!
NeedIndices
)
{
constexpr
bool
IsSecondCall
=
true
;
GridwiseReduction
::
template
Run
<
IsSecondCall
>(
in_grid_desc_m_k
,
out_grid_desc_m
,
in_elementwise_op
,
acc_elementwise_op
,
alpha
,
p_in_global
,
beta
,
p_out_global
,
p_ws_indices_global
,
p_indices_global
);
}
else
{
GridwiseReduction
::
RunSecondCallWithIndex
(
in_grid_desc_m_k
,
out_grid_desc_m
,
in_elementwise_op
,
acc_elementwise_op
,
alpha
,
p_in_global
,
beta
,
p_out_global
,
p_ws_indices_global
,
p_indices_global
);
};
};
template
<
typename
InDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
IndexDataType
,
typename
InGridDesc_M_K
,
typename
OutGridDesc_M
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
OutElementwiseOperation
,
bool
PropagateNan
,
bool
BetaIsZero
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
struct
GridwiseReduction_mk_to_m_blockwise
{
static_assert
(((
InSrcVectorDim
==
0
&&
MThreadSliceSize
%
InSrcVectorSize
==
0
)
||
(
InSrcVectorDim
==
1
&&
KThreadSliceSize
%
InSrcVectorSize
==
0
))
&&
(
MThreadSliceSize
%
OutDstVectorSize
==
0
),
"Invalid thread slice sizes and/or vector sizes configuration, please check!"
);
static
constexpr
bool
reorder_thread_cluster
=
(
InSrcVectorDim
==
0
);
using
ThreadClusterLengths_M_K
=
Sequence
<
MThreadClusterSize
,
KThreadClusterSize
>
;
using
ThreadBufferDimAccessOrder
=
typename
conditional
<
reorder_thread_cluster
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
using
ThreadClusterArrangeOrder
=
typename
conditional
<
reorder_thread_cluster
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
static
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
using
ThreadReduceSrcDesc_M_K
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{})));
using
ThreadReduceDstDesc_M
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{})));
using
PassThroughOp
=
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
index_t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
index_t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
template
<
bool
IsSecondCall
>
__device__
static
void
Run
(
const
InGridDesc_M_K
&
in_grid_desc_m_k
,
const
OutGridDesc_M
&
out_grid_desc_m
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
OutElementwiseOperation
&
acc_elementwise_op
,
AccDataType
alpha
,
const
InDataType
*
const
__restrict__
p_in_global
,
AccDataType
beta
,
OutDataType
*
const
__restrict__
p_out_global
,
const
IndexDataType
*
const
__restrict__
p_ws_indices_global
,
IndexDataType
*
const
__restrict__
p_indices_global
)
{
if
constexpr
(
IsSecondCall
)
{
static_assert
(
InSrcVectorDim
==
1
,
"InSrcVectorDim must be 1 for BlockwiseSecondCall, please check!"
);
};
using
BlockwiseReduce
=
PartitionedBlockwiseReduction
<
AccDataType
,
BlockSize
,
ThreadClusterLengths_M_K
,
ThreadClusterArrangeOrder
,
ReduceOperation
,
PropagateNan
>
;
using
ThreadwiseReduce
=
ThreadwiseReduction
<
AccDataType
,
ThreadReduceSrcDesc_M_K
,
ThreadReduceDstDesc_M
,
ReduceOperation
,
PropagateNan
>
;
(
void
)
p_ws_indices_global
;
(
void
)
p_indices_global
;
// LDS
__shared__
AccDataType
p_reduce_work_buffer
[
BlockSize
];
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
auto
out_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
reduce_work_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_work_buffer
,
BlockSize
);
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zeroVal
;
});
const
auto
toReduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_1d_id
=
get_block_1d_id
();
const
auto
thread_cluster_idx
=
thread_cluster_desc
.
CalculateBottomIndex
(
make_multi_index
(
thread_local_id
));
const
auto
thread_m_cluster_id
=
thread_cluster_idx
[
I0
];
const
auto
thread_k_cluster_id
=
thread_cluster_idx
[
I1
];
using
ThreadBufferLengths
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
KThreadSliceSize
));
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
K_BlockTileSize
);
const
index_t
toReduceTiles
=
(
toReduceLength
+
K_BlockTileSize
-
1
)
/
K_BlockTileSize
;
index_t
reducedTiles
=
0
;
do
{
threadwise_src_load
.
Run
(
in_grid_desc_m_k
,
in_global_buf
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
// do element-wise pre-reduction operation
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
));
in_elementwise_op
(
in_thread_buf
(
Number
<
offset
>
{}),
in_thread_buf
(
Number
<
offset
>
{}));
});
});
ThreadwiseReduce
::
Reduce
(
in_thread_buf
,
accu_value_buf
);
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
reducedTiles
++
;
}
while
(
reducedTiles
<
toReduceTiles
);
constexpr
auto
reduced_data_desc
=
ThreadReduceDstDesc_M
{};
static_for
<
0
,
MThreadSliceSize
,
1
>
{}(
[
&
](
auto
I
)
{
BlockwiseReduce
::
Reduce
(
reduce_work_buf
,
accu_value_buf
(
I
));
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
(
thread_k_cluster_id
==
0
)
{
acc_elementwise_op
(
accu_value_buf
(
I
),
accu_value_buf
(
I
));
accu_value_buf
(
I
)
*=
alpha
;
}
});
if
(
thread_k_cluster_id
==
0
)
{
if
constexpr
(
!
BetaIsZero
)
{
if
(
!
float_equal_zero
{}(
beta
))
{
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
priorDstValueBuf
;
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
OutDataType
,
OutDataType
,
OutGridDesc_M
,
decltype
(
reduced_data_desc
),
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
1
,
false
>
(
out_grid_desc_m
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
));
threadwise_dst_load
.
Run
(
out_grid_desc_m
,
out_global_buf
,
reduced_data_desc
,
make_tuple
(
I0
),
priorDstValueBuf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
+=
type_convert
<
AccDataType
>
(
priorDstValueBuf
[
I
])
*
beta
;
});
};
};
auto
threadwise_dst_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
AccDataType
,
OutDataType
,
decltype
(
reduced_data_desc
),
OutGridDesc_M
,
PassThroughOp
,
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum
::
Set
,
1
,
true
>
(
out_grid_desc_m
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
),
PassThroughOp
{});
threadwise_dst_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
),
accu_value_buf
,
out_grid_desc_m
,
out_global_buf
);
}
};
__device__
static
void
RunWithIndex
(
const
InGridDesc_M_K
&
in_grid_desc_m_k
,
const
OutGridDesc_M
&
out_grid_desc_m
,
const
InElementwiseOperation
&
in_elementwise_op
,
const
OutElementwiseOperation
&
acc_elementwise_op
,
AccDataType
alpha
,
const
InDataType
*
const
__restrict__
p_in_global
,
AccDataType
beta
,
OutDataType
*
const
__restrict__
p_out_global
,
const
IndexDataType
*
const
__restrict__
p_ws_indices_global
,
IndexDataType
*
const
__restrict__
p_indices_global
)
{
using
BlockwiseReduceWithIndex
=
PartitionedBlockwiseReductionWithIndex
<
AccDataType
,
IndexDataType
,
BlockSize
,
ThreadClusterLengths_M_K
,
ThreadClusterArrangeOrder
,
ReduceOperation
,
PropagateNan
>
;
using
AccumulationWithIndex
=
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
;
(
void
)
p_ws_indices_global
;
// LDS
__shared__
AccDataType
p_reduce_work_val_buffer
[
BlockSize
];
__shared__
IndexDataType
p_reduce_work_idx_buffer
[
BlockSize
];
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
out_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_indices_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
reduce_work_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_work_val_buffer
,
BlockSize
);
auto
reduce_work_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_work_idx_buffer
,
BlockSize
);
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_val_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_idx_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
const
auto
toReduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_1d_id
=
get_block_1d_id
();
const
auto
thread_cluster_idx
=
thread_cluster_desc
.
CalculateBottomIndex
(
make_multi_index
(
thread_local_id
));
const
auto
thread_m_cluster_id
=
thread_cluster_idx
[
I0
];
const
auto
thread_k_cluster_id
=
thread_cluster_idx
[
I1
];
using
ThreadBufferLengths
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
KThreadSliceSize
));
index_t
indexOffset
=
0
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zeroVal
;
accu_index_buf
(
I
)
=
0
;
});
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
K_BlockTileSize
);
const
index_t
toReduceTiles
=
(
toReduceLength
+
K_BlockTileSize
-
1
)
/
K_BlockTileSize
;
index_t
reducedTiles
=
0
;
do
{
// load the thread slice
threadwise_src_load
.
Run
(
in_grid_desc_m_k
,
in_global_buf
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_val_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
));
// initialize the indices for the per-thread to-reduce values
in_thread_idx_buf
(
Number
<
offset
>
{})
=
indexOffset
+
thread_k_cluster_id
*
KThreadSliceSize
+
iK
();
// do element-wise pre-reduction operation
in_elementwise_op
(
in_thread_val_buf
(
Number
<
offset
>
{}),
in_thread_val_buf
(
Number
<
offset
>
{}));
});
AccDataType
tmpValue
=
zeroVal
;
IndexDataType
tmpIndex
=
0
;
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
));
AccumulationWithIndex
::
Calculate
(
tmpValue
,
in_thread_val_buf
[
Number
<
offset
>
{}],
tmpIndex
,
in_thread_idx_buf
[
Number
<
offset
>
{}]);
});
BlockwiseReduceWithIndex
::
Reduce
(
reduce_work_val_buf
,
reduce_work_idx_buf
,
tmpValue
,
tmpIndex
);
AccumulationWithIndex
::
Calculate
(
accu_value_buf
(
iM
),
tmpValue
,
accu_index_buf
(
iM
),
tmpIndex
);
});
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
indexOffset
+=
K_BlockTileSize
;
reducedTiles
++
;
}
while
(
reducedTiles
<
toReduceTiles
);
constexpr
auto
reduced_data_desc
=
ThreadReduceDstDesc_M
{};
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
(
thread_k_cluster_id
==
0
)
{
// for indiced operation, acc_elementwise_op shoud do nothing
acc_elementwise_op
(
accu_value_buf
(
I
),
accu_value_buf
(
I
));
accu_value_buf
(
I
)
*=
alpha
;
}
});
if
(
thread_k_cluster_id
==
0
)
{
if
constexpr
(
!
BetaIsZero
)
{
if
(
!
float_equal_zero
{}(
beta
))
{
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
priorDstValueBuf
;
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
OutDataType
,
OutDataType
,
OutGridDesc_M
,
decltype
(
reduced_data_desc
),
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
1
,
false
>
(
out_grid_desc_m
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
));
threadwise_dst_load
.
Run
(
out_grid_desc_m
,
out_global_val_buf
,
reduced_data_desc
,
make_tuple
(
I0
),
priorDstValueBuf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
+=
type_convert
<
AccDataType
>
(
priorDstValueBuf
[
I
])
*
beta
;
});
};
};
auto
threadwise_dst_val_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
AccDataType
,
OutDataType
,
decltype
(
reduced_data_desc
),
OutGridDesc_M
,
PassThroughOp
,
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum
::
Set
,
1
,
false
>
(
out_grid_desc_m
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
),
PassThroughOp
{});
auto
threadwise_dst_idx_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
IndexDataType
,
IndexDataType
,
decltype
(
reduced_data_desc
),
OutGridDesc_M
,
PassThroughOp
,
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum
::
Set
,
1
,
false
>
(
out_grid_desc_m
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
),
PassThroughOp
{});
threadwise_dst_val_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
),
accu_value_buf
,
out_grid_desc_m
,
out_global_val_buf
);
threadwise_dst_idx_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
),
accu_index_buf
,
out_grid_desc_m
,
out_global_idx_buf
);
}
};
__device__
static
void
RunSecondCallWithIndex
(
const
InGridDesc_M_K
&
in_grid_desc_m_k
,
const
OutGridDesc_M
&
out_grid_desc_m
,
const
InElementwiseOperation
in_elementwise_op
,
const
OutElementwiseOperation
acc_elementwise_op
,
AccDataType
alpha
,
const
InDataType
*
const
__restrict__
p_ws_values_global
,
AccDataType
beta
,
OutDataType
*
const
__restrict__
p_out_global
,
const
IndexDataType
*
const
__restrict__
p_ws_indices_global
,
IndexDataType
*
const
__restrict__
p_indices_global
)
{
static_assert
(
InSrcVectorDim
==
1
,
"InSrcVectorDim must be 1 for BlockwiseSecondCall, please check!"
);
using
BlockwiseReduceWithIndex
=
PartitionedBlockwiseReductionWithIndex
<
AccDataType
,
IndexDataType
,
BlockSize
,
Sequence
<
MThreadClusterSize
,
KThreadClusterSize
>
,
ThreadClusterArrangeOrder
,
ReduceOperation
,
PropagateNan
>
;
using
AccumulationWithIndex
=
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
;
(
void
)
in_elementwise_op
;
// LDS
__shared__
AccDataType
p_reduce_work_val_buffer
[
BlockSize
];
__shared__
IndexDataType
p_reduce_work_idx_buffer
[
BlockSize
];
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
src_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_ws_values_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
const
auto
src_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_ws_indices_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
());
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
out_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_indices_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
reduce_work_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_work_val_buffer
,
BlockSize
);
auto
reduce_work_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_work_idx_buffer
,
BlockSize
);
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_val_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_idx_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
const
auto
toReduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_1d_id
=
get_block_1d_id
();
const
auto
thread_cluster_idx
=
thread_cluster_desc
.
CalculateBottomIndex
(
make_multi_index
(
thread_local_id
));
const
auto
thread_m_cluster_id
=
thread_cluster_idx
[
I0
];
const
auto
thread_k_cluster_id
=
thread_cluster_idx
[
I1
];
using
ThreadBufferLengths
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_src_val_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
KThreadSliceSize
));
auto
threadwise_src_idx_load
=
ThreadwiseTensorSliceTransfer_v2
<
IndexDataType
,
IndexDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
KThreadSliceSize
));
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zeroVal
;
accu_index_buf
(
I
)
=
0
;
});
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
K_BlockTileSize
);
const
index_t
toReduceTiles
=
(
toReduceLength
+
K_BlockTileSize
-
1
)
/
K_BlockTileSize
;
index_t
reducedTiles
=
0
;
do
{
// load the thread slice
threadwise_src_val_load
.
Run
(
in_grid_desc_m_k
,
src_global_val_buf
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_val_buf
);
threadwise_src_idx_load
.
Run
(
in_grid_desc_m_k
,
src_global_idx_buf
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_idx_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
AccDataType
tmpValue
=
zeroVal
;
IndexDataType
tmpIndex
=
0
;
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
));
AccumulationWithIndex
::
Calculate
(
tmpValue
,
in_thread_val_buf
[
Number
<
offset
>
{}],
tmpIndex
,
in_thread_idx_buf
[
Number
<
offset
>
{}]);
});
BlockwiseReduceWithIndex
::
Reduce
(
reduce_work_val_buf
,
reduce_work_idx_buf
,
tmpValue
,
tmpIndex
);
AccumulationWithIndex
::
Calculate
(
accu_value_buf
(
iM
),
tmpValue
,
accu_index_buf
(
iM
),
tmpIndex
);
});
threadwise_src_val_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
threadwise_src_idx_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
reducedTiles
++
;
}
while
(
reducedTiles
<
toReduceTiles
);
constexpr
auto
reduced_data_desc
=
ThreadReduceDstDesc_M
{};
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
(
thread_k_cluster_id
==
0
)
{
// for indiced operation, acc_elementwise_op shoud do nothing
acc_elementwise_op
(
accu_value_buf
(
I
),
accu_value_buf
(
I
));
accu_value_buf
(
I
)
*=
alpha
;
}
});
if
(
thread_k_cluster_id
==
0
)
{
if
constexpr
(
!
BetaIsZero
)
{
if
(
!
float_equal_zero
{}(
beta
))
{
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
priorDstValueBuf
;
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
OutDataType
,
OutDataType
,
OutGridDesc_M
,
decltype
(
reduced_data_desc
),
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
1
,
true
>
(
out_grid_desc_m
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
));
threadwise_dst_load
.
Run
(
out_grid_desc_m
,
out_global_val_buf
,
reduced_data_desc
,
make_tuple
(
I0
),
priorDstValueBuf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
+=
type_convert
<
AccDataType
>
(
priorDstValueBuf
[
I
])
*
beta
;
});
};
};
auto
threadwise_dst_val_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
AccDataType
,
OutDataType
,
decltype
(
reduced_data_desc
),
OutGridDesc_M
,
PassThroughOp
,
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum
::
Set
,
1
,
true
>
(
out_grid_desc_m
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
),
PassThroughOp
{});
auto
threadwise_dst_idx_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
IndexDataType
,
IndexDataType
,
decltype
(
reduced_data_desc
),
OutGridDesc_M
,
PassThroughOp
,
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum
::
Set
,
1
,
true
>
(
out_grid_desc_m
,
make_multi_index
(
block_global_1d_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
),
PassThroughOp
{});
threadwise_dst_val_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
),
accu_value_buf
,
out_grid_desc_m
,
out_global_val_buf
);
threadwise_dst_idx_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
),
accu_index_buf
,
out_grid_desc_m
,
out_global_idx_buf
);
}
};
};
}
// namespace ck
#endif
Prev
1
2
3
4
5
6
…
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment