Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
07a673c6
Commit
07a673c6
authored
Apr 14, 2022
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into cpu_avx2
parents
c0f698d5
ac0d8066
Changes
307
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
500 additions
and
512 deletions
+500
-512
include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
.../tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
+97
-50
include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
...ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
+6
-6
include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
...operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
+6
-6
include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
...k/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
+5
-5
include/ck/tensor_operation/gpu/device/device_pool2d_fwd.hpp
include/ck/tensor_operation/gpu/device/device_pool2d_fwd.hpp
+2
-2
include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
...nsor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
+2
-2
include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
...de/ck/tensor_operation/gpu/device/gemm_specialization.hpp
+1
-1
include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
...ensor_operation/gpu/device/reduction_operator_mapping.hpp
+16
-16
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
...or_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
+94
-127
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
.../gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
+29
-37
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
.../grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
+61
-81
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
...r_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+57
-55
include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp
...or_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp
+11
-11
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp
...ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp
+11
-11
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r3.hpp
...ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r3.hpp
+11
-11
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp
...e/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp
+8
-8
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp
...e/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp
+49
-49
include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
...eration/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+16
-16
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
...nsor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+10
-10
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+8
-8
No files found.
include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
View file @
07a673c6
...
...
@@ -8,6 +8,7 @@
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_xdl_cshuffle_v1.hpp"
#include "tensor_operation/gpu/device/gemm_specialization.hpp"
namespace
ck
{
namespace
tensor_operation
{
...
...
@@ -24,7 +25,7 @@ template <typename ALayout,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
,
GemmSpecialization
_t
GemmSpec
ialization
,
GemmSpecialization
GemmSpec
,
index_t
NumGemmKPrefetchStage
,
index_t
BlockSize
,
index_t
MPerBlock
,
...
...
@@ -84,8 +85,8 @@ struct DeviceGemm_Xdl_CShuffle
const
auto
MPad
=
M
-
MRaw
;
const
auto
KPad
=
K
-
KRaw
;
if
constexpr
(
GemmSpec
ialization
==
GemmSpecialization
_t
::
MKPadding
||
GemmSpec
ialization
==
GemmSpecialization
_t
::
MNKPadding
)
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MKPadding
||
GemmSpec
==
GemmSpecialization
::
MNKPadding
)
{
// pad both M and K
assert
(
K
%
AK1
==
0
);
...
...
@@ -108,8 +109,8 @@ struct DeviceGemm_Xdl_CShuffle
return
a_grid_desc_ak0_m_ak1
;
}
else
if
constexpr
(
GemmSpec
ialization
==
GemmSpecialization
_t
::
MPadding
||
GemmSpec
ialization
==
GemmSpecialization
_t
::
MNPadding
)
else
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MPadding
||
GemmSpec
==
GemmSpecialization
::
MNPadding
)
{
// pad M, but not K
assert
(
KRaw
%
AK1
==
0
);
...
...
@@ -125,8 +126,8 @@ struct DeviceGemm_Xdl_CShuffle
return
a_grid_desc_ak0_m_ak1
;
}
else
if
constexpr
(
GemmSpec
ialization
==
GemmSpecialization
_t
::
KPadding
||
GemmSpec
ialization
==
GemmSpecialization
_t
::
NKPadding
)
else
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
KPadding
||
GemmSpec
==
GemmSpecialization
::
NKPadding
)
{
// pad K, but not M
assert
(
K
%
AK1
==
0
);
...
...
@@ -187,8 +188,8 @@ struct DeviceGemm_Xdl_CShuffle
const
auto
NPad
=
N
-
NRaw
;
const
auto
KPad
=
K
-
KRaw
;
if
constexpr
(
GemmSpec
ialization
==
GemmSpecialization
_t
::
NKPadding
||
GemmSpec
ialization
==
GemmSpecialization
_t
::
MNKPadding
)
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
NKPadding
||
GemmSpec
==
GemmSpecialization
::
MNKPadding
)
{
// pad both N and K
assert
(
K
%
BK1
==
0
);
...
...
@@ -211,8 +212,8 @@ struct DeviceGemm_Xdl_CShuffle
return
b_grid_desc_bk0_n_bk1
;
}
else
if
constexpr
(
GemmSpec
ialization
==
GemmSpecialization
_t
::
NPadding
||
GemmSpec
ialization
==
GemmSpecialization
_t
::
MNPadding
)
else
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
NPadding
||
GemmSpec
==
GemmSpecialization
::
MNPadding
)
{
// pad N, but not K
assert
(
KRaw
%
BK1
==
0
);
...
...
@@ -228,8 +229,8 @@ struct DeviceGemm_Xdl_CShuffle
return
b_grid_desc_bk0_n_bk1
;
}
else
if
constexpr
(
GemmSpec
ialization
==
GemmSpecialization
_t
::
KPadding
||
GemmSpec
ialization
==
GemmSpecialization
_t
::
MKPadding
)
else
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
KPadding
||
GemmSpec
==
GemmSpecialization
::
MKPadding
)
{
// pad K, but not N
assert
(
K
%
BK1
==
0
);
...
...
@@ -290,8 +291,8 @@ struct DeviceGemm_Xdl_CShuffle
const
auto
MPad
=
M
-
MRaw
;
const
auto
NPad
=
N
-
NRaw
;
if
constexpr
(
GemmSpec
ialization
==
GemmSpecialization
_t
::
MNPadding
||
GemmSpec
ialization
==
GemmSpecialization
_t
::
MNKPadding
)
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MNPadding
||
GemmSpec
==
GemmSpecialization
::
MNKPadding
)
{
// pad M and N
return
transform_tensor_descriptor
(
c_grid_desc_mraw_nraw
,
...
...
@@ -300,8 +301,8 @@ struct DeviceGemm_Xdl_CShuffle
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
else
if
constexpr
(
GemmSpec
ialization
==
GemmSpecialization
_t
::
MPadding
||
GemmSpec
ialization
==
GemmSpecialization
_t
::
MKPadding
)
else
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MPadding
||
GemmSpec
==
GemmSpecialization
::
MKPadding
)
{
// pad M, but not N
return
transform_tensor_descriptor
(
...
...
@@ -310,8 +311,8 @@ struct DeviceGemm_Xdl_CShuffle
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
else
if
constexpr
(
GemmSpec
ialization
==
GemmSpecialization
_t
::
NPadding
||
GemmSpec
ialization
==
GemmSpecialization
_t
::
NKPadding
)
else
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
NPadding
||
GemmSpec
==
GemmSpecialization
::
NKPadding
)
{
// pad N, but not M
return
transform_tensor_descriptor
(
...
...
@@ -340,7 +341,7 @@ struct DeviceGemm_Xdl_CShuffle
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
AGridDesc_AK0_M_AK1
,
BGridDesc_BK0_N_BK1
,
CGridDesc_M_N
,
...
...
@@ -434,7 +435,7 @@ struct DeviceGemm_Xdl_CShuffle
{
using
Argument
=
DeviceOp
::
Argument
;
float
Run
(
const
Argument
&
arg
,
int
/*
nrepeat
*/
=
1
)
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
)
{
#if 0
{
...
...
@@ -465,6 +466,8 @@ struct DeviceGemm_Xdl_CShuffle
const
bool
has_main_k0_block_loop
=
GridwiseGemm
::
CalculateHasMainK0BlockLoop
(
K0
);
float
ave_time
=
0
;
if
(
has_main_k0_block_loop
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v1
<
...
...
@@ -480,20 +483,42 @@ struct DeviceGemm_Xdl_CShuffle
typename
GridwiseGemm
::
DefaultBlock2CTileMap
,
true
>
;
launch_kernel
(
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
,
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
c_grid_desc_mblock_mperblock_nblock_nperblock_
,
arg
.
block_2_ctile_map_
);
if
(
nrepeat
==
0
)
{
launch_kernel
(
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
,
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
c_grid_desc_mblock_mperblock_nblock_nperblock_
,
arg
.
block_2_ctile_map_
);
}
else
{
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
,
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
c_grid_desc_mblock_mperblock_nblock_nperblock_
,
arg
.
block_2_ctile_map_
);
}
}
else
{
...
...
@@ -510,23 +535,45 @@ struct DeviceGemm_Xdl_CShuffle
typename
GridwiseGemm
::
DefaultBlock2CTileMap
,
false
>
;
launch_kernel
(
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
,
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
c_grid_desc_mblock_mperblock_nblock_nperblock_
,
arg
.
block_2_ctile_map_
);
if
(
nrepeat
==
0
)
{
launch_kernel
(
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
,
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
c_grid_desc_mblock_mperblock_nblock_nperblock_
,
arg
.
block_2_ctile_map_
);
}
else
{
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
,
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
c_grid_desc_mblock_mperblock_nblock_nperblock_
,
arg
.
block_2_ctile_map_
);
}
}
return
0
;
return
ave_time
;
}
// polymorphic
...
...
include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
View file @
07a673c6
...
...
@@ -31,7 +31,7 @@ template <typename ADataType,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
,
GemmSpecialization
_t
GemmSpec
ialization
,
GemmSpecialization
GemmSpec
,
ck
::
index_t
BlockSize
,
ck
::
index_t
MPerBlock
,
ck
::
index_t
NPerBlock
,
...
...
@@ -91,7 +91,7 @@ struct DeviceGemmXdlSplitK
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
constexpr
(
GemmSpec
ialization
==
GemmSpecialization
_t
::
MNPadding
)
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MNPadding
)
{
const
auto
PadM
=
(
MPerBlock
-
M
%
MPerBlock
)
%
MPerBlock
;
return
transform_tensor_descriptor
(
...
...
@@ -136,7 +136,7 @@ struct DeviceGemmXdlSplitK
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
constexpr
(
GemmSpec
ialization
==
GemmSpecialization
_t
::
MNPadding
)
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MNPadding
)
{
const
auto
PadN
=
(
NPerBlock
-
N
%
NPerBlock
)
%
NPerBlock
;
return
transform_tensor_descriptor
(
...
...
@@ -170,7 +170,7 @@ struct DeviceGemmXdlSplitK
}
}();
if
constexpr
(
GemmSpec
ialization
==
GemmSpecialization
_t
::
MNPadding
)
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MNPadding
)
{
const
auto
PadM
=
(
MPerBlock
-
M
%
MPerBlock
)
%
MPerBlock
;
const
auto
PadN
=
(
NPerBlock
-
N
%
NPerBlock
)
%
NPerBlock
;
...
...
@@ -209,7 +209,7 @@ struct DeviceGemmXdlSplitK
ADataType
,
// TODO: distinguish A/B datatype
AccDataType
,
CDataType
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
AGridDesc_K0_M_K1
,
BGridDesc_K0_N_K1
,
CGridDesc_M_N
,
...
...
@@ -250,7 +250,7 @@ struct DeviceGemmXdlSplitK
ADataType
,
// TODO: distinguish A/B datatype
AccDataType
,
CDataType
,
InMemoryDataOperationEnum
_t
::
AtomicAdd
,
InMemoryDataOperationEnum
::
AtomicAdd
,
AGridDesc_K0_M_K1
,
BGridDesc_K0_N_K1
,
CGridDesc_M_N
,
...
...
include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
View file @
07a673c6
...
...
@@ -31,7 +31,7 @@ template <typename ADataType,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
,
GemmSpecialization
_t
GemmSpec
ialization
,
GemmSpecialization
GemmSpec
,
ck
::
index_t
BlockSize
,
ck
::
index_t
MPerBlock
,
ck
::
index_t
NPerBlock
,
...
...
@@ -93,7 +93,7 @@ struct DeviceGemmXdlSplitKCShuffle
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
constexpr
(
GemmSpec
ialization
==
GemmSpecialization
_t
::
MNPadding
)
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MNPadding
)
{
const
auto
PadM
=
(
MPerBlock
-
M
%
MPerBlock
)
%
MPerBlock
;
return
transform_tensor_descriptor
(
...
...
@@ -138,7 +138,7 @@ struct DeviceGemmXdlSplitKCShuffle
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
constexpr
(
GemmSpec
ialization
==
GemmSpecialization
_t
::
MNPadding
)
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MNPadding
)
{
const
auto
PadN
=
(
NPerBlock
-
N
%
NPerBlock
)
%
NPerBlock
;
return
transform_tensor_descriptor
(
...
...
@@ -172,7 +172,7 @@ struct DeviceGemmXdlSplitKCShuffle
}
}();
if
constexpr
(
GemmSpec
ialization
==
GemmSpecialization
_t
::
MNPadding
)
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MNPadding
)
{
const
auto
PadM
=
(
MPerBlock
-
M
%
MPerBlock
)
%
MPerBlock
;
const
auto
PadN
=
(
NPerBlock
-
N
%
NPerBlock
)
%
NPerBlock
;
...
...
@@ -211,7 +211,7 @@ struct DeviceGemmXdlSplitKCShuffle
ADataType
,
// TODO: distinguish A/B datatype
AccDataType
,
CDataType
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
AGridDesc_K0_M_K1
,
BGridDesc_K0_N_K1
,
CGridDesc_M_N
,
...
...
@@ -253,7 +253,7 @@ struct DeviceGemmXdlSplitKCShuffle
ADataType
,
// TODO: distinguish A/B datatype
AccDataType
,
CDataType
,
InMemoryDataOperationEnum
_t
::
AtomicAdd
,
InMemoryDataOperationEnum
::
AtomicAdd
,
AGridDesc_K0_M_K1
,
BGridDesc_K0_N_K1
,
CGridDesc_M_N
,
...
...
include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
View file @
07a673c6
...
...
@@ -27,7 +27,7 @@ template <typename ADataType,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
,
GemmSpecialization
_t
GemmSpec
ialization
,
GemmSpecialization
GemmSpec
,
ck
::
index_t
BlockSize
,
ck
::
index_t
MPerBlock
,
ck
::
index_t
NPerBlock
,
...
...
@@ -81,7 +81,7 @@ struct DeviceGroupedGemmXdl
}
}();
if
constexpr
(
GemmSpec
ialization
==
GemmSpecialization
_t
::
MNPadding
)
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MNPadding
)
{
const
auto
PadM
=
(
MPerBlock
-
M
%
MPerBlock
)
%
MPerBlock
;
...
...
@@ -120,7 +120,7 @@ struct DeviceGroupedGemmXdl
}
}();
if
constexpr
(
GemmSpec
ialization
==
GemmSpecialization
_t
::
MNPadding
)
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MNPadding
)
{
const
auto
PadN
=
(
NPerBlock
-
N
%
NPerBlock
)
%
NPerBlock
;
...
...
@@ -155,7 +155,7 @@ struct DeviceGroupedGemmXdl
}
}();
if
constexpr
(
GemmSpec
ialization
==
GemmSpecialization
_t
::
MNPadding
)
if
constexpr
(
GemmSpec
==
GemmSpecialization
::
MNPadding
)
{
const
auto
PadM
=
(
MPerBlock
-
M
%
MPerBlock
)
%
MPerBlock
;
const
auto
PadN
=
(
NPerBlock
-
N
%
NPerBlock
)
%
NPerBlock
;
...
...
@@ -187,7 +187,7 @@ struct DeviceGroupedGemmXdl
ADataType
,
// TODO: distinguish A/B datatype
AccDataType
,
CDataType
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
AGridDesc_K0_M_K1
,
BGridDesc_K0_N_K1
,
CGridDesc_M_N
,
...
...
include/ck/tensor_operation/gpu/device/device_pool2d_fwd.hpp
View file @
07a673c6
...
...
@@ -10,7 +10,7 @@ namespace ck {
namespace
tensor_operation
{
namespace
device
{
template
<
ck
::
ReduceTensorOp
_t
ReduceOpId
>
template
<
ck
::
ReduceTensorOp
ReduceOpId
>
struct
DevicePool2dFwd
:
public
BaseOperator
{
virtual
std
::
unique_ptr
<
BaseArgument
>
...
...
@@ -29,7 +29,7 @@ struct DevicePool2dFwd : public BaseOperator
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
template
<
ck
::
ReduceTensorOp
_t
ReduceOpId
>
template
<
ck
::
ReduceTensorOp
ReduceOpId
>
using
DevicePool2dFwdPtr
=
std
::
unique_ptr
<
DevicePool2dFwd
<
ReduceOpId
>>
;
}
// namespace device
...
...
include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
View file @
07a673c6
...
...
@@ -16,7 +16,7 @@ namespace device {
template
<
typename
InDataType
,
typename
OutDataType
,
typename
AccDataType
,
ck
::
ReduceTensorOp
_t
ReduceOpId
,
ck
::
ReduceTensorOp
ReduceOpId
,
bool
NeedIndices
,
ck
::
index_t
BlockSize
,
ck
::
index_t
ReduceMThreadClusterSize
,
...
...
@@ -181,7 +181,7 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
reduce_lowest_length_
=
window_spatial_lengths
[
1
];
// TODO: is this correct?
if
constexpr
(
ReduceOpId
==
ck
::
ReduceTensorOp
_t
::
AVG
)
if
constexpr
(
ReduceOpId
==
ck
::
ReduceTensorOp
::
AVG
)
{
ck
::
index_t
divider
=
window_spatial_lengths
[
0
]
*
window_spatial_lengths
[
1
];
in_element_op_
=
InElementwiseOperation
{
divider
};
...
...
include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
View file @
07a673c6
...
...
@@ -5,7 +5,7 @@ namespace ck {
namespace
tensor_operation
{
namespace
device
{
enum
struct
GemmSpecialization
_t
enum
struct
GemmSpecialization
{
Default
,
MPadding
,
...
...
include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
View file @
07a673c6
...
...
@@ -37,11 +37,11 @@ namespace ck {
// The boolean member "indexable" are also provided in reduce_binary_operactor for
// easier checking by the upper-layer codes in the kernels.
template
<
typename
T
,
ReduceTensorOp
_t
Op
>
template
<
typename
T
,
ReduceTensorOp
Op
>
struct
reduce_binary_operator
;
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
_t
::
ADD
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
ADD
>
{
using
opType
=
reduce
::
Add
<
T
>
;
using
dataType
=
T
;
...
...
@@ -50,7 +50,7 @@ struct reduce_binary_operator<T, ReduceTensorOp_t::ADD>
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
_t
::
MUL
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
MUL
>
{
using
opType
=
reduce
::
Mul
<
T
>
;
using
dataType
=
T
;
...
...
@@ -59,7 +59,7 @@ struct reduce_binary_operator<T, ReduceTensorOp_t::MUL>
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
_t
::
MIN
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
MIN
>
{
using
opType
=
reduce
::
Min
<
T
>
;
using
dataType
=
T
;
...
...
@@ -68,7 +68,7 @@ struct reduce_binary_operator<T, ReduceTensorOp_t::MIN>
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
_t
::
MAX
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
MAX
>
{
using
opType
=
reduce
::
Max
<
T
>
;
using
dataType
=
T
;
...
...
@@ -77,7 +77,7 @@ struct reduce_binary_operator<T, ReduceTensorOp_t::MAX>
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
_t
::
AMAX
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
AMAX
>
{
using
opType
=
reduce
::
AMax
<
T
>
;
using
dataType
=
T
;
...
...
@@ -86,7 +86,7 @@ struct reduce_binary_operator<T, ReduceTensorOp_t::AMAX>
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
_t
::
AVG
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
AVG
>
{
using
opType
=
reduce
::
Add
<
T
>
;
using
dataType
=
T
;
...
...
@@ -95,7 +95,7 @@ struct reduce_binary_operator<T, ReduceTensorOp_t::AVG>
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
_t
::
NORM1
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
NORM1
>
{
using
opType
=
reduce
::
Add
<
T
>
;
using
dataType
=
T
;
...
...
@@ -104,7 +104,7 @@ struct reduce_binary_operator<T, ReduceTensorOp_t::NORM1>
};
template
<
typename
T
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
_t
::
NORM2
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
NORM2
>
{
using
opType
=
reduce
::
Add
<
T
>
;
using
dataType
=
T
;
...
...
@@ -115,7 +115,7 @@ struct reduce_binary_operator<T, ReduceTensorOp_t::NORM2>
// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
// functor classes.
// The two unary functors are called before and afer the Reduction is executed respectively
template
<
typename
T
,
ReduceTensorOp
_t
Op
,
bool
IsFirstReduce
,
bool
IsLastReduce
>
template
<
typename
T
,
ReduceTensorOp
Op
,
bool
IsFirstReduce
,
bool
IsLastReduce
>
struct
reduce_unary_operator
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
...
...
@@ -123,42 +123,42 @@ struct reduce_unary_operator
};
template
<
typename
T
,
bool
IsFirstReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
_t
::
AVG
,
IsFirstReduce
,
true
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
::
AVG
,
IsFirstReduce
,
true
>
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
,
true
>
;
};
template
<
typename
T
,
bool
IsLastReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
_t
::
NORM1
,
true
,
IsLastReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
::
NORM1
,
true
,
IsLastReduce
>
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryAbs
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
};
template
<
typename
T
,
bool
IsLastReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
_t
::
AMAX
,
true
,
IsLastReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
::
AMAX
,
true
,
IsLastReduce
>
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryAbs
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
};
template
<
typename
T
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
_t
::
NORM2
,
true
,
false
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
::
NORM2
,
true
,
false
>
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySquare
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
};
template
<
typename
T
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
_t
::
NORM2
,
true
,
true
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
::
NORM2
,
true
,
true
>
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySquare
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySqrt
<
T
,
T
>
;
};
template
<
typename
T
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
_t
::
NORM2
,
false
,
true
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
::
NORM2
,
false
,
true
>
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySqrt
<
T
,
T
>
;
...
...
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
View file @
07a673c6
...
...
@@ -31,6 +31,7 @@
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_blockwise.hpp"
#include "reduction_functions_threadwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "cluster_descriptor.hpp"
#include "element_wise_operation.hpp"
...
...
@@ -179,10 +180,10 @@ struct GridwiseReduction_mk_to_m_blockwise
static
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
// For laying out the threads to do reducing on LDS buffer, for LDS buffer, we always use the
// Dim_K as the fastest one
static
constexpr
auto
block_buf_desc_m_k
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadClusterSize
>
{},
Number
<
K
Thread
Cluster
Size
>
{}));
using
ThreadReduceSrcDesc_M_K
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{})));
using
ThreadReduceDstDesc_M
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
M
Thread
Slice
Size
>
{}))
)
;
using
PassThroughOp
=
tensor_operation
::
element_wise
::
PassThrough
;
...
...
@@ -216,32 +217,33 @@ struct GridwiseReduction_mk_to_m_blockwise
ThreadClusterArrangeOrder
,
ReduceOperation
,
PropagateNan
>
;
using
Accumulation
=
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
>
;
using
ThreadwiseReduce
=
ThreadwiseReduction
<
AccDataType
,
ThreadReduceSrcDesc_M_K
,
ThreadReduceDstDesc_M
,
ReduceOperation
,
PropagateNan
>
;
(
void
)
p_ws_indices_global
;
(
void
)
p_indices_global
;
// LDS
__shared__
AccDataType
p_
block_
reduce_buffer
[
BlockSize
];
__shared__
AccDataType
p_reduce_
work_
buffer
[
BlockSize
];
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
auto
out_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
out_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
block_
reduce_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
p_
block_
reduce_buffer
,
BlockSize
);
auto
reduce_
work_
buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_
work_
buffer
,
BlockSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zeroVal
;
});
...
...
@@ -288,38 +290,26 @@ struct GridwiseReduction_mk_to_m_blockwise
make_tuple
(
I0
,
I0
),
in_thread_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
// do element-wise pre-reduction operation
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
in_elementwise_op
(
in_thread_buf
(
offset
),
in_thread_buf
(
offset
));
});
// reduce on each thread-local slice
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
Accumulation
::
Calculate
(
accu_value_buf
(
I
),
in_thread_buf
[
offset
]);
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
));
in_elementwise_op
(
in_thread_buf
(
Number
<
offset
>
{}),
in_thread_buf
(
Number
<
offset
>
{}));
});
});
ThreadwiseReduce
::
Reduce
(
in_thread_buf
,
accu_value_buf
);
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
reducedTiles
++
;
}
while
(
reducedTiles
<
toReduceTiles
);
constexpr
auto
reduced_data_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{}));
constexpr
auto
reduced_data_desc
=
ThreadReduceDstDesc_M
{};
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
block_reduce_buf
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
accu_value_buf
[
I
];
accu_value_buf
(
I
)
=
zeroVal
;
__syncthreads
();
BlockwiseReduce
::
Reduce
(
block_reduce_buf
,
accu_value_buf
(
I
));
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}(
[
&
](
auto
I
)
{
BlockwiseReduce
::
Reduce
(
reduce_work_buf
,
accu_value_buf
(
I
));
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
(
thread_k_cluster_id
==
0
)
...
...
@@ -336,7 +326,7 @@ struct GridwiseReduction_mk_to_m_blockwise
{
if
(
!
float_equal_zero
{}(
beta
))
{
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
priorDstValueBuf
;
auto
threadwise_dst_load
=
...
...
@@ -376,7 +366,7 @@ struct GridwiseReduction_mk_to_m_blockwise
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
1
,
true
>
(
out_grid_desc_m
,
...
...
@@ -417,35 +407,34 @@ struct GridwiseReduction_mk_to_m_blockwise
(
void
)
p_ws_indices_global
;
// LDS
__shared__
AccDataType
p_
block_
reduce_val_buffer
[
BlockSize
];
__shared__
IndexDataType
p_
block_
reduce_idx_buffer
[
BlockSize
];
__shared__
AccDataType
p_reduce_
work_
val_buffer
[
BlockSize
];
__shared__
IndexDataType
p_reduce_
work_
idx_buffer
[
BlockSize
];
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
out_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
out_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_indices_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
block_
reduce_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
p_
block_
reduce_val_buffer
,
BlockSize
);
auto
block_
reduce_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
p_
block_
reduce_idx_buffer
,
BlockSize
);
auto
reduce_
work_
val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_
work_
val_buffer
,
BlockSize
);
auto
reduce_
work_
idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_
work_
idx_buffer
,
BlockSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_val_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
index_t
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_idx_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
const
auto
toReduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
...
...
@@ -498,42 +487,36 @@ struct GridwiseReduction_mk_to_m_blockwise
make_tuple
(
I0
,
I0
),
in_thread_val_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
))
;
// initialize the indices for the per-thread to-reduce values
in_thread_idx_buf
(
offset
)
=
indexOffset
+
thread_k_cluster_id
*
KThreadSliceSize
+
J
();
in_thread_idx_buf
(
Number
<
offset
>
{}
)
=
indexOffset
+
thread_k_cluster_id
*
KThreadSliceSize
+
iK
();
// do element-wise pre-reduction operation
in_elementwise_op
(
in_thread_val_buf
(
offset
),
in_thread_val_buf
(
offset
));
in_elementwise_op
(
in_thread_val_buf
(
Number
<
offset
>
{}),
in_thread_val_buf
(
Number
<
offset
>
{}));
});
AccDataType
tmpValue
=
zeroVal
;
IndexDataType
tmpIndex
=
0
;
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
))
;
// reduce on the dim1 thread slice
AccumulationWithIndex
::
Calculate
(
tmpValue
,
in_thread_val_buf
[
offset
],
tmpIndex
,
in_thread_idx_buf
[
offset
]);
AccumulationWithIndex
::
Calculate
(
tmpValue
,
in_thread_val_buf
[
Number
<
offset
>
{}],
tmpIndex
,
in_thread_idx_buf
[
Number
<
offset
>
{}]);
});
// store thread local value to LDS for parallel reduction
block_reduce_val_buf
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
tmpValue
;
block_reduce_idx_buf
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
tmpIndex
;
__syncthreads
();
BlockwiseReduceWithIndex
::
Reduce
(
block_
reduce_val_buf
,
block_
reduce_idx_buf
,
tmpValue
,
tmpIndex
);
reduce_
work_
val_buf
,
reduce_
work_
idx_buf
,
tmpValue
,
tmpIndex
);
AccumulationWithIndex
::
Calculate
(
accu_value_buf
(
I
),
tmpValue
,
accu_index_buf
(
I
),
tmpIndex
);
accu_value_buf
(
iM
),
tmpValue
,
accu_index_buf
(
iM
),
tmpIndex
);
});
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
...
...
@@ -542,8 +525,7 @@ struct GridwiseReduction_mk_to_m_blockwise
reducedTiles
++
;
}
while
(
reducedTiles
<
toReduceTiles
);
constexpr
auto
reduced_data_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{}));
constexpr
auto
reduced_data_desc
=
ThreadReduceDstDesc_M
{};
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
(
thread_k_cluster_id
==
0
)
...
...
@@ -561,7 +543,7 @@ struct GridwiseReduction_mk_to_m_blockwise
{
if
(
!
float_equal_zero
{}(
beta
))
{
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
priorDstValueBuf
;
auto
threadwise_dst_load
=
...
...
@@ -601,7 +583,7 @@ struct GridwiseReduction_mk_to_m_blockwise
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
1
,
false
>
(
out_grid_desc_m
,
...
...
@@ -619,7 +601,7 @@ struct GridwiseReduction_mk_to_m_blockwise
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
1
,
false
>
(
out_grid_desc_m
,
...
...
@@ -672,42 +654,38 @@ struct GridwiseReduction_mk_to_m_blockwise
(
void
)
in_elementwise_op
;
// LDS
__shared__
AccDataType
p_
block_
reduce_val_buffer
[
BlockSize
];
__shared__
IndexDataType
p_
block_
reduce_idx_buffer
[
BlockSize
];
__shared__
AccDataType
p_reduce_
work_
val_buffer
[
BlockSize
];
__shared__
IndexDataType
p_reduce_
work_
idx_buffer
[
BlockSize
];
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
src_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
p_ws_values_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
const
auto
src_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_ws_values_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
const
auto
src_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_ws_indices_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
());
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
out_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
out_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_indices_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
block_
reduce_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
p_
block_
reduce_val_buffer
,
BlockSize
);
auto
block_
reduce_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
p_
block_
reduce_idx_buffer
,
BlockSize
);
auto
reduce_
work_
val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_
work_
val_buffer
,
BlockSize
);
auto
reduce_
work_
idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_
work_
idx_buffer
,
BlockSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_val_buf
;
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_idx_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
const
auto
toReduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
...
...
@@ -756,8 +734,6 @@ struct GridwiseReduction_mk_to_m_blockwise
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
KThreadSliceSize
));
// index_t indexOffset = 0;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zeroVal
;
accu_index_buf
(
I
)
=
0
;
...
...
@@ -782,42 +758,33 @@ struct GridwiseReduction_mk_to_m_blockwise
make_tuple
(
I0
,
I0
),
in_thread_idx_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
AccDataType
tmpValue
=
zeroVal
;
IndexDataType
tmpIndex
=
0
;
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
))
;
// reduce on the dim1 thread slice
AccumulationWithIndex
::
Calculate
(
tmpValue
,
in_thread_val_buf
[
offset
],
tmpIndex
,
in_thread_idx_buf
[
offset
]);
AccumulationWithIndex
::
Calculate
(
tmpValue
,
in_thread_val_buf
[
Number
<
offset
>
{}],
tmpIndex
,
in_thread_idx_buf
[
Number
<
offset
>
{}]);
});
// store thread local value to LDS for parallel reduction
block_reduce_val_buf
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
tmpValue
;
block_reduce_idx_buf
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
tmpIndex
;
__syncthreads
();
BlockwiseReduceWithIndex
::
Reduce
(
block_
reduce_val_buf
,
block_
reduce_idx_buf
,
tmpValue
,
tmpIndex
);
reduce_
work_
val_buf
,
reduce_
work_
idx_buf
,
tmpValue
,
tmpIndex
);
AccumulationWithIndex
::
Calculate
(
accu_value_buf
(
I
),
tmpValue
,
accu_index_buf
(
I
),
tmpIndex
);
accu_value_buf
(
iM
),
tmpValue
,
accu_index_buf
(
iM
),
tmpIndex
);
});
threadwise_src_val_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
threadwise_src_idx_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
// indexOffset += K_BlockTileSize;
reducedTiles
++
;
}
while
(
reducedTiles
<
toReduceTiles
);
constexpr
auto
reduced_data_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{}));
constexpr
auto
reduced_data_desc
=
ThreadReduceDstDesc_M
{};
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
(
thread_k_cluster_id
==
0
)
...
...
@@ -835,7 +802,7 @@ struct GridwiseReduction_mk_to_m_blockwise
{
if
(
!
float_equal_zero
{}(
beta
))
{
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
priorDstValueBuf
;
auto
threadwise_dst_load
=
...
...
@@ -875,7 +842,7 @@ struct GridwiseReduction_mk_to_m_blockwise
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
1
,
true
>
(
out_grid_desc_m
,
...
...
@@ -893,7 +860,7 @@ struct GridwiseReduction_mk_to_m_blockwise
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
1
,
true
>
(
out_grid_desc_m
,
...
...
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
View file @
07a673c6
...
...
@@ -30,6 +30,7 @@
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_blockwise.hpp"
#include "reduction_functions_threadwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "element_wise_operation.hpp"
...
...
@@ -103,10 +104,10 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
static
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
// For laying out the threads to do reducing on LDS buffer, for LDS buffer, we always use the
// Dim_K as the fastest one
static
constexpr
auto
block_buf_desc_m_k
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadClusterSize
>
{},
Number
<
K
Thread
Cluster
Size
>
{}));
using
ThreadReduceSrcDesc_M_K
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{})));
using
ThreadReduceDstDesc_M
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
M
Thread
Slice
Size
>
{}))
)
;
using
BlockwiseReduce
=
PartitionedBlockwiseReduction
<
AccDataType
,
BlockSize
,
...
...
@@ -115,6 +116,12 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
ReduceOperation
,
PropagateNan
>
;
using
ThreadwiseReduce
=
ThreadwiseReduction
<
AccDataType
,
ThreadReduceSrcDesc_M_K
,
ThreadReduceDstDesc_M
,
ReduceOperation
,
PropagateNan
>
;
using
PassThroughOp
=
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
...
...
@@ -138,23 +145,20 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
// LDS
__shared__
AccDataType
p_
block_
reduce_buffer
[
BlockSize
];
__shared__
AccDataType
p_reduce_
work_
buffer
[
BlockSize
];
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
auto
out_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
out_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
block_
reduce_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
p_
block_
reduce_buffer
,
BlockSize
);
auto
reduce_
work_
buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_
work_
buffer
,
BlockSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zeroVal
;
});
...
...
@@ -201,42 +205,30 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
make_tuple
(
I0
,
I0
),
in_thread_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
// do element-wise pre-reduction operation
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
in_elementwise_op
(
in_thread_buf
(
offset
),
in_thread_buf
(
offset
));
});
// reduce on each thread-local slice
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
Accumulation
::
Calculate
(
accu_value_buf
(
I
),
in_thread_buf
[
offset
]);
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
));
in_elementwise_op
(
in_thread_buf
(
Number
<
offset
>
{}),
in_thread_buf
(
Number
<
offset
>
{}));
});
});
ThreadwiseReduce
::
Reduce
(
in_thread_buf
,
accu_value_buf
);
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
reducedTiles
++
;
}
while
(
reducedTiles
<
num_k_block_tile_iteration
);
constexpr
auto
reduced_data_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{}));
constexpr
auto
reduced_data_desc
=
ThreadReduceDstDesc_M
{};
// Each block executes multiple parallel reductions on the LDS, and by atomic-adding its
// reduced output to the global location corresponding to each invariant dimension to get a
// consistent reduced result for that invariant dimension. due to the using of vector_load,
// each block/thread is involved into multiple invarirant dimensions.
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
block_reduce_buf
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
accu_value_buf
[
I
];
accu_value_buf
(
I
)
=
zeroVal
;
__syncthreads
();
BlockwiseReduce
::
Reduce
(
block_reduce_buf
,
accu_value_buf
(
I
));
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}(
[
&
](
auto
I
)
{
BlockwiseReduce
::
Reduce
(
reduce_work_buf
,
accu_value_buf
(
I
));
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
(
thread_k_cluster_id
==
0
)
...
...
@@ -259,7 +251,7 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum
_t
::
AtomicAdd
,
InMemoryDataOperationEnum
::
AtomicAdd
,
1
,
true
>
(
out_grid_desc_m
,
...
...
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
View file @
07a673c6
...
...
@@ -30,6 +30,7 @@
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_blockwise.hpp"
#include "reduction_functions_threadwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "cluster_descriptor.hpp"
#include "element_wise_operation.hpp"
...
...
@@ -121,10 +122,10 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
static
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
// For laying out the threads to do reducing on LDS buffer, for LDS buffer, we always use the
// Dim_K as the fastest one
static
constexpr
auto
block_buf_desc_m_k
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadClusterSize
>
{},
Number
<
K
Thread
Cluster
Size
>
{}));
using
ThreadReduceSrcDesc_M_K
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{})));
using
ThreadReduceDstDesc_M
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
M
Thread
Slice
Size
>
{}))
)
;
using
PassThroughOp
=
tensor_operation
::
element_wise
::
PassThrough
;
...
...
@@ -151,8 +152,11 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
ReduceOperation
,
PropagateNan
>
;
using
Accumulation
=
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
>
;
using
ThreadwiseReduce
=
ThreadwiseReduction
<
AccDataType
,
ThreadReduceSrcDesc_M_K
,
ThreadReduceDstDesc_M
,
ReduceOperation
,
PropagateNan
>
;
(
void
)
p_ws_indices_global
;
(
void
)
acc_elementwise_op
;
...
...
@@ -160,25 +164,22 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
// LDS
__shared__
AccDataType
p_
block_
reduce_buffer
[
BlockSize
];
__shared__
AccDataType
p_reduce_
work_
buffer
[
BlockSize
];
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
p_src_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
auto
workspace_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_src_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
auto
workspace_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_ws_values_global
,
workspace_desc_m_k
.
GetElementSpaceSize
());
auto
block_
reduce_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
p_
block_
reduce_buffer
,
BlockSize
);
auto
reduce_
work_
buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_
work_
buffer
,
BlockSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zeroVal
;
});
...
...
@@ -225,20 +226,17 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
make_tuple
(
I0
,
I0
),
in_thread_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
// do element-wise pre-reduction operation
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
in_elementwise_op
(
in_thread_buf
(
offset
),
in_thread_buf
(
offset
));
});
// reduce on each thread-local slice
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
Accumulation
::
Calculate
(
accu_value_buf
(
I
),
in_thread_buf
[
offset
]);
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
));
in_elementwise_op
(
in_thread_buf
(
Number
<
offset
>
{}),
in_thread_buf
(
Number
<
offset
>
{}));
});
});
ThreadwiseReduce
::
Reduce
(
in_thread_buf
,
accu_value_buf
);
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
reducedTiles
++
;
...
...
@@ -246,16 +244,8 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
// Each block executes multiple parallel reductions on the LDS, and due to the using of
// vector_load, each block/thread is involved into multiple invarirant dimensions.
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
block_reduce_buf
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
accu_value_buf
[
I
];
accu_value_buf
(
I
)
=
zeroVal
;
__syncthreads
();
BlockwiseReduce
::
Reduce
(
block_reduce_buf
,
accu_value_buf
(
I
));
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}(
[
&
](
auto
I
)
{
BlockwiseReduce
::
Reduce
(
reduce_work_buf
,
accu_value_buf
(
I
));
});
constexpr
auto
reduced_data_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
1
>
{}));
...
...
@@ -272,7 +262,7 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
Sequence
<
0
,
1
>
,
1
,
1
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
1
,
true
>
(
workspace_desc_m_k
,
...
...
@@ -318,37 +308,33 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
// LDS
__shared__
AccDataType
p_
block_
reduce_val_buffer
[
BlockSize
];
__shared__
index_t
p_
block_
reduce_idx_buffer
[
BlockSize
];
__shared__
AccDataType
p_reduce_
work_
val_buffer
[
BlockSize
];
__shared__
index_t
p_reduce_
work_
idx_buffer
[
BlockSize
];
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
p_src_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
auto
workspace_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_src_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
auto
workspace_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_ws_values_global
,
workspace_desc_m_k
.
GetElementSpaceSize
());
auto
workspace_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
workspace_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_ws_indices_global
,
workspace_desc_m_k
.
GetElementSpaceSize
());
auto
block_
reduce_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
p_
block_
reduce_val_buffer
,
BlockSize
);
auto
block_
reduce_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
p_
block_
reduce_idx_buffer
,
BlockSize
);
auto
reduce_
work_
val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_
work_
val_buffer
,
BlockSize
);
auto
reduce_
work_
idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_
work_
idx_buffer
,
BlockSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_val_buf
;
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_idx_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_id
=
get_block_1d_id
();
...
...
@@ -401,42 +387,36 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
make_tuple
(
I0
,
I0
),
in_thread_val_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
))
;
// initialize the indices for the per-thread to-reduce values
in_thread_idx_buf
(
offset
)
=
indexOffset
+
thread_k_cluster_id
*
KThreadSliceSize
+
J
();
in_thread_idx_buf
(
Number
<
offset
>
{}
)
=
indexOffset
+
thread_k_cluster_id
*
KThreadSliceSize
+
iK
();
// do element-wise pre-reduction operation
in_elementwise_op
(
in_thread_val_buf
(
offset
),
in_thread_val_buf
(
offset
));
in_elementwise_op
(
in_thread_val_buf
(
Number
<
offset
>
{}),
in_thread_val_buf
(
Number
<
offset
>
{}));
});
AccDataType
tmpValue
=
zeroVal
;
IndexDataType
tmpIndex
=
0
;
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
))
;
// reduce on the dim1 thread slice
AccumulationWithIndex
::
Calculate
(
tmpValue
,
in_thread_val_buf
[
offset
],
tmpIndex
,
in_thread_idx_buf
[
offset
]);
AccumulationWithIndex
::
Calculate
(
tmpValue
,
in_thread_val_buf
[
Number
<
offset
>
{}],
tmpIndex
,
in_thread_idx_buf
[
Number
<
offset
>
{}]);
});
// store thread local value to LDS for parallel reduction
block_reduce_val_buf
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
tmpValue
;
block_reduce_idx_buf
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
tmpIndex
;
__syncthreads
();
BlockwiseReduceWithIndex
::
Reduce
(
block_
reduce_val_buf
,
block_
reduce_idx_buf
,
tmpValue
,
tmpIndex
);
reduce_
work_
val_buf
,
reduce_
work_
idx_buf
,
tmpValue
,
tmpIndex
);
AccumulationWithIndex
::
Calculate
(
accu_value_buf
(
I
),
tmpValue
,
accu_index_buf
(
I
),
tmpIndex
);
accu_value_buf
(
iM
),
tmpValue
,
accu_index_buf
(
iM
),
tmpIndex
);
});
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
...
...
@@ -461,7 +441,7 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
Sequence
<
0
,
1
>
,
1
,
1
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
1
,
true
>
(
workspace_desc_m_k
,
...
...
@@ -480,7 +460,7 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
Sequence
<
0
,
1
>
,
1
,
1
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
1
,
true
>
(
workspace_desc_m_k
,
...
...
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
View file @
07a673c6
...
...
@@ -30,6 +30,7 @@
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_threadwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "element_wise_operation.hpp"
...
...
@@ -110,6 +111,11 @@ struct GridwiseReduction_mk_to_m_threadwise
using
ThreadBufferDimAccessOrder
=
typename
conditional
<
InSrcVectorDim
==
0
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
using
ThreadReduceSrcDesc_M_K
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{})));
using
ThreadReduceDstDesc_M
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{})));
using
PassThroughOp
=
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
...
...
@@ -124,26 +130,25 @@ struct GridwiseReduction_mk_to_m_threadwise
OutDataType
*
const
__restrict__
p_out_global
,
IndexDataType
*
const
__restrict__
p_indices_global
)
{
using
Accumulation
=
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
>
;
using
ThreadwiseReduce
=
ThreadwiseReduction
<
AccDataType
,
ThreadReduceSrcDesc_M_K
,
ThreadReduceDstDesc_M
,
ReduceOperation
,
PropagateNan
>
;
(
void
)
p_indices_global
;
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
auto
dst_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
dst_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zeroVal
;
});
...
...
@@ -178,20 +183,17 @@ struct GridwiseReduction_mk_to_m_threadwise
make_tuple
(
I0
,
I0
),
in_thread_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
// do element-wise pre-reduction operation
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
in_elementwise_op
(
in_thread_buf
(
offset
),
in_thread_buf
(
offset
));
});
// reduce on each thread-local slice
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
Accumulation
::
Calculate
(
accu_value_buf
(
I
),
in_thread_buf
[
offset
]);
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
));
in_elementwise_op
(
in_thread_buf
(
Number
<
offset
>
{}),
in_thread_buf
(
Number
<
offset
>
{}));
});
});
ThreadwiseReduce
::
Reduce
(
in_thread_buf
,
accu_value_buf
);
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
reducedLength
+=
KThreadSliceSize
;
...
...
@@ -203,8 +205,7 @@ struct GridwiseReduction_mk_to_m_threadwise
accu_value_buf
(
I
)
*=
alpha
;
});
constexpr
auto
reduced_data_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{}));
constexpr
auto
reduced_data_desc
=
ThreadReduceDstDesc_M
{};
if
constexpr
(
!
BetaIsZero
)
{
...
...
@@ -223,7 +224,7 @@ struct GridwiseReduction_mk_to_m_threadwise
true
>
(
out_grid_desc_m
,
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
));
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
priorDstValue_buf
;
threadwise_dst_load
.
Run
(
out_grid_desc_m
,
...
...
@@ -248,7 +249,7 @@ struct GridwiseReduction_mk_to_m_threadwise
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
1
,
false
>
(
out_grid_desc_m
,
...
...
@@ -269,30 +270,35 @@ struct GridwiseReduction_mk_to_m_threadwise
OutDataType
*
const
__restrict__
p_out_global
,
IndexDataType
*
const
__restrict__
p_indices_global
)
{
using
AccumulationWithIndex
=
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
;
using
ThreadwiseReduceWithIndex
=
ThreadwiseReductionWithIndex
<
AccDataType
,
IndexDataType
,
ThreadReduceSrcDesc_M_K
,
ThreadReduceDstDesc_M
,
ReduceOperation
,
PropagateNan
>
;
(
void
)
acc_elementwise_op
;
const
auto
zeroVal
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
auto
out_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
out_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_indices_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_val_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_buf
;
in_thread_
idx_
buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zeroVal
;
...
...
@@ -329,26 +335,23 @@ struct GridwiseReduction_mk_to_m_threadwise
in_global_buf
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_buf
);
in_thread_
val_
buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
// do element-wise pre-reduction operation
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
))
;
in_elementwise_op
(
in_thread_buf
(
offset
),
in_thread_buf
(
offset
));
});
in_thread_idx_buf
(
Number
<
offset
>
{})
=
indexStart
+
iK
();
// reduce on each thread-local slice
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
J
)
{
constexpr
auto
offset
=
I
*
Number
<
KThreadSliceSize
>
{}
+
J
;
AccumulationWithIndex
::
Calculate
(
accu_value_buf
(
I
),
in_thread_buf
[
offset
],
accu_index_buf
(
I
),
indexStart
+
J
);
in_elementwise_op
(
in_thread_val_buf
(
Number
<
offset
>
{}),
in_thread_val_buf
(
Number
<
offset
>
{}));
});
});
ThreadwiseReduceWithIndex
::
Reduce
(
in_thread_val_buf
,
in_thread_idx_buf
,
accu_value_buf
,
accu_index_buf
);
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
indexStart
+=
KThreadSliceSize
;
...
...
@@ -362,8 +365,7 @@ struct GridwiseReduction_mk_to_m_threadwise
accu_value_buf
(
I
)
*=
alpha
;
});
constexpr
auto
reduced_data_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{}));
constexpr
auto
reduced_data_desc
=
ThreadReduceDstDesc_M
{};
if
constexpr
(
!
BetaIsZero
)
{
...
...
@@ -382,7 +384,7 @@ struct GridwiseReduction_mk_to_m_threadwise
false
>
(
out_grid_desc_m
,
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
));
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
priorDstValue_buf
;
threadwise_dst_load
.
Run
(
out_grid_desc_m
,
...
...
@@ -407,7 +409,7 @@ struct GridwiseReduction_mk_to_m_threadwise
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
1
,
false
>
(
out_grid_desc_m
,
...
...
@@ -424,7 +426,7 @@ struct GridwiseReduction_mk_to_m_threadwise
Sequence
<
0
>
,
0
,
OutDstVectorSize
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
1
,
false
>
(
out_grid_desc_m
,
...
...
include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp
View file @
07a673c6
...
...
@@ -55,7 +55,7 @@ template <index_t BlockSize,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
InMemoryDataOperationEnum
_t
CGlobalMemoryDataOperation
,
InMemoryDataOperationEnum
CGlobalMemoryDataOperation
,
typename
AGridDesc_GK0_GM0_GM1_GK1
,
typename
BGridDesc_GK0_GN0_GN1_GK1
,
typename
CGridDesc_GM0_GM1_GN0_GN1
,
...
...
@@ -329,11 +329,11 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
integral_constant
<
bool
,
HasMainKBlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
)
{
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_a_grid
,
a_grid_desc_gk0_gm0_gm10_gm11_gk1
.
GetElementSpaceSize
());
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_b_grid
,
b_grid_desc_gk0_gn0_gn10_gn11_gk1
.
GetElementSpaceSize
());
auto
c_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
c_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_c_grid
,
c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1
.
GetElementSpaceSize
());
const
auto
GK0
=
a_grid_desc_gk0_gm0_gm10_gm11_gk1
.
GetLength
(
I0
);
...
...
@@ -383,7 +383,7 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
// A matrix blockwise copy
auto
a_blockwise_copy
=
BlockwiseTensorSliceTransfer_v5r1
<
BlockSize
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
GK0PerBlock
,
GM0
,
1
,
GM1PerBlockGM11
,
GK1
.
value
>
,
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1
,
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1
,
...
...
@@ -407,7 +407,7 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
// B matrix blockwise copy
auto
b_blockwise_copy
=
BlockwiseTensorSliceTransfer_v5r1
<
BlockSize
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
GK0PerBlock
,
GN0
,
1
,
GN1PerBlockGN11
,
GK1
.
value
>
,
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1
,
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1
,
...
...
@@ -467,7 +467,7 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
FloatAB
*
p_b_block_double
=
p_shared_block
+
2
*
a_block_aligned_space_size
;
// register allocation for output
auto
c_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
_t
::
Vgpr
,
FloatAcc
>
(
auto
c_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAcc
>
(
c_thread_desc_bm0_bm1_bn0_bn1
.
GetElementSpaceSize
());
ThreadwiseTensorSliceSet_v1
<
FloatAcc
,
...
...
@@ -481,15 +481,15 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
GK0PerBlock
,
0
,
0
,
0
,
0
);
constexpr
auto
b_block_slice_copy_step
=
make_multi_index
(
GK0PerBlock
,
0
,
0
,
0
,
0
);
auto
a_block_even_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
a_block_even_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_a_block_double
,
a_block_desc_gk0_gm0_gm10_gm11_gk1
.
GetElementSpaceSize
());
auto
b_block_even_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
b_block_even_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_b_block_double
,
b_block_desc_gk0_gn0_gn10_gn11_gk1
.
GetElementSpaceSize
());
auto
a_block_odd_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
a_block_odd_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_a_block_double
+
a_block_aligned_space_size
,
a_block_desc_gk0_gm0_gm10_gm11_gk1
.
GetElementSpaceSize
());
auto
b_block_odd_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
b_block_odd_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_b_block_double
+
b_block_aligned_space_size
,
b_block_desc_gk0_gn0_gn10_gn11_gk1
.
GetElementSpaceSize
());
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp
View file @
07a673c6
...
...
@@ -55,7 +55,7 @@ template <index_t BlockSize,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
InMemoryDataOperationEnum
_t
CGlobalMemoryDataOperation
,
InMemoryDataOperationEnum
CGlobalMemoryDataOperation
,
typename
AKMGridDesc
,
typename
BKNGridDesc
,
typename
CMNGridDesc
,
...
...
@@ -268,11 +268,11 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
integral_constant
<
bool
,
HasMainKBlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
)
{
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_a_grid
,
a_k_m0_m1_grid_desc
.
GetElementSpaceSize
());
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_b_grid
,
b_k_n0_n1_grid_desc
.
GetElementSpaceSize
());
auto
c_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
c_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_c_grid
,
c_m0_m10_m11_n0_n10_n11_grid_desc
.
GetElementSpaceSize
());
const
auto
K
=
a_k_m0_m1_grid_desc
.
GetLength
(
I0
);
...
...
@@ -315,7 +315,7 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
// A matrix blockwise copy
auto
a_blockwise_copy
=
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
KPerBlock
,
1
,
MPerBlockM1
>
,
ABlockTransferThreadSliceLengths_K_M0_M1
,
ABlockTransferThreadClusterLengths_K_M0_M1
,
...
...
@@ -341,7 +341,7 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
// B matrix blockwise copy
auto
b_blockwise_copy
=
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
KPerBlock
,
1
,
NPerBlockN1
>
,
BBlockTransferThreadSliceLengths_K_N0_N1
,
BBlockTransferThreadClusterLengths_K_N0_N1
,
...
...
@@ -403,7 +403,7 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
FloatAB
*
p_b_block_double
=
p_shared_block
+
2
*
a_block_aligned_space_size
;
// register allocation for output
auto
c_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
_t
::
Vgpr
,
FloatAcc
>
(
auto
c_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAcc
>
(
c_m10_m11_n10_n11_thread_desc
.
GetElementSpaceSize
());
ThreadwiseTensorSliceSet_v1
<
FloatAcc
,
...
...
@@ -428,15 +428,15 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
constexpr
auto
b_k_n0_n1_global_move_slice_window_step_hack
=
BGridMoveSliceWindowStepHacks
{};
auto
a_block_even_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
a_block_even_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_a_block_double
,
a_k_m0_m1_block_desc
.
GetElementSpaceSize
());
auto
b_block_even_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
b_block_even_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_b_block_double
,
b_k_n0_n1_block_desc
.
GetElementSpaceSize
());
auto
a_block_odd_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
a_block_odd_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_a_block_double
+
a_block_aligned_space_size
,
a_k_m0_m1_block_desc
.
GetElementSpaceSize
());
auto
b_block_odd_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
b_block_odd_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_b_block_double
+
b_block_aligned_space_size
,
b_k_n0_n1_block_desc
.
GetElementSpaceSize
());
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r3.hpp
View file @
07a673c6
...
...
@@ -55,7 +55,7 @@ template <index_t BlockSize,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
InMemoryDataOperationEnum
_t
CGlobalMemoryDataOperation
,
InMemoryDataOperationEnum
CGlobalMemoryDataOperation
,
typename
AK0MK1GridDesc
,
typename
BK0NK1GridDesc
,
typename
CMNGridDesc
,
...
...
@@ -275,11 +275,11 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
integral_constant
<
bool
,
HasMainKBlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
)
{
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_a_grid
,
a_k0_m0_m1_k1_grid_desc
.
GetElementSpaceSize
());
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_b_grid
,
b_k0_n0_n1_k1_grid_desc
.
GetElementSpaceSize
());
auto
c_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
c_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_c_grid
,
c_m0_m10_m11_n0_n10_n11_grid_desc
.
GetElementSpaceSize
());
// divide block work by [M, N]
...
...
@@ -325,7 +325,7 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
// A matrix blockwise copy
auto
a_blockwise_copy
=
BlockwiseTensorSliceTransfer_v5r1
<
BlockSize
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
KPerBlock
,
1
,
MPerBlockM1
,
K1
.
value
>
,
ABlockTransferThreadSliceLengths_K0_M0_M1_K1
,
ABlockTransferThreadClusterLengths_K0_M0_M1_K1
,
...
...
@@ -349,7 +349,7 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
// B matrix blockwise copy
auto
b_blockwise_copy
=
BlockwiseTensorSliceTransfer_v5r1
<
BlockSize
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
KPerBlock
,
1
,
NPerBlockN1
,
K1
.
value
>
,
BBlockTransferThreadSliceLengths_K0_N0_N1_K1
,
BBlockTransferThreadClusterLengths_K0_N0_N1_K1
,
...
...
@@ -409,7 +409,7 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
FloatAB
*
p_b_block_double
=
p_shared_block
+
2
*
a_block_aligned_space_size
;
// register allocation for output
auto
c_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
_t
::
Vgpr
,
FloatAcc
>
(
auto
c_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAcc
>
(
c_m10_m11_n10_n11_thread_desc
.
GetElementSpaceSize
());
ThreadwiseTensorSliceSet_v1
<
FloatAcc
,
...
...
@@ -423,15 +423,15 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
KPerBlock
,
0
,
0
,
0
);
constexpr
auto
b_block_slice_copy_step
=
make_multi_index
(
KPerBlock
,
0
,
0
,
0
);
auto
a_block_even_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
a_block_even_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_a_block_double
,
a_k0_m0_m1_k1_block_desc
.
GetElementSpaceSize
());
auto
b_block_even_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
b_block_even_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_b_block_double
,
b_k0_n0_n1_k1_block_desc
.
GetElementSpaceSize
());
auto
a_block_odd_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
a_block_odd_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_a_block_double
+
a_block_aligned_space_size
,
a_k0_m0_m1_k1_block_desc
.
GetElementSpaceSize
());
auto
b_block_odd_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
b_block_odd_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_b_block_double
+
b_block_aligned_space_size
,
b_k0_n0_n1_k1_block_desc
.
GetElementSpaceSize
());
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp
View file @
07a673c6
...
...
@@ -15,7 +15,7 @@ template <index_t BlockSize,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
InMemoryDataOperationEnum
_t
CGlobalMemoryDataOperation
,
InMemoryDataOperationEnum
CGlobalMemoryDataOperation
,
typename
AGlobalDesc
,
typename
BGlobalDesc
,
typename
CGlobalDesc
,
...
...
@@ -84,11 +84,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_a_global
,
a_e_k_global_desc
.
GetElementSpaceSize
());
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_b_global
,
b_e_n_ho_wo_global_desc
.
GetElementSpaceSize
());
auto
c_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
c_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_c_global
,
c_k_n_ho_wo_global_desc
.
GetElementSpaceSize
());
constexpr
auto
E
=
EPerBlock
*
3
*
3
;
...
...
@@ -181,7 +181,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// A matrix blockwise copy
auto
a_blockwise_copy
=
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
E
,
KPerBlock
>
,
ABlockTransferThreadSliceLengths_E_K
,
ABlockTransferThreadClusterLengths_E_K
,
...
...
@@ -221,11 +221,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3
b_e_n_ho_wo_global_desc
,
make_multi_index
(
0
,
0
,
ho_thread_data_on_global
,
wo_thread_data_on_global
));
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_shared_block
,
a_e_k_desc
.
GetElementSpaceSize
());
// register allocation for output
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAcc
,
c_k_n_ho_wo_thread_desc
.
GetElementSpaceSize
(),
true
>
...
...
@@ -250,7 +250,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
BGlobalMoveSliceWindowStepHacks
{};
// double regsiter buffer for b
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAB
,
b_e_n_ho_wo_thread_desc
.
GetElementSpaceSize
(),
true
>
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp
View file @
07a673c6
...
...
@@ -20,7 +20,7 @@ template <typename GridwiseGemm,
typename
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
,
typename
CBlockIdToBlockClusterAdaptor_K_N_H_W
,
bool
HasMainE0BlockLoop
,
ActivTypeEnum
_t
ActivType
>
ActivTypeEnum
ActivType
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
...
...
@@ -50,7 +50,7 @@ __global__ void
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
,
cblockid_to_k_n_h_w_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
{},
integral_constant
<
ActivTypeEnum
_t
,
ActivType
>
{});
integral_constant
<
ActivTypeEnum
,
ActivType
>
{});
}
template
<
typename
GridwiseGemm
,
...
...
@@ -62,7 +62,7 @@ template <typename GridwiseGemm,
typename
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
,
typename
CBlockIdToBlockClusterAdaptor_K_N_H_W
,
bool
HasMainE0BlockLoop
,
ActivTypeEnum
_t
ActivType
>
ActivTypeEnum
ActivType
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
...
...
@@ -94,7 +94,7 @@ __global__ void
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
,
cblockid_to_k_n_h_w_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
{},
integral_constant
<
ActivTypeEnum
_t
,
ActivType
>
{});
integral_constant
<
ActivTypeEnum
,
ActivType
>
{});
}
template
<
typename
GridwiseGemm
,
...
...
@@ -106,7 +106,7 @@ template <typename GridwiseGemm,
typename
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
,
typename
CBlockIdToBlockClusterAdaptor_K_N_H_W
,
bool
HasMainE0BlockLoop
,
ActivTypeEnum
_t
ActivType
>
ActivTypeEnum
ActivType
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
...
...
@@ -140,14 +140,14 @@ __global__ void
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
,
cblockid_to_k_n_h_w_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
{},
integral_constant
<
ActivTypeEnum
_t
,
ActivType
>
{});
integral_constant
<
ActivTypeEnum
,
ActivType
>
{});
}
template
<
index_t
BlockSize
,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
InMemoryDataOperationEnum
_t
CGlobalMemoryDataOperation
,
InMemoryDataOperationEnum
CGlobalMemoryDataOperation
,
typename
AGridDesc_E0_E1_K_E2
,
typename
BGridDesc_E0_E1_N_Ho_Wo_E2
,
typename
CGridDesc_K_N_Ho_Wo
,
...
...
@@ -559,7 +559,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
constexpr
auto
bias_k0_k1_thread_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
Number
<
KPerThread
>
{}));
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
FloatC
,
bias_k0_k1_thread_desc
.
GetElementSpaceSize
(),
true
>
...
...
@@ -602,10 +602,10 @@ struct GridwiseGemmDlops_km_kn_mn_v3
});
}
template
<
typename
CThreadBuff
,
typename
CThreadDesc_K1_N_H2_W2
,
ActivTypeEnum
_t
activ_type_
>
template
<
typename
CThreadBuff
,
typename
CThreadDesc_K1_N_H2_W2
,
ActivTypeEnum
activ_type_
>
__device__
static
void
Activation
(
CThreadBuff
&
c_thread_buf
,
const
CThreadDesc_K1_N_H2_W2
&
,
integral_constant
<
ActivTypeEnum
_t
,
activ_type_
>
)
integral_constant
<
ActivTypeEnum
,
activ_type_
>
)
{
constexpr
auto
c_k1_n_h2_w2_thread_gemm_desc
=
CThreadDesc_K1_N_H2_W2
{};
...
...
@@ -737,7 +737,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
I1
,
Number
<
WoPerThread_2
>
{}));
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
FloatC
,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc
.
GetElementSpaceSize
(),
true
>
...
...
@@ -783,7 +783,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
1
,
true
>
(
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
,
make_multi_index
(
k_block_work_id
,
...
...
@@ -843,7 +843,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
I1
,
Number
<
WoPerThreadx2
>
{}));
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
FloatC
,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc
.
GetElementSpaceSize
(),
true
>
...
...
@@ -874,7 +874,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
InMemoryDataOperationEnum
_t
::
Add
,
InMemoryDataOperationEnum
::
Add
,
1
,
true
>
(
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
,
make_multi_index
(
k_block_work_id
,
...
...
@@ -964,7 +964,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// A matrix blockwise copy
auto
a_blockwise_copy
=
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
I1
,
E1
,
I1
,
KPerBlock
,
E2
>
,
ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2
,
ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2
,
...
...
@@ -1023,11 +1023,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3
0
,
0
));
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_shared_block
,
a_e0_e1_k0_k1_e2_block_copy_desc
.
GetElementSpaceSize
());
//// register allocation for output
// StaticBuffer<AddressSpaceEnum
_t
::Vgpr,
// StaticBuffer<AddressSpaceEnum::Vgpr,
// FloatAcc,
// c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(),
// true>
...
...
@@ -1050,7 +1050,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
constexpr
auto
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks
=
BGlobalStepHacks
{};
// double regsiter buffer for b
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAB
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc
.
GetElementSpaceSize
(),
true
>
...
...
@@ -1294,21 +1294,21 @@ struct GridwiseGemmDlops_km_kn_mn_v3
const
auto
bias_k0_k1_grid_desc
=
MakeBiasK0K1GridDescriptor
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
);
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_a_global
,
a_e0_e1_k0_k1_e2_grid_desc
.
GetElementSpaceSize
());
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_b_global
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
.
GetElementSpaceSize
());
auto
c_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
c_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_c_global
,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
.
GetElementSpaceSize
());
auto
d_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
d_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_d_global
,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
.
GetElementSpaceSize
());
auto
bias_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
bias_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_bias_global
,
bias_k0_k1_grid_desc
.
GetElementSpaceSize
());
constexpr
auto
c_k1_n_h2_w2_thread_gemm_desc
=
MakeCK1NH2W2ThreadDescriptor
();
// register allocation for output
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAcc
,
c_k1_n_h2_w2_thread_gemm_desc
.
GetElementSpaceSize
(),
true
>
...
...
@@ -1344,7 +1344,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
typename
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
,
typename
CBlockIdToBlockClusterAdaptor_K_N_H_W
,
bool
HasMainE0BlockLoop
,
ActivTypeEnum
_t
ActivType
>
ActivTypeEnum
ActivType
>
__device__
static
void
ConvBiasActiv
(
const
FloatAB
*
__restrict__
p_a_global
,
const
FloatAB
*
__restrict__
p_b_global
,
...
...
@@ -1356,26 +1356,26 @@ struct GridwiseGemmDlops_km_kn_mn_v3
const
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
&
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
,
const
CBlockIdToBlockClusterAdaptor_K_N_H_W
&
cblockid_to_k_n_h_w_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
,
integral_constant
<
ActivTypeEnum
_t
,
ActivType
>
)
integral_constant
<
ActivTypeEnum
,
ActivType
>
)
{
static
constexpr
auto
activ_type
=
integral_constant
<
ActivTypeEnum
_t
,
ActivType
>
{};
static
constexpr
auto
activ_type
=
integral_constant
<
ActivTypeEnum
,
ActivType
>
{};
const
auto
bias_k0_k1_grid_desc
=
MakeBiasK0K1GridDescriptor
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
);
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_a_global
,
a_e0_e1_k0_k1_e2_grid_desc
.
GetElementSpaceSize
());
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_b_global
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
.
GetElementSpaceSize
());
auto
c_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
c_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_c_global
,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
.
GetElementSpaceSize
());
auto
bias_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
bias_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_bias_global
,
bias_k0_k1_grid_desc
.
GetElementSpaceSize
());
constexpr
auto
c_k1_n_h2_w2_thread_gemm_desc
=
MakeCK1NH2W2ThreadDescriptor
();
// register allocation for output
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAcc
,
c_k1_n_h2_w2_thread_gemm_desc
.
GetElementSpaceSize
(),
true
>
...
...
@@ -1423,7 +1423,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
typename
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
,
typename
CBlockIdToBlockClusterAdaptor_K_N_H_W
,
bool
HasMainE0BlockLoop
,
ActivTypeEnum
_t
ActivType
>
ActivTypeEnum
ActivType
>
__device__
static
void
ConvBiasActivMaxpool
(
const
FloatAB
*
__restrict__
p_a_global
,
const
FloatAB
*
__restrict__
p_b_global
,
...
...
@@ -1437,28 +1437,28 @@ struct GridwiseGemmDlops_km_kn_mn_v3
const
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
&
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
,
const
CBlockIdToBlockClusterAdaptor_K_N_H_W
&
cblockid_to_k_n_h_w_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
,
integral_constant
<
ActivTypeEnum
_t
,
ActivType
>
)
integral_constant
<
ActivTypeEnum
,
ActivType
>
)
{
static
constexpr
auto
activ_type
=
integral_constant
<
ActivTypeEnum
_t
,
ActivType
>
{};
static
constexpr
auto
activ_type
=
integral_constant
<
ActivTypeEnum
,
ActivType
>
{};
const
auto
bias_k0_k1_grid_desc
=
MakeBiasK0K1GridDescriptor
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
);
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_a_global
,
a_e0_e1_k0_k1_e2_grid_desc
.
GetElementSpaceSize
());
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_b_global
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
.
GetElementSpaceSize
());
auto
c_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
c_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_c_global
,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
.
GetElementSpaceSize
());
auto
d_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
d_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_d_global
,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
.
GetElementSpaceSize
());
auto
bias_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
bias_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_bias_global
,
bias_k0_k1_grid_desc
.
GetElementSpaceSize
());
constexpr
auto
c_k1_n_h2_w2_thread_gemm_desc
=
MakeCK1NH2W2ThreadDescriptor
();
// register allocation for output
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAcc
,
c_k1_n_h2_w2_thread_gemm_desc
.
GetElementSpaceSize
(),
true
>
...
...
@@ -1514,7 +1514,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
typename
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
,
typename
CBlockIdToBlockClusterAdaptor_K_N_H_W
,
bool
HasMainE0BlockLoop
,
ActivTypeEnum
_t
ActivType
>
ActivTypeEnum
ActivType
>
__device__
static
void
ConvBiasActivResizeAdd
(
const
FloatAB
*
__restrict__
p_a_global
,
const
FloatAB
*
__restrict__
p_b_global
,
...
...
@@ -1527,26 +1527,26 @@ struct GridwiseGemmDlops_km_kn_mn_v3
const
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
&
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
,
const
CBlockIdToBlockClusterAdaptor_K_N_H_W
&
cblockid_to_k_n_h_w_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
,
integral_constant
<
ActivTypeEnum
_t
,
ActivType
>
)
integral_constant
<
ActivTypeEnum
,
ActivType
>
)
{
static
constexpr
auto
activ_type
=
integral_constant
<
ActivTypeEnum
_t
,
ActivType
>
{};
static
constexpr
auto
activ_type
=
integral_constant
<
ActivTypeEnum
,
ActivType
>
{};
const
auto
bias_k0_k1_grid_desc
=
MakeBiasK0K1GridDescriptor
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
);
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_a_global
,
a_e0_e1_k0_k1_e2_grid_desc
.
GetElementSpaceSize
());
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_b_global
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
.
GetElementSpaceSize
());
auto
d_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
d_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_d_global
,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
.
GetElementSpaceSize
());
auto
bias_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
bias_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_bias_global
,
bias_k0_k1_grid_desc
.
GetElementSpaceSize
());
constexpr
auto
c_k1_n_h2_w2_thread_gemm_desc
=
MakeCK1NH2W2ThreadDescriptor
();
// register allocation for output
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAcc
,
c_k1_n_h2_w2_thread_gemm_desc
.
GetElementSpaceSize
(),
true
>
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
View file @
07a673c6
...
...
@@ -79,8 +79,8 @@ template <typename FloatAB,
typename
CElementwiseOperation
,
typename
D0ReduceOperation
,
typename
D1ReduceOperation
,
InMemoryDataOperationEnum
_t
CGlobalMemoryDataOperation
,
InMemoryDataOperationEnum
_t
DGlobalMemoryDataOperation
,
InMemoryDataOperationEnum
CGlobalMemoryDataOperation
,
InMemoryDataOperationEnum
DGlobalMemoryDataOperation
,
typename
AGridDesc_AK0_M_AK1
,
typename
BGridDesc_BK0_N_BK1
,
typename
CGridDesc_M_N
,
...
...
@@ -363,15 +363,15 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
const
DGridDescriptor_MBlock_MPerBlock
&
d_grid_desc_mblock_mperblock
,
const
Block2CTileMap
&
block_2_ctile_map
)
{
const
auto
a_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
a_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_a_grid
,
a_grid_desc_ak0_m_ak1
.
GetElementSpaceSize
());
const
auto
b_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
b_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_b_grid
,
b_grid_desc_bk0_n_bk1
.
GetElementSpaceSize
());
auto
c_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
c_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_c_grid
,
c_grid_desc_mblock_mperblock_nblock_nperblock
.
GetElementSpaceSize
());
auto
d0_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
d0_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_d0_grid
,
d_grid_desc_mblock_mperblock
.
GetElementSpaceSize
());
auto
d1_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
d1_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_d1_grid
,
d_grid_desc_mblock_mperblock
.
GetElementSpaceSize
());
// divide block work by [M, N]
...
...
@@ -399,7 +399,7 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
BlockwiseTensorSliceTransfer_v4r1
<
BlockSize
,
AElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
AK0
,
MPerBlock
,
AK1
>
,
ABlockTransferThreadClusterLengths_AK0_M_AK1
,
ABlockTransferThreadClusterArrangeOrder
,
...
...
@@ -430,7 +430,7 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
BlockwiseTensorSliceTransfer_v4r1
<
BlockSize
,
BElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
BK0
,
NPerBlock
,
BK1
>
,
BBlockTransferThreadClusterLengths_BK0_N_BK1
,
BBlockTransferThreadClusterArrangeOrder
,
...
...
@@ -484,10 +484,10 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
constexpr
auto
a_block_space_size_aligned
=
math
::
integer_least_multiple
(
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
(),
max_lds_align
);
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
FloatAB
*>
(
p_shared
),
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
());
auto
b_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
b_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
FloatAB
*>
(
p_shared
)
+
a_block_space_size_aligned
,
b_block_desc_bk0_n_bk1
.
GetElementSpaceSize
());
...
...
@@ -563,7 +563,7 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
constexpr
auto
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
=
GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
();
auto
c_shuffle_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
c_shuffle_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
FloatCShuffle
*>
(
p_shared
),
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
.
GetElementSpaceSize
());
...
...
@@ -632,7 +632,7 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
>
,
7
,
1
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
1
,
true
>
{
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2
,
...
...
@@ -723,13 +723,13 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
Number
<
mreduce_per_thread
>
{}));
// TODO: this should be implemented as a blockwise reduction
auto
c_reduce_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
_t
::
Vgpr
,
FloatCShuffle
>
(
auto
c_reduce_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatCShuffle
>
(
c_reduce_thread_desc_mperblock_nperblock
.
GetElementSpaceSize
());
auto
d0_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
_t
::
Vgpr
,
FloatCShuffle
>
(
auto
d0_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatCShuffle
>
(
d_reduce_thread_desc_mperblock
.
GetElementSpaceSize
());
auto
d1_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
_t
::
Vgpr
,
FloatCShuffle
>
(
auto
d1_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatCShuffle
>
(
d_reduce_thread_desc_mperblock
.
GetElementSpaceSize
());
// reduce: threadwise copy from LDS to VGPR
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
View file @
07a673c6
...
...
@@ -60,7 +60,7 @@ template <typename FloatAB,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
,
InMemoryDataOperationEnum
_t
CGlobalMemoryDataOperation
,
InMemoryDataOperationEnum
CGlobalMemoryDataOperation
,
typename
AGridDesc_AK0_M_AK1
,
typename
BGridDesc_BK0_N_BK1
,
typename
CGridDesc_M_N
,
...
...
@@ -316,11 +316,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
c_grid_desc_mblock_mperblock_nblock_nperblock
,
const
Block2CTileMap
&
block_2_ctile_map
)
{
const
auto
a_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
a_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_a_grid
,
a_grid_desc_ak0_m_ak1
.
GetElementSpaceSize
());
const
auto
b_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
b_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_b_grid
,
b_grid_desc_bk0_n_bk1
.
GetElementSpaceSize
());
auto
c_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
c_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_c_grid
,
c_grid_desc_mblock_mperblock_nblock_nperblock
.
GetElementSpaceSize
());
// divide block work by [M, N]
...
...
@@ -348,7 +348,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
BlockwiseTensorSliceTransfer_v4r1
<
BlockSize
,
AElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
AK0
,
MPerBlock
,
AK1
>
,
ABlockTransferThreadClusterLengths_AK0_M_AK1
,
ABlockTransferThreadClusterArrangeOrder
,
...
...
@@ -379,7 +379,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
BlockwiseTensorSliceTransfer_v4r1
<
BlockSize
,
BElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
BK0
,
NPerBlock
,
BK1
>
,
BBlockTransferThreadClusterLengths_BK0_N_BK1
,
BBlockTransferThreadClusterArrangeOrder
,
...
...
@@ -433,10 +433,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
constexpr
auto
a_block_space_size_aligned
=
math
::
integer_least_multiple
(
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
(),
max_lds_align
);
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
FloatAB
*>
(
p_shared
),
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
());
auto
b_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
b_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
FloatAB
*>
(
p_shared
)
+
a_block_space_size_aligned
,
b_block_desc_bk0_n_bk1
.
GetElementSpaceSize
());
...
...
@@ -512,7 +512,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
constexpr
auto
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
=
GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
();
auto
c_shuffle_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
c_shuffle_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
FloatCShuffle
*>
(
p_shared
),
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
.
GetElementSpaceSize
());
...
...
@@ -581,7 +581,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
>
,
7
,
1
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
1
,
true
>
{
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2
,
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
View file @
07a673c6
...
...
@@ -132,7 +132,7 @@ template <index_t BlockSize,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
InMemoryDataOperationEnum
_t
CGlobalMemoryDataOperation
,
InMemoryDataOperationEnum
CGlobalMemoryDataOperation
,
typename
AGridDesc_K0_M_K1
,
typename
BGridDesc_K0_N_K1
,
typename
CGridDesc_M_N
,
...
...
@@ -426,11 +426,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
const
CElementwiseOperation
&
c_element_op
,
const
Block2CTileMap
&
block_2_ctile_map
)
{
const
auto
a_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
a_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_a_grid
,
a_grid_desc_k0_m_k1
.
GetElementSpaceSize
());
const
auto
b_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
const
auto
b_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_b_grid
,
b_grid_desc_k0_n_k1
.
GetElementSpaceSize
());
auto
c_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Global
>
(
auto
c_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_c_grid
,
c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2
.
GetElementSpaceSize
());
const
auto
K0
=
a_grid_desc_k0_m_k1
.
GetLength
(
I0
);
...
...
@@ -460,7 +460,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
BlockwiseTensorSliceTransfer_v4r1
<
BlockSize
,
AElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
K0PerBlock
,
MPerBlock
,
K1
>
,
ABlockTransferThreadClusterLengths_K0_M_K1
,
ABlockTransferThreadClusterArrangeOrder
,
...
...
@@ -491,7 +491,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
BlockwiseTensorSliceTransfer_v4r1
<
BlockSize
,
BElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
InMemoryDataOperationEnum
_t
::
Set
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
K0PerBlock
,
NPerBlock
,
K1
>
,
BBlockTransferThreadClusterLengths_K0_N_K1
,
BBlockTransferThreadClusterArrangeOrder
,
...
...
@@ -543,10 +543,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
constexpr
auto
a_block_space_size_aligned
=
math
::
integer_least_multiple
(
a_block_desc_k0_m_k1
.
GetElementSpaceSize
(),
max_lds_align
);
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
FloatAB
*>
(
p_shared
),
a_block_desc_k0_m_k1
.
GetElementSpaceSize
());
auto
b_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
_t
::
Lds
>
(
auto
b_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
FloatAB
*>
(
p_shared
)
+
a_block_space_size_aligned
,
b_block_desc_k0_n_k1
.
GetElementSpaceSize
());
...
...
Prev
1
2
3
4
5
6
7
8
9
…
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment