Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
0eb75e21
Commit
0eb75e21
authored
Aug 17, 2024
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into ck_tile/moe
parents
1b4b640b
c8b6b642
Changes
200
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2330 additions
and
421 deletions
+2330
-421
example/ck_tile/01_fmha/script/smoke_test_bwd.sh
example/ck_tile/01_fmha/script/smoke_test_bwd.sh
+12
-10
include/ck/host_utility/device_prop.hpp
include/ck/host_utility/device_prop.hpp
+6
-0
include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
...ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
+43
-0
include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp
...ation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp
+23
-0
include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
...gen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+79
-8
include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
...operation/gpu/device/impl/device_column_to_image_impl.hpp
+2
-2
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp
...pu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp
+282
-99
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
...operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+142
-162
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
.../device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+154
-8
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
...device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
+6
-6
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
...mpl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+194
-95
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
.../device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+158
-6
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
...e_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
+7
-7
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
...impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
+158
-8
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
...grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
+1054
-0
include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
...operation/gpu/device/impl/device_image_to_column_impl.hpp
+2
-2
include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp
...tensor_operation/gpu/device/impl/device_reduce_common.hpp
+3
-3
include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
...or_operation/gpu/device/impl/device_reduce_multiblock.hpp
+2
-2
include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
...or_operation/gpu/device/impl/device_reduce_threadwise.hpp
+2
-2
include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp
...tion/gpu/device/impl/device_reduce_threadwise_multi_d.hpp
+1
-1
No files found.
example/ck_tile/01_fmha/script/smoke_test_bwd.sh
View file @
0eb75e21
...
...
@@ -11,18 +11,19 @@ COMMON_ARGS='-v=1'
set
-x
for
prec
in
"fp16"
"bf16"
;
do
for
perm
in
0 1
;
do
for
hdim
in
32 64 128
;
do
for
hdim
in
32 64 128
256
;
do
for
mode
in
0 1
;
do
for
bias
in
"n"
"e"
"a"
;
do
for
dbias
in
0 1
;
do
for
p_drop
in
0.0 0.2
;
do
for
bias
in
"n"
"a"
;
do
for
dbias
in
0
;
do
for
p_drop
in
0.0 0.2
;
do
for
deterministic
in
0
;
do
$EXE
-prec
=
$prec
-b
=
1
-h
=
4
-h_k
=
2
-d
=
$hdim
-s
=
259
-bias
=
$bias
-dbias
=
$dbias
-p_drop
=
$p_drop
-iperm
=
$perm
-operm
=
$perm
-v
=
1
-mode
=
$mode
-kname
=
$KNAME
$COMMON_ARGS
$EXE
-prec
=
$prec
-b
=
2
-h
=
2
-d
=
$hdim
-s
=
516
-s_k
=
253
-bias
=
$bias
-dbias
=
$dbias
-p_drop
=
$p_drop
-iperm
=
$perm
-operm
=
$perm
-v
=
1
-mode
=
$mode
-kname
=
$KNAME
$COMMON_ARGS
$EXE
-prec
=
$prec
-b
=
1
-h
=
4
-h_k
=
1
-d
=
$hdim
-s
=
500
-s_k
=
251
-bias
=
$bias
-dbias
=
$dbias
-p_drop
=
$p_drop
-iperm
=
$perm
-operm
=
$perm
-mask
=
1
-v
=
1
-mode
=
$mode
-kname
=
$KNAME
$COMMON_ARGS
$EXE
-prec
=
$prec
-b
=
1
-h
=
2
-d
=
$hdim
-s
=
900
-s_k
=
258
-bias
=
$bias
-dbias
=
$dbias
-p_drop
=
$p_drop
-iperm
=
$perm
-operm
=
$perm
-mask
=
2
-v
=
1
-mode
=
$mode
-kname
=
$KNAME
$COMMON_ARGS
$EXE
-prec
=
$prec
-b
=
2
-h
=
1
-d
=
$hdim
-s
=
987
-s_k
=
219
-bias
=
$bias
-dbias
=
$dbias
-p_drop
=
$p_drop
-iperm
=
$perm
-operm
=
$perm
-mask
=
t:128,30
-v
=
1
-mode
=
$mode
-kname
=
$KNAME
$COMMON_ARGS
$EXE
-prec
=
$prec
-b
=
2
-h
=
3
-h_k
=
1
-d
=
$hdim
-s
=
244
-s_k
=
499
-bias
=
$bias
-dbias
=
$dbias
-p_drop
=
$p_drop
-iperm
=
$perm
-operm
=
$perm
-mask
=
b:4,35
-v
=
1
-mode
=
$mode
-kname
=
$KNAME
$COMMON_ARGS
$EXE
-prec
=
$prec
-b
=
1
-h
=
4
-h_k
=
2
-d
=
$hdim
-s
=
259
-bias
=
$bias
-dbias
=
$dbias
-p_drop
=
$p_drop
-iperm
=
$perm
-operm
=
$perm
-deterministic
=
$deterministic
-v
=
1
-mode
=
$mode
-kname
=
$KNAME
$COMMON_ARGS
$EXE
-prec
=
$prec
-b
=
2
-h
=
2
-d
=
$hdim
-s
=
516
-s_k
=
253
-bias
=
$bias
-dbias
=
$dbias
-p_drop
=
$p_drop
-iperm
=
$perm
-operm
=
$perm
-deterministic
=
$deterministic
-v
=
1
-mode
=
$mode
-kname
=
$KNAME
$COMMON_ARGS
$EXE
-prec
=
$prec
-b
=
1
-h
=
4
-h_k
=
1
-d
=
$hdim
-s
=
500
-s_k
=
251
-bias
=
$bias
-dbias
=
$dbias
-p_drop
=
$p_drop
-iperm
=
$perm
-operm
=
$perm
-mask
=
1
-deterministic
=
$deterministic
-v
=
1
-mode
=
$mode
-kname
=
$KNAME
$COMMON_ARGS
$EXE
-prec
=
$prec
-b
=
1
-h
=
2
-d
=
$hdim
-s
=
900
-s_k
=
258
-bias
=
$bias
-dbias
=
$dbias
-p_drop
=
$p_drop
-iperm
=
$perm
-operm
=
$perm
-mask
=
2
-v
=
1
-deterministic
=
$deterministic
-mode
=
$mode
-kname
=
$KNAME
$COMMON_ARGS
$EXE
-prec
=
$prec
-b
=
2
-h
=
1
-d
=
$hdim
-s
=
987
-s_k
=
219
-bias
=
$bias
-dbias
=
$dbias
-p_drop
=
$p_drop
-iperm
=
$perm
-operm
=
$perm
-mask
=
t:128,30
-deterministic
=
$deterministic
-v
=
1
-mode
=
$mode
-kname
=
$KNAME
$COMMON_ARGS
$EXE
-prec
=
$prec
-b
=
2
-h
=
3
-h_k
=
1
-d
=
$hdim
-s
=
244
-s_k
=
499
-bias
=
$bias
-dbias
=
$dbias
-p_drop
=
$p_drop
-iperm
=
$perm
-operm
=
$perm
-mask
=
b:4,35
-deterministic
=
$deterministic
-v
=
1
-mode
=
$mode
-kname
=
$KNAME
$COMMON_ARGS
done
done
...
...
@@ -31,4 +32,5 @@ done
done
done
done
done
set
+x
include/ck/host_utility/device_prop.hpp
View file @
0eb75e21
...
...
@@ -65,6 +65,12 @@ inline bool is_lds_direct_load_supported()
ck
::
get_device_name
()
==
"gfx941"
||
ck
::
get_device_name
()
==
"gfx942"
;
}
inline
bool
is_bf16_atomic_supported
()
{
return
ck
::
get_device_name
()
==
"gfx940"
||
ck
::
get_device_name
()
==
"gfx941"
||
ck
::
get_device_name
()
==
"gfx942"
;
}
inline
bool
is_gfx101_supported
()
{
return
ck
::
get_device_name
()
==
"gfx1010"
||
ck
::
get_device_name
()
==
"gfx1011"
||
...
...
include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
View file @
0eb75e21
...
...
@@ -53,6 +53,49 @@ struct DeviceGemmMultipleD : public BaseOperator
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
// GEMM:
// input : A[M, K], B[K, N],
// input : D0[M, N], D1[M, N], ...
// output : E[M, N]
// C = a_op(A) * b_op(B)
// E = cde_op(C, D0, D1, ...)
// Assume:
// D0, D1, ... and E have the same layout
template
<
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
typename
ELayout
,
typename
ADataType
,
typename
BDataType
,
typename
DsDataType
,
typename
EDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CDEElementwiseOperation
>
struct
DeviceGemmMultipleDSplitK
:
public
BaseOperator
{
static
constexpr
index_t
NumDTensor
=
DsDataType
::
Size
();
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
const
void
*
p_b
,
std
::
array
<
const
void
*
,
NumDTensor
>
p_ds
,
void
*
p_e
,
ck
::
index_t
M
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
StrideA
,
ck
::
index_t
StrideB
,
std
::
array
<
ck
::
index_t
,
NumDTensor
>
StrideDs
,
ck
::
index_t
StrideE
,
ck
::
index_t
KBatch
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp
View file @
0eb75e21
...
...
@@ -126,6 +126,29 @@ struct DeviceGroupedConvFwdMultipleABD : public BaseOperator
const
BElementwiseOperation
&
b_element_op
,
const
CDEElementwiseOperation
&
cde_element_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
APointers
p_a
,
BPointers
p_b
,
const
std
::
array
<
const
void
*
,
NumDTensor
>&
p_ds
,
void
*
p_e
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
const
std
::
array
<
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_k_wos_lengths
,
const
std
::
array
<
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_k_wos_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
e_g_n_k_wos_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
e_g_n_k_wos_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
input_right_pads
,
const
AElementwiseOperation
&
a_element_op
,
const
BElementwiseOperation
&
b_element_op
,
const
CDEElementwiseOperation
&
cde_element_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
...
...
include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
View file @
0eb75e21
...
...
@@ -359,14 +359,14 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
using
GemmToConv
FwdTransformer
=
TransformConvFwdToGemm
<
NDimSpatial
,
ConvForwardSpecialization
>
;
using
ConvToGemm
FwdTransformer
=
TransformConvFwdToGemm
<
NDimSpatial
,
ConvForwardSpecialization
>
;
static
constexpr
auto
matrix_padder
=
MatrixPadder
<
GemmSpec
,
index_t
,
index_t
,
index_t
>
{
MPerBlock
,
NPerBlock
,
KPerBlock
};
template
<
typename
ALay
>
__host__
__device__
static
auto
MakeAGridDescriptor_M_K
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
MakeAGridDescriptor_M_K
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
const
auto
in_gemmmraw_gemmkraw_desc
=
conv_to_gemm_transformer
.
template
MakeADescriptor_M_K
<
ALay
>();
...
...
@@ -379,7 +379,7 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
template
<
typename
BLay
>
__host__
__device__
static
auto
MakeBGridDescriptor_N_K
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
MakeBGridDescriptor_N_K
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
const
auto
wei_gemmnraw_gemmkraw_desc
=
conv_to_gemm_transformer
.
template
MakeBDescriptor_N_K
<
BLay
>();
...
...
@@ -392,7 +392,7 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
template
<
typename
ELay
>
__host__
__device__
static
auto
MakeEGridDescriptor_M_N
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
MakeEGridDescriptor_M_N
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
const
auto
out_gemmmraw_gemmnraw_desc
=
conv_to_gemm_transformer
.
template
MakeCDescriptor_M_N
<
ELay
>();
...
...
@@ -405,7 +405,7 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
// Shape of Ds and E must be aligned. Strides can be different.
// Pass e_g_n_k_wos_lengths for logical broadcast.
static
auto
MakeDsGridDescriptor_M_N
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
static
auto
MakeDsGridDescriptor_M_N
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
return
generate_tuple
(
[
&
](
auto
i
)
{
...
...
@@ -417,7 +417,7 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
}
// desc for problem definition
constexpr
static
GemmToConv
FwdTransformer
dummy_conv_to_gemm_transformer
;
constexpr
static
ConvToGemm
FwdTransformer
dummy_conv_to_gemm_transformer
;
using
AGridDesc_M_K
=
remove_cvref_t
<
decltype
(
MakeAGridDescriptor_M_K
<
ALayout
>
(
dummy_conv_to_gemm_transformer
))
>
;
using
BGridDesc_N_K
=
...
...
@@ -617,7 +617,7 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
// D batch stride
compute_ptr_offset_of_batch_
.
BatchStrideDs_
(
i
)
=
ds_g_n_k_wos_strides
[
i
][
0
];
GemmToConv
FwdTransformer
conv_to_gemm_transformer_d
{
a_g_n_c_wis_lengths
,
ConvToGemm
FwdTransformer
conv_to_gemm_transformer_d
{
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
...
...
@@ -686,7 +686,7 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
// tensor descriptors for problem definiton
index_t
num_group_
;
GemmToConv
FwdTransformer
conv_to_gemm_transformer_
;
ConvToGemm
FwdTransformer
conv_to_gemm_transformer_
;
AGridDesc_M_K
a_grid_desc_m_k_
;
BGridDesc_N_K
b_grid_desc_n_k_
;
...
...
@@ -943,6 +943,77 @@ struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
b_element_op
,
cde_element_op
};
}
static
__device__
__host__
auto
MakeArgument
(
APointers
p_as
,
BPointers
p_bs
,
const
ck
::
Array
<
const
void
*
,
NumDTensor
>&
p_ds
,
void
*
p_e
,
const
ck
::
Array
<
long_index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
ck
::
Array
<
long_index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_strides
,
const
ck
::
Array
<
long_index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
ck
::
Array
<
long_index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
const
ck
::
Array
<
ck
::
Array
<
long_index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_k_wos_lengths
,
const
ck
::
Array
<
ck
::
Array
<
long_index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_k_wos_strides
,
const
ck
::
Array
<
long_index_t
,
NDimSpatial
+
3
>&
e_g_n_k_wos_lengths
,
const
ck
::
Array
<
long_index_t
,
NDimSpatial
+
3
>&
e_g_n_k_wos_strides
,
const
ck
::
Array
<
long_index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
ck
::
Array
<
long_index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
ck
::
Array
<
long_index_t
,
NDimSpatial
>&
input_left_pads
,
const
ck
::
Array
<
long_index_t
,
NDimSpatial
>&
input_right_pads
,
const
AElementwiseOperation
&
a_element_op
,
const
BElementwiseOperation
&
b_element_op
,
const
CDEElementwiseOperation
&
cde_element_op
)
{
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides_i32
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_k_wos_lengths_i32
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_k_wos_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_dilations_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
input_left_pads_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
input_right_pads_i32
;
array_convert
(
a_g_n_c_wis_lengths_i32
,
a_g_n_c_wis_lengths
);
array_convert
(
a_g_n_c_wis_strides_i32
,
a_g_n_c_wis_strides
);
array_convert
(
b_g_k_c_xs_lengths_i32
,
b_g_k_c_xs_lengths
);
array_convert
(
b_g_k_c_xs_strides_i32
,
b_g_k_c_xs_strides
);
for
(
index_t
d
=
0
;
d
<
NumDTensor
;
d
++
)
{
array_convert
(
ds_g_n_k_wos_lengths_i32
[
d
],
ds_g_n_k_wos_lengths
[
d
]);
array_convert
(
ds_g_n_k_wos_strides_i32
[
d
],
ds_g_n_k_wos_strides
[
d
]);
}
array_convert
(
e_g_n_k_wos_lengths_i32
,
e_g_n_k_wos_lengths
);
array_convert
(
e_g_n_k_wos_strides_i32
,
e_g_n_k_wos_strides
);
array_convert
(
conv_filter_strides_i32
,
conv_filter_strides
);
array_convert
(
conv_filter_dilations_i32
,
conv_filter_dilations
);
array_convert
(
input_left_pads_i32
,
input_left_pads
);
array_convert
(
input_right_pads_i32
,
input_right_pads
);
return
Argument
{
p_as
,
p_bs
,
p_ds
,
p_e
,
a_g_n_c_wis_lengths_i32
,
a_g_n_c_wis_strides_i32
,
b_g_k_c_xs_lengths_i32
,
b_g_k_c_xs_strides_i32
,
ds_g_n_k_wos_lengths_i32
,
ds_g_n_k_wos_strides_i32
,
e_g_n_k_wos_lengths_i32
,
e_g_n_k_wos_strides_i32
,
conv_filter_strides_i32
,
conv_filter_dilations_i32
,
input_left_pads_i32
,
input_right_pads_i32
,
a_element_op
,
b_element_op
,
cde_element_op
};
}
};
}
// namespace device
...
...
include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
View file @
0eb75e21
...
...
@@ -64,7 +64,7 @@ struct DeviceColumnToImageImpl
static
constexpr
auto
spatial_offset
=
Number
<
3
>
{};
using
GemmToConv
FwdTransformer
=
using
ConvToGemm
FwdTransformer
=
TransformConvFwdToGemm
<
NDimSpatial
,
ConvolutionForwardSpecialization
::
Default
>
;
static
constexpr
auto
matrix_padder
=
MatrixPadder
<
GemmSpecialization
::
MKPadding
,
index_t
,
index_t
,
index_t
>
{
...
...
@@ -233,7 +233,7 @@ struct DeviceColumnToImageImpl
:
independent_filter_stride
;
}
GemmToConv
FwdTransformer
conv_to_gemm_transformer
{
a_g_n_c_wis_lengths
,
ConvToGemm
FwdTransformer
conv_to_gemm_transformer
{
a_g_n_c_wis_lengths
,
image_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
{},
// not needed for A Descriptor
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp
View file @
0eb75e21
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
View file @
0eb75e21
...
...
@@ -168,15 +168,11 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
// rotating mem
rotating_mem
.
Next
();
// clear c mem
if
constexpr
(
!
is_same
<
remove_cvref_t
<
CDataType
>
,
bhalf_t
>::
value
)
{
if
(
arg_
.
KBatch
>
1
)
hipGetErrorString
(
hipMemsetAsync
(
arg_
.
p_c_grid
,
hipGetErrorString
(
hipMemsetAsync
(
arg_
.
p_c_grid
,
0
,
arg_
.
M
*
arg_
.
N
*
sizeof
(
CDataType
),
stream_config
.
stream_id_
));
}
};
ave_time
=
ck
::
utility
::
launch_and_time_kernel_with_preprocess
<
false
>
(
...
...
@@ -189,15 +185,12 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
arg_
);
}
else
{
if
constexpr
(
!
is_same
<
remove_cvref_t
<
CDataType
>
,
bhalf_t
>::
value
)
{
if
(
arg
.
KBatch
>
1
)
hipGetErrorString
(
hipMemsetAsync
(
arg
.
p_c_grid
,
0
,
arg
.
M
*
arg
.
N
*
sizeof
(
CDataType
),
stream_config
.
stream_id_
));
}
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
gdx
,
gdy
,
gdz
),
dim3
(
BlockSize
),
0
,
arg
);
...
...
@@ -214,8 +207,6 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v3
)
{
if
(
arg
.
KBatch
>
1
)
{
if
constexpr
(
!
is_same
<
remove_cvref_t
<
CDataType
>
,
bhalf_t
>::
value
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
...
...
@@ -224,7 +215,6 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
minimum_occupancy
>
;
Run
(
kernel
);
}
}
else
{
const
auto
kernel
=
...
...
@@ -239,13 +229,11 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
else
if
constexpr
(
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v2
)
{
if
(
arg
.
KBatch
>
1
)
{
if
constexpr
(
!
is_same
<
remove_cvref_t
<
CDataType
>
,
bhalf_t
>::
value
)
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
One
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
true
,
InMemoryDataOperationEnum
::
AtomicAdd
,
minimum_occupancy
,
...
...
@@ -255,8 +243,8 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
else
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Full
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
true
,
InMemoryDataOperationEnum
::
AtomicAdd
,
minimum_occupancy
,
...
...
@@ -266,8 +254,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
if
constexpr
(
GridwiseGemm
::
BlockwiseGemmPipe
::
PrefetchStages
>
2
)
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Two
)
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Two
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
...
...
@@ -326,8 +313,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
if
constexpr
(
GridwiseGemm
::
BlockwiseGemmPipe
::
PrefetchStages
>
6
)
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Six
)
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Six
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
...
...
@@ -354,7 +340,6 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
}
}
}
}
else
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
One
)
...
...
@@ -472,8 +457,6 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
else
if
constexpr
(
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v4
)
{
if
(
arg
.
KBatch
>
1
)
{
if
constexpr
(
!
is_same
<
remove_cvref_t
<
CDataType
>
,
bhalf_t
>::
value
)
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Odd
)
{
...
...
@@ -496,7 +479,6 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
Run
(
kernel
);
}
}
}
else
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Odd
)
...
...
@@ -524,13 +506,11 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
else
{
if
(
arg
.
KBatch
>
1
)
{
if
constexpr
(
!
is_same
<
remove_cvref_t
<
CDataType
>
,
bhalf_t
>::
value
)
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Odd
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
true
,
InMemoryDataOperationEnum
::
AtomicAdd
,
minimum_occupancy
,
...
...
@@ -539,8 +519,8 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
}
else
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
true
,
InMemoryDataOperationEnum
::
AtomicAdd
,
minimum_occupancy
,
...
...
@@ -548,7 +528,6 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
Run
(
kernel
);
}
}
}
else
{
if
(
GridwiseGemm
::
CalculateKBlockLoopTailNum
(
K_split
)
==
TailNumber
::
Odd
)
...
...
@@ -579,10 +558,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
// Tail number always 1
if
constexpr
(
BlkGemmPipelineVer
==
BlockGemmPipelineVersion
::
v1
)
{
if
(
arg
.
KBatch
>
1
)
{
if
constexpr
(
!
is_same
<
remove_cvref_t
<
CDataType
>
,
bhalf_t
>::
value
)
{
const
auto
kernel
=
kernel_gemm_xdl_cshuffle_v3
<
GridwiseGemm
,
...
...
@@ -591,7 +567,6 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
minimum_occupancy
>
;
Run
(
kernel
);
}
}
else
{
const
auto
kernel
=
...
...
@@ -628,6 +603,11 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
return
false
;
}
if
(
!
is_bf16_atomic_supported
()
&&
std
::
is_same_v
<
CDataType
,
ck
::
bhalf_t
>
&&
arg
.
KBatch
>
1
)
{
return
false
;
}
if
((
arg
.
K
%
AK1
!=
0
||
arg
.
K
%
BK1
!=
0
)
&&
!
(
GemmSpec
==
GemmSpecialization
::
MKPadding
||
GemmSpec
==
GemmSpecialization
::
NKPadding
||
GemmSpec
==
GemmSpecialization
::
MNKPadding
||
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
View file @
0eb75e21
...
...
@@ -238,14 +238,14 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
using
GemmToConv
FwdTransformer
=
TransformConvFwdToGemm
<
NDimSpatial
,
ConvForwardSpecialization
>
;
using
ConvToGemm
FwdTransformer
=
TransformConvFwdToGemm
<
NDimSpatial
,
ConvForwardSpecialization
>
;
static
constexpr
auto
matrix_padder
=
MatrixPadder
<
GemmSpec
,
index_t
,
index_t
,
index_t
>
{
MPerBlock
,
NPerBlock
,
K0PerBlock
};
template
<
typename
ALay
>
static
auto
MakeAGridDescriptor_AK0_M_AK1
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
MakeAGridDescriptor_AK0_M_AK1
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
const
auto
in_gemmmraw_gemmkraw_desc
=
conv_to_gemm_transformer
.
template
MakeADescriptor_M_K
<
ALay
>();
...
...
@@ -266,7 +266,7 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
template
<
typename
BLay
>
static
auto
MakeBGridDescriptor_BK0_N_BK1
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
MakeBGridDescriptor_BK0_N_BK1
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
const
auto
wei_gemmnraw_gemmkraw_desc
=
conv_to_gemm_transformer
.
template
MakeBDescriptor_N_K
<
BLay
>();
...
...
@@ -287,7 +287,7 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
}
template
<
typename
ELay
>
static
auto
MakeEGridDescriptor_M_N
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
static
auto
MakeEGridDescriptor_M_N
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
const
auto
out_gemmmraw_gemmnraw_desc
=
conv_to_gemm_transformer
.
template
MakeCDescriptor_M_N
<
ELay
>();
...
...
@@ -298,7 +298,7 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
return
out_gemmm_gemmn_desc
;
}
static
auto
MakeDsGridDescriptor_M_N
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
static
auto
MakeDsGridDescriptor_M_N
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
return
generate_tuple
(
[
&
](
auto
i
)
{
...
...
@@ -310,7 +310,7 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
}
// desc for problem definition
constexpr
static
GemmToConv
FwdTransformer
dummy_conv_to_gemm_transformer
;
constexpr
static
ConvToGemm
FwdTransformer
dummy_conv_to_gemm_transformer
;
using
AGridDesc_AK0_M_AK1
=
remove_cvref_t
<
decltype
(
MakeAGridDescriptor_AK0_M_AK1
<
ALayout
>
(
dummy_conv_to_gemm_transformer
))
>
;
using
BGridDesc_BK0_N_BK1
=
remove_cvref_t
<
decltype
(
MakeBGridDescriptor_BK0_N_BK1
<
BLayout
>
(
...
...
@@ -447,7 +447,7 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
using
DDataType
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsDataType
>>
;
GemmToConv
FwdTransformer
conv_to_gemm_transformer_d
{
a_g_n_c_wis_lengths
,
ConvToGemm
FwdTransformer
conv_to_gemm_transformer_d
{
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
...
...
@@ -511,7 +511,7 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
// tensor descriptors for problem definiton
index_t
num_group_
;
GemmToConv
FwdTransformer
conv_to_gemm_transformer_
;
ConvToGemm
FwdTransformer
conv_to_gemm_transformer_
;
AGridDesc_AK0_M_AK1
a_grid_desc_ak0_m_ak1_
;
BGridDesc_BK0_N_BK1
b_grid_desc_bk0_n_bk1_
;
...
...
@@ -836,6 +836,79 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
cde_element_op
};
}
static
auto
MakeArgument
(
const
void
*
p_a
,
const
void
*
p_b
,
const
std
::
array
<
const
void
*
,
NumDTensor
>&
p_ds
,
void
*
p_e
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
const
std
::
array
<
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_k_wos_lengths
,
const
std
::
array
<
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_k_wos_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
e_g_n_k_wos_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
e_g_n_k_wos_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
input_right_pads
,
const
AElementwiseOperation
&
a_element_op
,
const
BElementwiseOperation
&
b_element_op
,
const
CDEElementwiseOperation
&
cde_element_op
)
{
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides_i32
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_k_wos_lengths_i32
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_k_wos_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_dilations_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
input_left_pads_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
input_right_pads_i32
;
array_convert
(
a_g_n_c_wis_lengths_i32
,
a_g_n_c_wis_lengths
);
array_convert
(
a_g_n_c_wis_strides_i32
,
a_g_n_c_wis_strides
);
array_convert
(
b_g_k_c_xs_lengths_i32
,
b_g_k_c_xs_lengths
);
array_convert
(
b_g_k_c_xs_strides_i32
,
b_g_k_c_xs_strides
);
for
(
index_t
d
=
0
;
d
<
NumDTensor
;
d
++
)
{
array_convert
(
ds_g_n_k_wos_lengths_i32
[
d
],
ds_g_n_k_wos_lengths
[
d
]);
array_convert
(
ds_g_n_k_wos_strides_i32
[
d
],
ds_g_n_k_wos_strides
[
d
]);
}
array_convert
(
e_g_n_k_wos_lengths_i32
,
e_g_n_k_wos_lengths
);
array_convert
(
e_g_n_k_wos_strides_i32
,
e_g_n_k_wos_strides
);
array_convert
(
conv_filter_strides_i32
,
conv_filter_strides
);
array_convert
(
conv_filter_dilations_i32
,
conv_filter_dilations
);
array_convert
(
input_left_pads_i32
,
input_left_pads
);
array_convert
(
input_right_pads_i32
,
input_right_pads
);
return
Argument
{
p_a
,
p_b
,
p_ds
,
p_e
,
a_g_n_c_wis_lengths_i32
,
a_g_n_c_wis_strides_i32
,
b_g_k_c_xs_lengths_i32
,
b_g_k_c_xs_strides_i32
,
ds_g_n_k_wos_lengths_i32
,
ds_g_n_k_wos_strides_i32
,
e_g_n_k_wos_lengths_i32
,
e_g_n_k_wos_strides_i32
,
conv_filter_strides_i32
,
conv_filter_dilations_i32
,
input_left_pads_i32
,
input_right_pads_i32
,
a_element_op
,
b_element_op
,
cde_element_op
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
...
...
@@ -880,6 +953,79 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
cde_element_op
);
}
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
const
void
*
p_b
,
const
std
::
array
<
const
void
*
,
NumDTensor
>&
p_ds
,
void
*
p_e
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
const
std
::
array
<
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_k_wos_lengths
,
const
std
::
array
<
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_k_wos_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
e_g_n_k_wos_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
e_g_n_k_wos_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
input_right_pads
,
const
AElementwiseOperation
&
a_element_op
,
const
BElementwiseOperation
&
b_element_op
,
const
CDEElementwiseOperation
&
cde_element_op
)
override
{
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides_i32
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_k_wos_lengths_i32
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_k_wos_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_dilations_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
input_left_pads_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
input_right_pads_i32
;
array_convert
(
a_g_n_c_wis_lengths_i32
,
a_g_n_c_wis_lengths
);
array_convert
(
a_g_n_c_wis_strides_i32
,
a_g_n_c_wis_strides
);
array_convert
(
b_g_k_c_xs_lengths_i32
,
b_g_k_c_xs_lengths
);
array_convert
(
b_g_k_c_xs_strides_i32
,
b_g_k_c_xs_strides
);
for
(
index_t
d
=
0
;
d
<
NumDTensor
;
d
++
)
{
array_convert
(
ds_g_n_k_wos_lengths_i32
[
d
],
ds_g_n_k_wos_lengths
[
d
]);
array_convert
(
ds_g_n_k_wos_strides_i32
[
d
],
ds_g_n_k_wos_strides
[
d
]);
}
array_convert
(
e_g_n_k_wos_lengths_i32
,
e_g_n_k_wos_lengths
);
array_convert
(
e_g_n_k_wos_strides_i32
,
e_g_n_k_wos_strides
);
array_convert
(
conv_filter_strides_i32
,
conv_filter_strides
);
array_convert
(
conv_filter_dilations_i32
,
conv_filter_dilations
);
array_convert
(
input_left_pads_i32
,
input_left_pads
);
array_convert
(
input_right_pads_i32
,
input_right_pads
);
return
std
::
make_unique
<
Argument
>
(
p_a
,
p_b
,
p_ds
,
p_e
,
a_g_n_c_wis_lengths_i32
,
a_g_n_c_wis_strides_i32
,
b_g_k_c_xs_lengths_i32
,
b_g_k_c_xs_strides_i32
,
ds_g_n_k_wos_lengths_i32
,
ds_g_n_k_wos_strides_i32
,
e_g_n_k_wos_lengths_i32
,
e_g_n_k_wos_strides_i32
,
conv_filter_strides_i32
,
conv_filter_dilations_i32
,
input_left_pads_i32
,
input_right_pads_i32
,
a_element_op
,
b_element_op
,
cde_element_op
);
}
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
View file @
0eb75e21
...
...
@@ -234,14 +234,14 @@ struct DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK : public DeviceGroupedConvFwd<NDimS
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
using
GemmToConv
FwdTransformer
=
TransformConvFwdToGemm
<
NDimSpatial
,
ConvForwardSpecialization
>
;
using
ConvToGemm
FwdTransformer
=
TransformConvFwdToGemm
<
NDimSpatial
,
ConvForwardSpecialization
>
;
static
constexpr
auto
matrix_padder
=
MatrixPadder
<
GemmSpec
,
index_t
,
index_t
,
index_t
>
{
MPerBlock
,
NPerBlock
,
K0PerBlock
};
template
<
typename
ALay
>
static
auto
MakeAGridDescriptor_AK0_M_AK1
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
MakeAGridDescriptor_AK0_M_AK1
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
const
auto
in_gemmmraw_gemmkraw_desc
=
conv_to_gemm_transformer
.
template
MakeADescriptor_M_K
<
ALay
>();
...
...
@@ -263,7 +263,7 @@ struct DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK : public DeviceGroupedConvFwd<NDimS
template
<
typename
BLay
>
static
auto
MakeBGridDescriptor_BK0_N_BK1
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
MakeBGridDescriptor_BK0_N_BK1
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
const
auto
wei_gemmnraw_gemmkraw_desc
=
conv_to_gemm_transformer
.
template
MakeBDescriptor_N_K
<
BLay
>();
...
...
@@ -284,7 +284,7 @@ struct DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK : public DeviceGroupedConvFwd<NDimS
}
template
<
typename
CLay
>
static
auto
MakeCGridDescriptor_M_N
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
static
auto
MakeCGridDescriptor_M_N
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
const
auto
out_gemmmraw_gemmnraw_desc
=
conv_to_gemm_transformer
.
template
MakeCDescriptor_M_N
<
CLay
>();
...
...
@@ -296,7 +296,7 @@ struct DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK : public DeviceGroupedConvFwd<NDimS
}
// desc for problem definition
constexpr
static
GemmToConv
FwdTransformer
dummy_conv_to_gemm_transformer
;
constexpr
static
ConvToGemm
FwdTransformer
dummy_conv_to_gemm_transformer
;
using
AGridDesc_AK0_M_AK1
=
remove_cvref_t
<
decltype
(
MakeAGridDescriptor_AK0_M_AK1
<
ALayout
>
(
dummy_conv_to_gemm_transformer
))
>
;
using
BGridDesc_BK0_N_BK1
=
remove_cvref_t
<
decltype
(
MakeBGridDescriptor_BK0_N_BK1
<
BLayout
>
(
...
...
@@ -452,7 +452,7 @@ struct DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK : public DeviceGroupedConvFwd<NDimS
// tensor descriptors for problem definiton
index_t
num_group_
;
GemmToConv
FwdTransformer
conv_to_gemm_transformer_
;
ConvToGemm
FwdTransformer
conv_to_gemm_transformer_
;
AGridDesc_AK0_M_AK1
a_grid_desc_ak0_m_ak1_
;
BGridDesc_BK0_N_BK1
b_grid_desc_bk0_n_bk1_
;
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
View file @
0eb75e21
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
View file @
0eb75e21
...
...
@@ -293,7 +293,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
using
GemmToConv
FwdTransformer
=
TransformConvFwdToGemm
<
NDimSpatial
,
using
ConvToGemm
FwdTransformer
=
TransformConvFwdToGemm
<
NDimSpatial
,
ConvForwardSpecialization
,
true
/*SplitN*/
,
ADataType
,
...
...
@@ -304,7 +304,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
template
<
typename
ALay
>
static
auto
MakeAGridDescriptor_AK0_M_AK1
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
MakeAGridDescriptor_AK0_M_AK1
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
const
auto
in_gemmmraw_gemmkraw_desc
=
...
...
@@ -327,7 +327,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
template
<
typename
BLay
>
static
auto
MakeBGridDescriptor_BK0_N_BK1
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
MakeBGridDescriptor_BK0_N_BK1
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
const
auto
wei_gemmnraw_gemmkraw_desc
=
conv_to_gemm_transformer
.
template
MakeBDescriptor_N_K
<
BLay
>();
...
...
@@ -348,7 +348,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
}
template
<
typename
ELay
>
static
auto
MakeEGridDescriptor_M_N
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
static
auto
MakeEGridDescriptor_M_N
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
const
auto
out_gemmmraw_gemmnraw_desc
=
...
...
@@ -361,7 +361,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
}
// desc for problem definition
constexpr
static
GemmToConv
FwdTransformer
dummy_conv_to_gemm_transformer
;
constexpr
static
ConvToGemm
FwdTransformer
dummy_conv_to_gemm_transformer
;
using
EGridDesc_M_N
=
remove_cvref_t
<
decltype
(
MakeEGridDescriptor_M_N
<
ELayout
>
(
dummy_conv_to_gemm_transformer
))
>
;
...
...
@@ -495,7 +495,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
// tensor descriptors for problem definiton
index_t
num_group_
;
GemmToConv
FwdTransformer
conv_to_gemm_transformer_
;
ConvToGemm
FwdTransformer
conv_to_gemm_transformer_
;
index_t
conv_N_per_block_
;
...
...
@@ -978,6 +978,12 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
return
false
;
}
// Gridwise gemm v3 doesn't verify descriptors size
if
(
!
arg
.
conv_to_gemm_transformer_
.
AreDescriptorsSmallerThan2GB
())
{
return
false
;
}
// check Gridwise GEMM
const
index_t
GemmM
=
arg
.
a_grid_desc_ak0_m_ak1_
.
GetLength
(
I1
);
const
index_t
GemmN
=
arg
.
b_grid_desc_bk0_n_bk1_
.
GetLength
(
I1
);
...
...
@@ -1037,6 +1043,79 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
cde_element_op
};
}
static
auto
MakeArgument
(
const
void
*
p_as
,
const
void
*
p_bs
,
const
std
::
array
<
const
void
*
,
NumDTensor
>&
p_ds
,
void
*
p_e
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
const
std
::
array
<
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_k_wos_lengths
,
const
std
::
array
<
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_k_wos_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
e_g_n_k_wos_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
e_g_n_k_wos_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
input_right_pads
,
const
AElementwiseOperation
&
a_element_op
,
const
BElementwiseOperation
&
b_element_op
,
const
CDEElementwiseOperation
&
cde_element_op
)
{
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides_i32
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_k_wos_lengths_i32
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_k_wos_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_dilations_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
input_left_pads_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
input_right_pads_i32
;
array_convert
(
a_g_n_c_wis_lengths_i32
,
a_g_n_c_wis_lengths
);
array_convert
(
a_g_n_c_wis_strides_i32
,
a_g_n_c_wis_strides
);
array_convert
(
b_g_k_c_xs_lengths_i32
,
b_g_k_c_xs_lengths
);
array_convert
(
b_g_k_c_xs_strides_i32
,
b_g_k_c_xs_strides
);
for
(
index_t
d
=
0
;
d
<
NumDTensor
;
d
++
)
{
array_convert
(
ds_g_n_k_wos_lengths_i32
[
d
],
ds_g_n_k_wos_lengths
[
d
]);
array_convert
(
ds_g_n_k_wos_strides_i32
[
d
],
ds_g_n_k_wos_strides
[
d
]);
}
array_convert
(
e_g_n_k_wos_lengths_i32
,
e_g_n_k_wos_lengths
);
array_convert
(
e_g_n_k_wos_strides_i32
,
e_g_n_k_wos_strides
);
array_convert
(
conv_filter_strides_i32
,
conv_filter_strides
);
array_convert
(
conv_filter_dilations_i32
,
conv_filter_dilations
);
array_convert
(
input_left_pads_i32
,
input_left_pads
);
array_convert
(
input_right_pads_i32
,
input_right_pads
);
return
Argument
{
p_as
,
p_bs
,
p_ds
,
p_e
,
a_g_n_c_wis_lengths_i32
,
a_g_n_c_wis_strides_i32
,
b_g_k_c_xs_lengths_i32
,
b_g_k_c_xs_strides_i32
,
ds_g_n_k_wos_lengths_i32
,
ds_g_n_k_wos_strides_i32
,
e_g_n_k_wos_lengths_i32
,
e_g_n_k_wos_strides_i32
,
conv_filter_strides_i32
,
conv_filter_dilations_i32
,
input_left_pads_i32
,
input_right_pads_i32
,
a_element_op
,
b_element_op
,
cde_element_op
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
...
...
@@ -1081,6 +1160,79 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
cde_element_op
);
}
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
const
void
*
p_b
,
const
std
::
array
<
const
void
*
,
NumDTensor
>&
p_ds
,
void
*
p_e
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
const
std
::
array
<
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_k_wos_lengths
,
const
std
::
array
<
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_k_wos_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
e_g_n_k_wos_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
e_g_n_k_wos_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
input_right_pads
,
const
AElementwiseOperation
&
a_element_op
,
const
BElementwiseOperation
&
b_element_op
,
const
CDEElementwiseOperation
&
cde_element_op
)
override
{
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides_i32
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_k_wos_lengths_i32
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_k_wos_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_dilations_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
input_left_pads_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
input_right_pads_i32
;
array_convert
(
a_g_n_c_wis_lengths_i32
,
a_g_n_c_wis_lengths
);
array_convert
(
a_g_n_c_wis_strides_i32
,
a_g_n_c_wis_strides
);
array_convert
(
b_g_k_c_xs_lengths_i32
,
b_g_k_c_xs_lengths
);
array_convert
(
b_g_k_c_xs_strides_i32
,
b_g_k_c_xs_strides
);
for
(
index_t
d
=
0
;
d
<
NumDTensor
;
d
++
)
{
array_convert
(
ds_g_n_k_wos_lengths_i32
[
d
],
ds_g_n_k_wos_lengths
[
d
]);
array_convert
(
ds_g_n_k_wos_strides_i32
[
d
],
ds_g_n_k_wos_strides
[
d
]);
}
array_convert
(
e_g_n_k_wos_lengths_i32
,
e_g_n_k_wos_lengths
);
array_convert
(
e_g_n_k_wos_strides_i32
,
e_g_n_k_wos_strides
);
array_convert
(
conv_filter_strides_i32
,
conv_filter_strides
);
array_convert
(
conv_filter_dilations_i32
,
conv_filter_dilations
);
array_convert
(
input_left_pads_i32
,
input_left_pads
);
array_convert
(
input_right_pads_i32
,
input_right_pads
);
return
std
::
make_unique
<
Argument
>
(
p_a
,
p_b
,
p_ds
,
p_e
,
a_g_n_c_wis_lengths_i32
,
a_g_n_c_wis_strides_i32
,
b_g_k_c_xs_lengths_i32
,
b_g_k_c_xs_strides_i32
,
ds_g_n_k_wos_lengths_i32
,
ds_g_n_k_wos_strides_i32
,
e_g_n_k_wos_lengths_i32
,
e_g_n_k_wos_strides_i32
,
conv_filter_strides_i32
,
conv_filter_dilations_i32
,
input_left_pads_i32
,
input_right_pads_i32
,
a_element_op
,
b_element_op
,
cde_element_op
);
}
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
View file @
0eb75e21
...
...
@@ -309,13 +309,13 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
using
GemmToConv
FwdTransformer
=
TransformConvFwdToGemm
<
NDimSpatial
,
ConvForwardSpecialization
>
;
using
ConvToGemm
FwdTransformer
=
TransformConvFwdToGemm
<
NDimSpatial
,
ConvForwardSpecialization
>
;
static
constexpr
auto
matrix_padder
=
MatrixPadder
<
GemmSpec
,
index_t
,
index_t
,
index_t
>
{
MPerBlock
,
NPerBlock
,
KPerBlock
};
template
<
typename
ALay
>
static
auto
MakeAGridDescriptor_M_K
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
static
auto
MakeAGridDescriptor_M_K
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
const
auto
in_gemmmraw_gemmkraw_desc
=
conv_to_gemm_transformer
.
template
MakeADescriptor_M_K
<
ALay
>();
...
...
@@ -327,7 +327,7 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
}
template
<
typename
BLay
>
static
auto
MakeBGridDescriptor_N_K
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
static
auto
MakeBGridDescriptor_N_K
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
const
auto
wei_gemmnraw_gemmkraw_desc
=
conv_to_gemm_transformer
.
template
MakeBDescriptor_N_K
<
BLay
>();
...
...
@@ -339,7 +339,7 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
}
template
<
typename
ELay
>
static
auto
MakeEGridDescriptor_M_N
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
static
auto
MakeEGridDescriptor_M_N
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
const
auto
out_gemmmraw_gemmnraw_desc
=
conv_to_gemm_transformer
.
template
MakeCDescriptor_M_N
<
ELay
>();
...
...
@@ -420,7 +420,7 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
return
GetPaddedRGridDescriptor
(
r_grid_desc_mraw
,
NHoWo
);
}
constexpr
static
GemmToConv
FwdTransformer
dummy_conv_to_gemm_transformer
;
constexpr
static
ConvToGemm
FwdTransformer
dummy_conv_to_gemm_transformer
;
using
AGridDesc_M_K
=
remove_cvref_t
<
decltype
(
MakeAGridDescriptor_M_K
<
ALayout
>
(
dummy_conv_to_gemm_transformer
))
>
;
using
BGridDesc_N_K
=
...
...
@@ -599,7 +599,7 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
// D batch stride
compute_ptr_offset_of_batch_
.
BatchStrideDs_
(
i
)
=
ds_g_n_k_wos_strides
[
i
][
0
];
GemmToConv
FwdTransformer
conv_to_gemm_transformer_d
{
a_g_n_c_wis_lengths
,
ConvToGemm
FwdTransformer
conv_to_gemm_transformer_d
{
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
...
...
@@ -649,7 +649,7 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
EDataType
*
p_e_grid_
;
typename
GridwiseGemm
::
RsGridPointer
p_rs_grid_
;
GemmToConv
FwdTransformer
conv_to_gemm_transformer_
;
ConvToGemm
FwdTransformer
conv_to_gemm_transformer_
;
// tensor descriptors for problem definiton
AGridDesc_M_K
a_grid_desc_m_k_
;
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
View file @
0eb75e21
...
...
@@ -135,13 +135,13 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
static
constexpr
auto
BEnableLds
=
BEnableLds_auto
||
BEnableLds_manu
||
(
NumGemmKPrefetchStage
>
1
);
using
GemmToConv
FwdTransformer
=
TransformConvFwdToGemm
<
NDimSpatial
,
ConvForwardSpecialization
>
;
using
ConvToGemm
FwdTransformer
=
TransformConvFwdToGemm
<
NDimSpatial
,
ConvForwardSpecialization
>
;
static
constexpr
auto
matrix_padder
=
MatrixPadder
<
GemmSpec
,
index_t
,
index_t
,
index_t
>
{
MPerBlock
,
NPerBlock
,
KPerBlock
};
template
<
typename
ALay
>
static
auto
MakeAGridDescriptor
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
static
auto
MakeAGridDescriptor
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
const
auto
in_gemmmraw_gemmkraw_desc
=
conv_to_gemm_transformer
.
template
MakeADescriptor_M_K
<
ALay
>();
...
...
@@ -185,7 +185,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
}
template
<
typename
BLay
>
static
auto
MakeBGridDescriptor
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
static
auto
MakeBGridDescriptor
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
const
auto
wei_gemmnraw_gemmkraw_desc
=
conv_to_gemm_transformer
.
template
MakeBDescriptor_N_K
<
BLay
>();
...
...
@@ -229,7 +229,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
}
template
<
typename
ELay
>
static
auto
MakeEGridDescriptor_M_N
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
static
auto
MakeEGridDescriptor_M_N
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
const
auto
out_gemmmraw_gemmnraw_desc
=
conv_to_gemm_transformer
.
template
MakeCDescriptor_M_N
<
ELay
>();
...
...
@@ -240,7 +240,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
return
out_gemmm_gemmn_desc
;
}
static
auto
MakeDsGridDescriptor_M_N
(
const
GemmToConv
FwdTransformer
&
conv_to_gemm_transformer
)
static
auto
MakeDsGridDescriptor_M_N
(
const
ConvToGemm
FwdTransformer
&
conv_to_gemm_transformer
)
{
return
generate_tuple
(
[
&
](
auto
i
)
{
...
...
@@ -252,7 +252,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
}
// desc for problem definition
constexpr
static
GemmToConv
FwdTransformer
dummy_conv_to_gemm_transformer
;
constexpr
static
ConvToGemm
FwdTransformer
dummy_conv_to_gemm_transformer
;
using
AGridDesc
=
decltype
(
DeviceOp
::
MakeAGridDescriptor
<
ALayout
>
(
dummy_conv_to_gemm_transformer
));
using
BGridDesc
=
...
...
@@ -406,7 +406,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
[
&
](
auto
i
)
{
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
GemmToConv
FwdTransformer
conv_to_gemm_transformer_d
{
a_g_n_c_wis_lengths
,
ConvToGemm
FwdTransformer
conv_to_gemm_transformer_d
{
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
...
...
@@ -448,7 +448,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
// tensor descriptors for problem definiton
index_t
num_group_
;
GemmToConv
FwdTransformer
conv_to_gemm_transformer_
;
ConvToGemm
FwdTransformer
conv_to_gemm_transformer_
;
DsGridDesc_M_N
ds_grid_desc_m_n_
;
EGridDesc_M_N
e_grid_desc_m_n_
;
...
...
@@ -772,6 +772,81 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
cde_element_op
};
}
static
auto
MakeArgument
(
const
void
*
p_a
,
const
void
*
p_b
,
const
std
::
array
<
const
void
*
,
NumDTensor
>&
p_ds
,
void
*
p_e
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
const
std
::
array
<
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_k_wos_lengths
,
const
std
::
array
<
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_k_wos_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
e_g_n_k_wos_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
e_g_n_k_wos_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
input_right_pads
,
const
AElementwiseOperation
&
a_element_op
,
const
BElementwiseOperation
&
b_element_op
,
const
CDEElementwiseOperation
&
cde_element_op
)
{
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides_i32
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_k_wos_lengths_i32
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_k_wos_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_dilations_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
input_left_pads_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
input_right_pads_i32
;
array_convert
(
a_g_n_c_wis_lengths_i32
,
a_g_n_c_wis_lengths
);
array_convert
(
a_g_n_c_wis_strides_i32
,
a_g_n_c_wis_strides
);
array_convert
(
b_g_k_c_xs_lengths_i32
,
b_g_k_c_xs_lengths
);
array_convert
(
b_g_k_c_xs_strides_i32
,
b_g_k_c_xs_strides
);
for
(
index_t
d
=
0
;
d
<
NumDTensor
;
d
++
)
{
array_convert
(
ds_g_n_k_wos_lengths_i32
[
d
],
ds_g_n_k_wos_lengths
[
d
]);
array_convert
(
ds_g_n_k_wos_strides_i32
[
d
],
ds_g_n_k_wos_strides
[
d
]);
}
array_convert
(
e_g_n_k_wos_lengths_i32
,
e_g_n_k_wos_lengths
);
array_convert
(
e_g_n_k_wos_strides_i32
,
e_g_n_k_wos_strides
);
array_convert
(
conv_filter_strides_i32
,
conv_filter_strides
);
array_convert
(
conv_filter_dilations_i32
,
conv_filter_dilations
);
array_convert
(
input_left_pads_i32
,
input_left_pads
);
array_convert
(
input_right_pads_i32
,
input_right_pads
);
return
Argument
{
p_a
,
p_b
,
p_ds
,
p_e
,
a_g_n_c_wis_lengths_i32
,
a_g_n_c_wis_strides_i32
,
b_g_k_c_xs_lengths_i32
,
b_g_k_c_xs_strides_i32
,
ds_g_n_k_wos_lengths_i32
,
ds_g_n_k_wos_strides_i32
,
e_g_n_k_wos_lengths_i32
,
e_g_n_k_wos_strides_i32
,
conv_filter_strides_i32
,
conv_filter_dilations_i32
,
input_left_pads_i32
,
input_right_pads_i32
,
1
,
1
,
a_element_op
,
b_element_op
,
cde_element_op
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
...
...
@@ -818,6 +893,81 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
cde_element_op
);
}
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
const
void
*
p_b
,
const
std
::
array
<
const
void
*
,
NumDTensor
>&
p_ds
,
void
*
p_e
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
const
std
::
array
<
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_k_wos_lengths
,
const
std
::
array
<
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_k_wos_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
e_g_n_k_wos_lengths
,
const
std
::
array
<
long_index_t
,
NDimSpatial
+
3
>&
e_g_n_k_wos_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
long_index_t
,
NDimSpatial
>&
input_right_pads
,
const
AElementwiseOperation
&
a_element_op
,
const
BElementwiseOperation
&
b_element_op
,
const
CDEElementwiseOperation
&
cde_element_op
)
override
{
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides_i32
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_k_wos_lengths_i32
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_k_wos_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths_i32
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_strides_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_dilations_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
input_left_pads_i32
;
std
::
array
<
index_t
,
NDimSpatial
>
input_right_pads_i32
;
array_convert
(
a_g_n_c_wis_lengths_i32
,
a_g_n_c_wis_lengths
);
array_convert
(
a_g_n_c_wis_strides_i32
,
a_g_n_c_wis_strides
);
array_convert
(
b_g_k_c_xs_lengths_i32
,
b_g_k_c_xs_lengths
);
array_convert
(
b_g_k_c_xs_strides_i32
,
b_g_k_c_xs_strides
);
for
(
index_t
d
=
0
;
d
<
NumDTensor
;
d
++
)
{
array_convert
(
ds_g_n_k_wos_lengths_i32
[
d
],
ds_g_n_k_wos_lengths
[
d
]);
array_convert
(
ds_g_n_k_wos_strides_i32
[
d
],
ds_g_n_k_wos_strides
[
d
]);
}
array_convert
(
e_g_n_k_wos_lengths_i32
,
e_g_n_k_wos_lengths
);
array_convert
(
e_g_n_k_wos_strides_i32
,
e_g_n_k_wos_strides
);
array_convert
(
conv_filter_strides_i32
,
conv_filter_strides
);
array_convert
(
conv_filter_dilations_i32
,
conv_filter_dilations
);
array_convert
(
input_left_pads_i32
,
input_left_pads
);
array_convert
(
input_right_pads_i32
,
input_right_pads
);
return
std
::
make_unique
<
Argument
>
(
p_a
,
p_b
,
p_ds
,
p_e
,
a_g_n_c_wis_lengths_i32
,
a_g_n_c_wis_strides_i32
,
b_g_k_c_xs_lengths_i32
,
b_g_k_c_xs_strides_i32
,
ds_g_n_k_wos_lengths_i32
,
ds_g_n_k_wos_strides_i32
,
e_g_n_k_wos_lengths_i32
,
e_g_n_k_wos_strides_i32
,
conv_filter_strides_i32
,
conv_filter_dilations_i32
,
input_left_pads_i32
,
input_right_pads_i32
,
1
,
1
,
a_element_op
,
b_element_op
,
cde_element_op
);
}
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
0 → 100644
View file @
0eb75e21
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
View file @
0eb75e21
...
...
@@ -57,7 +57,7 @@ struct DeviceImageToColumnImpl
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
using
GemmToConv
FwdTransformer
=
using
ConvToGemm
FwdTransformer
=
TransformConvFwdToGemm
<
NDimSpatial
,
ConvolutionForwardSpecialization
::
Default
>
;
static
constexpr
auto
matrix_padder
=
...
...
@@ -97,7 +97,7 @@ struct DeviceImageToColumnImpl
b_g_k_c_xs_lengths
[
I2
]
=
C
;
c_g_n_k_wos_lengths
[
I1
]
=
N
;
GemmToConv
FwdTransformer
conv_to_gemm_transformer
{
a_g_n_c_wis_lengths
,
ConvToGemm
FwdTransformer
conv_to_gemm_transformer
{
a_g_n_c_wis_lengths
,
image_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
{},
// not needed for A Descriptor
...
...
include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp
View file @
0eb75e21
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -19,7 +19,7 @@ namespace device {
template
<
index_t
Rank
,
int
NumReduceDim
>
std
::
pair
<
long_index_t
,
long_index_t
>
get_2d_lengths
(
const
std
::
vector
<
index_t
>&
inLengths
)
{
static_assert
(
Rank
<=
6
,
"bigger Rank size not supported!"
);
static_assert
(
Rank
<=
12
,
"bigger Rank size not supported!"
);
long_index_t
invariant_total_length
=
1
;
long_index_t
reduce_total_length
=
1
;
...
...
@@ -38,7 +38,7 @@ std::pair<long_index_t, long_index_t> get_2d_lengths(const std::vector<index_t>&
template
<
index_t
Rank
,
int
NumReduceDim
>
std
::
pair
<
long_index_t
,
long_index_t
>
get_2d_lengths
(
const
std
::
array
<
index_t
,
Rank
>&
inLengths
)
{
static_assert
(
Rank
<=
6
,
"bigger Rank size not supported!"
);
static_assert
(
Rank
<=
12
,
"bigger Rank size not supported!"
);
long_index_t
invariant_total_length
=
1
;
long_index_t
reduce_total_length
=
1
;
...
...
include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
View file @
0eb75e21
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -51,7 +51,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InDataType,
PropagateNan
,
OutputIndex
>
{
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
static_assert
(
Rank
<=
12
,
"Bigger Rank size is not supported!"
);
static_assert
(
BlockSize
==
MThreadClusterSize
*
KThreadClusterSize
,
"Invalid thread cluster size assignments!"
);
...
...
include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
View file @
0eb75e21
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -47,7 +47,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InDataType,
OutputIndex
>
{
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
static_assert
(
Rank
<=
12
,
"Bigger Rank size is not supported!"
);
static_assert
(((
InSrcVectorDim
==
0
&&
MThreadSliceSize
%
InSrcVectorSize
==
0
)
||
(
InSrcVectorDim
==
1
&&
KThreadSliceSize
%
InSrcVectorSize
==
0
))
&&
...
...
include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise_multi_d.hpp
View file @
0eb75e21
...
...
@@ -45,7 +45,7 @@ struct DeviceReduceThreadWiseMultiD : public DeviceReduceMultiD<InDataType,
OutElementwiseOperation
>
{
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
static_assert
(
Rank
<=
12
,
"Bigger Rank size is not supported!"
);
static_assert
(((
InSrcVectorDim
==
0
&&
MThreadSliceSize
%
InSrcVectorSize
==
0
)
||
(
InSrcVectorDim
==
1
&&
KThreadSliceSize
%
InSrcVectorSize
==
0
))
&&
...
...
Prev
1
2
3
4
5
6
7
…
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment