Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
8346af9c
Unverified
Commit
8346af9c
authored
May 10, 2024
by
Bartłomiej Kocot
Committed by
GitHub
May 10, 2024
Browse files
Change output gemm type to AccDataType in two stage conv bwd wei (#1283)
parent
a0ae1c61
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
19 additions
and
9 deletions
+19
-9
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
...evice_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
+16
-9
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp
.../device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp
+1
-0
test/grouped_convnd_bwd_weight/test_grouped_conv_bwd_weight_xdl_bilinear.cpp
..._bwd_weight/test_grouped_conv_bwd_weight_xdl_bilinear.cpp
+2
-0
No files found.
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
View file @
8346af9c
...
...
@@ -197,6 +197,12 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
K0PerBlock
,
ConvBackwardWeightSpecialization
>
{};
static
constexpr
index_t
MaxScalarPerVectorFP32
=
4
;
static
constexpr
index_t
WorkspaceInOutScalarPerVector
=
is_same_v
<
AccDataType
,
float
>
?
math
::
min
(
CBlockTransferScalarPerVector_NWaveNPerXdl
,
MaxScalarPerVectorFP32
)
:
CBlockTransferScalarPerVector_NWaveNPerXdl
;
// Bytes per 32 lds bank: 32 * 4 bytes
static
constexpr
auto
BankLength
=
128
;
static
constexpr
auto
ElePerBank
=
BankLength
/
sizeof
(
ADataType
);
...
...
@@ -297,7 +303,7 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
ADataType
,
BDataType
,
AccDataType
,
E
DataType
,
Acc
DataType
,
InMemoryDataOperationEnum
::
AtomicAdd
,
AGridDesc_K0_M_K1
,
BGridDesc_K0_N_K1
,
...
...
@@ -337,7 +343,7 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
BBlockLdsN1Padding
,
CShuffleMXdlPerWavePerShuffle
,
CShuffleNXdlPerWavePerShuffle
,
CBlockTransfer
ScalarPerVector
_NWaveNPerXdl
,
WorkspaceInOut
ScalarPerVector
,
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
,
true
,
true
,
...
...
@@ -349,7 +355,7 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
static
constexpr
auto
MakeElementwiseInputSequence
()
{
return
generate_sequence_v2
(
[
&
](
auto
)
constexpr
{
return
Number
<
CBlockTransfer
ScalarPerVector
_NWaveNPerXdl
>
{};
},
[
&
](
auto
)
constexpr
{
return
Number
<
WorkspaceInOut
ScalarPerVector
>
{};
},
Number
<
NumDTensor
+
1
>
{});
}
...
...
@@ -499,7 +505,7 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
using
DsGridDesc_M_N
=
decltype
(
MakeDsGridDescriptor_M_N
<
NDimSpatial
>
({},
{}));
using
CDGridDesc_M_N
=
decltype
(
concat_tuple
(
Tuple
<
CGridDesc_M_N
>
{},
DsGridDesc_M_N
{}));
using
DsGridPointerTuple
=
decltype
(
GetDsGridPointerTuple
());
using
CDDataTypes
=
decltype
(
concat_tuple
(
Tuple
<
const
E
DataType
*>
{},
DsGridPointerTuple
{}));
using
CDDataTypes
=
decltype
(
concat_tuple
(
Tuple
<
const
Acc
DataType
*>
{},
DsGridPointerTuple
{}));
using
EGridDesc_M_N
=
CGridDesc_M_N
;
static
constexpr
index_t
ClusterLengthMPerBlock
=
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
::
At
(
1
);
...
...
@@ -659,7 +665,7 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
std
::
size_t
GetWorkspaceSizeBytes
()
const
{
return
sizeof
(
E
DataType
)
*
ce_grid_desc_m_n_
.
GetElementSpaceSize
()
*
Conv_G_
;
return
sizeof
(
Acc
DataType
)
*
ce_grid_desc_m_n_
.
GetElementSpaceSize
()
*
Conv_G_
;
}
const
ADataType
*
p_a_grid_
;
...
...
@@ -738,7 +744,7 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
const
bool
has_main_k0_block_loop
=
GridwiseGemm
::
CalculateHasMainK0BlockLoop
(
K0
);
auto
launch_gemm_kernel
=
[
&
](
auto
has_main_k_block_loop
)
{
E
DataType
*
p_c_grid
=
type_convert
<
E
DataType
*>
(
arg
.
p_workspace_
);
Acc
DataType
*
p_c_grid
=
type_convert
<
Acc
DataType
*>
(
arg
.
p_workspace_
);
const
index_t
grid_size
=
arg
.
block_2_ctile_map_
.
CalculateGridSize
(
arg
.
ce_grid_desc_m_n_
)
*
arg
.
Conv_G_
;
...
...
@@ -753,7 +759,7 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
GridwiseGemm
,
ADataType
,
BDataType
,
E
DataType
,
Acc
DataType
,
OutElementwiseOperation
,
InElementwiseOperation
,
element_wise
::
PassThrough
,
...
...
@@ -786,7 +792,7 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
};
auto
launch_elementwise_kernel
=
[
&
]()
{
const
E
DataType
*
p_c_grid
=
type_convert
<
const
E
DataType
*>
(
arg
.
p_workspace_
);
const
Acc
DataType
*
p_c_grid
=
type_convert
<
const
Acc
DataType
*>
(
arg
.
p_workspace_
);
const
index_t
grid_size
=
arg
.
elementwise_block_2_ctile_map_
.
CalculateGridSize
(
arg
.
ce_grid_desc_m_n_
)
*
arg
.
Conv_G_
;
...
...
@@ -907,7 +913,8 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
}
// vector store C matrix into global memory
if
(
!
(
arg
.
Conv_C_
%
CBlockTransferScalarPerVector_NWaveNPerXdl
==
0
))
if
(
!
(
arg
.
Conv_C_
%
CBlockTransferScalarPerVector_NWaveNPerXdl
==
0
&&
arg
.
Conv_C_
%
WorkspaceInOutScalarPerVector
==
0
))
{
return
false
;
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp
View file @
8346af9c
...
...
@@ -86,6 +86,7 @@ using device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_bilinear_instances = std:
//#########################################| Spatial| | | | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| MBlock_MPerBlock| NWaveNPerXdl|
//#########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| |
// generic instance
DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
Tuple
<
BLayout
>
,
F16
,
F16
,
F16
,
F32
,
Tuple
<
F16
>
,
PassThrough
,
Bilinear
,
PassThrough
,
ConvSpec
,
64
,
64
,
64
,
4
,
8
,
32
,
32
,
2
,
2
,
S
<
1
,
4
,
8
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
1
,
4
,
true
,
S
<
1
,
4
,
8
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
1
,
4
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
>
,
DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
Tuple
<
BLayout
>
,
F16
,
F16
,
F16
,
F32
,
Tuple
<
F16
>
,
PassThrough
,
Bilinear
,
PassThrough
,
ConvSpec
,
64
,
64
,
64
,
4
,
8
,
32
,
32
,
2
,
2
,
S
<
1
,
4
,
8
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
2
,
4
,
true
,
S
<
1
,
4
,
8
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
2
,
4
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
2
>
,
// instance for small conv.K
// for fp16 conv.K and conv.C must be divisible by 2
...
...
test/grouped_convnd_bwd_weight/test_grouped_conv_bwd_weight_xdl_bilinear.cpp
View file @
8346af9c
...
...
@@ -264,5 +264,7 @@ TYPED_TEST(TestGroupedConvndBwdWeight3d, Test3D)
{
3
,
1
,
1
,
64
,
3
,
{
3
,
3
,
3
},
{
32
,
32
,
32
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
1
,
1
,
1
}});
this
->
conv_params
.
push_back
(
{
3
,
1
,
1
,
1
,
1
,
{
3
,
3
,
3
},
{
32
,
32
,
32
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
1
,
1
,
1
}});
this
->
conv_params
.
push_back
(
{
3
,
1
,
1
,
4
,
4
,
{
3
,
3
,
3
},
{
14
,
28
,
28
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
1
,
1
,
1
}});
this
->
Run
();
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment