Commit df7a58bc authored by aska-0096's avatar aska-0096
Browse files

Add All instances

parent a36ceb6d
...@@ -32,53 +32,1417 @@ using BiasLayout = typename LayoutSettingSelector<NDimSpatial>::BiasLayout; ...@@ -32,53 +32,1417 @@ using BiasLayout = typename LayoutSettingSelector<NDimSpatial>::BiasLayout;
template <ck::index_t NDimSpatial> template <ck::index_t NDimSpatial>
using ResidualLayout = typename LayoutSettingSelector<NDimSpatial>::ResidualLayout; using ResidualLayout = typename LayoutSettingSelector<NDimSpatial>::ResidualLayout;
// clang-format off
template <ck::index_t NDimSpatial> template <ck::index_t NDimSpatial>
using DeviceConvFwdInstance = using DeviceConvFwdInstances = std::tuple<
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle< // GEMM_N = 16
NDimSpatial, // K0 = 2
InputLayout<NDimSpatial>, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
WeightLayout<NDimSpatial>, NDimSpatial,
ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
OutputLayout<NDimSpatial>, InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InKernelDataType, InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
WeiKernelDataType, 256, 16, 2, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, 16, 16, // MPerWMMA x NPerWMMA
OutKernelDataType, 2, 1, // MRepeat x NRepeat
AccDataType, S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
CShuffleDataType, S<1, 16, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
InElementOp, 1, 1, S<1, 256, 1, 1>, 16>,
WeiElementOp, ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
OutElementOp, NDimSpatial,
ConvSpec, // ConvForwardSpecialization InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
GemmSpec, // GemmSpecialization InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
128, // BlockSize InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, // MPerBlock 128, 16, 2, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, // NPerBlock 16, 16, // MPerWMMA x NPerWMMA
2, // K0PerBlock 2, 1, // MRepeat x NRepeat
16, // K1 S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
16, // MPerWMMA S<1, 16, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
16, // NPerWMMA 1, 1, S<1, 128, 1, 1>, 16>,
2, // MRepeat ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
1, // NRepeat NDimSpatial,
S<1, 128, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
S<1, 0, 2>, // ABlockTransferSrcAccessOrder InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
2, // ABlockTransferSrcVectorDim 64, 16, 2, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, // ABlockTransferSrcScalarPerVector 16, 16, // MPerWMMA x NPerWMMA
16, // ABlockTransferDstScalarPerVector_AK1 2, 1, // MRepeat x NRepeat
true, // ABlockLdsExtraM S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 8>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 S<1, 16, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder 1, 1, S<1, 64, 1, 1>, 16>,
S<1, 0, 2>, // BBlockTransferSrcAccessOrder // K0 = 3
2, // BBlockTransferSrcVectorDim ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
2, // BBlockTransferSrcScalarPerVector NDimSpatial,
2, // BBlockTransferDstScalarPerVector_BK1 InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
true, // BBlockLdsExtraN InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
1, InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
1, 256, 16, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
S<1, 32, 1, 4>, 16, 16, // MPerWMMA x NPerWMMA
4>; 2, 1, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 256, 1, 1>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 192, // BlockSize
192, 16, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 1, // MRepeat x NRepeat
S<1, 192, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<3, 16, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 3>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 16, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 1, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 128, 1, 1>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 16, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 1, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 1>, 16>,
// K0 = 4
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 16, 4, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 1, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 256, 1, 1>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 16, 4, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 1, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 128, 1, 1>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 16, 4, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 1, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 1>, 16>,
// K0 = 5
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 16, 5, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 1, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 256, 1, 1>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 16, 5, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 1, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 128, 1, 1>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 16, 5, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 1, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 1>, 16>,
// K0 = 6
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 16, 6, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 1, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 256, 1, 1>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 16, 6, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 1, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 128, 1, 1>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 16, 6, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 1, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 1>, 16>,
// K0 = 7
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 16, 7, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 1, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 256, 1, 1>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 16, 7, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 1, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 128, 1, 1>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 16, 7, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 1, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 1>, 16>,
// K0 = 8
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 16, 8, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 1, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 256, 1, 1>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 16, 8, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 1, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 128, 1, 1>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 16, 8, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 1, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 1>, 16>,
// GEMM_N=32, Wave N = 1
// K0 = 2
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 32, 2, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 128, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 32, 2, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 32, 2, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 2>, 16>,
// K0 = 3
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 32, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 128, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 192, // BlockSize
192, 32, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 192, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<3, 32, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 96, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 32, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 32, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 2>, 16>,
// K0 = 4
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 32, 4, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 16, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 128, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 32, 4, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 32, 4, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 2>, 16>,
// K0 = 5
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 32, 5, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 128, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 32, 5, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 32, 5, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 2>, 16>,
// K0 = 6
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 32, 6, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 128, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 32, 6, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 32, 6, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 2>, 16>,
// K0 = 7
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 32, 7, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 128, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 32, 7, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 32, 7, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 2>, 16>,
// K0 = 8
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 32, 8, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 128, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 32, 8, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 32, 8, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 1, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 2>, 16>,
// GEMM_N=32, Wave N = 1
// K0 = 2
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 32, 2, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 128, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 32, 2, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 32, 2, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 2>, 16>,
// K0 = 3
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 32, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 128, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 192, // BlockSize
192, 32, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 192, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<3, 32, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 64, 1, 3>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 32, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 32, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 2>, 16>,
// K0 = 4
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 32, 4, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 128, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 32, 4, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 32, 4, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 2>, 16>,
// K0 = 5
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 32, 5, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 128, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 32, 5, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 32, 5, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 2>, 16>,
// K0 = 6
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 32, 6, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 128, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 32, 6, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 32, 6, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 2>, 16>,
// K0 = 7
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 32, 7, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 128, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 32, 7, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 32, 7, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 2>, 16>,
// K0 = 8
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 32, 8, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 8>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true,
1, 1, S<1, 128, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 32, 8, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 2>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 32, 8, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 2, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 32, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 2>, 16>,
// GEMM_N=64, Wave N = 1
// K0 = 2
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 64, 2, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 64, 2, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 64, 2, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true,
1, 1, S<1, 16, 1, 4>, 16>,
// K0 = 3
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 64, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 192, // BlockSize
192, 64, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 192, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<3, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true,
1, 1, S<1, 64, 1, 3>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 64, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 64, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true,
1, 1, S<1, 16, 1, 4>, 16>,
// K0 = 4
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 64, 4, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 64, 4, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8 , 8, true,
1, 1, S<1, 32, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 64, 4, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true,
1, 1, S<1, 16, 1, 4>, 16>,
// K0 = 5
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 64, 5, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 64, 5, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8 , 8, true,
1, 1, S<1, 32, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 64, 5, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true,
1, 1, S<1, 16, 1, 4>, 16>,
// K0 = 6
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 64, 6, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 64, 6, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8 , 8, true,
1, 1, S<1, 32, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 64, 6, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true,
1, 1, S<1, 16, 1, 4>, 16>,
// K0 = 7
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 64, 7, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 64, 7, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8 , 8, true,
1, 1, S<1, 32, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 64, 7, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true,
1, 1, S<1, 16, 1, 4>, 16>,
// K0 = 8
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 64, 8, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 64, 8, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8 , 8, true,
1, 1, S<1, 32, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 64, 8, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
2, 4, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true,
1, 1, S<1, 16, 1, 4>, 16>,
// GEMM_N = 64, WaveN = 2
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 64, 2, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 64, 2, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 64, 2, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true,
1, 1, S<1, 16, 1, 4>, 16>,
// K0 = 3
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 64, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 192, // BlockSize
192, 64, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 192, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<3, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true,
1, 1, S<1, 64, 1, 3>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 64, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 64, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true,
1, 1, S<1, 16, 1, 4>, 16>,
// K0 = 4
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 64, 4, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 64, 4, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8 , 8, true,
1, 1, S<1, 32, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 64, 4, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true,
1, 1, S<1, 16, 1, 4>, 16>,
// K0 = 5
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 64, 5, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 64, 5, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8 , 8, true,
1, 1, S<1, 32, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 64, 5, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true,
1, 1, S<1, 16, 1, 4>, 16>,
// K0 = 6
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 64, 6, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 64, 6, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8 , 8, true,
1, 1, S<1, 32, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 64, 6, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true,
1, 1, S<1, 16, 1, 4>, 16>,
// K0 = 7
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 64, 7, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 64, 7, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8 , 8, true,
1, 1, S<1, 32, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 64, 7, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true,
1, 1, S<1, 16, 1, 4>, 16>,
// K0 = 8
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256, // BlockSize
256, 64, 8, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128, // BlockSize
128, 64, 8, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8 , 8, true,
1, 1, S<1, 32, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64, // BlockSize
64, 64, 8, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
4, 2, // MRepeat x NRepeat
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true,
1, 1, S<1, 16, 1, 4>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 192, // BlockSize
192, 48, 2, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
6, 1, // MRepeat x NRepeat
S<1, 192, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 48, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 3>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 96, // BlockSize
96, 48, 2, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
6, 1, // MRepeat x NRepeat
S<1, 96, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 48, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 3>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 192, // BlockSize
192, 48, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
6, 1, // MRepeat x NRepeat
S<1, 192, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 48, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 3>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 96, // BlockSize
96, 48, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
6, 1, // MRepeat x NRepeat
S<1, 96, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 48, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 3>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 192, // BlockSize
192, 48, 4, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
6, 1, // MRepeat x NRepeat
S<1, 192, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 48, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 3>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 96, // BlockSize
96, 48, 4, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
6, 1, // MRepeat x NRepeat
S<1, 96, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 48, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 3>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 192, // BlockSize
192, 48, 5, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
6, 1, // MRepeat x NRepeat
S<1, 192, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 48, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 3>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 96, // BlockSize
96, 48, 5, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
6, 1, // MRepeat x NRepeat
S<1, 96, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 48, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 3>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 192, // BlockSize
192, 48, 6, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
6, 1, // MRepeat x NRepeat
S<1, 192, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 48, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 3>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 96, // BlockSize
96, 48, 6, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
6, 1, // MRepeat x NRepeat
S<1, 96, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 48, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 3>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 192, // BlockSize
192, 48, 7, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
6, 1, // MRepeat x NRepeat
S<1, 192, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 48, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 3>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 96, // BlockSize
96, 48, 7, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
6, 1, // MRepeat x NRepeat
S<1, 96, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 48, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 3>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 192, // BlockSize
192, 48, 8, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
6, 1, // MRepeat x NRepeat
S<1, 192, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 48, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true,
1, 1, S<1, 64, 1, 3>, 16>,
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
NDimSpatial,
InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>,
InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 96, // BlockSize
96, 48, 8, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
16, 16, // MPerWMMA x NPerWMMA
6, 1, // MRepeat x NRepeat
S<1, 96, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
S<1, 48, 2>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
1, 1, S<1, 32, 1, 3>, 16>
>;
// clang-format on
template <ck::index_t NDimSpatial> template <ck::index_t NDimSpatial>
using HostConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial, using HostConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InUserDataType, InUserDataType,
...@@ -182,7 +1546,14 @@ bool run_grouped_conv_fwd_bias_relu_add(const ExecutionConfig& config, ...@@ -182,7 +1546,14 @@ bool run_grouped_conv_fwd_bias_relu_add(const ExecutionConfig& config,
copy(conv_param.input_right_pads_, input_right_pads); copy(conv_param.input_right_pads_, input_right_pads);
// do Conv // do Conv
auto conv = DeviceConvFwdInstance<NDimSpatial>{}; float best_perf = .0;
float best_time = .0;
std::string best_kernel= "";
ck::static_for<0, std::tuple_size_v<DeviceConvFwdInstances<NDimSpatial>>, 1>{}([&](auto i) {
const auto device_conv_fwd_instance = std::get<i>(DeviceConvFwdInstances<NDimSpatial>{});
using DeviceConvFwdInstance = ck::remove_cvref_t<decltype(device_conv_fwd_instance)>;
auto conv = DeviceConvFwdInstance{};
auto invoker = conv.MakeInvoker(); auto invoker = conv.MakeInvoker();
auto argument = auto argument =
conv.MakeArgument(in_device_buf.GetDeviceBuffer(), conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
...@@ -225,8 +1596,15 @@ bool run_grouped_conv_fwd_bias_relu_add(const ExecutionConfig& config, ...@@ -225,8 +1596,15 @@ bool run_grouped_conv_fwd_bias_relu_add(const ExecutionConfig& config,
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< conv.GetTypeString() << std::endl; << conv.GetTypeString() << std::endl;
if(tflops > best_perf){
best_perf = tflops;
best_time = avg_time * 1000;
best_kernel = conv.GetTypeString();
}
if(config.do_verification) if(config.do_verification)
{ {
#if 0
Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc); Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc);
auto ref_conv = HostConvFwdInstance<NDimSpatial>{}; auto ref_conv = HostConvFwdInstance<NDimSpatial>{};
...@@ -251,16 +1629,14 @@ bool run_grouped_conv_fwd_bias_relu_add(const ExecutionConfig& config, ...@@ -251,16 +1629,14 @@ bool run_grouped_conv_fwd_bias_relu_add(const ExecutionConfig& config,
out_device_buf.FromDevice(out_device.mData.data()); out_device_buf.FromDevice(out_device.mData.data());
#ifdef BUILD_INT4_EXAMPLE
const Tensor<OutUserDataType> out_device_converted(out_device);
return ck::utils::check_err(
out_device_converted, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
#else
return ck::utils::check_err( return ck::utils::check_err(
out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f); out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
#endif #endif
} }
});
std::cout << "--------------------------------------------------------------------------------------------"<<std::endl;
std::cout << "Best kernel: " << best_kernel << " , " << best_perf << " TFlops , " << best_time<<" us"<<std::endl;
std::cout << "--------------------------------------------------------------------------------------------"<<std::endl;
return true; return true;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment