temp save

553cfa68 · aska-0096 · a8687138 · 553cfa68
Commit 553cfa68 authored Feb 22, 2023 by aska-0096
Hide whitespace changes
Inline Side-by-side

Showing with 128 additions and 6 deletions

example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc ...ple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc +128 -6

No files found.
--- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
+++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
@@ -35,6 +35,129 @@ using ResidualLayout = typename LayoutSettingSelector<NDimSpatial>::ResidualLayo
 // clang-format off
 template <ck::index_t NDimSpatial>
 using DeviceConvFwdInstances = std::tuple<
+// Instances provide to AIT Fp16
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
+        NDimSpatial, 
+        InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>, 
+        InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
+        InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256,         // BlockSize
+        512, 16, 4, 8, // MPerBlock x NPerBlock x K0PerBlock x K1
+        16, 16,         // MPerWMMA x NPerWMMA
+        4, 1,           // MRepeat x NRepeat
+        S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+        S<4, 16, 4>,  S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 
+        2, 1, S<1, 256, 1, 1>, 8>,
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
+        NDimSpatial, 
+        InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>, 
+        InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
+        InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256,         // BlockSize
+        256, 64, 8, 8, // MPerBlock x NPerBlock x K0PerBlock x K1
+        16, 16,         // MPerWMMA x NPerWMMA
+        2, 4,           // MRepeat x NRepeat
+        S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+        S<4, 64, 1>,  S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 
+        2, 2, S<1, 256, 1, 1>, 8>,
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
+        NDimSpatial, 
+        InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>, 
+        InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
+        InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256,         // BlockSize
+        256, 64, 4, 8, // MPerBlock x NPerBlock x K0PerBlock x K1
+        16, 16,         // MPerWMMA x NPerWMMA
+        2, 4,           // MRepeat x NRepeat
+        S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+        S<4, 64, 1>,  S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 
+        2, 2, S<1, 256, 1, 1>, 8>,
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
+        NDimSpatial, 
+        InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>, 
+        InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
+        InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256,         // BlockSize
+        256, 128, 4, 8, // MPerBlock x NPerBlock x K0PerBlock x K1
+        16, 16,         // MPerWMMA x NPerWMMA
+        4, 4,           // MRepeat x NRepeat
+        S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+        S<4, 64, 1>,  S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 
+        4, 2, S<1, 256, 1, 1>, 8>,
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
+        NDimSpatial, 
+        InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>, 
+        InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
+        InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256,         // BlockSize
+        256, 128, 8, 8, // MPerBlock x NPerBlock x K0PerBlock x K1
+        16, 16,         // MPerWMMA x NPerWMMA
+        4, 4,           // MRepeat x NRepeat
+        S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+        S<4, 64, 1>,  S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 
+        4, 2, S<1, 256, 1, 1>, 8>,
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
+        NDimSpatial, 
+        InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>, 
+        InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
+        InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256,         // BlockSize
+        128, 256, 4, 8, // MPerBlock x NPerBlock x K0PerBlock x K1
+        16, 16,         // MPerWMMA x NPerWMMA
+        4, 4,           // MRepeat x NRepeat
+        S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+        S<4, 64, 1>,  S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 
+        4, 2, S<1, 128, 1, 2>, 8>,
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
+        NDimSpatial, 
+        InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>, 
+        InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
+        InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256,         // BlockSize
+        128, 256, 8, 8, // MPerBlock x NPerBlock x K0PerBlock x K1
+        16, 16,         // MPerWMMA x NPerWMMA
+        4, 4,           // MRepeat x NRepeat
+        S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+        S<4, 64, 1>,  S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 
+        4, 2, S<1, 128, 1, 2>, 8>,
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
+        NDimSpatial, 
+        InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>, 
+        InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
+        InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 192,         // BlockSize
+        192, 48, 6, 8, // MPerBlock x NPerBlock x K0PerBlock x K1
+        16, 16,         // MPerWMMA x NPerWMMA
+        6, 1,           // MRepeat x NRepeat
+        S<1, 192, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+        S<2, 48, 2>,  S<1, 0, 2>, S<1, 0, 2>, 2, 8, 4, true, 
+        3, 1, S<1, 96, 1, 2>, 8>,
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
+        NDimSpatial, 
+        InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>, 
+        InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
+        InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 128,         // BlockSize
+        128, 64, 8, 8, // MPerBlock x NPerBlock x K0PerBlock x K1
+        16, 16,         // MPerWMMA x NPerWMMA
+        2, 4,           // MRepeat x NRepeat
+        S<1, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+        S<2, 64, 1>,  S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 
+        2, 2, S<1, 128, 1, 1>, 8>,
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
+        NDimSpatial, 
+        InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>, 
+        InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
+        InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 96,         // BlockSize
+        96, 48, 8, 8, // MPerBlock x NPerBlock x K0PerBlock x K1
+        16, 16,         // MPerWMMA x NPerWMMA
+        6, 1,           // MRepeat x NRepeat
+        S<1, 96, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+        S<2, 48, 1>,  S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 
+        3, 1, S<1, 48, 1, 2>, 8>,
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
+        NDimSpatial, 
+        InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>, 
+        InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
+        InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 64,         // BlockSize
+        64, 64, 8, 8, // MPerBlock x NPerBlock x K0PerBlock x K1
+        16, 16,         // MPerWMMA x NPerWMMA
+        4, 2,           // MRepeat x NRepeat
+        S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
+        S<1, 64, 1>,  S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 
+        4, 1, S<1, 64, 1, 1>, 8>
+#if 0
    // GEMM_N = 16
    // K0 = 8
    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
@@ -53,12 +176,12 @@ using DeviceConvFwdInstances = std::tuple<
        InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>, 
        InKernelDataType, WeiKernelDataType, ck::Tuple<BiasKernelDataType, ResidualKernelDataType>, OutKernelDataType, AccDataType, CShuffleDataType,
        InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmSpec, 256,         // BlockSize
-        1024, 16, 3, 16, // MPerBlock x NPerBlock x K0PerBlock x K1
+        1024, 16, 2, 8, // MPerBlock x NPerBlock x K0PerBlock x K1
        16, 16,         // MPerWMMA x NPerWMMA
        8, 1,           // MRepeat x NRepeat
-        S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
+        S<1, 256, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
-        S<1, 16, 8>,  S<1, 0, 2>, S<1, 0, 2>, 2,  2,  2, true, 
+        S<2, 16, 4>,  S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 
-        8, 1, S<1, 256, 1, 1>, 16>,
+        4, 1, S<1, 256, 1, 1>, 8>,
    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
        NDimSpatial, 
        InputLayout<NDimSpatial>, WeightLayout<NDimSpatial>, ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>, OutputLayout<NDimSpatial>, 
@@ -103,7 +226,6 @@ using DeviceConvFwdInstances = std::tuple<
        S<1, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
        S<1, 16, 4>,  S<1, 0, 2>, S<1, 0, 2>, 2,  4,  4, true, 
        2, 1, S<1, 64, 1, 1>, 16>
-        /*
    // K0 = 2
    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<
        NDimSpatial, 
@@ -1509,7 +1631,7 @@ using DeviceConvFwdInstances = std::tuple<
        S<1, 96, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, // ABlockTransferThreadClusterLengths_AK0_M_AK1, ArrangeOrder, SrcAccessOrder, VectorDim, SrcScalarPerVector, DstScalarPerVector_AK1, LdsExtraM
        S<1, 48, 2>, S<1, 0, 2>, S<1, 0, 2>, 2,  8,  8, true, 
        6, 1, S<1, 32, 1, 3>, 16>
-        */
+#endif
        >;
 // clang-format on