merge

8fb97941 · root · 666bdad1 · 6c2a3a95 · 8fb97941 · 8fb97941
Commit 8fb97941 authored Mar 18, 2021 by root
5 changed files
--- a/composable_kernel/include/gridwise_operation_wrapper.hpp
+++ b/composable_kernel/include/gridwise_operation_wrapper.hpp
@@ -3,10 +3,10 @@
 template <typename GridwiseOp, typename... Xs>
 __global__ void
-#if 1
+#if 0
    __launch_bounds__(256, 2)
 #endif
-        run_gridwise_operation(Xs... xs)
+run_gridwise_operation(Xs... xs)
 {
    GridwiseOp{}.Run(xs...);
 }

--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -37,7 +37,7 @@
 #endif
 #ifndef CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
-#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 0
+#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
 #endif
 #ifndef CK_USE_AMD_V_FMAC_F32
@@ -74,7 +74,7 @@
 // experimental implementation
 #ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
 #endif
 #ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK

--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -68,7 +68,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(
    const auto in_right_pads  = sequence_to_tuple_of_number(InRightPads{});
 #endif
-#if 1
+#if 0
    // cdata = 16, BlockSize = 64, 16x64x4
    constexpr index_t BlockSize = 64;
@@ -101,6 +101,72 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(
    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 2;
+#elif 0
+    // cdata = 32, BlockSize 64, 16x128x4
+    constexpr index_t BlockSize = 64;
+    constexpr index_t GemmMPerBlock = 16;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerThread = 2;
+    constexpr index_t GemmNPerThread = 4;
+    constexpr index_t GemmKPerThread = 1;
+    constexpr index_t GemmMLevel0Cluster = 2;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 2;
+    constexpr index_t GemmNLevel1Cluster = 8;
+    constexpr index_t ThreadGemmDataPerReadM = 2;
+    constexpr index_t ThreadGemmDataPerReadN = 4;
+    using GemmABlockTransferThreadSliceLengths_GemmK_GemmM   = Sequence<1, 1>;
+    using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 16>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
+    using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN   = Sequence<4, 2>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<1, 64>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 4;
+#elif 1
+    // cdata = 64, BlockSize 64, 16x256x4
+    constexpr index_t BlockSize = 64;
+    constexpr index_t GemmMPerBlock = 16;
+    constexpr index_t GemmNPerBlock = 256;
+    constexpr index_t GemmKPerBlock = 4;
+    constexpr index_t GemmMPerThread = 4;
+    constexpr index_t GemmNPerThread = 4;
+    constexpr index_t GemmKPerThread = 1;
+    constexpr index_t GemmMLevel0Cluster = 2;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 1;
+    constexpr index_t GemmNLevel1Cluster = 16;
+    constexpr index_t ThreadGemmDataPerReadM = 4;
+    constexpr index_t ThreadGemmDataPerReadN = 4;
+    using GemmABlockTransferThreadSliceLengths_GemmK_GemmM   = Sequence<1, 1>;
+    using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 16>;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
+    using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN   = Sequence<4, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<1, 64>;
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 4;
 #elif 0
    // cdata = 16, BlockSize = 64, 16x64x4
    // GemmBBlockCopySrcDataPerRead_GemmN = 4

--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
@@ -112,7 +112,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
-#if 1
+#if 0
    // cdata = 16, BlockSize = 64, 16x64x4
    constexpr index_t BlockSize = 64;
@@ -211,7 +211,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmM1 = 4;
-#elif 0
+#elif 1
    // cdata = 64, BlockSize = 64, 16x256x4
    constexpr index_t BlockSize = 64;

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -16,7 +16,6 @@
 #include "device_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
 int main(int argc, char* argv[])
@@ -65,6 +64,20 @@ int main(int argc, char* argv[])
    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 4;
+    constexpr index_t HI = 1080;
+    constexpr index_t WI = 1920;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+    using LeftPads  = Sequence<1, 1>;
+    using RightPads = Sequence<1, 1>;
 #elif 1
    constexpr index_t N  = 1;
    constexpr index_t C  = 4;
@@ -724,6 +737,23 @@ int main(int argc, char* argv[])
         RightPads{},
         nrepeat);
 #elif 1
+    device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk<in_data_t,
+                                                                         in_vector_size,
+                                                                         acc_data_t,
+                                                                         out_data_t>
+        (in_nchw_desc,
+         in_nchw,
+         wei_kcyx_desc,
+         wei_kcyx,
+         out_nkhw_desc,
+         out_nkhw_device,
+         ConvStrides{},
+         ConvDilations{},
+         LeftPads{},
+         RightPads{},
+         nrepeat);
+#elif 1
    device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(in_nchw_desc,
                                                                         in_nchw,
                                                                         wei_kcyx_desc,