using 256x256x32 tile size

438138c0 · wangshaojie6 · ebf3d70b · 438138c0 · 438138c0
Commit 438138c0 authored May 04, 2022 by wangshaojie6
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 6 deletions

example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp +4 -4

include/ck/config.hpp include/ck/config.hpp +2 -2

No files found.
--- a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
+++ b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
@@ -45,13 +45,13 @@ using DeviceConvBwdWeightInstance = ck::tensor_operation::device::
        OutElementOp,                     // OutElementwiseOperation
        256,                              // BlockSize
        256,                              // MPerBlock
-        128,                              // NPerBlock
+        256,                              // NPerBlock
        4,                                // K0PerBlock
        8,                                // K1
        32,                               // MPerXdl
        32,                               // NPerXdl
        4,                                // MXdlPerWave
-        2,                                // NXdlPerWave
+        4,                                // NXdlPerWave
        S<1, 4, 32, 2>,                   // ABlockTransferThreadClusterLengths_K0_M_K1
        S<0, 3, 1, 2>,                    // ABlockTransferThreadClusterArrangeOrder
        S<0, 2, 1, 3>,                    // ABlockTransferSrcAccessOrder
@@ -59,12 +59,12 @@ using DeviceConvBwdWeightInstance = ck::tensor_operation::device::
        8,                                // ABlockTransferSrcScalarPerVector
        4,                                // ABlockTransferDstScalarPerVector_K1
        true,                             // ABlockLdsAddExtraM
-        S<1, 4, 16, 4>,                   // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<1, 4, 32, 2>,                   // BBlockTransferThreadClusterLengths_K0_N_K1
        S<0, 3, 1, 2>,                    // BBlockTransferThreadClusterArrangeOrder
        S<0, 2, 1, 3>,                    // BBlockTransferSrcAccessOrder
        2,                                // BBlockTransferSrcVectorDim
        8,                                // BBlockTransferSrcScalarPerVector
-        2,                                // BBlockTransferDstScalarPerVector_K1
+        4,                                // BBlockTransferDstScalarPerVector_K1
        true,                             // BBlockLdsAddExtraN
        1,                                // CShuffleMXdlPerWavePerShuffle
        1,                                // CShuffleNXdlPerWavePerShuffle

--- a/include/ck/config.hpp
+++ b/include/ck/config.hpp
@@ -15,7 +15,7 @@
 #ifdef CK_USE_LAUNCH_BOUNDS
 #define CK_MAX_THREAD_PER_BLOCK 256
-#define CK_MIN_BLOCK_PER_CU 2
+#define CK_MIN_BLOCK_PER_CU 1
 #endif
 // check GPU target
@@ -107,7 +107,7 @@
 // experimental feature: use __builtin_memcpy instead of pointer cast to access a vector from
 // pointer of scalar
-#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 1
+#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0
 // experimental feature: use __builtin_memcpy instead of union to do bit_cast
 #define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1