Commit 438138c0 authored by wangshaojie6's avatar wangshaojie6
Browse files

using 256x256x32 tile size

parent ebf3d70b
...@@ -45,13 +45,13 @@ using DeviceConvBwdWeightInstance = ck::tensor_operation::device:: ...@@ -45,13 +45,13 @@ using DeviceConvBwdWeightInstance = ck::tensor_operation::device::
OutElementOp, // OutElementwiseOperation OutElementOp, // OutElementwiseOperation
256, // BlockSize 256, // BlockSize
256, // MPerBlock 256, // MPerBlock
128, // NPerBlock 256, // NPerBlock
4, // K0PerBlock 4, // K0PerBlock
8, // K1 8, // K1
32, // MPerXdl 32, // MPerXdl
32, // NPerXdl 32, // NPerXdl
4, // MXdlPerWave 4, // MXdlPerWave
2, // NXdlPerWave 4, // NXdlPerWave
S<1, 4, 32, 2>, // ABlockTransferThreadClusterLengths_K0_M_K1 S<1, 4, 32, 2>, // ABlockTransferThreadClusterLengths_K0_M_K1
S<0, 3, 1, 2>, // ABlockTransferThreadClusterArrangeOrder S<0, 3, 1, 2>, // ABlockTransferThreadClusterArrangeOrder
S<0, 2, 1, 3>, // ABlockTransferSrcAccessOrder S<0, 2, 1, 3>, // ABlockTransferSrcAccessOrder
...@@ -59,12 +59,12 @@ using DeviceConvBwdWeightInstance = ck::tensor_operation::device:: ...@@ -59,12 +59,12 @@ using DeviceConvBwdWeightInstance = ck::tensor_operation::device::
8, // ABlockTransferSrcScalarPerVector 8, // ABlockTransferSrcScalarPerVector
4, // ABlockTransferDstScalarPerVector_K1 4, // ABlockTransferDstScalarPerVector_K1
true, // ABlockLdsAddExtraM true, // ABlockLdsAddExtraM
S<1, 4, 16, 4>, // BBlockTransferThreadClusterLengths_K0_N_K1 S<1, 4, 32, 2>, // BBlockTransferThreadClusterLengths_K0_N_K1
S<0, 3, 1, 2>, // BBlockTransferThreadClusterArrangeOrder S<0, 3, 1, 2>, // BBlockTransferThreadClusterArrangeOrder
S<0, 2, 1, 3>, // BBlockTransferSrcAccessOrder S<0, 2, 1, 3>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim 2, // BBlockTransferSrcVectorDim
8, // BBlockTransferSrcScalarPerVector 8, // BBlockTransferSrcScalarPerVector
2, // BBlockTransferDstScalarPerVector_K1 4, // BBlockTransferDstScalarPerVector_K1
true, // BBlockLdsAddExtraN true, // BBlockLdsAddExtraN
1, // CShuffleMXdlPerWavePerShuffle 1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle 1, // CShuffleNXdlPerWavePerShuffle
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#ifdef CK_USE_LAUNCH_BOUNDS #ifdef CK_USE_LAUNCH_BOUNDS
#define CK_MAX_THREAD_PER_BLOCK 256 #define CK_MAX_THREAD_PER_BLOCK 256
#define CK_MIN_BLOCK_PER_CU 2 #define CK_MIN_BLOCK_PER_CU 1
#endif #endif
// check GPU target // check GPU target
...@@ -107,7 +107,7 @@ ...@@ -107,7 +107,7 @@
// experimental feature: use __builtin_memcpy instead of pointer cast to access a vector from // experimental feature: use __builtin_memcpy instead of pointer cast to access a vector from
// pointer of scalar // pointer of scalar
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 1 #define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0
// experimental feature: use __builtin_memcpy instead of union to do bit_cast // experimental feature: use __builtin_memcpy instead of union to do bit_cast
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1 #define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment