Commit e4b77dcf authored by Jing Zhang's avatar Jing Zhang
Browse files

testing

parent 58ee3f13
...@@ -158,7 +158,7 @@ transform_forward_convolution_into_gemm_v4r4_xdlops_nchw_kcyx_nkhw_pad( ...@@ -158,7 +158,7 @@ transform_forward_convolution_into_gemm_v4r4_xdlops_nchw_kcyx_nkhw_pad(
constexpr auto wei_gemmk0_gemmm_gemmk1_global_move_slice_window_iterator_hacks = constexpr auto wei_gemmk0_gemmm_gemmk1_global_move_slice_window_iterator_hacks =
Sequence<0, 0, 0, 0, 0>{}; Sequence<0, 0, 0, 0, 0>{};
#if 0 #if 1
// hack to control index calculation when iterating over in_gemmk0_gemmn_gemmk1_global tensor // hack to control index calculation when iterating over in_gemmk0_gemmn_gemmk1_global tensor
constexpr auto in_gemmk0_gemmn_gemmk1_global_iterator_hacks = constexpr auto in_gemmk0_gemmn_gemmk1_global_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}, make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
......
...@@ -211,7 +211,7 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1 ...@@ -211,7 +211,7 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1
Sequence<1, MRepeat, 1, KPack>, Sequence<1, MRepeat, 1, KPack>,
Sequence<0, 1, 2, 3>, Sequence<0, 1, 2, 3>,
3, 3,
1, // KPack, KPack,
1>; 1>;
using BThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB, using BThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
...@@ -221,7 +221,7 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1 ...@@ -221,7 +221,7 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1
Sequence<1, NRepeat, 1, KPack>, Sequence<1, NRepeat, 1, KPack>,
Sequence<0, 1, 2, 3>, Sequence<0, 1, 2, 3>,
3, 3,
1, // KPack, KPack,
1>; 1>;
AThreadCopy a_thread_copy_; AThreadCopy a_thread_copy_;
......
...@@ -141,7 +141,7 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1 ...@@ -141,7 +141,7 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
{ {
__host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
{ {
constexpr auto max_lds_align = KPack; constexpr auto max_lds_align = Number<KPack>{};
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
...@@ -204,7 +204,7 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1 ...@@ -204,7 +204,7 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
__builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock); __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
// lds max alignment // lds max alignment
constexpr auto max_lds_align = KPack; constexpr auto max_lds_align = Number<KPack>{};
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
...@@ -229,11 +229,11 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1 ...@@ -229,11 +229,11 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
decltype(a_k0_m_k1_global_desc), decltype(a_k0_m_k1_global_desc),
decltype(a_k0_m_k1_block_desc), decltype(a_k0_m_k1_block_desc),
ABlockTransferSrcAccessOrder, ABlockTransferSrcAccessOrder,
Sequence<2, 0, 1>, Sequence<1, 0, 2>,
2, // ABlockTransferSrcVectorDim, ABlockTransferSrcVectorDim,
2, 2,
1, // ABlockTransferSrcScalarPerVector, ABlockTransferSrcScalarPerVector,
1, // ABlockTransferDstScalarPerVector_KPack, ABlockTransferDstScalarPerVector_KPack,
1, 1,
1, 1,
AThreadTransferSrcResetCoordinateAfterRun, AThreadTransferSrcResetCoordinateAfterRun,
...@@ -256,11 +256,11 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1 ...@@ -256,11 +256,11 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
decltype(b_k0_n_k1_global_desc), decltype(b_k0_n_k1_global_desc),
decltype(b_k0_n_k1_block_desc), decltype(b_k0_n_k1_block_desc),
BBlockTransferSrcAccessOrder, BBlockTransferSrcAccessOrder,
Sequence<2, 0, 1>, Sequence<1, 0, 2>,
1, // BBlockTransferSrcVectorDim, BBlockTransferSrcVectorDim,
2, 2,
1, // BBlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector,
1, // BBlockTransferDstScalarPerVector_KPack, BBlockTransferDstScalarPerVector_KPack,
1, 1,
1, 1,
BThreadTransferSrcResetCoordinateAfterRun, BThreadTransferSrcResetCoordinateAfterRun,
...@@ -282,8 +282,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1 ...@@ -282,8 +282,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
NPerBlock % (NPerWave * NRepeat) == 0, NPerBlock % (NPerWave * NRepeat) == 0,
"wrong!"); "wrong!");
static_assert(KPack == 1, "");
constexpr auto a_k0_m0_m1_k1_block_desc = transform_dynamic_tensor_descriptor( constexpr auto a_k0_m0_m1_k1_block_desc = transform_dynamic_tensor_descriptor(
a_k0_m_k1_block_desc, a_k0_m_k1_block_desc,
make_tuple(make_pass_through_transform(Number<KPerBlock>{}), make_tuple(make_pass_through_transform(Number<KPerBlock>{}),
......
...@@ -61,7 +61,7 @@ struct integer_divide_ceiler ...@@ -61,7 +61,7 @@ struct integer_divide_ceiler
{ {
static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type"); static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
return (a + b - 1) / b; return (a + b - Number<1>{}) / b;
} }
}; };
...@@ -74,7 +74,7 @@ __host__ __device__ constexpr auto integer_divide_floor(X x, Y y) ...@@ -74,7 +74,7 @@ __host__ __device__ constexpr auto integer_divide_floor(X x, Y y)
template <class X, class Y> template <class X, class Y>
__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y) __host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
{ {
return (x + y - 1) / y; return (x + y - Number<1>{}) / y;
} }
template <class X, class Y> template <class X, class Y>
......
...@@ -79,10 +79,38 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw ...@@ -79,10 +79,38 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
const auto in_right_pads = sequence_to_tuple_of_number(InRightPads{}); const auto in_right_pads = sequence_to_tuple_of_number(InRightPads{});
#endif #endif
#if 1
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
constexpr index_t GemmMPerBlock = 128; constexpr index_t GemmMPerBlock = 128;
constexpr index_t GemmNPerBlock = 128; constexpr index_t GemmNPerBlock = 128;
constexpr index_t GemmKPerBlock = 4;
constexpr index_t GemmMPerWave = 64;
constexpr index_t GemmNPerWave = 64;
constexpr index_t GemmKPack = 4;
constexpr index_t MRepeat = 1;
constexpr index_t NRepeat = 1;
using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1 = Sequence<1, 2, 4>;
using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 4;
constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 4;
using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1 = Sequence<1, 2, 4>;
using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 4;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
#else
constexpr index_t BlockSize = 256;
constexpr index_t GemmMPerBlock = 128;
constexpr index_t GemmNPerBlock = 256;
constexpr index_t GemmKPerBlock = 16; constexpr index_t GemmKPerBlock = 16;
constexpr index_t GemmMPerWave = 64; constexpr index_t GemmMPerWave = 64;
...@@ -90,21 +118,22 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw ...@@ -90,21 +118,22 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
constexpr index_t GemmKPack = 1; constexpr index_t GemmKPack = 1;
constexpr index_t MRepeat = 1; constexpr index_t MRepeat = 1;
constexpr index_t NRepeat = 1; constexpr index_t NRepeat = 2;
using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 2, GemmKPack>; using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 2, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>; using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1; constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 1; constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 1;
using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1 = Sequence<2, 4, GemmKPack>; using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 4, 1>;
using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<8, 32, 1>; using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1; constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 1; constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1; constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
#endif
const auto descs = const auto descs =
transform_forward_convolution_into_gemm_v4r4_xdlops_nchw_kcyx_nkhw_pad<TInWei, transform_forward_convolution_into_gemm_v4r4_xdlops_nchw_kcyx_nkhw_pad<TInWei,
...@@ -152,7 +181,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw ...@@ -152,7 +181,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1, GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1, GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
Sequence<0, 2, 1>, Sequence<0, 2, 1>,
Sequence<0, 2, 1>, Sequence<1, 0, 2>,
1, 1,
GemmBBlockTransferSrcScalarPerVector_GemmN, GemmBBlockTransferSrcScalarPerVector_GemmN,
GemmBBlockTransferDstScalarPerVector_KPack, GemmBBlockTransferDstScalarPerVector_KPack,
......
...@@ -24,7 +24,7 @@ int main(int argc, char* argv[]) ...@@ -24,7 +24,7 @@ int main(int argc, char* argv[])
{ {
using namespace ck; using namespace ck;
#if 1 #if 0
constexpr index_t N = 256; constexpr index_t N = 256;
constexpr index_t C = 256; constexpr index_t C = 256;
constexpr index_t HI = 16; constexpr index_t HI = 16;
......
...@@ -10,7 +10,7 @@ cmake ...@@ -10,7 +10,7 @@ cmake
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D DEVICE_BACKEND="AMD" \ -D DEVICE_BACKEND="AMD" \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx908 -gline-tables-only -save-temps=$CWD -ftemplate-backtrace-limit=0" \ -D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx908 -gline-tables-only -save-temps=$CWD -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH="/opt/rocm" \ -D CMAKE_PREFIX_PATH="/opt/rocm" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment