"dist/vscode:/vscode.git/clone" did not exist on "7a79845151c044ff3982bc45e2251b7aaf7343d5"
Commit e4b77dcf authored by Jing Zhang's avatar Jing Zhang
Browse files

testing

parent 58ee3f13
......@@ -158,7 +158,7 @@ transform_forward_convolution_into_gemm_v4r4_xdlops_nchw_kcyx_nkhw_pad(
constexpr auto wei_gemmk0_gemmm_gemmk1_global_move_slice_window_iterator_hacks =
Sequence<0, 0, 0, 0, 0>{};
#if 0
#if 1
// hack to control index calculation when iterating over in_gemmk0_gemmn_gemmk1_global tensor
constexpr auto in_gemmk0_gemmn_gemmk1_global_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
......
......@@ -211,7 +211,7 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1
Sequence<1, MRepeat, 1, KPack>,
Sequence<0, 1, 2, 3>,
3,
1, // KPack,
KPack,
1>;
using BThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
......@@ -221,7 +221,7 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1
Sequence<1, NRepeat, 1, KPack>,
Sequence<0, 1, 2, 3>,
3,
1, // KPack,
KPack,
1>;
AThreadCopy a_thread_copy_;
......
......@@ -141,7 +141,7 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
{
__host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
{
constexpr auto max_lds_align = KPack;
constexpr auto max_lds_align = Number<KPack>{};
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
......@@ -204,7 +204,7 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
__builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
// lds max alignment
constexpr auto max_lds_align = KPack;
constexpr auto max_lds_align = Number<KPack>{};
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
......@@ -229,11 +229,11 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
decltype(a_k0_m_k1_global_desc),
decltype(a_k0_m_k1_block_desc),
ABlockTransferSrcAccessOrder,
Sequence<2, 0, 1>,
2, // ABlockTransferSrcVectorDim,
Sequence<1, 0, 2>,
ABlockTransferSrcVectorDim,
2,
1, // ABlockTransferSrcScalarPerVector,
1, // ABlockTransferDstScalarPerVector_KPack,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_KPack,
1,
1,
AThreadTransferSrcResetCoordinateAfterRun,
......@@ -256,11 +256,11 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
decltype(b_k0_n_k1_global_desc),
decltype(b_k0_n_k1_block_desc),
BBlockTransferSrcAccessOrder,
Sequence<2, 0, 1>,
1, // BBlockTransferSrcVectorDim,
Sequence<1, 0, 2>,
BBlockTransferSrcVectorDim,
2,
1, // BBlockTransferSrcScalarPerVector,
1, // BBlockTransferDstScalarPerVector_KPack,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_KPack,
1,
1,
BThreadTransferSrcResetCoordinateAfterRun,
......@@ -282,8 +282,6 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
NPerBlock % (NPerWave * NRepeat) == 0,
"wrong!");
static_assert(KPack == 1, "");
constexpr auto a_k0_m0_m1_k1_block_desc = transform_dynamic_tensor_descriptor(
a_k0_m_k1_block_desc,
make_tuple(make_pass_through_transform(Number<KPerBlock>{}),
......
......@@ -61,7 +61,7 @@ struct integer_divide_ceiler
{
static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
return (a + b - 1) / b;
return (a + b - Number<1>{}) / b;
}
};
......@@ -74,7 +74,7 @@ __host__ __device__ constexpr auto integer_divide_floor(X x, Y y)
template <class X, class Y>
__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
{
return (x + y - 1) / y;
return (x + y - Number<1>{}) / y;
}
template <class X, class Y>
......
......@@ -79,10 +79,38 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
const auto in_right_pads = sequence_to_tuple_of_number(InRightPads{});
#endif
#if 1
constexpr index_t BlockSize = 256;
constexpr index_t GemmMPerBlock = 128;
constexpr index_t GemmNPerBlock = 128;
constexpr index_t GemmKPerBlock = 4;
constexpr index_t GemmMPerWave = 64;
constexpr index_t GemmNPerWave = 64;
constexpr index_t GemmKPack = 4;
constexpr index_t MRepeat = 1;
constexpr index_t NRepeat = 1;
using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1 = Sequence<1, 2, 4>;
using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 4;
constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 4;
using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1 = Sequence<1, 2, 4>;
using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 4;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
#else
constexpr index_t BlockSize = 256;
constexpr index_t GemmMPerBlock = 128;
constexpr index_t GemmNPerBlock = 256;
constexpr index_t GemmKPerBlock = 16;
constexpr index_t GemmMPerWave = 64;
......@@ -90,21 +118,22 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
constexpr index_t GemmKPack = 1;
constexpr index_t MRepeat = 1;
constexpr index_t NRepeat = 1;
constexpr index_t NRepeat = 2;
using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 2, GemmKPack>;
using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 2, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 1;
using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1 = Sequence<2, 4, GemmKPack>;
using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<8, 32, 1>;
using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 4, 1>;
using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
#endif
const auto descs =
transform_forward_convolution_into_gemm_v4r4_xdlops_nchw_kcyx_nkhw_pad<TInWei,
......@@ -152,7 +181,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
Sequence<0, 2, 1>,
Sequence<0, 2, 1>,
Sequence<1, 0, 2>,
1,
GemmBBlockTransferSrcScalarPerVector_GemmN,
GemmBBlockTransferDstScalarPerVector_KPack,
......
......@@ -24,7 +24,7 @@ int main(int argc, char* argv[])
{
using namespace ck;
#if 1
#if 0
constexpr index_t N = 256;
constexpr index_t C = 256;
constexpr index_t HI = 16;
......
......@@ -10,7 +10,7 @@ cmake
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_BUILD_TYPE=Release \
-D DEVICE_BACKEND="AMD" \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx908 -gline-tables-only -save-temps=$CWD -ftemplate-backtrace-limit=0" \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx908 -gline-tables-only -save-temps=$CWD -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH="/opt/rocm" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment