"tests/vscode:/vscode.git/clone" did not exist on "0cc3a7a1232cc8725104b458ded08b8af6130d10"
Commit 7db237ba authored by Jing Zhang's avatar Jing Zhang
Browse files

clean

parent 9d6938ff
...@@ -158,8 +158,8 @@ struct GridwiseConvolutionForwardImplicitGemm_v4r4_xdlops_nchw_kcyx_nkhw ...@@ -158,8 +158,8 @@ struct GridwiseConvolutionForwardImplicitGemm_v4r4_xdlops_nchw_kcyx_nkhw
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
// gridwise batch-GEMM // gridwise batch-GEMM
// constexpr auto gridwise_gemm = GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2< // constexpr auto gridwise_gemm = GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2_org<
constexpr auto gridwise_gemm = GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2_org< constexpr auto gridwise_gemm = GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2<
GridSize, GridSize,
BlockSize, BlockSize,
ABFloat, ABFloat,
......
...@@ -130,7 +130,7 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2_org ...@@ -130,7 +130,7 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2_org
constexpr auto a_g_k_m_kpack_block_desc = make_native_tensor_descriptor_aligned( constexpr auto a_g_k_m_kpack_block_desc = make_native_tensor_descriptor_aligned(
Sequence<1, KPerBlock, MPerBlock, KPack>{}, Number<max_align>{}); Sequence<1, KPerBlock, MPerBlock, KPack>{}, Number<max_align>{});
auto a_blockwise_copy = BlockwiseGenericTensorSliceCopy_v5< auto a_blockwise_copy = BlockwiseGenericTensorSliceCopy_v4<
BlockSize, BlockSize,
decltype(a_g_k_m_kpack_global_desc), decltype(a_g_k_m_kpack_global_desc),
decltype(a_g_k_m_kpack_block_desc), decltype(a_g_k_m_kpack_block_desc),
...@@ -225,14 +225,14 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2_org ...@@ -225,14 +225,14 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2_org
for(index_t k_block_data_begin = 0; k_block_data_begin < K - KPerBlock; for(index_t k_block_data_begin = 0; k_block_data_begin < K - KPerBlock;
k_block_data_begin += KPerBlock) k_block_data_begin += KPerBlock)
{ {
// ABFloat p_a_thread_buffer[a_blockwise_copy.GetThreadBufferSize()]; ABFloat p_a_thread_buffer[a_blockwise_copy.GetThreadBufferSize()];
ABFloat p_b_thread_buffer[b_blockwise_copy.GetThreadBufferSize()]; ABFloat p_b_thread_buffer[b_blockwise_copy.GetThreadBufferSize()];
// load next data from device mem // load next data from device mem
a_blockwise_copy.MoveSrcSliceWindow(blockwise_a_copy_src_step, True); a_blockwise_copy.MoveSrcSliceWindow(blockwise_a_copy_src_step, True);
b_blockwise_copy.MoveSrcSliceWindow(blockwise_b_copy_src_step, True); b_blockwise_copy.MoveSrcSliceWindow(blockwise_b_copy_src_step, True);
a_blockwise_copy.RunLoadThreadBuffer(p_a_global); a_blockwise_copy.RunLoadThreadBuffer(p_a_global, p_a_thread_buffer);
b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer); b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer);
block_sync_lds(); block_sync_lds();
...@@ -250,7 +250,7 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2_org ...@@ -250,7 +250,7 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2_org
block_sync_lds(); block_sync_lds();
// store next data to LDS // store next data to LDS
a_blockwise_copy.RunStoreThreadBuffer(p_a_block); a_blockwise_copy.RunStoreThreadBuffer(p_a_thread_buffer, p_a_block);
b_blockwise_copy.RunStoreThreadBuffer(p_b_thread_buffer, p_b_block); b_blockwise_copy.RunStoreThreadBuffer(p_b_thread_buffer, p_b_block);
} }
......
...@@ -95,40 +95,6 @@ struct ThreadwiseGenericTensorSliceCopy_v5 ...@@ -95,40 +95,6 @@ struct ThreadwiseGenericTensorSliceCopy_v5
*reinterpret_cast<SrcData*>(&p_dst[dst_offset]) = src_data; *reinterpret_cast<SrcData*>(&p_dst[dst_offset]) = src_data;
} }
#if 0
template <typename SrcData, index_t SrcDataPerAccess>
struct vector_data_load;
template <>
struct vector_data_load<float, 1>
{
template <typename SrcCoord>
__device__ static auto run(const float* p_src, const SrcCoord src_coord_begin)
{
return load_data<float>(p_src, src_coord_begin.GetOffset());
}
};
template <>
struct vector_data_load<float, 2>
{
template <typename SrcCoord>
__device__ static auto run(const float* p_src, const SrcCoord src_coord_begin)
{
return load_data<float2_t>(p_src, src_coord_begin.GetOffset());
}
};
template <>
struct vector_data_load<float, 4>
{
template <typename SrcCoord>
__device__ static auto run(const float* p_src, const SrcCoord src_coord_begin)
{
return load_data<float4_t>(p_src, src_coord_begin.GetOffset());
}
};
#else
template <index_t SrcDataPerAccess, index_t SrcDataRange, typename SrcData, typename SrcCoord> template <index_t SrcDataPerAccess, index_t SrcDataRange, typename SrcData, typename SrcCoord>
__device__ static auto vector_data_load(const SrcData* p_src, const SrcCoord src_coord_begin) __device__ static auto vector_data_load(const SrcData* p_src, const SrcCoord src_coord_begin)
{ {
...@@ -136,8 +102,6 @@ struct ThreadwiseGenericTensorSliceCopy_v5 ...@@ -136,8 +102,6 @@ struct ThreadwiseGenericTensorSliceCopy_v5
return amd_buffer_load<SrcData, SrcDataPerAccess>(p_src, src_offset, true, SrcDataRange); return amd_buffer_load<SrcData, SrcDataPerAccess>(p_src, src_offset, true, SrcDataRange);
} }
#endif
template <typename DstData, index_t DstDataPerAccess> template <typename DstData, index_t DstDataPerAccess>
struct vector_data_store; struct vector_data_store;
......
...@@ -81,8 +81,6 @@ void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw( ...@@ -81,8 +81,6 @@ void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
constexpr index_t GridSize = math::integer_divide_ceil(GemmM, GemmMPerBlock) * constexpr index_t GridSize = math::integer_divide_ceil(GemmM, GemmMPerBlock) *
math::integer_divide_ceil(GemmN, GemmNPerBlock); math::integer_divide_ceil(GemmN, GemmNPerBlock);
static_assert(GridSize == 1568, "");
// A matrix copy // A matrix copy
constexpr index_t GemmABlockCopyClusterLengths_GemmK = 4; constexpr index_t GemmABlockCopyClusterLengths_GemmK = 4;
constexpr index_t GemmABlockCopyClusterLengths_GemmM = 64; constexpr index_t GemmABlockCopyClusterLengths_GemmM = 64;
...@@ -111,8 +109,8 @@ void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw( ...@@ -111,8 +109,8 @@ void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
using GemmABlockCopySrcAccessOrder = Sequence<0, 2, 1, 3>; // [GemmG, GemmM, GemmK, GemmKPack] using GemmABlockCopySrcAccessOrder = Sequence<0, 2, 1, 3>; // [GemmG, GemmM, GemmK, GemmKPack]
using GemmABlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [GemmG, GemmK, GemmM, GemmKPack] using GemmABlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [GemmG, GemmK, GemmM, GemmKPack]
constexpr index_t GemmABlockCopySrcDataPerRead_GemmKPack = 4; constexpr index_t GemmABlockCopySrcDataPerRead_GemmKPack = 1;
constexpr index_t GemmABlockCopyDstDataPerWrite_GemmKPack = 4; constexpr index_t GemmABlockCopyDstDataPerWrite_GemmKPack = 1;
// B matrix Copy // B matrix Copy
constexpr index_t GemmBBlockCopyClusterLengths_GemmK = 4; constexpr index_t GemmBBlockCopyClusterLengths_GemmK = 4;
...@@ -142,8 +140,8 @@ void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw( ...@@ -142,8 +140,8 @@ void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
using GemmBBlockCopySrcAccessOrder = Sequence<0, 1, 3, 2>; // [GemmG, GemmK, GemmKPack, GemmN] using GemmBBlockCopySrcAccessOrder = Sequence<0, 1, 3, 2>; // [GemmG, GemmK, GemmKPack, GemmN]
using GemmBBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [GemmG, GemmK, GemmN, GemmKPack] using GemmBBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [GemmG, GemmK, GemmN, GemmKPack]
constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN = 4; constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN = 1;
constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmKPack = 4; constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmKPack = 1;
// gridwise GEMM // gridwise GEMM
constexpr auto wkgrp_schd_order = NBlock1MBlock0; constexpr auto wkgrp_schd_order = NBlock1MBlock0;
......
...@@ -29,8 +29,8 @@ int main(int argc, char* argv[]) ...@@ -29,8 +29,8 @@ int main(int argc, char* argv[])
constexpr index_t HI = 56; constexpr index_t HI = 56;
constexpr index_t WI = 56; constexpr index_t WI = 56;
constexpr index_t K = 128; constexpr index_t K = 128;
constexpr index_t Y = 1; constexpr index_t Y = 3;
constexpr index_t X = 1; constexpr index_t X = 3;
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
...@@ -99,19 +99,6 @@ int main(int argc, char* argv[]) ...@@ -99,19 +99,6 @@ int main(int argc, char* argv[])
#endif #endif
} }
#if 0
device_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#elif 1
gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(in_nchw_desc, gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
...@@ -123,56 +110,6 @@ int main(int argc, char* argv[]) ...@@ -123,56 +110,6 @@ int main(int argc, char* argv[])
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
#elif 1
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#elif 0
device_dummy_static_transform(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#elif 0
device_dummy_dynamic_transform_v1(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#elif 1
device_dummy_dynamic_transform(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#endif
if(do_verification) if(do_verification)
{ {
#if 0 #if 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment