Commit 7da654a4 authored by Chao Liu's avatar Chao Liu
Browse files

added dyanmic col2im

parent 9d000309
...@@ -119,6 +119,7 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw ...@@ -119,6 +119,7 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
// blockwise atomic accumulation // blockwise atomic accumulation
auto blockwise_copy = auto blockwise_copy =
#if 0
BlockwiseDynamicTensorSliceTransfer_v1<BlockSize, BlockwiseDynamicTensorSliceTransfer_v1<BlockSize,
float, float,
float, float,
...@@ -129,8 +130,6 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw ...@@ -129,8 +130,6 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
BlockCopyClusterLengths_GemmK_GemmN, BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder, BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder, BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
1,
1, 1,
BlockCopyDataPerAccess_GemmN, BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN, BlockCopyDataPerAccess_GemmN,
...@@ -139,32 +138,39 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw ...@@ -139,32 +138,39 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
InMemoryDataOperation::AtomicAdd, InMemoryDataOperation::AtomicAdd,
1, 1,
1>( 1>(
#else
BlockwiseDynamicTensorSliceTransfer_v2<BlockSize,
float,
float,
decltype(col_gemmk_gemmn_global_desc),
decltype(img_gemmk_gemmn_global_desc),
Sequence<GemmKPerBlock, GemmNPerBlock>,
BlockCopySubLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
1,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
AddressSpace::Global,
AddressSpace::Global,
InMemoryDataOperation::AtomicAdd,
1,
1>(
#endif
col_gemmk_gemmn_global_desc, col_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global), make_multi_index(0, gemmn_block_data_on_global),
img_gemmk_gemmn_global_desc, img_gemmk_gemmn_global_desc,
make_multi_index(0, gemmn_block_data_on_global)); make_multi_index(0, gemmn_block_data_on_global));
auto col_gemmk_gemmn_coord = for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock)
make_dynamic_tensor_coordinate(col_gemmk_gemmn_global_desc, make_multi_index(0, 0));
auto img_gemmk_gemmn_coord =
make_dynamic_tensor_coordinate(img_gemmk_gemmn_global_desc, make_multi_index(0, 0));
const auto col_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step(
col_gemmk_gemmn_global_desc, make_multi_index(GemmKPerBlock, 0));
const auto img_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step(
img_gemmk_gemmn_global_desc, make_multi_index(GemmKPerBlock, 0));
for(index_t gemmk = 0; gemmk < GemmK - GemmKPerBlock; gemmk += GemmKPerBlock)
{ {
blockwise_copy.Run(p_col_global, p_img_global); blockwise_copy.Run(p_col_global, p_img_global);
move_dynamic_tensor_coordinate( blockwise_copy.MoveSrcSliceWindow(make_multi_index(GemmKPerBlock, 0));
col_gemmk_gemmn_global_desc, col_gemmk_gemmn_coord, col_gemmk_gemmn_coord_step); blockwise_copy.MoveDstSliceWindow(make_multi_index(GemmKPerBlock, 0));
move_dynamic_tensor_coordinate(
img_gemmk_gemmn_global_desc, img_gemmk_gemmn_coord, img_gemmk_gemmn_coord_step);
} }
} }
}; };
......
...@@ -459,7 +459,7 @@ struct DynamicMerge ...@@ -459,7 +459,7 @@ struct DynamicMerge
// this should be saved in SGPR as well // this should be saved in SGPR as well
index_t idx_low_length_minus_idx_diff_low_const = index_t idx_low_length_minus_idx_diff_low_const =
low_lengths_[i] - idx_diff_low_const[i]; low_lengths_[i] - idx_diff_low_const[i];
#if 0 #if 1
index_t idx_low_length_plus_idx_diff_low_const = index_t idx_low_length_plus_idx_diff_low_const =
low_lengths_[i] + idx_diff_low_const[i]; low_lengths_[i] + idx_diff_low_const[i];
#endif #endif
...@@ -467,21 +467,20 @@ struct DynamicMerge ...@@ -467,21 +467,20 @@ struct DynamicMerge
index_t idx_low_tmp = idx_low_old[i] + carry; index_t idx_low_tmp = idx_low_old[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const; bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const;
#if 0 #if 1
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i]; bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
#endif #endif
idx_diff_low(i) = idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const : idx_diff_low_const[i]; do_carry ? -idx_low_length_minus_idx_diff_low_const : idx_diff_low_const[i];
#if 0 #if 1
idx_diff_low(i) = idx_diff_low(i) = do_borrow ? idx_low_length_plus_idx_diff_low_const : idx_diff_low[i];
do_borrow ? idx_low_length_plus_idx_diff_low_const : idx_diff_low[i];
#endif #endif
idx_diff_low(i) += carry; idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0; carry = do_carry ? 1 : 0;
#if 0 #if 1
carry = do_borrow ? -1 : carry; carry = do_borrow ? -1 : carry;
#endif #endif
}); });
......
...@@ -9,6 +9,116 @@ ...@@ -9,6 +9,116 @@
namespace ck { namespace ck {
template <index_t BlockSize,
typename BlockSrcData,
typename BlockDstData,
typename BlockSrcDesc,
typename BlockDstDesc,
typename BlockSliceLengths,
typename ThreadSliceLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcDstDimAccessOrder,
index_t SrcDstVectoReadDim,
index_t SrcDataPerRead,
index_t DstDataPerWrite,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp,
index_t SrcDataStride,
index_t DstDataStride>
struct BlockwiseDynamicTensorSliceTransfer_v1
{
static constexpr index_t nDim =
remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension();
using Index = MultiIndex<nDim>;
__device__ constexpr BlockwiseDynamicTensorSliceTransfer_v1(const BlockSrcDesc& block_src_desc,
const Index& src_block_slice_origin,
const BlockDstDesc& block_dst_desc,
const Index& dst_block_slice_origin)
: threadwise_transfer_(block_src_desc,
make_zero_multi_index<nDim>(),
block_dst_desc,
make_zero_multi_index<nDim>())
{
static_assert(
nDim == remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension() &&
nDim == remove_reference_t<remove_cv_t<BlockDstDesc>>::GetNumOfDimension() &&
nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
nDim == ThreadClusterLengths::Size() && nDim == ThreadClusterArrangeOrder::Size() &&
nDim == SrcDstDimAccessOrder::Size(),
"wrong! nDim not consistent");
static_assert(
is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
"wrong! threads should be mapped to cover entire slicing window");
static_assert(BlockSize >= thread_cluster_desc_.GetElementSize(),
"wrong! BlockSize too small");
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
const auto thread_cluster_id =
thread_cluster_desc_.CalculateClusterIndex(get_thread_local_1d_id());
const auto thread_data_id_begin = thread_cluster_id * ThreadSliceLengths{};
threadwise_transfer_.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin);
threadwise_transfer_.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin);
}
}
__device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_transfer_.Run(p_block_src, p_block_dst);
}
}
__device__ void MoveSrcSliceWindow(const Index& step)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_transfer_.MoveSrcSliceWindow(step);
}
}
__device__ void MoveDstSliceWindow(const Index& step)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_transfer_.MoveDstSliceWindow(step);
}
}
private:
static constexpr auto thread_cluster_desc_ =
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
using ThreadwiseTransfer = ThreadwiseDynamicTensorSliceTransfer_v1<BlockSrcDesc,
BlockDstDesc,
ThreadSliceLengths,
SrcDstDimAccessOrder,
SrcDstVectoReadDim,
SrcDataPerRead,
DstDataPerWrite,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcDataStride,
DstDataStride>;
ThreadwiseTransfer threadwise_transfer_;
};
template <index_t BlockSize, template <index_t BlockSize,
typename BlockSrcData, typename BlockSrcData,
typename BlockDstData, typename BlockDstData,
...@@ -20,7 +130,7 @@ template <index_t BlockSize, ...@@ -20,7 +130,7 @@ template <index_t BlockSize,
typename ThreadClusterArrangeOrder, typename ThreadClusterArrangeOrder,
typename SrcDimAccessOrder, typename SrcDimAccessOrder,
typename DstDimAccessOrder, typename DstDimAccessOrder,
index_t SrcVectoReadDim, index_t SrcVectorReadDim,
index_t DstVectorWriteDim, index_t DstVectorWriteDim,
index_t SrcDataPerRead, index_t SrcDataPerRead,
index_t DstDataPerWrite, index_t DstDataPerWrite,
...@@ -29,17 +139,26 @@ template <index_t BlockSize, ...@@ -29,17 +139,26 @@ template <index_t BlockSize,
InMemoryDataOperation DstInMemOp, InMemoryDataOperation DstInMemOp,
index_t SrcDataStride, index_t SrcDataStride,
index_t DstDataStride> index_t DstDataStride>
struct BlockwiseDynamicTensorSliceTransfer_v1 struct BlockwiseDynamicTensorSliceTransfer_v2
{ {
static constexpr index_t nDim = static constexpr index_t nDim =
remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension(); remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension();
using Index = MultiIndex<nDim>; using Index = MultiIndex<nDim>;
__device__ constexpr BlockwiseDynamicTensorSliceTransfer_v1(const BlockSrcDesc& block_src_desc, __device__ constexpr BlockwiseDynamicTensorSliceTransfer_v2(const BlockSrcDesc& block_src_desc,
const Index& src_block_slice_origin, const Index& src_block_slice_origin,
const BlockDstDesc& block_dst_desc, const BlockDstDesc& block_dst_desc,
const Index& dst_block_slice_origin) const Index& dst_block_slice_origin)
: threadwise_read_(block_src_desc,
make_zero_multi_index<nDim>(),
thread_buffer_desc_,
make_zero_multi_index<nDim>()),
threadwise_write_(thread_buffer_desc_,
make_zero_multi_index<nDim>(),
block_dst_desc,
make_zero_multi_index<nDim>())
{ {
static_assert( static_assert(
nDim == remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension() && nDim == remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension() &&
...@@ -131,7 +250,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v1 ...@@ -131,7 +250,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v1
decltype(thread_buffer_desc_), decltype(thread_buffer_desc_),
ThreadSliceLengths, ThreadSliceLengths,
SrcDimAccessOrder, SrcDimAccessOrder,
SrcVectoReadDim, SrcVectorReadDim,
SrcDataPerRead, SrcDataPerRead,
1, 1,
SrcAddressSpace, SrcAddressSpace,
......
#ifndef CK_GRIDWISE_TENSOR_CONTRACTION_HPP
#define CK_GRIDWISE_TENSOR_CONTRACTION_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
namespace ck {
template <index_t GridSize,
index_t BlockSize,
typename Float,
typename AccFloat,
typename AGlobalDesc,
typename BGlobalDesc,
typename CGlobalDesc,
typename CBlockLengths,
index_t KPerBlock,
InMemoryDataOperation CGlobalMemoryDataOperation>
struct GridwiseTensorContraction_v1
{
__host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() {}
__device__ void Run(const Float* __restrict__ p_a_global,
const Float* __restrict__ p_b_global,
Float* __restrict__ p_c_global,
Float* __restrict__ p_shared_block) const
{
/// \todo sanity-check on AGlobalDesc, BGlboalDesc, CGlobalDesc length consisitency
/// \todo santiy-check on CBlockLengtsh
constexpr auto True = integral_constant<bool, true>{};
constexpr auto a_global_desc = AGlobalDesc{};
constexpr auto b_global_desc = BGlobalDesc{};
constexpr auto c_global_desc = CGlobalDesc{};
constexpr auto K = a_global_desc.GetLengths()[0];
// don't do anything if K == 0
if(K == 0)
{
return;
}
// lds max alignment
constexpr index_t max_lds_align = math::lcm(ABlockCopyDstDataPerWrite_M,
BBlockCopyDstDataPerWrite_N,
ThreadGemmAThreadCopySrcDataPerRead_M,
ThreadGemmBThreadCopySrcDataPerRead_N);
// divide block work by [M, N]
static_assert(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0,
"wrong! cannot divide work evenly among block");
constexpr index_t MBlockWork = M / MPerBlock;
constexpr index_t NBlockWork = N / NPerBlock;
constexpr auto block_work_desc =
make_cluster_descriptor(Sequence<MBlockWork, NBlockWork>{});
const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
const index_t m_block_data_on_global = block_work_id[0] * MPerBlock;
const index_t n_block_data_on_global = block_work_id[1] * NPerBlock;
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr auto a_k_m_block_desc = make_native_tensor_descriptor_aligned(
Sequence<KPerBlock, MPerBlock>{}, Number<max_lds_align>{});
// A matrix blockwise copy
auto a_blockwise_copy =
BlockwiseGenericTensorSliceCopy_v4<BlockSize,
AGlobalDesc,
decltype(a_block_desc),
decltype(a_k_m_block_desc.GetLengths()),
ABlockCopyThreadSliceLengths_K_M,
ABlockCopyThreadClusterLengths_K_M,
ABlockCopyThreadClusterArrangeOrder,
ABlockCopySrcAccessOrder,
Sequence<0, 1>,
ABlockCopySrcVectorReadDim,
1,
ABlockCopySrcDataPerRead,
ABlockCopyDstDataPerWrite_M,
AddressSpace::Global,
AddressSpace::Vgpr,
AddressSpace::Lds,
InMemoryDataOperation::Set>(
{0, m_block_data_on_global}, {0, 0});
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr auto b_k_n_block_desc = make_native_tensor_descriptor_aligned(
Sequence<KPerBlock, NPerBlock>{}, Number<max_lds_align>{});
// B matrix blockwise copy
auto b_blockwise_copy =
BlockwiseGenericTensorSliceCopy_v4<BlockSize,
decltype(b_k_n_global_desc),
decltype(b_k_n_block_desc),
decltype(b_k_n_block_desc.GetLengths()),
BBlockCopyThreadSliceLengths_K_N,
BBlockCopyThreadClusterLengths_K_N,
BBlockCopyThreadClusterArrangeOrder,
BBlockCopySrcAccessOrder,
Sequence<0, 1>,
BBlockCopySrcVectorReadDim,
1,
BBlockCopySrcDataPerRead,
BBlockCopyDstDataPerWrite_N,
AddressSpace::Global,
AddressSpace::Vgpr,
AddressSpace::Lds,
InMemoryDataOperation::Set>(
{0, n_block_data_on_global}, {0, 0});
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx[KPerBlock, MPerBlock] is in LDS
// b_mtx[KPerBlocl, NPerBlock] is in LDS
// c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
// register
constexpr auto a_k_m_block_mtx_desc = make_ConstantMatrixDescriptor(a_k_m_block_desc);
constexpr auto b_k_n_block_mtx_desc = make_ConstantMatrixDescriptor(b_k_n_block_desc);
// sanity check
static_assert(MPerBlock % (MPerThread * MLevel0Cluster * MLevel1Cluster) == 0 &&
NPerBlock % (NPerThread * NLevel0Cluster * NLevel1Cluster) == 0,
"wrong!");
constexpr index_t GemmMRepeat = MPerBlock / (MPerThread * MLevel0Cluster * MLevel1Cluster);
constexpr index_t GemmNRepeat = NPerBlock / (NPerThread * NLevel0Cluster * NLevel1Cluster);
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
constexpr auto c_m0m1_n0n1_thread_mtx_desc = make_ConstantMatrixDescriptor_packed(
Number<GemmMRepeat * MPerThread>{}, Number<GemmNRepeat * NPerThread>{});
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<
BlockSize,
decltype(a_k_m_block_mtx_desc),
decltype(b_k_n_block_mtx_desc),
decltype(c_m0m1_n0n1_thread_mtx_desc),
MPerThread,
NPerThread,
MLevel0Cluster,
NLevel0Cluster,
MLevel1Cluster,
NLevel1Cluster,
KPerThread,
ThreadGemmAThreadCopySrcDataPerRead_M,
ThreadGemmBThreadCopySrcDataPerRead_N>{};
// LDS allocation for A and B: be careful of alignment
constexpr index_t a_block_space =
math::integer_least_multiple(a_k_m_block_desc.GetElementSpace(), max_lds_align);
constexpr index_t b_block_space =
math::integer_least_multiple(b_k_n_block_desc.GetElementSpace(), max_lds_align);
Float* p_a_block_double = p_shared_block;
Float* p_b_block_double = p_shared_block + 2 * a_block_space;
// register allocation for output
AccFloat p_c_thread[c_m0m1_n0n1_thread_mtx_desc.GetElementSpace()];
// zero out threadwise output
threadwise_matrix_set_zero(c_m0m1_n0n1_thread_mtx_desc, p_c_thread);
// LDS double buffer: preload data into LDS
{
a_blockwise_copy.Run(p_a_global, p_a_block_double);
b_blockwise_copy.Run(p_b_global, p_b_block_double);
}
constexpr auto a_block_slice_copy_steps = Sequence<KPerBlock, 0>{};
constexpr auto b_block_slice_copy_steps = Sequence<KPerBlock, 0>{};
// LDS double buffer: main body
for(index_t k_block_data_begin = 0; k_block_data_begin + 2 * KPerBlock < K;
k_block_data_begin += 2 * KPerBlock)
{
#pragma unroll
for(index_t iloop = 0; iloop < 2; ++iloop)
{
const bool even_loop = (iloop % 2 == 0);
Float* p_a_block_now =
even_loop ? p_a_block_double : p_a_block_double + a_block_space;
Float* p_b_block_now =
even_loop ? p_b_block_double : p_b_block_double + b_block_space;
Float* p_a_block_next =
even_loop ? p_a_block_double + a_block_space : p_a_block_double;
Float* p_b_block_next =
even_loop ? p_b_block_double + b_block_space : p_b_block_double;
Float p_a_thread_buffer[a_blockwise_copy.GetThreadBufferSize()];
Float p_b_thread_buffer[b_blockwise_copy.GetThreadBufferSize()];
a_blockwise_copy.MoveSrcSliceWindow(a_block_slice_copy_steps, True);
b_blockwise_copy.MoveSrcSliceWindow(b_block_slice_copy_steps, True);
__syncthreads();
// LDS doubel buffer: load next data from device mem
a_blockwise_copy.RunLoadThreadBuffer(p_a_global, p_a_thread_buffer);
b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer);
// LDS double buffer: GEMM on current data
blockwise_gemm.Run(p_a_block_now, p_b_block_now, p_c_thread);
// LDS double buffer: store next data to LDS
a_blockwise_copy.RunStoreThreadBuffer(p_a_thread_buffer, p_a_block_next);
b_blockwise_copy.RunStoreThreadBuffer(p_b_thread_buffer, p_b_block_next);
}
}
// LDS double buffer: tail
{
constexpr bool has_two_iteration_left = (K % (2 * KPerBlock) == 0);
if(has_two_iteration_left) // if has 2 iteration left
{
Float p_a_thread_buffer[a_blockwise_copy.GetThreadBufferSize()];
Float p_b_thread_buffer[b_blockwise_copy.GetThreadBufferSize()];
a_blockwise_copy.MoveSrcSliceWindow(a_block_slice_copy_steps, True);
b_blockwise_copy.MoveSrcSliceWindow(b_block_slice_copy_steps, True);
__syncthreads();
// LDS double buffer: load last data from device mem
a_blockwise_copy.RunLoadThreadBuffer(p_a_global, p_a_thread_buffer);
b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer);
// LDS double buffer: GEMM on 2nd-last data
blockwise_gemm.Run(p_a_block_double, p_b_block_double, p_c_thread);
// LDS double buffer: store last data to LDS
a_blockwise_copy.RunStoreThreadBuffer(p_a_thread_buffer,
p_a_block_double + a_block_space);
b_blockwise_copy.RunStoreThreadBuffer(p_b_thread_buffer,
p_b_block_double + b_block_space);
__syncthreads();
// LDS double buffer: GEMM on last data
blockwise_gemm.Run(
p_a_block_double + a_block_space, p_b_block_double + b_block_space, p_c_thread);
}
else // if has 1 iteration left
{
__syncthreads();
// LDS double buffer: GEMM on last data
blockwise_gemm.Run(p_a_block_double, p_b_block_double, p_c_thread);
}
}
// input: register to global memory
{
constexpr index_t M1 = MPerThread * MLevel0Cluster * MLevel1Cluster;
constexpr index_t M0 = M / M1;
constexpr index_t N1 = NPerThread * NLevel0Cluster * NLevel1Cluster;
constexpr index_t N0 = N / N1;
// define input tensor descriptor for threadwise copy
// thread input tensor, src of threadwise copy
constexpr auto c_m0_m1_n0_n1_thread_desc = make_native_tensor_descriptor_packed(
Sequence<GemmMRepeat, MPerThread, GemmNRepeat, NPerThread>{});
constexpr auto c_m0_m1_n0_n1_global_desc = transform_tensor_descriptor(
c_m_n_global_desc,
make_tuple(UnMerge<Sequence<M0, M1>>{}, UnMerge<Sequence<N0, N1>>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
// calculate origin of thread input tensor on global memory
// blockwise GEMM c matrix starting index
const auto c_thread_mtx_on_block =
blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
const index_t m_thread_data_on_global =
m_block_data_on_global + c_thread_mtx_on_block.row;
const index_t n_thread_data_on_global =
n_block_data_on_global + c_thread_mtx_on_block.col;
ThreadwiseGenericTensorSliceCopy_v4r2<decltype(c_m0_m1_n0_n1_thread_desc),
decltype(c_m0_m1_n0_n1_global_desc),
decltype(c_m0_m1_n0_n1_thread_desc.GetLengths()),
CThreadCopySrcDstAccessOrder,
CThreadCopySrcDstVectorReadWriteDim,
1,
CThreadCopyDstDataPerWrite,
AddressSpace::Vgpr,
AddressSpace::Global,
CGlobalMemoryDataOperation>(
{0, 0, 0, 0},
{m_thread_data_on_global / M1,
m_thread_data_on_global % M1,
n_thread_data_on_global / N1,
n_thread_data_on_global % N1})
.Run(p_c_thread, p_c_global);
}
}
__device__ void Run(const Float* __restrict__ p_a_global,
const Float* __restrict__ p_b_global,
Float* __restrict__ p_c_global) const
{
constexpr index_t shared_block_size = GetSharedMemoryNumberOfByte() / sizeof(Float);
__shared__ Float p_shared_block[shared_block_size];
Run(p_a_global, p_b_global, p_c_global, p_shared_block);
}
};
} // namespace ck
#endif
...@@ -58,14 +58,28 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1( ...@@ -58,14 +58,28 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
bool forward_dim1 = true; bool forward_dim1 = true;
// hardcoded for 2d loop for now // hardcoded for 2d loop for now
#pragma unroll 1 #pragma unroll
for(int j0 = 0; j0 < J0; ++j0) for(int j0 = 0; j0 < J0; ++j0)
{ {
#pragma unroll 1 #pragma unroll
for(int j1 = 0; j1 < J1; ++j1) for(int j1 = 0; j1 < J1; ++j1)
{ {
// do work // do work
p_dst[dst_coord.GetOffset()] = p_src[src_coord.GetOffset()]; transfer_data<SrcData,
1,
SrcAddressSpace,
DstAddressSpace,
DstInMemOp,
SrcScalarStrideInVector,
DstScalarStrideInVector>(
p_src,
src_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord),
src_desc.GetElementSpaceSize(),
p_dst,
dst_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord),
dst_desc.GetElementSpaceSize());
// move dim1 iterator // move dim1 iterator
if(j1 < J1 - 1) if(j1 < J1 - 1)
......
...@@ -66,11 +66,11 @@ void device_dynamic_col2im_gemmkgemmn_nchw(ColDesc, ...@@ -66,11 +66,11 @@ void device_dynamic_col2im_gemmkgemmn_nchw(ColDesc,
#if 1 #if 1
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
constexpr index_t GemmKPerBlock = 128; constexpr index_t GemmKPerBlock = 8;
constexpr index_t GemmNPerBlock = 128; constexpr index_t GemmNPerBlock = 128;
using BlockCopySubLengths_GemmK_GemmN = Sequence<8, 8>; using BlockCopySubLengths_GemmK_GemmN = Sequence<1, 8>;
using BlockCopyClusterLengths_GemmK_GemmN = Sequence<16, 16>; using BlockCopyClusterLengths_GemmK_GemmN = Sequence<8, 16>;
using BlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [GemmK, GemmN] using BlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [GemmK, GemmN]
using BlockCopySrcAccessOrder = Sequence<0, 1>; // [GemmK, GemmN] using BlockCopySrcAccessOrder = Sequence<0, 1>; // [GemmK, GemmN]
using BlockCopyDstAccessOrder = Sequence<0, 1>; // [GemmK, GemmN] using BlockCopyDstAccessOrder = Sequence<0, 1>; // [GemmK, GemmN]
......
...@@ -21,14 +21,14 @@ void host_col2im(const Tensor<T>& in_eb, ...@@ -21,14 +21,14 @@ void host_col2im(const Tensor<T>& in_eb,
int N = in_nchw.mDesc.GetLengths()[0]; int N = in_nchw.mDesc.GetLengths()[0];
int C = in_nchw.mDesc.GetLengths()[1]; int C = in_nchw.mDesc.GetLengths()[1];
int HI = in_nchw.mDesc.GetLengths()[2]; int Hi = in_nchw.mDesc.GetLengths()[2];
int WI = in_nchw.mDesc.GetLengths()[3]; int Wi = in_nchw.mDesc.GetLengths()[3];
int Y = FilterSizes{}[0]; int Y = FilterSizes{}[0];
int X = FilterSizes{}[1]; int X = FilterSizes{}[1];
int HO = OutputSizes{}[0]; int Ho = OutputSizes{}[0];
int WO = OutputSizes{}[1]; int Wo = OutputSizes{}[1];
auto f = [&](auto n, auto c, auto hi, auto wi) { auto f = [&](auto n, auto c, auto hi, auto wi) {
double v = 0; double v = 0;
...@@ -37,22 +37,28 @@ void host_col2im(const Tensor<T>& in_eb, ...@@ -37,22 +37,28 @@ void host_col2im(const Tensor<T>& in_eb,
{ {
int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0]; int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0];
if(h_tmp >= 0 && h_tmp < HI && h_tmp % ConvStrides{}[0] == 0) if(h_tmp % ConvStrides{}[0] == 0)
{ {
int ho = h_tmp / ConvStrides{}[0]; int ho = h_tmp / ConvStrides{}[0];
for(int x = 0; x < X; ++x) if(ho >= 0 && ho < Ho)
{ {
int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1]; for(int x = 0; x < X; ++x)
if(w_tmp >= 0 && w_tmp < WI && w_tmp % ConvStrides{}[1] == 0)
{ {
int wo = w_tmp / ConvStrides{}[1]; int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1];
if(w_tmp % ConvStrides{}[1] == 0)
{
int wo = w_tmp / ConvStrides{}[1];
int e = c * (Y * X) + y * X + x; if(wo >= 0 && wo < Wo && w_tmp % ConvStrides{}[1] == 0)
int b = n * (HO * WO) + ho * WO + wo; {
int e = c * (Y * X) + y * X + x;
int b = n * (Ho * Wo) + ho * Wo + wo;
v += in_eb(e, b); v += in_eb(e, b);
}
}
} }
} }
} }
......
...@@ -19,21 +19,6 @@ int main(int argc, char* argv[]) ...@@ -19,21 +19,6 @@ int main(int argc, char* argv[])
using namespace ck; using namespace ck;
#if 1 #if 1
// 1x1, 8x8
constexpr index_t N = 2;
constexpr index_t C = 128;
constexpr index_t HI = 8;
constexpr index_t WI = 8;
constexpr index_t K = 128;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 3x3, 71x71 // 3x3, 71x71
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 192; constexpr index_t C = 192;
...@@ -541,7 +526,7 @@ int main(int argc, char* argv[]) ...@@ -541,7 +526,7 @@ int main(int argc, char* argv[])
if(do_verification) if(do_verification)
{ {
#if 1 #if 0
col_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread); col_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#else #else
col_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); col_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
...@@ -587,7 +572,7 @@ int main(int argc, char* argv[]) ...@@ -587,7 +572,7 @@ int main(int argc, char* argv[])
check_error(img_nchw_host, img_nchw_device); check_error(img_nchw_host, img_nchw_device);
#if 1 #if 0
LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl; LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl;
LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl; LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl;
LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl; LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl;
......
...@@ -250,17 +250,17 @@ int main(int argc, char* argv[]) ...@@ -250,17 +250,17 @@ int main(int argc, char* argv[])
#elif 1 #elif 1
device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk
#endif #endif
(in_nchw_desc, (in_nchw_desc,
in_nchw_device, in_nchw_device,
wei_kcyx_desc, wei_kcyx_desc,
wei_kcyx, wei_kcyx,
out_nkhw_desc, out_nkhw_desc,
out_nkhw, out_nkhw,
ConvStrides{}, ConvStrides{},
ConvDilations{}, ConvDilations{},
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
if(do_verification) if(do_verification)
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment