Commit 23c626a9 authored by Chao Liu's avatar Chao Liu
Browse files

changed to dynamics lds allocation

parent 7c098ddc
...@@ -220,7 +220,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, ...@@ -220,7 +220,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr index_t WeiBlockCopyDataPerRead = 4; constexpr index_t WeiBlockCopyDataPerRead = 4;
constexpr index_t BlockSize = 128; constexpr index_t BlockSize = 128;
#elif 0 #elif 1
// 1x1, 14x14, Vega 20, hack CPerBlock = 1 // 1x1, 14x14, Vega 20, hack CPerBlock = 1
constexpr index_t BPerBlock = 64; constexpr index_t BPerBlock = 64;
constexpr index_t KPerBlock = 128; constexpr index_t KPerBlock = 128;
...@@ -306,6 +306,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, ...@@ -306,6 +306,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
float time = launch_kernel(gridwise_conv.Run, float time = launch_kernel(gridwise_conv.Run,
dim3(GridSize), dim3(GridSize),
dim3(BlockSize), dim3(BlockSize),
gridwise_conv.GetSharedMemoryUsage(),
static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()), static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()), static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer())); static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
......
...@@ -29,14 +29,14 @@ struct KernelTimer ...@@ -29,14 +29,14 @@ struct KernelTimer
}; };
template <typename... Args, typename F> template <typename... Args, typename F>
float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, Args... args) float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
{ {
KernelTimer timer; KernelTimer timer;
#if DEVICE_BACKEND_HIP #if DEVICE_BACKEND_HIP
timer.Start(); timer.Start();
hipLaunchKernelGGL(kernel, grid_dim, block_dim, 0, 0, args...); hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, 0, args...);
timer.End(); timer.End();
...@@ -47,7 +47,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, Args... args) ...@@ -47,7 +47,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, Args... args)
timer.Start(); timer.Start();
cudaError_t error = cudaLaunchKernel(f, grid_dim, block_dim, p_args, 0, 0); cudaError_t error = cudaLaunchKernel(f, grid_dim, block_dim, p_args, lds_byte, 0);
timer.End(); timer.End();
......
...@@ -37,7 +37,7 @@ template <index_t GridSize, ...@@ -37,7 +37,7 @@ template <index_t GridSize,
class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
{ {
public: public:
__host__ __device__ static index_t GetSharedMemorySize() __host__ __device__ constexpr index_t GetInputBlockElementSpace() const
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -46,7 +46,6 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn ...@@ -46,7 +46,6 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
constexpr auto in_chwn_global_desc = InGlobalDesc{}; constexpr auto in_chwn_global_desc = InGlobalDesc{};
constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{}; constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{};
constexpr auto out_khwn_global_desc = OutGlobalDesc{};
constexpr index_t Hi = in_chwn_global_desc.GetLength(I1); constexpr index_t Hi = in_chwn_global_desc.GetLength(I1);
constexpr index_t Wi = in_chwn_global_desc.GetLength(I2); constexpr index_t Wi = in_chwn_global_desc.GetLength(I2);
...@@ -56,29 +55,59 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn ...@@ -56,29 +55,59 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
constexpr index_t BGhostRead = (Y - 1) * Wi + (X - 1); constexpr index_t BGhostRead = (Y - 1) * Wi + (X - 1);
// tensor view of blockwise input and weight // tensor view of blockwise input
// be careful of alignment // be careful of alignment
constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned( constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, BPerBlock + BGhostRead>{}, Number<InBlockCopyDataPerRead>{}); Sequence<CPerBlock, BPerBlock + BGhostRead>{}, Number<InBlockCopyDataPerRead>{});
// LDS: be careful of alignment
constexpr index_t max_align =
mod_conv::max(InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
return in_cb_block_desc.GetElementSpace(Number<max_align>{});
}
__host__ __device__ constexpr index_t GetWeightBlockElementSpace() const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_chwn_global_desc = InGlobalDesc{};
constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{};
constexpr index_t Hi = in_chwn_global_desc.GetLength(I1);
constexpr index_t Wi = in_chwn_global_desc.GetLength(I2);
constexpr index_t Y = wei_cyxk_global_desc.GetLength(I1);
constexpr index_t X = wei_cyxk_global_desc.GetLength(I2);
constexpr index_t BGhostRead = (Y - 1) * Wi + (X - 1);
// tensor view of blockwise weight
// be careful of alignment
constexpr auto wei_cyxk_block_desc = make_ConstantTensorDescriptor_aligned( constexpr auto wei_cyxk_block_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, Y, X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{}); Sequence<CPerBlock, Y, X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
// tensor view of threadwise output in register // LDS: be careful of alignment
constexpr auto out_kb_thread_desc =
make_ConstantTensorDescriptor(Sequence<KPerThread, BPerThread>{});
constexpr index_t max_align = constexpr index_t max_align =
mod_conv::max(InBlockCopyDataPerRead, WeiBlockCopyDataPerRead); mod_conv::max(InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
// LDS: be careful of alignment return wei_cyxk_block_desc.GetElementSpace(Number<max_align>{});
constexpr index_t in_block_element_space = }
in_cb_block_desc.GetElementSpace(Number<max_align>{});
constexpr index_t wei_block_element_space = __host__ __device__ constexpr index_t GetSharedMemoryUsage() const
wei_cyxk_block_desc.GetElementSpace(Number<max_align>{}); {
return (in_block_element_space + wei_block_element_space) * sizeof(Float); return (GetInputBlockElementSpace() + GetWeightBlockElementSpace()) * sizeof(Float);
}
__device__ constexpr static Float* GetSharedMemoryBegin()
{
extern __shared__ Float s[];
return s;
} }
__global__ static void Run(const Float* const __restrict__ p_in_global, __global__ static void Run(const Float* const __restrict__ p_in_global,
...@@ -251,8 +280,8 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn ...@@ -251,8 +280,8 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
constexpr index_t wei_block_element_space = constexpr index_t wei_block_element_space =
wei_cyxk_block_desc.GetElementSpace(Number<max_align>{}); wei_cyxk_block_desc.GetElementSpace(Number<max_align>{});
__shared__ Float p_in_block[in_block_element_space]; Float* const p_in_block = GetSharedMemoryBegin();
__shared__ Float p_wei_block[wei_block_element_space]; Float* const p_wei_block = GetSharedMemoryBegin() + in_block_element_space;
const Float* p_in_global_block_offset = const Float* p_in_global_block_offset =
p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin); p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
...@@ -288,7 +317,7 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn ...@@ -288,7 +317,7 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
blockwise_gemm.Run blockwise_gemm.Run
#elif 1 #elif 1
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
#elif 0 #elif 1
blockwise_gemm.Run_asm blockwise_gemm.Run_asm
#endif #endif
(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment