Commit 9a2744a6 authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent 18328e2f
......@@ -86,15 +86,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr index_t GemmNLevel1Cluster = 8;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmThreadPerColumnPerCluster = 8;
constexpr index_t GemmThreadPerRowPerCluster = 8;
constexpr index_t InBlockCopyThreadPerDim0 = 4;
constexpr index_t InBlockCopyThreadPerDim1 = 16;
constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
constexpr index_t InBlockCopyDataPerRead = 4;
constexpr index_t WeiBlockCopyDataPerRead = 4;
......@@ -116,15 +107,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr index_t GemmNLevel1Cluster = 4;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmThreadPerColumnPerCluster = 8;
constexpr index_t GemmThreadPerRowPerCluster = 8;
constexpr index_t InBlockCopyThreadPerDim0 = 4;
constexpr index_t InBlockCopyThreadPerDim1 = 16;
constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
constexpr index_t InBlockCopyDataPerRead = 4;
constexpr index_t WeiBlockCopyDataPerRead = 4;
......@@ -147,15 +129,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr index_t GemmNLevel1Cluster = 4;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmThreadPerColumnPerCluster = 8;
constexpr index_t GemmThreadPerRowPerCluster = 8;
constexpr index_t InBlockCopyThreadPerDim0 = 4;
constexpr index_t InBlockCopyThreadPerDim1 = 16;
constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
constexpr index_t InBlockCopyDataPerRead = 4;
constexpr index_t WeiBlockCopyDataPerRead = 4;
......@@ -177,15 +150,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr index_t GemmNLevel1Cluster = 4;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmThreadPerColumnPerCluster = 8;
constexpr index_t GemmThreadPerRowPerCluster = 8;
constexpr index_t InBlockCopyThreadPerDim0 = 4;
constexpr index_t InBlockCopyThreadPerDim1 = 16;
constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
constexpr index_t InBlockCopyDataPerRead = 4;
constexpr index_t WeiBlockCopyDataPerRead = 4;
......@@ -207,15 +171,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr index_t GemmNLevel1Cluster = 4;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmThreadPerColumnPerCluster = 8;
constexpr index_t GemmThreadPerRowPerCluster = 8;
constexpr index_t InBlockCopyThreadPerDim0 = 4;
constexpr index_t InBlockCopyThreadPerDim1 = 16;
constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
constexpr index_t InBlockCopyDataPerRead = 4;
constexpr index_t WeiBlockCopyDataPerRead = 4;
......@@ -257,8 +212,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
CPerBlock,
BPerThread,
KPerThread,
GemmThreadPerColumnPerCluster,
GemmThreadPerRowPerCluster,
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
......@@ -266,10 +219,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmKPerThreadLoop,
InBlockCopyThreadPerDim0,
InBlockCopyThreadPerDim1,
WeiBlockCopyThreadPerDim0,
WeiBlockCopyThreadPerDim1,
InBlockCopyDataPerRead,
WeiBlockCopyDataPerRead>,
dim3(GridSize),
......
......@@ -593,9 +593,9 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 1
// 1x1 filter, 14x14 image, C = 128
// 1x1 filter, 14x14 image, C = 512
constexpr index_t N = 128;
constexpr index_t C = 128;
constexpr index_t C = 512;
constexpr index_t HI = 14;
constexpr index_t WI = 14;
constexpr index_t K = 512;
......
......@@ -21,21 +21,45 @@ struct is_same<T, T>
static const bool value = true;
};
#if DEVICE_BACKEND_CUDA
template <typename T>
__host__ __device__ constexpr T max(T a, T b)
__host__ __device__ constexpr index_t integer_divide_ceil(index_t a, index_t b)
{
return a > b ? a : b;
return (a + b - 1) / b;
}
template <typename T>
__host__ __device__ constexpr T min(T a, T b)
namespace mod_conv {
template <class T>
__host__ __device__ constexpr T max(T x, T y)
{
return a < b ? a : b;
return x > y ? x : y;
}
#endif
__host__ __device__ constexpr index_t integer_divide_ceil(index_t a, index_t b)
template <class T, class... Ts>
__host__ __device__ constexpr T max(T x, Ts... xs)
{
return (a + b - 1) / b;
static_assert(sizeof...(xs) > 0, "not enough argument");
auto y = max(xs...);
static_assert(is_same<decltype(y), T>::value, "not the same type");
return x > y ? x : y;
}
template <class T>
__host__ __device__ constexpr T min(T x, T y)
{
return x < y ? x : y;
}
template <class T, class... Ts>
__host__ __device__ constexpr T min(T x, Ts... xs)
{
static_assert(sizeof...(xs) > 0, "not enough argument");
auto y = min(xs...);
static_assert(is_same<decltype(y), T>::value, "not the same type");
return x < y ? x : y;
}
}
......@@ -19,8 +19,6 @@ template <index_t GridSize,
index_t CPerBlock,
index_t BPerThread,
index_t KPerThread,
index_t GemmThreadPerColumnPerCluster,
index_t GemmThreadPerRowPerCluster,
index_t GemmMPerThreadSubC,
index_t GemmNPerThreadSubC,
index_t GemmMLevel0Cluster,
......@@ -28,10 +26,6 @@ template <index_t GridSize,
index_t GemmMLevel1Cluster,
index_t GemmNLevel1Cluster,
index_t GemmKPerThreadLoop,
index_t InBlockCopyThreadPerDim0,
index_t InBlockCopyThreadPerDim1,
index_t WeiBlockCopyThreadPerDim0,
index_t WeiBlockCopyThreadPerDim1,
index_t InBlockCopyDataPerRead,
index_t WeiBlockCopyDataPerRead>
__global__ void
......@@ -111,57 +105,23 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
}
#endif
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
#elif 0
const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths()),
InBlockCopyThreadPerDim0,
InBlockCopyThreadPerDim1>{};
#elif 1
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths()),
InBlockCopyDataPerRead>{};
#endif
// blockwise wei copy
// format is [CPerBlock*Y*X,KPerBlock]
#if 0
const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths())>{};
#elif 0
const auto blockwise_wei_copy = Blockwise2dTensorCopy2<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths()),
WeiBlockCopyThreadPerDim0,
WeiBlockCopyThreadPerDim1>{};
#elif 1
// blockwise wei copy
// format is [CPerBlock*Y*X,KPerBlock]
const auto blockwise_wei_copy = Blockwise2dTensorCopy3<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths()),
WeiBlockCopyDataPerRead>{};
#endif
// a series of blockwise GEMM
// c_mtx += transpose(a_mtx) * b_mtx
......@@ -198,16 +158,27 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
constexpr index_t wei_block_size =
wei_cyxk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
? InBlockCopyDataPerRead
: WeiBlockCopyDataPerRead;
constexpr index_t max_align =
mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
// LDS
// LDS
#if 1
__shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
__shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
#if 1
const Float* p_lds_begin = p_in_block < p_wei_block ? p_in_block : p_wei_block;
const Float* const p_lds_begin = p_in_block < p_wei_block ? p_in_block : p_wei_block;
#else // debug
constexpr index_t lds_byte = 8 * 1024;
constexpr index_t in_block_space = max_align * ((in_block_size + max_align - 1) / max_align);
constexpr index_t wei_block_space = max_align * ((wei_block_size + max_align - 1) / max_align);
static_assert(lds_byte >= (in_block_space + wei_block_space) * sizeof(Float),
"lds allocation not enough");
__shared__ Float p_lds_begin[lds_byte / sizeof(Float)];
Float* const p_in_block = p_lds_begin;
Float* const p_wei_block = p_lds_begin + in_block_space;
#endif
const Float* p_in_global_block_offset =
......@@ -240,11 +211,11 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
for(index_t x = 0; x < X; ++x)
{
auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
#if 1
#if 0
blockwise_gemm.Run
#elif 0
blockwise_gemm.Run_asm
#elif 0
#elif 1
blockwise_gemm.Run_RegisterDoubleBuffer
#endif
(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
......
......@@ -55,17 +55,66 @@ __device__ void threadwise_matrix_copy_v2(SrcMatrix,
#elif 1
static_assert(NCol == 4, "only for NCol == 4");
using vector_t = typename vector_type<Float, 4>::MemoryType;
for(index_t i = 0; i < NRow; ++i)
{
const index_t src_index = src_mtx.Get1dIndex(i, 0);
const index_t dst_index = dst_mtx.Get1dIndex(i, 0);
#if 1
using vector_t = typename vector_type<Float, 4>::MemoryType;
*(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
*(reinterpret_cast<const vector_t*>(p_src + src_index));
#elif 1
#elif 0
// ds_read_b32
asm volatile(
"\n \
ds_read_b32 %0, %1 \n \
"
: "=v"(p_dst[dst_index])
: "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));
asm volatile(
"\n \
ds_read_b32 %0, %1 \n \
"
: "=v"(p_dst[dst_index + 1])
: "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 1) - p_lds_begin))));
asm volatile(
"\n \
ds_read_b32 %0, %1 \n \
"
: "=v"(p_dst[dst_index + 2])
: "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 2) - p_lds_begin))));
asm volatile(
"\n \
ds_read_b32 %0, %1 \n \
"
: "=v"(p_dst[dst_index + 3])
: "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 3) - p_lds_begin))));
#elif 0
// ds_read_b64
using vector_t = typename vector_type<Float, 2>::MemoryType;
asm volatile(
"\n \
ds_read_b64 %0, %1 \n \
"
: "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index)))
: "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));
asm volatile(
"\n \
ds_read_b64 %0, %1 \n \
"
: "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index + 2)))
: "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index + 2) - p_lds_begin))));
#elif 0
// ds_read_b128
using vector_t = typename vector_type<Float, 4>::MemoryType;
asm volatile(
"\n \
ds_read_b128 %0, %1 \n \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment