Commit c7a6545e authored by Chao Liu's avatar Chao Liu
Browse files

experimenting global and buffer load/store

parent 9f46cdf5
...@@ -487,6 +487,7 @@ struct BlockwiseGenericTensorSliceCopy_v2 ...@@ -487,6 +487,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
#if 0 #if 0
mThreadwiseLoad.Run(p_src, p_buffer); mThreadwiseLoad.Run(p_src, p_buffer);
#else #else
// hardcoded: global to register
mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer); mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer);
#endif #endif
} }
...@@ -497,7 +498,8 @@ struct BlockwiseGenericTensorSliceCopy_v2 ...@@ -497,7 +498,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
#if 0 #if 0
mThreadwiseStore.Run(p_buffer, p_dst); mThreadwiseStore.Run(p_buffer, p_dst);
#else #else
mThreadwiseStore.template Run_amd_experiment<TData, 0, 2>(p_buffer, p_dst); // hardcoded: register to LDS
mThreadwiseStore.template Run_amd_experiment<TData, 0, 1>(p_buffer, p_dst);
#endif #endif
} }
...@@ -506,13 +508,8 @@ struct BlockwiseGenericTensorSliceCopy_v2 ...@@ -506,13 +508,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
{ {
TData p_buffer[GetRegisterBufferSize()]; TData p_buffer[GetRegisterBufferSize()];
#if 0 RunLoadRegisterBuffer(p_src, p_buffer);
mThreadwiseLoad.Run(p_src, p_buffer); RunStoreRegisterBuffer(p_buffer, p_dst);
mThreadwiseStore.Run(p_buffer, p_dst);
#else
mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer);
mThreadwiseStore.template Run_amd_experiment<TData, 0, 2>(p_buffer, p_dst);
#endif
} }
template <typename T, bool PositiveDirection> template <typename T, bool PositiveDirection>
......
...@@ -819,38 +819,38 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 ...@@ -819,38 +819,38 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
src_vector_t vector_data; src_vector_t vector_data;
// Read vector from src.
// 1. Source code version can take src of all kinds of memory-space
// 2. Inline asm versions using global_load or buffer_load can only take
// src from global-memory
//
// Commemt for loading from global-memory:
// When
// 1) using source code, in order for compiler to emit optimal
// load instruction, or
// 2) using inline asm (global_load or buffer_load), in order
// for inline asm to be valid,
// following assumptions need to be satisfied:
// 1. p_src need to be block-invariant (assumption)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
static_if<SrcMemorySpace == 2>{}([&](auto) { static_if<SrcMemorySpace == 2>{}([&](auto) {
#if 1 // source code #if 1 // source code
// Load vector from src.
// src can be all kinds of memory-space.
// In order for optimized global_load to be emitted by compiler, need to
// assume:
// 1. p_src need to be block-invariant (assumption)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
vector_data = *reinterpret_cast<const src_vector_t*>( vector_data = *reinterpret_cast<const src_vector_t*>(
&p_src[src_normal_offset + src_merged_offset]); &p_src[src_normal_offset + src_merged_offset]);
#else // inline asm using buffer_load #elif 1 // inline asm using global_load
// Load vector from src vector_data = __global_load<TData, SrcDataPerAccess>(
// src's memory-space can only be global-memory (buffer_load inline-asm is p_src,
// used) static_cast<uint32_t>(src_merged_offset),
// In order for buffer_load to be valid, need to assume: static_cast<uint32_t>(src_normal_offset));
// 1. p_src need to be block-invariant (assumption) #elif 1 // inline asm using buffer_load
// 2. src_normal_offset must be calculatd at compile time (guaranteed) vector_data = __buffer_load<TData, SrcDataPerAccess>(
// 3. src_merged_offset can be runtime value (no assumption imposed)
vector_data = buffer_load<TData, SrcDataPerAccess>(
p_src, p_src,
static_cast<uint32_t>(src_merged_offset), static_cast<uint32_t>(src_merged_offset),
static_cast<uint32_t>(src_normal_offset)); static_cast<uint32_t>(src_normal_offset));
#endif #endif
}).Else([&](auto) { }).Else([&](auto) {
// Load vector from src.
// src can be all kinds of memory-space. // src can be all kinds of memory-space.
// In order for optimized global_load to be emitted by compiler, need to
// assume:
// 1. p_src need to be block-invariant (assumption)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
vector_data = *reinterpret_cast<const src_vector_t*>( vector_data = *reinterpret_cast<const src_vector_t*>(
&p_src[src_normal_offset + src_merged_offset]); &p_src[src_normal_offset + src_merged_offset]);
}); });
...@@ -924,36 +924,34 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 ...@@ -924,36 +924,34 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
const index_t dst_normal_offset = const index_t dst_normal_offset =
DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id); DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
// Write vector into dst.
// 1. Source code version can take dst of all kinds of memory-space
// 2. Inline asm versions using global_store or buffer_store can only take
// dst from global-memory
//
// Commemt for storing into global-memory:
// When
// 1) using source code, in order for compiler to emit optimal
// store instruction, or
// 2) using inline asm (global_store or buffer_store), in order
// for inline asm to be valid,
// following assumptions need to be satisfied:
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
static_if<DstMemorySpace == 2>{}([&](auto) { static_if<DstMemorySpace == 2>{}([&](auto) {
#if 1 // source code #if 1 // source code
// Write vector into dst.
// dst can be all kinds of memory-space
// In order for optmized global_store to be emitted by compiler, need to
// assume:
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
*reinterpret_cast<dst_vector_t*>( *reinterpret_cast<dst_vector_t*>(
&p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
#else // inline asm using buffer_store #elif 1 // inline asm using global_store
// Write vector into dst. __global_store<TData, DstDataPerAccess>(
// dst's memory-space need to be global-memory (buffer_store is used) vector_data, p_dst, dst_merged_offset, dst_normal_offset);
// In order for optmized global_store to be emitted by compiler, need to #elif 1 // inline asm using buffer_store
// assume: __buffer_store<TData, DstDataPerAccess>(
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
buffer_store<TData, DstDataPerAccess>(
vector_data, p_dst, dst_merged_offset, dst_normal_offset); vector_data, p_dst, dst_merged_offset, dst_normal_offset);
#endif #endif
}).Else([&](auto) { }).Else([&](auto) {
// Write vector into dst.
// dst can be all kinds of memory-space // dst can be all kinds of memory-space
// In order for optmized global_store to be emitted by compiler, need to
// assume:
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
*reinterpret_cast<dst_vector_t*>( *reinterpret_cast<dst_vector_t*>(
&p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
}); });
......
...@@ -8,21 +8,169 @@ namespace ck { ...@@ -8,21 +8,169 @@ namespace ck {
// cast a pointer of LDS to its address // cast a pointer of LDS to its address
extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p); extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p);
// buffer_load and buffer_store // global_load and global_store
template <typename T, index_t VectorSize> template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType __device__ typename vector_type<T, VectorSize>::MemoryType
buffer_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset); __global_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset);
template <typename T, index_t VectorSize> template <typename T, index_t VectorSize>
__device__ void buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src, __device__ void __global_store(const typename vector_type<T, VectorSize>::MemoryType& src,
T* p_dst_block, T* p_dst_block,
uint32_t dst_thread_offset, uint32_t dst_thread_offset,
uint32_t dst_const_offset); uint32_t dst_const_offset);
template <> template <>
__device__ float buffer_load<float, 1>(const float* p_src_block, __device__ float __global_load<float, 1>(const float* p_src_block,
uint32_t src_thread_offset, uint32_t src_thread_offset,
uint32_t src_const_offset) uint32_t src_const_offset)
{
#if 0 // compute on VALU
float dst;
uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);
asm volatile("\n \
global_load_dword %0, %1, %2, offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_offset_u64), "s"(p_src_block));
return dst;
#else // compute on SALU
float dst;
uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset);
const float* p_src_block_with_offset = p_src_block + src_const_offset;
asm volatile("\n \
global_load_dword %0, %1, %2, offset:0 \n \
;;s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_offset_u64), "s"(p_src_block_with_offset));
return dst;
#endif
}
template <>
__device__ vector_type<float, 2>::MemoryType __global_load<float, 2>(const float* p_src_block,
uint32_t src_thread_offset,
uint32_t src_const_offset)
{
#if 0 // compute on VALU
vector_type<float, 2>::MemoryType dst;
uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);
asm volatile("\n \
global_load_dwordx2 %0, %1, %2, offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_offset_u64), "s"(p_src_block));
return dst;
#else // compute on SALU
vector_type<float, 2>::MemoryType dst;
uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset);
const float* p_src_block_with_offset = p_src_block + src_const_offset;
asm volatile("\n \
global_load_dwordx2 %0, %1, %2, offset:0 \n \
;;s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_offset_u64), "s"(p_src_block_with_offset));
return dst;
#endif
}
template <>
__device__ vector_type<float, 4>::MemoryType __global_load<float, 4>(const float* p_src_block,
uint32_t src_thread_offset,
uint32_t src_const_offset)
{
#if 0 // compute on VALU
vector_type<float, 4>::MemoryType dst;
uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);
asm volatile("\n \
global_load_dwordx4 %0, %1, %2, offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_offset_u64), "s"(p_src_block));
return dst;
#else // compute on SALU
vector_type<float, 4>::MemoryType dst;
uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset);
const float* p_src_block_with_offset = p_src_block + src_const_offset;
asm volatile("\n \
global_load_dwordx4 %0, %1, %2, offset:0 \n \
;;s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_offset_u64), "s"(p_src_block_with_offset));
return dst;
#endif
}
template <>
__device__ void __global_store<float, 1>(const float& src,
float* p_dst_block,
uint32_t dst_thread_offset,
uint32_t dst_const_offset)
{
#if 0 // compute on VALU
uint64_t dst_thread_offset_u64 = static_cast<uint64_t>(dst_thread_offset + dst_const_offset);
asm volatile("\n \
global_store_dword %0, %1, %2, offset:0 \n \
s_waitcnt 0 \n \
"
:
: "v"(dst_thread_offset_u64), "v"(src), "s"(p_dst_block));
#else // compute on SALU
uint64_t dst_thread_offset_u64 = static_cast<uint64_t>(dst_thread_offset);
float* p_dst_block_with_offset = p_dst_block + dst_const_offset;
asm volatile("\n \
global_store_dword %0, %1, %2, offset:0 \n \
;;s_waitcnt 0 \n \
"
:
: "v"(dst_thread_offset_u64), "v"(src), "s"(p_dst_block_with_offset));
#endif
}
// __buffer_load and __buffer_store
template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType
__buffer_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset);
template <typename T, index_t VectorSize>
__device__ void __buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
T* p_dst_block,
uint32_t dst_thread_offset,
uint32_t dst_const_offset);
template <>
__device__ float __buffer_load<float, 1>(const float* p_src_block,
uint32_t src_thread_offset,
uint32_t src_const_offset)
{ {
float dst; float dst;
...@@ -35,7 +183,7 @@ __device__ float buffer_load<float, 1>(const float* p_src_block, ...@@ -35,7 +183,7 @@ __device__ float buffer_load<float, 1>(const float* p_src_block,
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000; reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \ asm volatile("\n \
buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \ __buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \ s_waitcnt 0 \n \
" "
: "=v"(dst) : "=v"(dst)
...@@ -45,9 +193,9 @@ __device__ float buffer_load<float, 1>(const float* p_src_block, ...@@ -45,9 +193,9 @@ __device__ float buffer_load<float, 1>(const float* p_src_block,
} }
template <> template <>
__device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float* p_src_block, __device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(const float* p_src_block,
uint32_t src_thread_offset, uint32_t src_thread_offset,
uint32_t src_const_offset) uint32_t src_const_offset)
{ {
vector_type<float, 2>::MemoryType dst; vector_type<float, 2>::MemoryType dst;
...@@ -60,7 +208,7 @@ __device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float* ...@@ -60,7 +208,7 @@ __device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float*
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000; reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \ asm volatile("\n \
buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \ __buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \ s_waitcnt 0 \n \
" "
: "=v"(dst) : "=v"(dst)
...@@ -70,9 +218,9 @@ __device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float* ...@@ -70,9 +218,9 @@ __device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float*
} }
template <> template <>
__device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float* p_src_block, __device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(const float* p_src_block,
uint32_t src_thread_offset, uint32_t src_thread_offset,
uint32_t src_const_offset) uint32_t src_const_offset)
{ {
vector_type<float, 4>::MemoryType dst; vector_type<float, 4>::MemoryType dst;
...@@ -85,7 +233,7 @@ __device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float* ...@@ -85,7 +233,7 @@ __device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float*
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000; reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \ asm volatile("\n \
buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \ __buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \ s_waitcnt 0 \n \
" "
: "=v"(dst) : "=v"(dst)
...@@ -95,10 +243,10 @@ __device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float* ...@@ -95,10 +243,10 @@ __device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float*
} }
template <> template <>
__device__ void buffer_store<float, 1>(const float& src, __device__ void __buffer_store<float, 1>(const float& src,
float* p_dst_block, float* p_dst_block,
uint32_t dst_thread_offset, uint32_t dst_thread_offset,
uint32_t dst_const_offset) uint32_t dst_const_offset)
{ {
int32x4_t dst_block_setting{0}; int32x4_t dst_block_setting{0};
// fill in byte 0 - 1 // fill in byte 0 - 1
...@@ -109,7 +257,7 @@ __device__ void buffer_store<float, 1>(const float& src, ...@@ -109,7 +257,7 @@ __device__ void buffer_store<float, 1>(const float& src,
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000; reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
asm volatile("\n \ asm volatile("\n \
buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \ __buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
s_waitcnt 0 \n \ s_waitcnt 0 \n \
" "
: :
......
...@@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 1 #if 0
// BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
...@@ -82,7 +82,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -82,7 +82,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
constexpr index_t WeiBlockCopySrcDataPerRead_E = 1; constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#elif 0 #elif 0
// BlockSize = 64, blockwise-GEMM 64x64, each thread hold 64 data // BlockSize = 64, blockwise-GEMM 64x64, each thread hold 64 data
...@@ -156,7 +156,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -156,7 +156,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
constexpr index_t WeiBlockCopySrcDataPerRead_E = 1; constexpr index_t WeiBlockCopySrcDataPerRead_E = 2;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#endif #endif
......
...@@ -103,7 +103,7 @@ int main(int argc, char* argv[]) ...@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 1
// 1x1 filter, 8x8 image // 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
constexpr index_t N = 64; constexpr index_t N = 64;
...@@ -295,7 +295,7 @@ int main(int argc, char* argv[]) ...@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 1 #elif 0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr index_t N = 128; constexpr index_t N = 128;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment