"web/git@developer.sourcefind.cn:OpenDAS/ollama.git" did not exist on "a3d7bb0a30b9ea07d3d4beff41e7f35c34f7f48a"
Commit 5b7a18c5 authored by Chao Liu's avatar Chao Liu
Browse files

experimenting global and buffer load/store

parent c7a6545e
...@@ -838,7 +838,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 ...@@ -838,7 +838,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
#if 1 // source code #if 1 // source code
vector_data = *reinterpret_cast<const src_vector_t*>( vector_data = *reinterpret_cast<const src_vector_t*>(
&p_src[src_normal_offset + src_merged_offset]); &p_src[src_normal_offset + src_merged_offset]);
#elif 1 // inline asm using global_load #elif 0 // inline asm using global_load
vector_data = __global_load<TData, SrcDataPerAccess>( vector_data = __global_load<TData, SrcDataPerAccess>(
p_src, p_src,
static_cast<uint32_t>(src_merged_offset), static_cast<uint32_t>(src_merged_offset),
...@@ -943,7 +943,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 ...@@ -943,7 +943,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
#if 1 // source code #if 1 // source code
*reinterpret_cast<dst_vector_t*>( *reinterpret_cast<dst_vector_t*>(
&p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
#elif 1 // inline asm using global_store #elif 0 // inline asm using global_store
__global_store<TData, DstDataPerAccess>( __global_store<TData, DstDataPerAccess>(
vector_data, p_dst, dst_merged_offset, dst_normal_offset); vector_data, p_dst, dst_merged_offset, dst_normal_offset);
#elif 1 // inline asm using buffer_store #elif 1 // inline asm using buffer_store
......
...@@ -183,8 +183,8 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block, ...@@ -183,8 +183,8 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000; reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \ asm volatile("\n \
__buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \ buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \ ;;s_waitcnt 0 \n \
" "
: "=v"(dst) : "=v"(dst)
: "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset)); : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset));
...@@ -208,8 +208,8 @@ __device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(const float ...@@ -208,8 +208,8 @@ __device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(const float
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000; reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \ asm volatile("\n \
__buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \ buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \ ;;s_waitcnt 0 \n \
" "
: "=v"(dst) : "=v"(dst)
: "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset)); : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset));
...@@ -233,8 +233,8 @@ __device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(const float ...@@ -233,8 +233,8 @@ __device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(const float
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000; reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \ asm volatile("\n \
__buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \ buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \ ;;s_waitcnt 0 \n \
" "
: "=v"(dst) : "=v"(dst)
: "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset)); : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset));
...@@ -257,8 +257,8 @@ __device__ void __buffer_store<float, 1>(const float& src, ...@@ -257,8 +257,8 @@ __device__ void __buffer_store<float, 1>(const float& src,
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000; reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
asm volatile("\n \ asm volatile("\n \
__buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \ buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
s_waitcnt 0 \n \ ;;s_waitcnt 0 \n \
" "
: :
: "s"(dst_block_setting), "v"(src), "v"(dst_thread_offset), "s"(dst_const_offset)); : "s"(dst_block_setting), "v"(src), "v"(dst_thread_offset), "s"(dst_const_offset));
......
...@@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 0 #if 1
// BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
......
...@@ -103,7 +103,7 @@ int main(int argc, char* argv[]) ...@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 1 #elif 0
// 1x1 filter, 8x8 image // 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
constexpr index_t N = 64; constexpr index_t N = 64;
...@@ -295,7 +295,7 @@ int main(int argc, char* argv[]) ...@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 0 #elif 1
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr index_t N = 128; constexpr index_t N = 128;
...@@ -341,7 +341,7 @@ int main(int argc, char* argv[]) ...@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<3, 0>; using LeftPads = Sequence<3, 0>;
using RightPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>;
#elif 1 #elif 0
// 1x7 filter, 0x3 pad, 17x17 input // 1x7 filter, 0x3 pad, 17x17 input
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 128; constexpr index_t C = 128;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment