Commit b3108646 authored by Jing Zhang's avatar Jing Zhang
Browse files

add asm

parent 5be6a5f9
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#define NO_GLB_READ 0 #define NO_GLB_READ 0
// cast a pointer of LDS to its address // cast a pointer of LDS to its address
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; extern "C" __attribute__((address_space(3))) void* __to_local(const void* p)[[hc]];
__device__ void vmcnt(index_t cnt) __device__ void vmcnt(index_t cnt)
{ {
...@@ -721,18 +721,17 @@ __device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, in ...@@ -721,18 +721,17 @@ __device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, in
#endif #endif
} }
__device__ void global_load(vector_type<float, 4>::MemoryType& r, __device__ void global_loadx4(void* r, const void* ptr, index_t offset = 0)
const vector_type<float, 4>::MemoryType* ptr,
index_t offset = 0)
{ {
#if !NO_GLB_READ #if !NO_GLB_READ
if(offset == 0) if(offset == 0)
{ {
//*(vector_type<float, 4>::MemoryType*)(r) = *(vector_type<float, 4>::MemoryType*)(ptr);
asm volatile("\n \ asm volatile("\n \
global_load_dwordx4 %0, %1, off \n \ global_load_dwordx4 %0, %1, off \n \
" "
: "=v"(r) : "=v"(*(vector_type<float, 4>::MemoryType*)(r))
: "v"(ptr)); : "r"(ptr));
} }
else else
{ {
...@@ -741,17 +740,129 @@ __device__ void global_load(vector_type<float, 4>::MemoryType& r, ...@@ -741,17 +740,129 @@ __device__ void global_load(vector_type<float, 4>::MemoryType& r,
#endif #endif
} }
__device__ void __device__ void global_loadx2(void* r, const void* ptr, index_t offset = 0)
ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0) {
#if !NO_GLB_READ
if(offset == 0)
{
asm volatile("\n \
global_load_dwordx2 %0, %1, off \n \
"
: "=v"(*(vector_type<float, 2>::MemoryType*)(r))
: "r"(ptr));
}
else
{
assert(false);
}
#endif
}
__device__ void global_loadx1(void* r, const void* ptr, index_t offset = 0)
{
#if !NO_GLB_READ
if(offset == 0)
{
//*(float*)(r) = *(float*)(ptr);
asm volatile("\n \
global_load_dword %0, %1, off \n \
"
: "=v"(*(float*)(r))
: "r"(ptr));
}
else
{
assert(false);
}
#endif
}
__device__ void global_storex4(const void* ptr, const void* r, index_t offset = 0)
{
#if !NO_GLB_READ
if(offset == 0)
{
//*(vector_type<float, 4>::MemoryType*)(ptr) = *(vector_type<float, 4>::MemoryType*)(r);
asm volatile("\n \
global_store_dwordx4 %0, %1, off \n \
"
:
: "r"(ptr), "v"(*(vector_type<float, 4>::MemoryType*)(r)));
}
else
{
assert(false);
}
#endif
}
__device__ void global_storex2(const void* ptr, const void* r, index_t offset = 0)
{
#if !NO_GLB_READ
if(offset == 0)
{
asm volatile("\n \
global_store_dwordx2 %0, %1, off \n \
"
:
: "r"(ptr), "v"(*(vector_type<float, 2>::MemoryType*)(r)));
}
else
{
assert(false);
}
#endif
}
__device__ void global_storex1(const void* ptr, const void* r, index_t offset = 0)
{
#if !NO_GLB_READ
if(offset == 0)
{
//*(float*)(ptr) = *(float*)(r);
asm volatile("\n \
global_store_dword %0, %1, off \n \
"
:
: "r"(ptr), "v"(*(float*)(r)));
}
else
{
assert(false);
}
#endif
}
__device__ void ds_write_b128(const void* lds, const void* r, index_t offset = 0)
{ {
#if !NO_DS_WRITE #if !NO_DS_WRITE
if(offset == 0) if(offset == 0)
{ {
//*(vector_type<float, 4>::MemoryType*)(lds) = *(vector_type<float, 4>::MemoryType*)(r);
asm volatile("\n \ asm volatile("\n \
ds_write_b128 %0, %1 \n \ ds_write_b128 %0, %1 \n \
" "
: :
: "v"(__to_local(lds)), "v"(r)); : "v"(__to_local(lds)), "v"(*(vector_type<float, 4>::MemoryType*)(r)));
}
else
{
assert(false);
}
#endif
}
__device__ void ds_write_b32(const void* lds, const void* r, index_t offset = 0)
{
#if !NO_DS_WRITE
if(offset == 0)
{
//*(float*)(lds) = *(float*)(r);
asm volatile("\n \
ds_write_b32 %0, %1 \n \
"
:
: "v"(__to_local(lds)), "v"(*(float*)(r)));
} }
else else
{ {
......
...@@ -460,6 +460,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kc1x1_nkhw ...@@ -460,6 +460,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kc1x1_nkhw
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global, blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global,
p_wei_register_clipboard); p_wei_register_clipboard);
// vmcnt(0);
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double); blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_double); p_wei_block_double);
...@@ -504,6 +505,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kc1x1_nkhw ...@@ -504,6 +505,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kc1x1_nkhw
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
run_blockwise_gemm(p_wei_block_now, p_in_block_now, p_out_thread); run_blockwise_gemm(p_wei_block_now, p_in_block_now, p_out_thread);
// vmcnt(0);
// LDS double buffer: store next data to LDS // LDS double buffer: store next data to LDS
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_next); p_in_block_next);
...@@ -535,6 +537,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kc1x1_nkhw ...@@ -535,6 +537,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kc1x1_nkhw
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
run_blockwise_gemm(p_wei_block_double, p_in_block_double, p_out_thread); run_blockwise_gemm(p_wei_block_double, p_in_block_double, p_out_thread);
// vmcnt(0);
// LDS double buffer: store next data to LDS // LDS double buffer: store next data to LDS
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_double + in_block_space); p_in_block_double + in_block_space);
......
...@@ -97,3 +97,135 @@ __device__ void threadwise_generic_tensor_slice_copy_v1( ...@@ -97,3 +97,135 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
}); });
#endif #endif
} }
template <class Float,
class SrcDesc,
class DstDesc,
class SliceLengths,
class DimAccessOrder,
index_t DataPerAccess,
index_t OpType>
__device__ void threadwise_generic_tensor_slice_copy_v1_asm(
SrcDesc,
const Float* __restrict__ p_src,
Array<index_t, SrcDesc::GetNumOfDimension()> src_multi_id_begin,
DstDesc,
Float* __restrict__ p_dst,
Array<index_t, DstDesc::GetNumOfDimension()> dst_multi_id_begin,
SliceLengths,
DimAccessOrder,
Number<DataPerAccess>,
Number<OpType>)
{
constexpr index_t nDim = SrcDesc::GetNumOfDimension();
static_assert(nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
nDim == SliceLengths::GetSize() && nDim == DimAccessOrder::GetSize(),
"wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
#if 0
// doesn't compile, because merged-tensor reordering is not implemented
// TODO: implement tensor desc ops for merged-tensor
constexpr auto src_strides_in_access_order =
SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
constexpr auto dst_strides_in_access_order =
SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
// check src/dst stride on the lowest access dimension
static_assert((DataPerAccess == 1 || src_strides_in_access_order.Back() == 1) &&
(DataPerAccess == 1 || dst_strides_in_access_order.Back() == 1),
"wrong! src/dst stride on the lowest access dimension needs to be 1 for "
"vectorized read/write");
#endif
constexpr auto slice_lengths_in_access_order =
SliceLengths::ReorderGivenNew2Old(DimAccessOrder{});
// check slice length on the lowest access dimension
static_assert(slice_lengths_in_access_order.Back() % DataPerAccess == 0,
"wrong! slice length on the lowest access dimension should be evenly divided by "
"DataPerAccess");
constexpr index_t num_access_on_lowest_access_dimension =
slice_lengths_in_access_order.Back() / DataPerAccess;
constexpr auto access_lengths = slice_lengths_in_access_order.Modify(
Number<nDim - 1>{}, Number<num_access_on_lowest_access_dimension>{});
using vector_t = typename vector_type<Float, DataPerAccess>::MemoryType;
#if 1
ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
auto data_multi_id_in_access_order = access_multi_id;
data_multi_id_in_access_order(nDim - 1) = access_multi_id[nDim - 1] * DataPerAccess;
const auto data_multi_id =
reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
const index_t src_index =
SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
const index_t dst_index =
DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
static_assert(DataPerAccess == 1 || DataPerAccess == 4, "unsupported DataPerAccess");
static_assert(OpType == 1, "unsupported OpType");
if(DataPerAccess == 4)
{
if(OpType == 1)
{
global_loadx4(&p_dst[dst_index], &p_src[src_index]);
}
else if(OpType == 2)
{
global_storex4(&p_dst[dst_index], &p_src[src_index]);
}
else
{
ds_write_b128(&p_dst[dst_index], &p_src[src_index]);
}
}
if(DataPerAccess == 1)
{
if(OpType == 1)
{
global_loadx1(&p_dst[dst_index], &p_src[src_index]);
}
else if(OpType == 2)
{
global_storex1(&p_dst[dst_index], &p_src[src_index]);
}
else
{
ds_write_b32(&p_dst[dst_index], &p_src[src_index]);
}
}
//*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
//*reinterpret_cast<const vector_t*>(&p_src[src_index]);
});
#else
static_ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
constexpr index_t itmp = access_multi_id.Back() * DataPerAccess;
constexpr auto data_multi_id_in_access_order =
access_multi_id.Modify(Number<nDim - 1>{}, Number<itmp>{});
constexpr auto data_multi_id = reorder_array_given_old2new(
sequence2array(data_multi_id_in_access_order), DimAccessOrder{});
const index_t src_index =
SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
const index_t dst_index =
DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
});
#endif
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment