Commit a6ba3d2a authored by aska-0096's avatar aska-0096
Browse files

add __restrict__ qualifier to all flow of lds buffer

parent c6a03cde
...@@ -123,7 +123,7 @@ struct ThreadGroupTensorSliceTransfer_v4r1 ...@@ -123,7 +123,7 @@ struct ThreadGroupTensorSliceTransfer_v4r1
template <typename DstBuffer, index_t ThreadScratchId = 0> template <typename DstBuffer, index_t ThreadScratchId = 0>
__device__ void RunWrite(const DstDesc& dst_desc, __device__ void RunWrite(const DstDesc& dst_desc,
DstBuffer& dst_buf, DstBuffer& __restrict__ dst_buf,
Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{}) Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
{ {
if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
......
...@@ -27,10 +27,11 @@ __global__ void ...@@ -27,10 +27,11 @@ __global__ void
{ {
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \ #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
GridwiseGemm::template Run<HasMainKBlockLoop>( GridwiseGemm::template Run<HasMainKBlockLoop>(
karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared, karg); karg.p_a_grid, karg.p_b_grid, karg.p_c_grid, p_shared_0, p_shared_1, karg);
#else #else
ignore = karg; ignore = karg;
#endif // end of if (defined(__gfx908__) || defined(__gfx90a__)) #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
...@@ -52,9 +53,10 @@ __global__ void ...@@ -52,9 +53,10 @@ __global__ void
{ {
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \ #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
__shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid, p_b_grid, p_c_grid, p_shared, problem); GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid, p_b_grid, p_c_grid, p_shared_0, p_shared_1, problem);
#else #else
ignore = p_a_grid; ignore = p_a_grid;
ignore = p_b_grid; ignore = p_b_grid;
...@@ -569,7 +571,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1 ...@@ -569,7 +571,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
constexpr auto c_block_size = constexpr auto c_block_size =
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize(); c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
return math::max(2 * (a_block_space_size_aligned * sizeof(ComputeTypeA) + return math::max((a_block_space_size_aligned * sizeof(ComputeTypeA) +
b_block_space_size_aligned * sizeof(ComputeTypeB)), b_block_space_size_aligned * sizeof(ComputeTypeB)),
c_block_size * sizeof(FloatCShuffle)); c_block_size * sizeof(FloatCShuffle));
} }
...@@ -707,7 +709,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1 ...@@ -707,7 +709,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
__device__ static void Run(const FloatA* __restrict__ p_a_grid, __device__ static void Run(const FloatA* __restrict__ p_a_grid,
const FloatB* __restrict__ p_b_grid, const FloatB* __restrict__ p_b_grid,
FloatC* __restrict__ p_c_grid, FloatC* __restrict__ p_c_grid,
void* __restrict__ p_shared, void* __restrict__ p_shared_0,
void* __restrict__ p_shared_1,
const Problem& problem) const Problem& problem)
{ {
const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1( const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
...@@ -868,24 +871,18 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1 ...@@ -868,24 +871,18 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
constexpr auto a_block_space_size_aligned = math::integer_least_multiple( constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align); a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>( auto a_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
static_cast<ComputeTypeA*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize()); static_cast<ComputeTypeA* __restrict__>(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
auto b_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>( auto b_block_buf_ping = make_dynamic_buffer<AddressSpaceEnum::Lds>(
static_cast<ComputeTypeB*>(p_shared) + a_block_space_size_aligned, static_cast<ComputeTypeB* __restrict__>(p_shared_0) + a_block_space_size_aligned,
b_block_desc_bk0_n_bk1.GetElementSpaceSize()); b_block_desc_bk0_n_bk1.GetElementSpaceSize());
auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>( auto a_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
static_cast<ComputeTypeA*>(p_shared) + a_block_space_size_aligned + static_cast<ComputeTypeA* __restrict__>(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
b_block_space_size_aligned,
a_block_desc_ak0_m_ak1.GetElementSpaceSize());
auto b_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>( auto b_block_buf_pong = make_dynamic_buffer<AddressSpaceEnum::Lds>(
static_cast<ComputeTypeB*>(p_shared) + a_block_space_size_aligned + static_cast<ComputeTypeB* __restrict__>(p_shared_1) + a_block_space_size_aligned,
a_block_space_size_aligned + b_block_space_size_aligned,
b_block_desc_bk0_n_bk1.GetElementSpaceSize()); b_block_desc_bk0_n_bk1.GetElementSpaceSize());
auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong); auto a_block_bufs = make_tuple(a_block_buf_ping, a_block_buf_pong);
...@@ -948,7 +945,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1 ...@@ -948,7 +945,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(); GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>( auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
static_cast<FloatCShuffle*>(p_shared), static_cast<FloatCShuffle*>(p_shared_0),
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor( constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
......
...@@ -1050,7 +1050,7 @@ struct ThreadwiseTensorSliceTransfer_v4 ...@@ -1050,7 +1050,7 @@ struct ThreadwiseTensorSliceTransfer_v4
typename DstBuffer> typename DstBuffer>
__device__ void Run(const SrcDesc&, __device__ void Run(const SrcDesc&,
const SrcRefToOriginDisplacement&, const SrcRefToOriginDisplacement&,
const SrcBuffer& src_buf, const SrcBuffer& __restrict__ src_buf,
const DstDesc&, const DstDesc&,
const DstOriginIdx&, const DstOriginIdx&,
DstBuffer& dst_buf) const DstBuffer& dst_buf) const
......
...@@ -382,7 +382,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -382,7 +382,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
template <typename DstBuffer, index_t ThreadScratchId = 0> template <typename DstBuffer, index_t ThreadScratchId = 0>
__device__ void RunWrite(const DstDesc& dst_desc, __device__ void RunWrite(const DstDesc& dst_desc,
DstBuffer& dst_buf, DstBuffer& __restrict__ dst_buf,
Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{}) Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
{ {
// if there is transpose, it's done here // if there is transpose, it's done here
......
...@@ -29,12 +29,12 @@ struct DynamicBuffer ...@@ -29,12 +29,12 @@ struct DynamicBuffer
ElementSpaceSize element_space_size_; ElementSpaceSize element_space_size_;
T invalid_element_value_ = T{0}; T invalid_element_value_ = T{0};
__host__ __device__ constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size) __host__ __device__ constexpr DynamicBuffer(T* __restrict__ p_data, ElementSpaceSize element_space_size)
: p_data_{p_data}, element_space_size_{element_space_size} : p_data_{p_data}, element_space_size_{element_space_size}
{ {
} }
__host__ __device__ constexpr DynamicBuffer(T* p_data, __host__ __device__ constexpr DynamicBuffer(T* __restrict__ p_data,
ElementSpaceSize element_space_size, ElementSpaceSize element_space_size,
T invalid_element_value) T invalid_element_value)
: p_data_{p_data}, : p_data_{p_data},
...@@ -424,7 +424,7 @@ template < ...@@ -424,7 +424,7 @@ template <
typename X, typename X,
typename enable_if<is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value, bool>::type = false> typename enable_if<is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value, bool>::type = false>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element_value) make_dynamic_buffer(T* __restrict__ p, ElementSpaceSize element_space_size, X invalid_element_value)
{ {
return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, false, coherence>{ return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, false, coherence>{
p, element_space_size, invalid_element_value}; p, element_space_size, invalid_element_value};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment