Unverified Commit d0e3a70a authored by darren-amd's avatar darren-amd Committed by GitHub
Browse files

Statically Cast Pointer Offset (#1631)

* explicit cast ptr offset

* formating change
parent b6e74be1
...@@ -93,12 +93,12 @@ __global__ void ...@@ -93,12 +93,12 @@ __global__ void
__builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
const long_index_t a_batch_offset = const long_index_t a_batch_offset = amd_wave_read_first_lane(
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
const long_index_t b_batch_offset = const long_index_t b_batch_offset = amd_wave_read_first_lane(
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
const long_index_t e_batch_offset = const long_index_t e_batch_offset = amd_wave_read_first_lane(
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
......
...@@ -60,12 +60,12 @@ __global__ void ...@@ -60,12 +60,12 @@ __global__ void
const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge); const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block); const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
const long_index_t a_batch_offset = const long_index_t a_batch_offset = amd_wave_read_first_lane(
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
const long_index_t b_batch_offset = const long_index_t b_batch_offset = amd_wave_read_first_lane(
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
const long_index_t e_batch_offset = const long_index_t e_batch_offset = amd_wave_read_first_lane(
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
__shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
...@@ -117,12 +117,12 @@ __global__ void ...@@ -117,12 +117,12 @@ __global__ void
const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge); const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block); const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
const long_index_t a_batch_offset = const long_index_t a_batch_offset = amd_wave_read_first_lane(
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
const long_index_t b_batch_offset = const long_index_t b_batch_offset = amd_wave_read_first_lane(
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
const long_index_t e_batch_offset = const long_index_t e_batch_offset = amd_wave_read_first_lane(
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
// Pass two lds pointer is the key to tell compiler that ds_read/write // Pass two lds pointer is the key to tell compiler that ds_read/write
// operate on different lds chunk at same time without order dependecy // operate on different lds chunk at same time without order dependecy
......
...@@ -98,12 +98,12 @@ __global__ void ...@@ -98,12 +98,12 @@ __global__ void
__builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
const long_index_t a_batch_offset = const long_index_t a_batch_offset = amd_wave_read_first_lane(
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
const long_index_t b_batch_offset = const long_index_t b_batch_offset = amd_wave_read_first_lane(
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
const long_index_t c_batch_offset = const long_index_t c_batch_offset = amd_wave_read_first_lane(
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
......
...@@ -60,12 +60,12 @@ __global__ void ...@@ -60,12 +60,12 @@ __global__ void
__builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
const long_index_t a_batch_offset = const long_index_t a_batch_offset = amd_wave_read_first_lane(
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
const long_index_t b_batch_offset = const long_index_t b_batch_offset = amd_wave_read_first_lane(
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
const long_index_t e_batch_offset = const long_index_t e_batch_offset = amd_wave_read_first_lane(
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
...@@ -155,12 +155,12 @@ __global__ void ...@@ -155,12 +155,12 @@ __global__ void
__builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
const long_index_t a_batch_offset = const long_index_t a_batch_offset = amd_wave_read_first_lane(
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
const long_index_t b_batch_offset = const long_index_t b_batch_offset = amd_wave_read_first_lane(
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
const long_index_t e_batch_offset = const long_index_t e_batch_offset = amd_wave_read_first_lane(
amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
......
...@@ -121,10 +121,10 @@ struct GridwiseTensorRearrange ...@@ -121,10 +121,10 @@ struct GridwiseTensorRearrange
__builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
// Global Memory // Global Memory
const index_t a_batch_offset = const index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
__builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
const index_t c_batch_offset = const index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
__builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)); static_cast<long_index_t>(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_in_global + a_batch_offset, in_grid_desc.GetElementSpaceSize()); p_in_global + a_batch_offset, in_grid_desc.GetElementSpaceSize());
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment