"docs/git@developer.sourcefind.cn:guobj/qwen_lmdeploy.git" did not exist on "c3290cadcd8818deeaf9281f0494174308e04d79"
Commit 33b4b52c authored by Bartlomiej Kocot's avatar Bartlomiej Kocot
Browse files

Improve performance

parent 925ec9ea
...@@ -501,7 +501,7 @@ struct DeviceColumnToImageImpl ...@@ -501,7 +501,7 @@ struct DeviceColumnToImageImpl
BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>( BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>(
arg.out_grid_desc_m_k_container_[i]); arg.out_grid_desc_m_k_container_[i]);
const index_t grid_size = const index_t grid_size =
block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]); block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]) * arg.G_;
elapsed_time += launch_and_time_kernel(stream_config, elapsed_time += launch_and_time_kernel(stream_config,
kernel, kernel,
dim3(grid_size), dim3(grid_size),
......
...@@ -262,8 +262,9 @@ struct DeviceImageToColumnImpl ...@@ -262,8 +262,9 @@ struct DeviceImageToColumnImpl
const auto block_2_tile_map = const auto block_2_tile_map =
BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>( BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>(
arg.out_grid_desc_m_k_); arg.out_grid_desc_m_k_);
const index_t grid_size = block_2_tile_map.CalculateGridSize(arg.out_grid_desc_m_k_); const index_t grid_size =
const auto kernel = kernel_tensor_rearrange<InputGridDesc, block_2_tile_map.CalculateGridSize(arg.out_grid_desc_m_k_) * arg.G_;
const auto kernel = kernel_tensor_rearrange<InputGridDesc,
InputDataType, InputDataType,
OutputGridDesc, OutputGridDesc,
OutputDataType, OutputDataType,
......
...@@ -113,22 +113,24 @@ struct GridwiseTensorRearrange ...@@ -113,22 +113,24 @@ struct GridwiseTensorRearrange
make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)), make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)),
tensor_operation::element_wise::PassThrough{}}; tensor_operation::element_wise::PassThrough{}};
for(index_t idx = 0; idx < batch_count; idx++) const index_t num_blocks_per_batch =
{ __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
// Global Memory const index_t g_idx =
const index_t a_batch_offset = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
__builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetAPtrOffset(idx));
const index_t c_batch_offset = // Global Memory
__builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetCPtrOffset(idx)); const index_t a_batch_offset =
__builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( const index_t c_batch_offset =
p_in_global + a_batch_offset, in_grid_desc.GetElementSpaceSize()); __builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx));
auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_out_global + c_batch_offset, out_grid_desc.GetElementSpaceSize()); const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_in_global + a_batch_offset, in_grid_desc.GetElementSpaceSize());
copy_global_to_global.Run( auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf)); p_out_global + c_batch_offset, out_grid_desc.GetElementSpaceSize());
}
copy_global_to_global.Run(
tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf));
} }
__host__ static constexpr bool CheckValidity(const InputGridDesc& in_grid_desc, __host__ static constexpr bool CheckValidity(const InputGridDesc& in_grid_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment