Improve performance

33b4b52c · Bartlomiej Kocot · 925ec9ea · 33b4b52c · 33b4b52c · 33b4b52c
Commit 33b4b52c authored Oct 26, 2023 by Bartlomiej Kocot
3 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
@@ -501,7 +501,7 @@ struct DeviceColumnToImageImpl
                    BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>(
                        arg.out_grid_desc_m_k_container_[i]);
                const index_t grid_size =
-                    block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]);
+                    block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]) * arg.G_;
                elapsed_time += launch_and_time_kernel(stream_config,
                                                       kernel,
                                                       dim3(grid_size),

--- a/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
@@ -262,8 +262,9 @@ struct DeviceImageToColumnImpl
            const auto block_2_tile_map =
                BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>(
                    arg.out_grid_desc_m_k_);
-            const index_t grid_size = block_2_tile_map.CalculateGridSize(arg.out_grid_desc_m_k_);
-            const auto kernel       = kernel_tensor_rearrange<InputGridDesc,
+            const index_t grid_size =
+                block_2_tile_map.CalculateGridSize(arg.out_grid_desc_m_k_) * arg.G_;
+            const auto kernel = kernel_tensor_rearrange<InputGridDesc,
                                                        InputDataType,
                                                        OutputGridDesc,
                                                        OutputDataType,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
@@ -113,22 +113,24 @@ struct GridwiseTensorRearrange
                make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)),
                tensor_operation::element_wise::PassThrough{}};

-        for(index_t idx = 0; idx < batch_count; idx++)
-        {
-            // Global Memory
-            const index_t a_batch_offset =
-                __builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetAPtrOffset(idx));
-            const index_t c_batch_offset =
-                __builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetCPtrOffset(idx));
-
-            const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                p_in_global + a_batch_offset, in_grid_desc.GetElementSpaceSize());
-            auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                p_out_global + c_batch_offset, out_grid_desc.GetElementSpaceSize());
-
-            copy_global_to_global.Run(
-                tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf));
-        }
+        const index_t num_blocks_per_batch =
+            __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+        const index_t g_idx =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+        // Global Memory
+        const index_t a_batch_offset =
+            __builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        const index_t c_batch_offset =
+            __builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx));
+
+        const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_global + a_batch_offset, in_grid_desc.GetElementSpaceSize());
+        auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_out_global + c_batch_offset, out_grid_desc.GetElementSpaceSize());
+
+        copy_global_to_global.Run(
+            tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf));
    }

    __host__ static constexpr bool CheckValidity(const InputGridDesc& in_grid_desc,