Commit 2f5ae075 authored by carlushuang's avatar carlushuang
Browse files

improve perf a little bit by swizzle block idx

parent 9860dad8
...@@ -202,12 +202,6 @@ struct DeviceGemmXdlStreamK : public DeviceGemm<ALayout, ...@@ -202,12 +202,6 @@ struct DeviceGemmXdlStreamK : public DeviceGemm<ALayout,
hip_check_error(rtn); hip_check_error(rtn);
num_cu = dev_prop.multiProcessorCount; num_cu = dev_prop.multiProcessorCount;
printf("XXX occupancy:%d, num_cu:%d, BLOCK_SIZE:%d, LDS:%d\n",
occupancy,
num_cu,
BlockSize,
GridwiseGemm::GetSharedMemoryNumberOfByte());
return Argument{p_a, return Argument{p_a,
p_b, p_b,
p_c, p_c,
......
...@@ -635,7 +635,7 @@ struct BlockToCTileMap_3DGrid_KSplit ...@@ -635,7 +635,7 @@ struct BlockToCTileMap_3DGrid_KSplit
return true; return true;
} }
}; };
#include <stdlib.h>
template <uint32_t MPerBlock_, uint32_t NPerBlock_, uint32_t KPerBlock_> template <uint32_t MPerBlock_, uint32_t NPerBlock_, uint32_t KPerBlock_>
struct BlockToCTileMap_GemmStreamK struct BlockToCTileMap_GemmStreamK
{ {
...@@ -657,11 +657,22 @@ struct BlockToCTileMap_GemmStreamK ...@@ -657,11 +657,22 @@ struct BlockToCTileMap_GemmStreamK
uint32_t k_iters_per_big_block; uint32_t k_iters_per_big_block;
MDiv k_iters_per_tile; MDiv k_iters_per_tile;
MDiv n_tiles; MDiv n_tiles;
MDiv tile_swizzle_sub_m;
MDiv tile_swizzle_sub_m_rem;
//-------------------------------------- //--------------------------------------
static int env_get_int(const char* var_name, int default_int)
{
char* v = getenv(var_name);
int r = default_int;
if(v)
r = atoi(v);
return r;
}
// prefer construct on host // prefer construct on host
BlockToCTileMap_GemmStreamK( BlockToCTileMap_GemmStreamK(
uint32_t m, uint32_t n, uint32_t k, uint32_t num_cu, uint32_t occupancy) uint32_t m, uint32_t n, uint32_t k, uint32_t num_cu, uint32_t occupancy, uint32_t tile_swizzle_sub_m_factor = 8)
{ {
uint32_t num_tiles = uint32_t num_tiles =
math::integer_divide_ceil(m, MPerBlock) * math::integer_divide_ceil(n, NPerBlock); math::integer_divide_ceil(m, MPerBlock) * math::integer_divide_ceil(n, NPerBlock);
...@@ -760,6 +771,8 @@ struct BlockToCTileMap_GemmStreamK ...@@ -760,6 +771,8 @@ struct BlockToCTileMap_GemmStreamK
sk_num_blocks = 0; sk_num_blocks = 0;
} }
sk_num_blocks = env_get_int("sk_num_blocks", sk_num_blocks);
if(sk_num_blocks == 0) if(sk_num_blocks == 0)
{ {
sk_num_big_blocks = 0; sk_num_big_blocks = 0;
...@@ -791,13 +804,17 @@ struct BlockToCTileMap_GemmStreamK ...@@ -791,13 +804,17 @@ struct BlockToCTileMap_GemmStreamK
} }
} }
n_tiles = MDiv(math::integer_divide_ceil(n, NPerBlock)); n_tiles = MDiv(math::integer_divide_ceil(n, NPerBlock));
tile_swizzle_sub_m = MDiv(tile_swizzle_sub_m_factor);
tile_swizzle_sub_m_rem = MDiv(math::integer_divide_ceil(m, MPerBlock) % tile_swizzle_sub_m_factor);
printf("cu:%d, occupancy:%d, grids:%d, sk_num_big_blocks:%d, sk_num_blocks:%d, " printf("cu:%d, occupancy:%d, grids:%d, num_tiles:%d, dp_tiles:%d, sk_num_big_blocks:%d, sk_num_blocks:%d, "
"sk_total_iters:%d, dp_start_block_idx:%d, dp_iters_per_block:%d, dp_num_blocks:%d, " "sk_total_iters:%d, dp_start_block_idx:%d, dp_iters_per_block:%d, dp_num_blocks:%d, "
"k_iters_per_tile:%d, k_iters_per_big_block:%d\n", "k_iters_per_tile:%d, k_iters_per_big_block:%d\n",
num_cu, num_cu,
occupancy, occupancy,
get_grid_dims().x, get_grid_dims().x,
num_tiles,
dp_tiles,
sk_num_big_blocks, sk_num_big_blocks,
sk_num_blocks, sk_num_blocks,
sk_total_iters, sk_total_iters,
...@@ -859,12 +876,30 @@ struct BlockToCTileMap_GemmStreamK ...@@ -859,12 +876,30 @@ struct BlockToCTileMap_GemmStreamK
k_iters_per_tile.divmod(iter, tile_idx, iter_offset); k_iters_per_tile.divmod(iter, tile_idx, iter_offset);
} }
__device__ auto tile_to_spatial(uint32_t tile_idx) const __device__ auto tile_to_spatial(uint32_t tile_idx, uint32_t m, uint32_t /*n*/) const
{ {
// TODO:
uint32_t m_tile_idx, n_tile_idx; uint32_t m_tile_idx, n_tile_idx;
n_tiles.divmod(tile_idx, m_tile_idx, n_tile_idx); n_tiles.divmod(tile_idx, m_tile_idx, n_tile_idx);
return make_tuple(m_tile_idx, n_tile_idx); // return make_tuple(m_tile_idx, n_tile_idx);
// swizzle tile
uint32_t m_tiles = math::integer_divide_ceil(m, MPerBlock);
// uint32_t n_tiles = math::integer_divide_ceil(n, NPerBlock);
uint32_t quo_sub_m, rem_sub_m;
tile_swizzle_sub_m.divmod(m_tile_idx, quo_sub_m, rem_sub_m);
const auto sub_m_adapt = (m_tile_idx < (m_tiles - tile_swizzle_sub_m_rem.get())) ?
tile_swizzle_sub_m : tile_swizzle_sub_m_rem;
uint32_t m_tile_idx_sub0, m_tile_idx_sub1;
tile_swizzle_sub_m.divmod(m_tile_idx, m_tile_idx_sub0, m_tile_idx_sub1);
uint32_t tile_idx_local = n_tile_idx + m_tile_idx_sub1 * n_tiles.get();
uint32_t m_tile_idx_with_adapt, n_tile_idx_with_adapt;
sub_m_adapt.divmod(tile_idx_local, n_tile_idx_with_adapt, m_tile_idx_with_adapt);
return make_tuple(m_tile_idx_with_adapt + m_tile_idx_sub0 * tile_swizzle_sub_m.get(),
n_tile_idx_with_adapt);
} }
}; };
......
...@@ -474,7 +474,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk ...@@ -474,7 +474,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
block_mapping.get_block_itr(block_idx, iter_start, iter_end); block_mapping.get_block_itr(block_idx, iter_start, iter_end);
uint32_t total_iter_length = iter_end - iter_start; uint32_t total_iter_length = iter_end - iter_start;
// if(threadIdx.x == 0) // if(threadIdx.x == 0)
// printf("xxx bid:%d\n", static_cast<int>(blockIdx.x)); // printf("xxx bid:%d, is_sk_block:%d, is_dp_block:%d\n", static_cast<int>(blockIdx.x), is_sk_block, is_dp_block);
if(!is_sk_block && !is_dp_block) if(!is_sk_block && !is_dp_block)
return; return;
...@@ -485,7 +485,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk ...@@ -485,7 +485,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
uint32_t tile_idx, iter_offset; uint32_t tile_idx, iter_offset;
block_mapping.get_tile_idx_with_offset(iter_end - 1, tile_idx, iter_offset); block_mapping.get_tile_idx_with_offset(iter_end - 1, tile_idx, iter_offset);
iter_offset = __builtin_amdgcn_readfirstlane(iter_offset - current_iter_length + 1); iter_offset = __builtin_amdgcn_readfirstlane(iter_offset - current_iter_length + 1);
auto spatial_idx = block_mapping.tile_to_spatial(tile_idx); auto spatial_idx = block_mapping.tile_to_spatial(tile_idx, m, n);
const index_t m_block_data_idx_on_grid = const index_t m_block_data_idx_on_grid =
__builtin_amdgcn_readfirstlane(spatial_idx[I0] * MPerBlock); __builtin_amdgcn_readfirstlane(spatial_idx[I0] * MPerBlock);
...@@ -496,13 +496,12 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk ...@@ -496,13 +496,12 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
const index_t k0_block_data_idx_on_grid = const index_t k0_block_data_idx_on_grid =
__builtin_amdgcn_readfirstlane(iter_offset * K0PerBlock); __builtin_amdgcn_readfirstlane(iter_offset * K0PerBlock);
// if(threadIdx.x == 0) // if(threadIdx.x == 0)
// printf("[%s], bid:%d, block_idx:%d, tile_idx:%d(%d, %d, %d), iter_start:%d(%d | // printf("[%s], bid:%d, block_idx:%d, tile_idx:%d(%d, %d, %d), iter_start:%d(%d | %d), iter_end:%d, len:%d\n",
// %d), iter_end:%d, len:%d\n", // is_sk_block ? "sk_block" : (is_dp_block ? "dp_block" : "other "),
// is_sk_block ? "sk_block" : (is_dp_block ? "dp_block" : "other "), // static_cast<int>(blockIdx.x), block_idx, tile_idx, m_block_data_idx_on_grid,
// static_cast<int>(blockIdx.x), block_idx, tile_idx, m_block_data_idx_on_grid, // n_block_data_idx_on_grid, k0_block_data_idx_on_grid, iter_end -
// n_block_data_idx_on_grid, k0_block_data_idx_on_grid, iter_end - // current_iter_length, iter_offset, iter_start, iter_end, current_iter_length);
// current_iter_length, iter_offset, iter_start, iter_end, current_iter_length);
// A matrix blockwise copy // A matrix blockwise copy
auto a_blockwise_copy = auto a_blockwise_copy =
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment