"vscode:/vscode.git/clone" did not exist on "2d43094ffc9b1ee377651c6c8a358c81f0c96005"
Commit 3644f0ec authored by Adam Osewski's avatar Adam Osewski
Browse files

Launch grid size which is min of occupancy vs tile count

parent 3d345953
...@@ -567,13 +567,12 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo ...@@ -567,13 +567,12 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
EDataType, EDataType,
HasMainKBlockLoop, HasMainKBlockLoop,
CGlobalMemoryDataOperation>; CGlobalMemoryDataOperation>;
return LaunchKernel(kernel, arg, stream_config); return LaunchKernel(kernel, arg, dev_gemm_args, stream_config);
} }
template <typename KernelFunction> template <typename KernelFunction>
float LaunchKernel(const KernelFunction& kernel, int CalculateMaxOccupancyGridSize(const KernelFunction& kernel,
const Argument& arg, const StreamConfig& stream_config) const
const StreamConfig& stream_config) const
{ {
// Calculate max number of workgroups that can simultaneously reside on the CU. // Calculate max number of workgroups that can simultaneously reside on the CU.
int num_blocks = 0; int num_blocks = 0;
...@@ -592,13 +591,29 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo ...@@ -592,13 +591,29 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
<< std::endl; << std::endl;
} }
return cu_count * ck::math::min(num_blocks, CU_BLOCKS) * BLOCK_SUBSCRIPTION_FACTOR;
}
template <typename KernelFunction>
float LaunchKernel(const KernelFunction& kernel,
const Argument& arg,
const void* dev_gemm_args,
const StreamConfig& stream_config) const
{
int max_occupancy_grid_size = CalculateMaxOccupancyGridSize(kernel, stream_config);
// We launch the smaller number of workgroups from acutally needed tiles and the
// number of workgroups that maximize the GPU occupancy. That is because for some tile
// configuration the first is smaller than the latter. Launching too many workgroups
// mean some of them will have to iterate through all gemm problem descriptors just to
// find out they have nothing to do which is of course waste of GPU cycles.
return launch_and_time_kernel( return launch_and_time_kernel(
stream_config, stream_config,
kernel, kernel,
dim3(cu_count * ck::math::min(num_blocks, CU_BLOCKS) * BLOCK_SUBSCRIPTION_FACTOR), dim3(ck::math::min(arg.grid_size_, max_occupancy_grid_size)),
dim3(BlockSize), dim3(BlockSize),
0, 0,
arg.p_workspace_, dev_gemm_args,
arg.grid_size_, arg.grid_size_,
arg.K_BATCH); arg.K_BATCH);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment