Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
3644f0ec
Commit
3644f0ec
authored
Jul 20, 2023
by
Adam Osewski
Browse files
Launch grid size which is min of occupancy vs tile count
parent
3d345953
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
21 additions
and
6 deletions
+21
-6
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle_tile_loop.hpp
...mpl/device_grouped_gemm_xdl_splitk_cshuffle_tile_loop.hpp
+21
-6
No files found.
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle_tile_loop.hpp
View file @
3644f0ec
...
...
@@ -567,12 +567,11 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
EDataType
,
HasMainKBlockLoop
,
CGlobalMemoryDataOperation
>
;
return
LaunchKernel
(
kernel
,
arg
,
stream_config
);
return
LaunchKernel
(
kernel
,
arg
,
dev_gemm_args
,
stream_config
);
}
template
<
typename
KernelFunction
>
float
LaunchKernel
(
const
KernelFunction
&
kernel
,
const
Argument
&
arg
,
int
CalculateMaxOccupancyGridSize
(
const
KernelFunction
&
kernel
,
const
StreamConfig
&
stream_config
)
const
{
// Calculate max number of workgroups that can simultaneously reside on the CU.
...
...
@@ -592,13 +591,29 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
<<
std
::
endl
;
}
return
cu_count
*
ck
::
math
::
min
(
num_blocks
,
CU_BLOCKS
)
*
BLOCK_SUBSCRIPTION_FACTOR
;
}
template
<
typename
KernelFunction
>
float
LaunchKernel
(
const
KernelFunction
&
kernel
,
const
Argument
&
arg
,
const
void
*
dev_gemm_args
,
const
StreamConfig
&
stream_config
)
const
{
int
max_occupancy_grid_size
=
CalculateMaxOccupancyGridSize
(
kernel
,
stream_config
);
// We launch the smaller number of workgroups from acutally needed tiles and the
// number of workgroups that maximize the GPU occupancy. That is because for some tile
// configuration the first is smaller than the latter. Launching too many workgroups
// mean some of them will have to iterate through all gemm problem descriptors just to
// find out they have nothing to do which is of course waste of GPU cycles.
return
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
cu_count
*
ck
::
math
::
min
(
num_blocks
,
CU_BLOCKS
)
*
BLOCK_SUBSCRIPTION_FACTOR
),
dim3
(
ck
::
math
::
min
(
arg
.
grid_size_
,
max_occupancy_grid_size
)
),
dim3
(
BlockSize
),
0
,
arg
.
p_workspace_
,
dev_gemm_args
,
arg
.
grid_size_
,
arg
.
K_BATCH
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment