Unverified Commit 136d750f authored by czhu-cohere's avatar czhu-cohere Committed by GitHub
Browse files

[Kernel] Improve machete memory bound perf (#21556)


Signed-off-by: default avatarczhu-cohere <conway.zhu@cohere.com>
parent b3caeb82
......@@ -187,8 +187,12 @@ struct PrepackedLayoutBTemplate {
CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
Shape_NKL shape_mkl) {
auto layout = TVbNbKL_to_offset(shape_mkl);
return make_layout(coalesce(get<0>(layout)), get<1>(layout),
get<2>(layout));
// for 4-bit elements, having >= 64 values per column
// allows TMA to load full 32-byte sectors
auto inner_layout =
make_layout(make_shape(_256{}, size<0>(layout) / _256{}));
return make_layout(inner_layout, get<1>(layout), get<2>(layout));
}
// ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment