Commit 281110cf authored by Qianfeng Zhang's avatar Qianfeng Zhang
Browse files

Move the codes for storing the first v_lds tile some later

parent 5f4bfa4a
...@@ -457,31 +457,6 @@ struct BlockFmhaPipelineQRKSVSAsync ...@@ -457,31 +457,6 @@ struct BlockFmhaPipelineQRKSVSAsync
auto p_compute = make_static_distributed_tensor<SMPLComputeDataType>( auto p_compute = make_static_distributed_tensor<SMPLComputeDataType>(
s.get_tile_distribution()); // Pcompute{j} s.get_tile_distribution()); // Pcompute{j}
__builtin_amdgcn_sched_barrier(0);
if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
{
auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
shuffle_tile(v_shuffle_tmp, v_tiles[I0]);
auto v_lds_window_tmp =
get_slice_tile(v_lds_window, sequence<0, 0>{}, sequence<kN1, kK1>{});
store_tile(
v_lds_window_tmp,
tile_elementwise_in(v_element_func, v_shuffle_tmp)); // store the prefetch
}
else
{
auto v_lds_window_tmp =
get_slice_tile(v_lds_window, sequence<0, 0>{}, sequence<kN1, kK1>{});
store_tile(v_lds_window_tmp,
tile_elementwise_in(v_element_func, v_tiles[I0])); // store the prefetch
}
__builtin_amdgcn_sched_barrier(0);
static const auto get_validated_m = [](SMPLComputeDataType raw_m) { static const auto get_validated_m = [](SMPLComputeDataType raw_m) {
/// NOTICE: bias might be materialized mask including -inf values, need /// NOTICE: bias might be materialized mask including -inf values, need
/// consideration /// consideration
...@@ -565,6 +540,31 @@ struct BlockFmhaPipelineQRKSVSAsync ...@@ -565,6 +540,31 @@ struct BlockFmhaPipelineQRKSVSAsync
smem_ptr, seqlen_k_start + i_total_loops * kN0, p_compute, randval_dram_window); smem_ptr, seqlen_k_start + i_total_loops * kN0, p_compute, randval_dram_window);
} }
__builtin_amdgcn_sched_barrier(0);
if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
{
auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
shuffle_tile(v_shuffle_tmp, v_tiles[I0]);
auto v_lds_window_tmp =
get_slice_tile(v_lds_window, sequence<0, 0>{}, sequence<kN1, kK1>{});
store_tile(
v_lds_window_tmp,
tile_elementwise_in(v_element_func, v_shuffle_tmp)); // store the prefetch
}
else
{
auto v_lds_window_tmp =
get_slice_tile(v_lds_window, sequence<0, 0>{}, sequence<kN1, kK1>{});
store_tile(v_lds_window_tmp,
tile_elementwise_in(v_element_func, v_tiles[I0])); // store the prefetch
}
__builtin_amdgcn_sched_barrier(0);
const auto p = const auto p =
cast_tile<PDataType>(tile_elementwise_in(p_compute_element_func, p_compute)); cast_tile<PDataType>(tile_elementwise_in(p_compute_element_func, p_compute));
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment