Commit 55d1b19c authored by Qianfeng Zhang's avatar Qianfeng Zhang
Browse files

Move the location for store_tile() of first v_tile

parent f881fa70
......@@ -624,6 +624,26 @@ struct BlockFmhaPipelineQRKSVSAsync
});
});
__builtin_amdgcn_sched_barrier(0x7f);
if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
{
auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
shuffle_tile(v_shuffle_tmp, v_tiles[I0]);
store_tile(
v_lds_windows[I0],
tile_elementwise_in(v_element_func, v_shuffle_tmp)); // store the prefetch
}
else
{
store_tile(v_lds_windows[I0],
tile_elementwise_in(v_element_func, v_tiles[I0])); // store the prefetch
}
__builtin_amdgcn_sched_barrier(0);
auto rowsum_p = block_tile_reduce<SMPLComputeDataType>(
p_compute, sequence<1>{}, f_sum, SMPLComputeDataType{0}); // rowsum(Pcompute{j})
......@@ -666,26 +686,6 @@ struct BlockFmhaPipelineQRKSVSAsync
smem_ptr, seqlen_k_start + i_total_loops * kN0, p_compute, randval_dram_window);
}
__builtin_amdgcn_sched_barrier(0x7f);
if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
{
auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
shuffle_tile(v_shuffle_tmp, v_tiles[I0]);
store_tile(
v_lds_windows[I0],
tile_elementwise_in(v_element_func, v_shuffle_tmp)); // store the prefetch
}
else
{
store_tile(v_lds_windows[I0],
tile_elementwise_in(v_element_func, v_tiles[I0])); // store the prefetch
}
__builtin_amdgcn_sched_barrier(0);
const auto p =
cast_tile<PDataType>(tile_elementwise_in(p_compute_element_func, p_compute));
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment