Unverified Commit 59965aff authored by Vadim Gimpelson's avatar Vadim Gimpelson Committed by GitHub
Browse files

[BUGFIX] Fix `_dummy_run` missing `prepare_inputs_event` synchronization (#34866)


Signed-off-by: default avatarVadim Gimpelson <vadim.gimpelson@gmail.com>
parent b1c4f0b2
......@@ -4771,8 +4771,13 @@ class GPUModelRunner(
ubatch_slices=ubatch_slices_padded,
)
# If force_attention is True, we always capture attention. Otherwise,
# it only happens for cudagraph_runtime_mode=FULL.
# _dummy_run shares pinned CPU buffers (seq_lens, query_start_loc,
# etc.) with execute_model. It must participate in the same event
# protocol so that back-to-back dummy/real steps don't overwrite
# pinned memory while a prior non_blocking H2D DMA is still reading.
with self.synchronize_input_prep():
# If force_attention is True, we always capture attention.
# Otherwise, it only happens for cudagraph_runtime_mode=FULL.
if force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL:
if create_mixed_batch:
# In the mixed batch mode (used for FI warmup), we use
......@@ -4795,7 +4800,7 @@ class GPUModelRunner(
num_tokens_padded=num_tokens_padded if pad_attn else None,
num_reqs=num_reqs_padded,
max_query_len=max_query_len,
ubatch_slices=ubatch_slices_padded if pad_attn else ubatch_slices,
ubatch_slices=(ubatch_slices_padded if pad_attn else ubatch_slices),
for_cudagraph_capture=is_graph_capturing,
slot_mappings=slot_mappings_by_group,
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment