Unverified Commit ad720aef authored by Elvir Crnčević's avatar Elvir Crnčević Committed by GitHub
Browse files

[Bugfix] Fix V1 dummy run writing NaN to KV cache null block (#39444)


Signed-off-by: default avatarElvir Crncevic <elvircrn@gmail.com>
Co-authored-by: default avatarClaude Sonnet 4 <noreply@anthropic.com>
parent 270e8a41
...@@ -5356,12 +5356,18 @@ class GPUModelRunner( ...@@ -5356,12 +5356,18 @@ class GPUModelRunner(
attn_metadata: PerLayerAttnMetadata | None = None attn_metadata: PerLayerAttnMetadata | None = None
slot_mappings_by_group, slot_mappings = self._get_slot_mappings( slot_mappings_by_group, slot_mappings = self._get_slot_mappings(
num_tokens_padded=num_tokens, num_tokens_padded=num_tokens_padded,
num_reqs_padded=num_reqs_padded, num_reqs_padded=num_reqs_padded,
num_tokens_unpadded=num_tokens_unpadded, num_tokens_unpadded=num_tokens_unpadded,
ubatch_slices=ubatch_slices_padded, ubatch_slices=ubatch_slices_padded,
) )
# Dummy runs have no real slot assignments — fill with -1 so
# concat_and_cache kernels skip the KV write.
if slot_mappings_by_group is not None:
for sm in slot_mappings_by_group.values():
sm.fill_(-1)
# _dummy_run shares pinned CPU buffers (seq_lens, query_start_loc, # _dummy_run shares pinned CPU buffers (seq_lens, query_start_loc,
# etc.) with execute_model. It must participate in the same event # etc.) with execute_model. It must participate in the same event
# protocol so that back-to-back dummy/real steps don't overwrite # protocol so that back-to-back dummy/real steps don't overwrite
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment