Unverified Commit f186cfe7 authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

[MRV2] Fix hanging issue with DeepSeek V3.2 by setting `skip_attn=False` (#39098)


Signed-off-by: default avatarWoosukKwon <woosuk.kwon@berkeley.edu>
Signed-off-by: default avatarWoosuk Kwon <woosuk@inferact.ai>
parent dfa5062a
...@@ -391,12 +391,17 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -391,12 +391,17 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self, self,
num_tokens: int, num_tokens: int,
*args, *args,
skip_attn: bool = True, skip_attn: bool = False,
uniform_decode: bool = False, uniform_decode: bool = False,
skip_eplb: bool = False, skip_eplb: bool = False,
is_profile: bool = False, is_profile: bool = False,
**kwargs, **kwargs,
) -> tuple[torch.Tensor | None, torch.Tensor | None]: ) -> tuple[torch.Tensor | None, torch.Tensor | None]:
if skip_attn and not is_profile:
raise ValueError(
"skip_attn must only be True for initial memory profiling."
)
# Create a dummy scheduler output. # Create a dummy scheduler output.
num_reqs = min(num_tokens, self.max_num_reqs) num_reqs = min(num_tokens, self.max_num_reqs)
if uniform_decode: if uniform_decode:
...@@ -988,6 +993,10 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -988,6 +993,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
if not skip_attn_for_dummy_run: if not skip_attn_for_dummy_run:
block_tables, slot_mappings = self.prepare_dummy_attn(input_batch) block_tables, slot_mappings = self.prepare_dummy_attn(input_batch)
else: else:
assert batch_desc.cg_mode != CUDAGraphMode.FULL, (
"Attention metadata must be prepared for dummy runs when using "
"FULL cudagraph mode."
)
block_tables = None block_tables = None
slot_mappings = None slot_mappings = None
# FIXME(woosuk): Fix warmup for LoRA. # FIXME(woosuk): Fix warmup for LoRA.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment