Commit 89da8d9d authored by Tao He's avatar Tao He Committed by simon-mo
Browse files

[Qwen3Next] Fixes the cuda graph capture conditions under large batch sizes (#24660) (#24667)


Signed-off-by: default avatarTao He <linzhu.ht@alibaba-inc.com>
parent 01085b13
...@@ -209,7 +209,8 @@ class GDNAttentionMetadataBuilder( ...@@ -209,7 +209,8 @@ class GDNAttentionMetadataBuilder(
# prepare tensors for cudagraph # prepare tensors for cudagraph
if (self.use_full_cuda_graph and num_prefills == 0 and num_decodes == 0 if (self.use_full_cuda_graph and num_prefills == 0 and num_decodes == 0
and num_spec_decodes <= self.decode_cudagraph_max_bs): and num_spec_decodes <= self.decode_cudagraph_max_bs
and m.num_actual_tokens <= self.decode_cudagraph_max_bs):
num_total_tokens = self.vllm_config.pad_for_cudagraph( num_total_tokens = self.vllm_config.pad_for_cudagraph(
m.num_actual_tokens) m.num_actual_tokens)
batch_size = num_total_tokens // (self.num_spec + 1) batch_size = num_total_tokens // (self.num_spec + 1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment