Unverified Commit af7fc84f authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

[BugFix][Minor] Fix full cuda graph bug when max_num_seqs < 512 (#19171)


Signed-off-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
parent 0678b522
...@@ -1737,7 +1737,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1737,7 +1737,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# has num_tokens in total. # has num_tokens in total.
assert num_tokens <= self.scheduler_config.max_num_batched_tokens assert num_tokens <= self.scheduler_config.max_num_batched_tokens
max_num_reqs = self.scheduler_config.max_num_seqs max_num_reqs = self.scheduler_config.max_num_seqs
num_reqs = max_num_reqs if num_tokens >= max_num_reqs else num_tokens num_reqs = min(num_tokens, max_num_reqs)
min_tokens_per_req = num_tokens // num_reqs min_tokens_per_req = num_tokens // num_reqs
num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
num_scheduled_tokens_list[-1] += num_tokens % num_reqs num_scheduled_tokens_list[-1] += num_tokens % num_reqs
...@@ -1765,7 +1765,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1765,7 +1765,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self.kv_cache_config.kv_cache_groups): self.kv_cache_config.kv_cache_groups):
attn_metadata_i = ( attn_metadata_i = (
self.attn_metadata_builders[kv_cache_group_id].build( self.attn_metadata_builders[kv_cache_group_id].build(
num_reqs=num_tokens, num_reqs=num_reqs,
num_actual_tokens=num_tokens, num_actual_tokens=num_tokens,
max_query_len=num_tokens, max_query_len=num_tokens,
common_prefix_len=0, common_prefix_len=0,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment