Commit fc7980db authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.15.1' into v0.15.1-ori

parents 3eab7fef 1892993b
...@@ -1284,7 +1284,7 @@ class Scheduler(SchedulerInterface): ...@@ -1284,7 +1284,7 @@ class Scheduler(SchedulerInterface):
scheduled_spec_token_ids = ( scheduled_spec_token_ids = (
scheduler_output.scheduled_spec_decode_tokens.get(req_id) scheduler_output.scheduled_spec_decode_tokens.get(req_id)
) )
if scheduled_spec_token_ids: if scheduled_spec_token_ids and generated_token_ids:
num_draft_tokens = len(scheduled_spec_token_ids) num_draft_tokens = len(scheduled_spec_token_ids)
num_accepted = len(generated_token_ids) - 1 num_accepted = len(generated_token_ids) - 1
num_rejected = num_draft_tokens - num_accepted num_rejected = num_draft_tokens - num_accepted
......
...@@ -1382,12 +1382,14 @@ class GPUModelRunner( ...@@ -1382,12 +1382,14 @@ class GPUModelRunner(
num_scheduled_tokens: dict[str, int], num_scheduled_tokens: dict[str, int],
kv_cache_spec: KVCacheSpec, kv_cache_spec: KVCacheSpec,
num_reqs: int, num_reqs: int,
for_cudagraph_capture: bool = False,
) -> tuple[torch.Tensor | None, np.ndarray | None]: ) -> tuple[torch.Tensor | None, np.ndarray | None]:
if not isinstance(kv_cache_spec, CrossAttentionSpec): if not isinstance(kv_cache_spec, CrossAttentionSpec):
return None, None return None, None
# Zero out buffer for padding requests that are not actually scheduled (CGs) # Zero out buffer for padding requests that are not actually scheduled (CGs)
self.encoder_seq_lens.np[:num_reqs] = 0 self.encoder_seq_lens.np[:num_reqs] = 0
# Build encoder_seq_lens array mapping request indices to # Build encoder_seq_lens array mapping request indices to
# encoder lengths for inputs scheduled in this batch # encoder lengths for inputs scheduled in this batch
for req_id in num_scheduled_tokens: for req_id in num_scheduled_tokens:
...@@ -1404,6 +1406,15 @@ class GPUModelRunner( ...@@ -1404,6 +1406,15 @@ class GPUModelRunner(
feature.mm_position.length for feature in req_state.mm_features feature.mm_position.length for feature in req_state.mm_features
) )
self.encoder_seq_lens.np[req_index] = encoder_input_tokens self.encoder_seq_lens.np[req_index] = encoder_input_tokens
if for_cudagraph_capture:
# During CUDA graph capture, we need to use realistic encoder lengths
# so that max_seqlen_k is captured with the correct value.
max_encoder_len = getattr(
self.model_config.hf_config,
"max_source_positions",
self.max_encoder_len,
)
self.encoder_seq_lens.np[:num_reqs] = max_encoder_len
self.encoder_seq_lens.copy_to_gpu(num_reqs) self.encoder_seq_lens.copy_to_gpu(num_reqs)
encoder_seq_lens = self.encoder_seq_lens.gpu[:num_reqs] encoder_seq_lens = self.encoder_seq_lens.gpu[:num_reqs]
...@@ -1821,6 +1832,7 @@ class GPUModelRunner( ...@@ -1821,6 +1832,7 @@ class GPUModelRunner(
num_scheduled_tokens or {}, num_scheduled_tokens or {},
kv_cache_group.kv_cache_spec, kv_cache_group.kv_cache_spec,
num_reqs_padded, num_reqs_padded,
for_cudagraph_capture=for_cudagraph_capture,
) )
if kv_cache_gid > 0: if kv_cache_gid > 0:
cm.block_table_tensor = _get_block_table(kv_cache_gid) cm.block_table_tensor = _get_block_table(kv_cache_gid)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment