Commit 02689420 authored by xuxz's avatar xuxz
Browse files

Merge branch 'v0.9.2-dev' into 'v0.9.2-dev-add_connector'

# Conflicts:
#   vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
parents ef362942 fa683b07
...@@ -465,7 +465,7 @@ class V1ZeroModelRunner(GPUModelRunner): ...@@ -465,7 +465,7 @@ class V1ZeroModelRunner(GPUModelRunner):
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
# make sure that the padded length is divisible by attn_tp_size because we may need reduce-scatter across attn_tp dim. # make sure that the padded length is divisible by attn_tp_size because we may need reduce-scatter across attn_tp dim.
if self.ep_sp: if self.ep_sp or self.enable_dp_attention:
num_input_tokens = round_up(num_scheduled_tokens, tp_size) num_input_tokens = round_up(num_scheduled_tokens, tp_size)
if (self.use_cuda_graph if (self.use_cuda_graph
and num_input_tokens <= self.cudagraph_batch_sizes[-1]): and num_input_tokens <= self.cudagraph_batch_sizes[-1]):
...@@ -796,6 +796,7 @@ class V1ZeroModelRunner(GPUModelRunner): ...@@ -796,6 +796,7 @@ class V1ZeroModelRunner(GPUModelRunner):
req_state = self.requests[req_id] req_state = self.requests[req_id]
token_idx = self.last_sampled_token_lens[req_idx] token_idx = self.last_sampled_token_lens[req_idx]
if token_idx == -1: if token_idx == -1:
self.fix_sampled_token_ids[req_idx].clear()
continue continue
fix_len = len(self.fix_sampled_token_ids[req_idx]) fix_len = len(self.fix_sampled_token_ids[req_idx])
req_state.output_token_ids[token_idx:token_idx + fix_len] = self.fix_sampled_token_ids[req_idx] req_state.output_token_ids[token_idx:token_idx + fix_len] = self.fix_sampled_token_ids[req_idx]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment