Commit 596c18f6 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev' of http://10.16.6.30/dcutoolkit/deeplearing/vllm into v0.9.2-dev

parents dc2aff4c 5e1ca86d
...@@ -38,6 +38,7 @@ class CachedRequestState: ...@@ -38,6 +38,7 @@ class CachedRequestState:
block_ids: tuple[list[int], ...] block_ids: tuple[list[int], ...]
num_computed_tokens: int num_computed_tokens: int
output_token_ids: list[int] output_token_ids: list[int]
spec_token_ids: list[int] = None
mrope_positions: Optional[torch.Tensor] = None mrope_positions: Optional[torch.Tensor] = None
mrope_position_delta: Optional[int] = None mrope_position_delta: Optional[int] = None
...@@ -288,9 +289,16 @@ class InputBatch: ...@@ -288,9 +289,16 @@ class InputBatch:
end_idx = start_idx + len(request.output_token_ids) end_idx = start_idx + len(request.output_token_ids)
self.token_ids_cpu[req_index, self.token_ids_cpu[req_index,
start_idx:end_idx] = request.output_token_ids start_idx:end_idx] = request.output_token_ids
num_spec_tokens = 0
if request.spec_token_ids != None:
num_spec_tokens = len(request.spec_token_ids)
self.token_ids_cpu[req_index,
end_idx:end_idx + num_spec_tokens] = request.spec_token_ids
# Number of token ids in token_ids_cpu. # Number of token ids in token_ids_cpu.
# NOTE(woosuk): This may include spec decode tokens. # NOTE(woosuk): This may include spec decode tokens.
self.num_tokens[req_index] = request.num_tokens self.num_tokens[req_index] = request.num_tokens + num_spec_tokens
# Number of tokens without spec decode tokens. # Number of tokens without spec decode tokens.
self.num_tokens_no_spec[req_index] = request.num_tokens self.num_tokens_no_spec[req_index] = request.num_tokens
......
...@@ -478,6 +478,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -478,6 +478,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# Update the cached states. # Update the cached states.
req_state.num_computed_tokens = num_computed_tokens req_state.num_computed_tokens = num_computed_tokens
spec_token_ids = (
scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
if not is_last_rank: if not is_last_rank:
# When using PP, the scheduler sends the sampled tokens back, # When using PP, the scheduler sends the sampled tokens back,
...@@ -494,6 +496,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -494,6 +496,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
elif num_new_tokens > 0: elif num_new_tokens > 0:
req_state.output_token_ids.extend( req_state.output_token_ids.extend(
new_token_ids[-num_new_tokens:]) new_token_ids[-num_new_tokens:])
if len(spec_token_ids) > 0:
req_state.spec_token_ids = spec_token_ids
# Update the block IDs. # Update the block IDs.
if not resumed_from_preemption: if not resumed_from_preemption:
...@@ -533,8 +537,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -533,8 +537,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self.input_batch.num_tokens[req_index] = end_token_index self.input_batch.num_tokens[req_index] = end_token_index
# Add spec_token_ids to token_ids_cpu. # Add spec_token_ids to token_ids_cpu.
spec_token_ids = (
scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
if spec_token_ids: if spec_token_ids:
num_spec_tokens = len(spec_token_ids) num_spec_tokens = len(spec_token_ids)
start_index = self.input_batch.num_tokens_no_spec[req_index] start_index = self.input_batch.num_tokens_no_spec[req_index]
...@@ -631,7 +633,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -631,7 +633,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# where M is the max_model_len. # where M is the max_model_len.
token_indices = (positions_np + token_indices = (positions_np +
req_indices * self.input_batch.token_ids_cpu.shape[1]) req_indices * self.input_batch.token_ids_cpu.shape[1])
# NOTE(woosuk): We use torch.index_select instead of np.take here # NOTE(woosuk): We use torch.index_select instead of np.take here
# because torch.index_select is much faster than np.take for large # because torch.index_select is much faster than np.take for large
# tensors. # tensors.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment