Commit a74f053c authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev-lzg' into 'v0.9.2-dev'

fix issue from merge

See merge request dcutoolkit/deeplearing/vllm!184
parents 683c4868 2e66885c
...@@ -348,7 +348,6 @@ class V1ZeroModelRunner(GPUModelRunner): ...@@ -348,7 +348,6 @@ class V1ZeroModelRunner(GPUModelRunner):
else: else:
block_table = None block_table = None
num_rejected_tokens = None
if spec_decode_metadata is None: if spec_decode_metadata is None:
# input_ids can be None for multimodal models. # input_ids can be None for multimodal models.
target_token_ids = self.input_ids[:num_scheduled_tokens] target_token_ids = self.input_ids[:num_scheduled_tokens]
...@@ -364,21 +363,15 @@ class V1ZeroModelRunner(GPUModelRunner): ...@@ -364,21 +363,15 @@ class V1ZeroModelRunner(GPUModelRunner):
cu_num_tokens = eagle_attn_metadata.query_start_loc cu_num_tokens = eagle_attn_metadata.query_start_loc
else: else:
# TODO(woosuk): Refactor this. # TODO(woosuk): Refactor this.
num_draft_tokens = spec_decode_metadata.num_draft_tokens num_accepted_tokens = [len(s) - 1 for s in sampled_token_ids]
num_rejected_tokens = [ num_accepted_tokens_tensor = async_tensor_h2d(
n + 1 - len(sampled_token_ids[i]) if n > 0 else 0 num_accepted_tokens,
for i, n in enumerate(num_draft_tokens)
]
num_rejected_tokens_tensor = async_tensor_h2d(
num_rejected_tokens,
dtype=torch.int32, dtype=torch.int32,
target_device=self.device, target_device=self.device,
pin_memory=True) pin_memory=True)
num_tokens = num_scheduled_tokens - sum(num_rejected_tokens)
cu_num_tokens, token_indices = self.drafter.prepare_inputs( cu_num_tokens, token_indices = self.drafter.prepare_inputs(
eagle_attn_metadata.query_start_loc, eagle_attn_metadata.query_start_loc,
num_rejected_tokens_tensor, num_accepted_tokens_tensor,
num_tokens,
) )
target_token_ids = self.input_ids[token_indices] target_token_ids = self.input_ids[token_indices]
# TODO(woosuk): Support M-RoPE. # TODO(woosuk): Support M-RoPE.
...@@ -399,7 +392,7 @@ class V1ZeroModelRunner(GPUModelRunner): ...@@ -399,7 +392,7 @@ class V1ZeroModelRunner(GPUModelRunner):
cu_num_tokens=cu_num_tokens, cu_num_tokens=cu_num_tokens,
block_table=block_table, block_table=block_table,
sampling_metadata=sampling_metadata, sampling_metadata=sampling_metadata,
num_rejected_tokens=num_rejected_tokens decoding=spec_decode_metadata is not None
) )
spec_token_ids = np.ones(draft_token_ids.shape, dtype=int).tolist() spec_token_ids = np.ones(draft_token_ids.shape, dtype=int).tolist()
self.last_draft_token_ids = draft_token_ids self.last_draft_token_ids = draft_token_ids
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment