Commit 4f359a91 authored by zhuwenwen's avatar zhuwenwen
Browse files

fix issue from merge

parent e85207b7
......@@ -347,7 +347,6 @@ class V1ZeroModelRunner(GPUModelRunner):
else:
block_table = None
num_rejected_tokens = None
if spec_decode_metadata is None:
# input_ids can be None for multimodal models.
target_token_ids = self.input_ids[:num_scheduled_tokens]
......@@ -363,21 +362,15 @@ class V1ZeroModelRunner(GPUModelRunner):
cu_num_tokens = eagle_attn_metadata.query_start_loc
else:
# TODO(woosuk): Refactor this.
num_draft_tokens = spec_decode_metadata.num_draft_tokens
num_rejected_tokens = [
n + 1 - len(sampled_token_ids[i]) if n > 0 else 0
for i, n in enumerate(num_draft_tokens)
]
num_rejected_tokens_tensor = async_tensor_h2d(
num_rejected_tokens,
num_accepted_tokens = [len(s) - 1 for s in sampled_token_ids]
num_accepted_tokens_tensor = async_tensor_h2d(
num_accepted_tokens,
dtype=torch.int32,
target_device=self.device,
pin_memory=True)
num_tokens = num_scheduled_tokens - sum(num_rejected_tokens)
cu_num_tokens, token_indices = self.drafter.prepare_inputs(
eagle_attn_metadata.query_start_loc,
num_rejected_tokens_tensor,
num_tokens,
num_accepted_tokens_tensor,
)
target_token_ids = self.input_ids[token_indices]
# TODO(woosuk): Support M-RoPE.
......@@ -398,7 +391,7 @@ class V1ZeroModelRunner(GPUModelRunner):
cu_num_tokens=cu_num_tokens,
block_table=block_table,
sampling_metadata=sampling_metadata,
num_rejected_tokens=num_rejected_tokens
decoding=spec_decode_metadata is not None
)
spec_token_ids = np.ones(draft_token_ids.shape, dtype=int).tolist()
self.last_draft_token_ids = draft_token_ids
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment