Unverified Commit d0b40297 authored by Ekagra Ranjan's avatar Ekagra Ranjan Committed by GitHub
Browse files

[Bugfix][Spec Decode] Avoid double call of Ngram CPU (#36952)


Signed-off-by: default avatarEkagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
parent 6341d430
......@@ -4247,15 +4247,6 @@ class GPUModelRunner(
self.input_batch.token_ids_cpu,
slot_mappings=slot_mappings,
)
if isinstance(self.drafter, NgramProposer):
assert isinstance(sampled_token_ids, list), (
"sampled_token_ids should be a python list when ngram is used."
)
draft_token_ids = self.drafter.propose(
sampled_token_ids,
self.input_batch.num_tokens_no_spec,
self.input_batch.token_ids_cpu,
)
elif spec_config.use_ngram_gpu():
assert isinstance(self.drafter, NgramProposerGPU)
(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment