Commit 0ee425a6 authored by lizhigong's avatar lizhigong
Browse files

pause speculative decoding with zero overhead scheduling, develop tbo first

parent 7d224eb2
......@@ -208,6 +208,9 @@ class SpecDecodeWorker(LoRANotSupportedWorkerBase):
if draft_model_config.hf_config.model_type == "eagle":
enable_lm_head_weight_load = True
if is_zero_overhead():
assert False, (
"speculative decoding not support zero overhead scheduler yet"
)
from vllm.zero_overhead.spec_decode.muti_step_worker import ZeroOverheadMultiStepWorker
proposer_worker = ZeroOverheadMultiStepWorker(**draft_worker_kwargs)
else:
......
......@@ -301,6 +301,7 @@ class ZeroOverheadEngine(LLMEngine):
) = self.scheduler[virtual_engine].schedule()
if self.last_record is not None:
last_sampler = self.last_record[1]
spec_step = get_spec_step()
if spec_step == SpecStepKind.KIND_DEFAULT:
self.async_d2h = last_sampler.sampled_token_ids_tensor.to('cpu', non_blocking=True)
elif spec_step == SpecStepKind.SCORE_DECODE:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment