Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0ee425a6
Commit
0ee425a6
authored
May 08, 2025
by
lizhigong
Browse files
pause speculative decoding with zero overhead scheduling, develop tbo first
parent
7d224eb2
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
4 additions
and
0 deletions
+4
-0
vllm/spec_decode/spec_decode_worker.py
vllm/spec_decode/spec_decode_worker.py
+3
-0
vllm/zero_overhead/llm_engine.py
vllm/zero_overhead/llm_engine.py
+1
-0
No files found.
vllm/spec_decode/spec_decode_worker.py
View file @
0ee425a6
...
@@ -208,6 +208,9 @@ class SpecDecodeWorker(LoRANotSupportedWorkerBase):
...
@@ -208,6 +208,9 @@ class SpecDecodeWorker(LoRANotSupportedWorkerBase):
if
draft_model_config
.
hf_config
.
model_type
==
"eagle"
:
if
draft_model_config
.
hf_config
.
model_type
==
"eagle"
:
enable_lm_head_weight_load
=
True
enable_lm_head_weight_load
=
True
if
is_zero_overhead
():
if
is_zero_overhead
():
assert
False
,
(
"speculative decoding not support zero overhead scheduler yet"
)
from
vllm.zero_overhead.spec_decode.muti_step_worker
import
ZeroOverheadMultiStepWorker
from
vllm.zero_overhead.spec_decode.muti_step_worker
import
ZeroOverheadMultiStepWorker
proposer_worker
=
ZeroOverheadMultiStepWorker
(
**
draft_worker_kwargs
)
proposer_worker
=
ZeroOverheadMultiStepWorker
(
**
draft_worker_kwargs
)
else
:
else
:
...
...
vllm/zero_overhead/llm_engine.py
View file @
0ee425a6
...
@@ -301,6 +301,7 @@ class ZeroOverheadEngine(LLMEngine):
...
@@ -301,6 +301,7 @@ class ZeroOverheadEngine(LLMEngine):
)
=
self
.
scheduler
[
virtual_engine
].
schedule
()
)
=
self
.
scheduler
[
virtual_engine
].
schedule
()
if
self
.
last_record
is
not
None
:
if
self
.
last_record
is
not
None
:
last_sampler
=
self
.
last_record
[
1
]
last_sampler
=
self
.
last_record
[
1
]
spec_step
=
get_spec_step
()
if
spec_step
==
SpecStepKind
.
KIND_DEFAULT
:
if
spec_step
==
SpecStepKind
.
KIND_DEFAULT
:
self
.
async_d2h
=
last_sampler
.
sampled_token_ids_tensor
.
to
(
'cpu'
,
non_blocking
=
True
)
self
.
async_d2h
=
last_sampler
.
sampled_token_ids_tensor
.
to
(
'cpu'
,
non_blocking
=
True
)
elif
spec_step
==
SpecStepKind
.
SCORE_DECODE
:
elif
spec_step
==
SpecStepKind
.
SCORE_DECODE
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment