Commit 024e595d authored by lizhigong's avatar lizhigong
Browse files

add server zero overhead engine

parent 08c4bafa
...@@ -280,6 +280,8 @@ class _AsyncLLMEngine(LLMEngine): ...@@ -280,6 +280,8 @@ class _AsyncLLMEngine(LLMEngine):
""" """
# these are cached outputs from previous iterations. None if on first # these are cached outputs from previous iterations. None if on first
# iteration # iteration
if self.zero_overhead:
return self.zero_overhead_step()
cached_outputs = self.cached_scheduler_outputs[virtual_engine] cached_outputs = self.cached_scheduler_outputs[virtual_engine]
seq_group_metadata_list = cached_outputs.seq_group_metadata_list seq_group_metadata_list = cached_outputs.seq_group_metadata_list
scheduler_outputs = cached_outputs.scheduler_outputs scheduler_outputs = cached_outputs.scheduler_outputs
......
...@@ -60,7 +60,7 @@ from vllm.worker.model_runner_base import ( ...@@ -60,7 +60,7 @@ from vllm.worker.model_runner_base import (
_init_attn_metadata_from_tensor_dict, _init_attn_metadata_from_tensor_dict,
_init_sampling_metadata_from_tensor_dict) _init_sampling_metadata_from_tensor_dict)
from vllm.model_executor.layers.ops.update_input import UpdateInputTokens from vllm.model_executor.layers.update_input import UpdateInputTokens
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.backends.abstract import AttentionBackend
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment