Unverified Commit c5832d2a authored by Murali Andoorveedu's avatar Murali Andoorveedu Committed by GitHub
Browse files

[Core] Pipeline Parallel Support (#4412)


Signed-off-by: default avatarMuralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
parent 15aba081
......@@ -12,7 +12,8 @@ from vllm.distributed import broadcast_tensor_dict
from vllm.logger import init_logger
from vllm.model_executor.model_loader import get_model
from vllm.sampling_params import SamplingParams
from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
from vllm.sequence import (IntermediateTensors, SamplerOutput, SequenceData,
SequenceGroupMetadata)
from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad
from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata
from vllm.worker.model_runner_base import (
......@@ -190,6 +191,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
def prepare_model_input(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
virtual_engine: int = 0,
) -> ModelInputForXPU:
multi_modal_input = None
if self.is_driver_worker:
......@@ -334,6 +336,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
self,
model_input: ModelInputForXPU,
kv_caches: List[torch.Tensor],
intermediate_tensors: Optional[IntermediateTensors] = None,
num_steps: int = 1,
) -> Optional[List[SamplerOutput]]:
if num_steps > 1:
......
......@@ -85,8 +85,8 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
)
# Uninitialized cache engine. Will be initialized by
# initialize_cache.
self.cache_engine: CacheEngine
self.gpu_cache: List[torch.Tensor]
self.cache_engine: List[CacheEngine]
self.gpu_cache: Optional[List[List[torch.Tensor]]]
def init_device(self) -> None:
if self.device_config.device.type == "xpu" and is_xpu():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment