Unverified Commit 9b018700 authored by Li, Jiang's avatar Li, Jiang Committed by GitHub
Browse files

[Bugfix] Fix cuda event usage with CPU model runner (#23643)


Signed-off-by: default avatarjiang1.li <jiang1.li@intel.com>
parent 44ac25ea
...@@ -11,6 +11,7 @@ from vllm.logger import init_logger ...@@ -11,6 +11,7 @@ from vllm.logger import init_logger
from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader import get_model
from vllm.v1.attention.backends.cpu_attn import TorchSDPAMetadataBuilderV1 from vllm.v1.attention.backends.cpu_attn import TorchSDPAMetadataBuilderV1
from vllm.v1.worker.gpu_model_runner import GPUModelRunner from vllm.v1.worker.gpu_model_runner import GPUModelRunner
from vllm.v1.worker.utils import CpuGpuBuffer
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
...@@ -21,7 +22,8 @@ logger = init_logger(__name__) ...@@ -21,7 +22,8 @@ logger = init_logger(__name__)
class CPUModelRunner(GPUModelRunner): class CPUModelRunner(GPUModelRunner):
def __init__(self, vllm_config: VllmConfig, device: torch.device): def __init__(self, vllm_config: VllmConfig, device: torch.device):
super().__init__(vllm_config, device) with _torch_cuda_wrapper():
super().__init__(vllm_config, device)
assert device == torch.device("cpu") assert device == torch.device("cpu")
assert self.speculative_config is None, "spec decode is not supported." assert self.speculative_config is None, "spec decode is not supported."
...@@ -71,8 +73,8 @@ class CPUModelRunner(GPUModelRunner): ...@@ -71,8 +73,8 @@ class CPUModelRunner(GPUModelRunner):
setattr(obj, device_attr_name, cpu_tensor) setattr(obj, device_attr_name, cpu_tensor)
for k, v in vars(self).items(): for k, v in vars(self).items():
if k.endswith("_cpu") and isinstance(v, torch.Tensor): if isinstance(v, CpuGpuBuffer):
replace_tensor(self, k, k[:-4]) v.gpu = v.cpu
for k, v in vars(self.input_batch).items(): for k, v in vars(self.input_batch).items():
if k.endswith("_cpu_tensor") and isinstance(v, torch.Tensor): if k.endswith("_cpu_tensor") and isinstance(v, torch.Tensor):
...@@ -108,6 +110,26 @@ class CPUModelRunner(GPUModelRunner): ...@@ -108,6 +110,26 @@ class CPUModelRunner(GPUModelRunner):
def _sync_device(self) -> None: def _sync_device(self) -> None:
pass pass
def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
return sampled_token_ids.tolist()
@contextmanager
def _torch_cuda_wrapper():
class _EventPlaceholder:
def __init__(self, *args, **kwargs) -> None:
self.record = lambda: None
self.synchronize = lambda: None
try:
cuda_event = torch.cuda.Event
torch.cuda.Event = _EventPlaceholder
yield
finally:
torch.cuda.Event = cuda_event
@contextmanager @contextmanager
def _set_global_compilation_settings(config: VllmConfig): def _set_global_compilation_settings(config: VllmConfig):
......
...@@ -321,7 +321,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -321,7 +321,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
(self.max_model_len, 1), (self.max_model_len, 1),
dtype=torch.int64, dtype=torch.int64,
device="cpu", device="cpu",
pin_memory=True) pin_memory=self.pin_memory)
def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer: def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer:
return CpuGpuBuffer(*args, return CpuGpuBuffer(*args,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment