Unverified Commit 59389c92 authored by Nick Hill's avatar Nick Hill Committed by GitHub
Browse files

[BugFix][CPU] Fix CPU worker dependency on cumem_allocator (#20696)


Signed-off-by: default avatarNick Hill <nhill@redhat.com>
parent 8f2720de
...@@ -11,7 +11,6 @@ import torch.nn as nn ...@@ -11,7 +11,6 @@ import torch.nn as nn
import vllm.envs as envs import vllm.envs as envs
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.device_allocator.cumem import CuMemAllocator
from vllm.distributed import (ensure_model_parallel_initialized, from vllm.distributed import (ensure_model_parallel_initialized,
init_distributed_environment, init_distributed_environment,
set_custom_all_reduce) set_custom_all_reduce)
...@@ -79,6 +78,8 @@ class Worker(WorkerBase): ...@@ -79,6 +78,8 @@ class Worker(WorkerBase):
self.profiler = None self.profiler = None
def sleep(self, level: int = 1) -> None: def sleep(self, level: int = 1) -> None:
from vllm.device_allocator.cumem import CuMemAllocator
free_bytes_before_sleep = torch.cuda.mem_get_info()[0] free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
# Save the buffers before level 2 sleep # Save the buffers before level 2 sleep
...@@ -101,6 +102,8 @@ class Worker(WorkerBase): ...@@ -101,6 +102,8 @@ class Worker(WorkerBase):
used_bytes / GiB_bytes) used_bytes / GiB_bytes)
def wake_up(self, tags: Optional[list[str]] = None) -> None: def wake_up(self, tags: Optional[list[str]] = None) -> None:
from vllm.device_allocator.cumem import CuMemAllocator
allocator = CuMemAllocator.get_instance() allocator = CuMemAllocator.get_instance()
allocator.wake_up(tags) allocator.wake_up(tags)
...@@ -174,6 +177,8 @@ class Worker(WorkerBase): ...@@ -174,6 +177,8 @@ class Worker(WorkerBase):
# to hijack tensor allocation. # to hijack tensor allocation.
def load_model(self) -> None: def load_model(self) -> None:
if self.vllm_config.model_config.enable_sleep_mode: if self.vllm_config.model_config.enable_sleep_mode:
from vllm.device_allocator.cumem import CuMemAllocator
allocator = CuMemAllocator.get_instance() allocator = CuMemAllocator.get_instance()
assert allocator.get_current_usage() == 0, ( assert allocator.get_current_usage() == 0, (
"Sleep mode can only be " "Sleep mode can only be "
...@@ -241,7 +246,10 @@ class Worker(WorkerBase): ...@@ -241,7 +246,10 @@ class Worker(WorkerBase):
def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None: def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
"""Allocate GPU KV cache with the specified kv_cache_config.""" """Allocate GPU KV cache with the specified kv_cache_config."""
if self.vllm_config.model_config.enable_sleep_mode: if self.vllm_config.model_config.enable_sleep_mode:
from vllm.device_allocator.cumem import CuMemAllocator
allocator = CuMemAllocator.get_instance() allocator = CuMemAllocator.get_instance()
context = allocator.use_memory_pool(tag="kv_cache") context = allocator.use_memory_pool(tag="kv_cache")
else: else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment