"vllm/vscode:/vscode.git/clone" did not exist on "541a2ef892720489f770569417bc1bc4436dbb21"
Unverified Commit 59d53066 authored by Wei Zhao's avatar Wei Zhao Committed by GitHub
Browse files

[Feature] Support CPU Offloading without Pytorch Pinned Memory that leads to...


[Feature] Support CPU Offloading without Pytorch Pinned Memory that leads to doubled allocation (#32993)
Signed-off-by: default avatarwzhao18 <wzhao18.sz@gmail.com>
Co-authored-by: default avatarMichael Goin <mgoin64@gmail.com>
parent 4a9952ec
......@@ -2,33 +2,58 @@
#include <torch/cuda.h>
#include <cuda_runtime.h>
// This function assumes that `cpu_tensor` is a CPU tensor allocated with pinned
// memory, and that UVA (Unified Virtual Addressing) is enabled.
// This function assumes that `cpu_tensor` is a CPU tensor,
// and that UVA (Unified Virtual Addressing) is enabled.
torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
TORCH_CHECK(cpu_tensor.device().is_cpu(), "Input tensor must be on CPU");
// Get raw host pointer from CPU tensor
void* host_ptr = cpu_tensor.data_ptr();
// handle empty tensor
if (cpu_tensor.numel() == 0) {
return torch::empty(cpu_tensor.sizes(),
cpu_tensor.options().device(torch::kCUDA));
}
if (cpu_tensor.is_pinned()) {
// If CPU tensor is pinned, directly get the device pointer.
void* host_ptr = const_cast<void*>(cpu_tensor.data_ptr());
void* device_ptr = nullptr;
cudaError_t err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0);
TORCH_CHECK(err == cudaSuccess,
"cudaHostGetDevicePointer failed: ", cudaGetErrorString(err));
return torch::from_blob(
device_ptr, cpu_tensor.sizes(), cpu_tensor.strides(),
[base = cpu_tensor](void*) {}, // keep cpu tensor alive
cpu_tensor.options().device(torch::kCUDA));
}
// If CPU tensor is not pinned, allocate a new pinned memory buffer.
torch::Tensor contiguous_cpu = cpu_tensor.contiguous();
size_t nbytes = contiguous_cpu.nbytes();
void* host_ptr = nullptr;
cudaError_t err = cudaHostAlloc(&host_ptr, nbytes, cudaHostAllocMapped);
if (err != cudaSuccess) {
AT_ERROR("cudaHostAlloc failed: ", cudaGetErrorString(err));
}
err = cudaMemcpy(host_ptr, contiguous_cpu.data_ptr(), nbytes,
cudaMemcpyDefault);
if (err != cudaSuccess) {
cudaFreeHost(host_ptr);
AT_ERROR("cudaMemcpy failed: ", cudaGetErrorString(err));
}
// Get a device pointer corresponding to the pinned host memory
void* device_ptr = nullptr;
cudaError_t err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0);
TORCH_CHECK(err == cudaSuccess,
"cudaHostGetDevicePointer failed: ", cudaGetErrorString(err));
// We'll use the same sizes, strides, and dtype as the CPU tensor.
// TODO: check if layout is respected.
auto sizes = cpu_tensor.sizes();
auto strides = cpu_tensor.strides();
auto options = cpu_tensor.options().device(torch::kCUDA);
// use default no-op deleter, since the memory is owned by the original CPU
// tensor
torch::Tensor cuda_tensor =
torch::from_blob(device_ptr, sizes, strides, options);
TORCH_CHECK(cuda_tensor.device().is_cuda(),
"Resulting tensor is not on CUDA device");
return cuda_tensor;
}
err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0);
if (err != cudaSuccess) {
cudaFreeHost(host_ptr);
AT_ERROR("cudaHostGetDevicePointer failed: ", cudaGetErrorString(err));
}
auto deleter = [host_ptr](void*) { cudaFreeHost(host_ptr); };
return torch::from_blob(device_ptr, contiguous_cpu.sizes(),
contiguous_cpu.strides(), deleter,
contiguous_cpu.options().device(torch::kCUDA));
}
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from ..utils import compare_two_settings
def test_cpu_offload():
@pytest.mark.parametrize("disable_pin_memory", [False, True])
@pytest.mark.parametrize("disable_uva", [False, True])
def test_cpu_offload(disable_pin_memory, disable_uva):
env_vars = {
"VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY": str(int(disable_pin_memory)),
"VLLM_WEIGHT_OFFLOADING_DISABLE_UVA": str(int(disable_uva)),
}
args = ["--cpu-offload-gb", "1"]
# cuda graph only works with UVA offloading
if disable_uva:
args.append("--enforce-eager")
compare_two_settings(
"hmellor/tiny-random-LlamaForCausalLM", [], ["--cpu-offload-gb", "1"]
model="hmellor/tiny-random-LlamaForCausalLM",
arg1=[],
arg2=args,
env1=None,
env2=env_vars,
)
......@@ -230,6 +230,8 @@ if TYPE_CHECKING:
VLLM_USE_V2_MODEL_RUNNER: bool = False
VLLM_LOG_MODEL_INSPECTION: bool = False
VLLM_DEBUG_MFU_METRICS: bool = False
VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY: bool = False
VLLM_WEIGHT_OFFLOADING_DISABLE_UVA: bool = False
VLLM_DISABLE_LOG_LOGO: bool = False
VLLM_LORA_DISABLE_PDL: bool = False
......@@ -1542,6 +1544,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_DEBUG_MFU_METRICS": lambda: bool(
int(os.getenv("VLLM_DEBUG_MFU_METRICS", "0"))
),
# Disable using pytorch's pin memory for CPU offloading.
"VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY": lambda: bool(
int(os.getenv("VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY", "0"))
),
# Disable using UVA (Unified Virtual Addressing) for CPU offloading.
"VLLM_WEIGHT_OFFLOADING_DISABLE_UVA": lambda: bool(
int(os.getenv("VLLM_WEIGHT_OFFLOADING_DISABLE_UVA", "0"))
),
# Disable logging of vLLM logo at server startup time.
"VLLM_DISABLE_LOG_LOGO": lambda: bool(int(os.getenv("VLLM_DISABLE_LOG_LOGO", "0"))),
# Disable PDL for LoRA, as enabling PDL with LoRA on SM100 causes
......
......@@ -11,6 +11,7 @@ import torch
from torch import nn
from typing_extensions import assert_never
import vllm.envs as envs
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
from vllm.logger import init_logger
from vllm.model_executor.layers.attention import Attention, MLAAttention
......@@ -25,6 +26,7 @@ from vllm.model_executor.model_loader.reload import (
from vllm.model_executor.models.interfaces import SupportsQuant
from vllm.tracing import instrument
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import get_accelerator_view_from_cpu_tensor
logger = init_logger(__name__)
......@@ -111,7 +113,8 @@ def process_weights_after_loading(
):
# TODO(lucas): see if there is a way to unify the signatures
# of process_weights_after_loading
module.process_weights_after_loading(model_config.dtype)
with device_loading_context(module, target_device):
module.process_weights_after_loading(model_config.dtype)
# Needed for torchao model reloading via model.reload_weights
# @kylesayrs @jerryzh168 this can be removed if callers move to `reload_weights`
......@@ -127,38 +130,41 @@ def device_loading_context(module: torch.nn.Module, target_device: torch.device)
return
original_device_states: dict[str, torch.device] = {}
uva_offloaded_parameters: list[str] = []
# Store original device states and move parameters to GPU if they're on CPU
for name, p in module.named_parameters():
if p.device.type == "cpu":
original_device_states[name] = p.device
p.data = p.data.to(target_device)
if getattr(p, "_vllm_is_uva_offloaded", False):
uva_offloaded_parameters.append(name)
# Parameters already on target device are not touched
try:
yield module
finally:
use_pin_memory = (
is_pin_memory_available()
and not envs.VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY
)
# Restore parameters to their original devices, ignoring new parameters
pin_memory = is_pin_memory_available()
for name, p in module.named_parameters():
if name in original_device_states:
original_device: torch.device = original_device_states[name]
if original_device.type == "cpu":
# `torch.empty_like` does not support `pin_memory` argument
cpu_data = torch.empty_strided(
size=p.data.size(),
stride=p.data.stride(),
dtype=p.data.dtype,
layout=p.data.layout,
device="cpu",
pin_memory=pin_memory,
)
cpu_data.copy_(p.data)
p.data = cpu_data
else:
p.data = p.data.to(original_device)
# New parameters or parameters already on target device are untouched
p.data = p.data.to(original_device)
# parameter is UVA offloaded, but was replaced with a new device tensor
# re-offload it to CPU using UVA
if name in uva_offloaded_parameters and not getattr(
p, "_vllm_is_uva_offloaded", False
):
cpu_data = p.data.to(device="cpu")
if use_pin_memory:
cpu_data = cpu_data.pin_memory()
p.data = get_accelerator_view_from_cpu_tensor(cpu_data)
p._vllm_is_uva_offloaded = True
_MODEL_ARCH_BY_HASH = dict[int, tuple[type[nn.Module], str]]()
......
......@@ -13,6 +13,7 @@ from torch.func import functional_call
from torch.nn.modules.module import register_module_module_registration_hook
from transformers import PretrainedConfig
import vllm.envs as envs
from vllm.config import VllmConfig
from vllm.distributed import (
get_tensor_model_parallel_rank,
......@@ -633,11 +634,10 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
return module
pin_memory = is_pin_memory_available()
uva_available = is_uva_available()
assert uva_available, "V1 CPU offloading requires uva (pin memory) support"
uva_offloading = True
pin_memory = (
is_pin_memory_available() and not envs.VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY
)
uva_offloading = is_uva_available() and not envs.VLLM_WEIGHT_OFFLOADING_DISABLE_UVA
# offload parameters to CPU
# use pin_memory if possible, which helps cudagraph capture speed
......@@ -648,22 +648,16 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
# one module might have some parameters offloaded and some not
break
# `torch.empty_like` does not support `pin_memory` argument
cpu_data = torch.empty_strided(
size=p.data.size(),
stride=p.data.stride(),
dtype=p.data.dtype,
layout=p.data.layout,
device="cpu",
pin_memory=pin_memory,
)
cpu_data.copy_(p.data)
cpu_data = p.data.to(device="cpu")
if pin_memory:
cpu_data = cpu_data.pin_memory()
if not uva_offloading:
p.data = cpu_data
else:
# keep the cpu data alive
p._vllm_offloaded_cpu_data = cpu_data
p.data = get_accelerator_view_from_cpu_tensor(cpu_data)
p._vllm_is_uva_offloaded = True
_CPU_OFFLOAD_BYTES += p.data.numel() * p.data.element_size()
offloaded_parameters = True
......@@ -678,7 +672,12 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
k: v.to(device, non_blocking=True)
for k, v in module.state_dict().items()
}
output = functional_call(module, device_state, args=args, kwargs=kwargs)
# set `tie_weights=False` as tied weights in original model
# become untied when calling .to(device) individually
output = functional_call(
module, device_state, args=args, kwargs=kwargs, tie_weights=False
)
module.forward = forward
return output
......
......@@ -678,12 +678,18 @@ def get_accelerator_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tens
"""
Get an accelerator view of a CPU tensor using Unified Virtual Addressing (UVA).
"""
assert cpu_tensor.is_pinned(), "CPU tensor must be pinned"
from vllm.platforms import current_platform
if current_platform.is_xpu():
assert cpu_tensor.is_pinned(), "CPU tensor must be pinned"
return torch.ops._C.get_xpu_view_from_cpu_tensor(cpu_tensor)
return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
elif current_platform.is_cuda():
return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
else:
raise ValueError(
f"`get_accelerator_view_from_cpu_tensor` is currently "
f"not supported in: {current_platform.device_name}"
)
# Helper function used in testing.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment