Unverified Commit 6ac5e06f authored by Isotr0py's avatar Isotr0py Committed by GitHub
Browse files

[Chore] Clean up pytorch helper functions in `vllm.utils` (#26908)


Signed-off-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: default avatarisotr0py <2037008807@qq.com>
parent 5c2acb27
...@@ -64,7 +64,7 @@ from vllm.model_executor.models.utils import ( ...@@ -64,7 +64,7 @@ from vllm.model_executor.models.utils import (
) )
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.utils import direct_register_custom_op from vllm.utils.torch_utils import direct_register_custom_op
from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata
......
...@@ -71,7 +71,7 @@ from vllm.platforms import current_platform ...@@ -71,7 +71,7 @@ from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import Qwen3NextConfig from vllm.transformers_utils.configs import Qwen3NextConfig
from vllm.triton_utils import tl, triton from vllm.triton_utils import tl, triton
from vllm.utils import direct_register_custom_op from vllm.utils.torch_utils import direct_register_custom_op
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata
from .interfaces import ( from .interfaces import (
......
...@@ -29,7 +29,7 @@ from vllm.model_executor.layers.fused_moe import FusedMoE ...@@ -29,7 +29,7 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.models.interfaces import MixtureOfExperts from vllm.model_executor.models.interfaces import MixtureOfExperts
from vllm.model_executor.models.utils import maybe_prefix from vllm.model_executor.models.utils import maybe_prefix
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import direct_register_custom_op from vllm.utils.torch_utils import direct_register_custom_op
from .utils import log_replacement from .utils import log_replacement
......
...@@ -24,11 +24,13 @@ from vllm.multimodal import NestedTensors ...@@ -24,11 +24,13 @@ from vllm.multimodal import NestedTensors
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.utils import ( from vllm.utils import (
cdiv, cdiv,
direct_register_custom_op,
get_cuda_view_from_cpu_tensor,
is_pin_memory_available, is_pin_memory_available,
is_uva_available, is_uva_available,
) )
from vllm.utils.torch_utils import (
direct_register_custom_op,
get_cuda_view_from_cpu_tensor,
)
logger = init_logger(__name__) logger = init_logger(__name__)
......
...@@ -34,7 +34,6 @@ from vllm.model_executor.layers.linear import ( ...@@ -34,7 +34,6 @@ from vllm.model_executor.layers.linear import (
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
...@@ -53,6 +52,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder ...@@ -53,6 +52,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.transformers_utils.processor import cached_get_processor from vllm.transformers_utils.processor import cached_get_processor
from vllm.utils.jsontree import json_map_leaves from vllm.utils.jsontree import json_map_leaves
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
from vllm.utils.torch_utils import set_default_torch_dtype
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsTranscription from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsTranscription
from .utils import ( from .utils import (
......
...@@ -7,7 +7,7 @@ from typing import Any ...@@ -7,7 +7,7 @@ from typing import Any
import torch import torch
from vllm.utils import is_torch_equal_or_newer from vllm.utils.torch_utils import is_torch_equal_or_newer
def set_random_seed(seed: int) -> None: def set_random_seed(seed: int) -> None:
......
...@@ -7,8 +7,8 @@ from typing import TYPE_CHECKING ...@@ -7,8 +7,8 @@ from typing import TYPE_CHECKING
from vllm import envs from vllm import envs
from vllm.plugins import PLATFORM_PLUGINS_GROUP, load_plugins_by_group from vllm.plugins import PLATFORM_PLUGINS_GROUP, load_plugins_by_group
from vllm.utils import supports_xccl
from vllm.utils.import_utils import resolve_obj_by_qualname from vllm.utils.import_utils import resolve_obj_by_qualname
from vllm.utils.torch_utils import supports_xccl
from .interface import CpuArchEnum, Platform, PlatformEnum from .interface import CpuArchEnum, Platform, PlatformEnum
......
...@@ -16,7 +16,8 @@ from typing_extensions import ParamSpec ...@@ -16,7 +16,8 @@ from typing_extensions import ParamSpec
import vllm._C # noqa import vllm._C # noqa
import vllm.envs as envs import vllm.envs as envs
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils import cuda_device_count_stateless, import_pynvml from vllm.utils import import_pynvml
from vllm.utils.torch_utils import cuda_device_count_stateless
from .interface import DeviceCapability, Platform, PlatformEnum from .interface import DeviceCapability, Platform, PlatformEnum
......
...@@ -9,7 +9,7 @@ import torch ...@@ -9,7 +9,7 @@ import torch
import vllm.envs as envs import vllm.envs as envs
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils import cuda_device_count_stateless from vllm.utils.torch_utils import cuda_device_count_stateless
from .interface import DeviceCapability, Platform, PlatformEnum from .interface import DeviceCapability, Platform, PlatformEnum
......
...@@ -21,7 +21,8 @@ import torch ...@@ -21,7 +21,8 @@ import torch
import vllm.envs as envs import vllm.envs as envs
from vllm.connections import global_http_connection from vllm.connections import global_http_connection
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils import cuda_device_count_stateless, cuda_get_device_properties from vllm.utils import cuda_get_device_properties
from vllm.utils.torch_utils import cuda_device_count_stateless
from vllm.version import __version__ as VLLM_VERSION from vllm.version import __version__ as VLLM_VERSION
logger = init_logger(__name__) logger = init_logger(__name__)
......
...@@ -35,12 +35,11 @@ from argparse import ( ...@@ -35,12 +35,11 @@ from argparse import (
from collections import defaultdict from collections import defaultdict
from collections.abc import ( from collections.abc import (
Callable, Callable,
Collection,
Iterator, Iterator,
Sequence, Sequence,
) )
from concurrent.futures.process import ProcessPoolExecutor from concurrent.futures.process import ProcessPoolExecutor
from functools import cache, lru_cache, partial, wraps from functools import cache, partial, wraps
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, TextIO, TypeVar from typing import TYPE_CHECKING, Any, TextIO, TypeVar
from urllib.parse import urlparse from urllib.parse import urlparse
...@@ -48,8 +47,6 @@ from uuid import uuid4 ...@@ -48,8 +47,6 @@ from uuid import uuid4
import cbor2 import cbor2
import cloudpickle import cloudpickle
import numpy as np
import numpy.typing as npt
import psutil import psutil
import regex as re import regex as re
import setproctitle import setproctitle
...@@ -57,9 +54,6 @@ import torch ...@@ -57,9 +54,6 @@ import torch
import yaml import yaml
import zmq import zmq
import zmq.asyncio import zmq.asyncio
from packaging import version
from packaging.version import Version
from torch.library import Library
import vllm.envs as envs import vllm.envs as envs
from vllm.logger import enable_trace_function_call, init_logger from vllm.logger import enable_trace_function_call, init_logger
...@@ -69,13 +63,11 @@ if TYPE_CHECKING: ...@@ -69,13 +63,11 @@ if TYPE_CHECKING:
from argparse import Namespace from argparse import Namespace
from vllm.config import ModelConfig, VllmConfig from vllm.config import ModelConfig, VllmConfig
from vllm.sequence import IntermediateTensors
else: else:
Namespace = object Namespace = object
ModelConfig = object ModelConfig = object
VllmConfig = object VllmConfig = object
IntermediateTensors = object
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -105,46 +97,6 @@ STR_INVALID_VAL: str = "INVALID" ...@@ -105,46 +97,6 @@ STR_INVALID_VAL: str = "INVALID"
CYAN = "\033[1;36m" CYAN = "\033[1;36m"
RESET = "\033[0;0m" RESET = "\033[0;0m"
STR_DTYPE_TO_TORCH_DTYPE = {
"float32": torch.float32,
"half": torch.half,
"bfloat16": torch.bfloat16,
"float": torch.float,
"fp8": torch.uint8,
"fp8_e4m3": torch.uint8,
"fp8_e5m2": torch.uint8,
"int8": torch.int8,
"fp8_inc": torch.float8_e4m3fn,
"fp8_ds_mla": torch.uint8,
}
TORCH_DTYPE_TO_NUMPY_DTYPE = {
torch.float16: np.float16,
torch.float32: np.float32,
torch.float64: np.float64,
torch.uint8: np.uint8,
torch.int32: np.int32,
torch.int64: np.int64,
}
@contextlib.contextmanager
def set_default_torch_num_threads(num_threads: int):
"""Sets the default number of threads for PyTorch to the given value."""
old_num_threads = torch.get_num_threads()
torch.set_num_threads(num_threads)
yield
torch.set_num_threads(old_num_threads)
def kv_cache_dtype_str_to_dtype(
kv_cache_dtype: str, model_config: ModelConfig
) -> torch.dtype:
if kv_cache_dtype == "auto":
# Model config may not be specified for unit tests, default to float16
return model_config.dtype if model_config else torch.half
return STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
T = TypeVar("T") T = TypeVar("T")
U = TypeVar("U") U = TypeVar("U")
...@@ -407,141 +359,6 @@ def round_down(x: int, y: int) -> int: ...@@ -407,141 +359,6 @@ def round_down(x: int, y: int) -> int:
return (x // y) * y return (x // y) * y
def _generate_random_fp8(
tensor: torch.Tensor,
low: float,
high: float,
) -> None:
# NOTE(zhaoyang): Due to NaN and Inf representation for fp8 data type,
# it may occur Inf or NaN if we directly use torch.randint
# to generate random data for fp8 data.
# For example, s.11111.00 in fp8e5m2 format represents Inf.
# | E4M3 | E5M2
# -----|-------------|-------------------
# Inf | N/A | s.11111.00
# NaN | s.1111.111 | s.11111.{01,10,11}
from vllm import _custom_ops as ops
tensor_tmp = torch.empty_like(tensor, dtype=torch.float16)
tensor_tmp.uniform_(low, high)
ops.convert_fp8(tensor, tensor_tmp)
del tensor_tmp
def get_kv_cache_torch_dtype(
cache_dtype: str | torch.dtype | None,
model_dtype: str | torch.dtype | None = None,
) -> torch.dtype:
if isinstance(cache_dtype, str):
if cache_dtype == "auto":
if isinstance(model_dtype, str) and model_dtype in STR_DTYPE_TO_TORCH_DTYPE:
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype]
elif isinstance(model_dtype, torch.dtype):
torch_dtype = model_dtype
else:
raise ValueError(f"Invalid model dtype: {model_dtype}")
elif cache_dtype in STR_DTYPE_TO_TORCH_DTYPE:
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype]
else:
raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
elif isinstance(cache_dtype, torch.dtype):
torch_dtype = cache_dtype
else:
raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
return torch_dtype
def create_kv_caches_with_random_flash(
num_blocks: int,
block_size: int,
num_layers: int,
num_heads: int,
head_size: int,
cache_dtype: str | torch.dtype | None,
model_dtype: str | torch.dtype | None = None,
seed: int | None = None,
device: str | None = "cuda",
cache_layout: str | None = "NHD",
) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
from vllm.platforms import current_platform
current_platform.seed_everything(seed)
dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
generic_kv_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
assert cache_layout in ("NHD", "HND")
stride_order = (0, 1, 2, 3, 4) if cache_layout == "NHD" else (0, 1, 3, 2, 4)
kv_cache_allocation_shape = tuple(generic_kv_cache_shape[i] for i in stride_order)
scale = head_size**-0.5
key_caches: list[torch.Tensor] = []
value_caches: list[torch.Tensor] = []
for _ in range(num_layers):
key_value_cache = torch.empty(
size=kv_cache_allocation_shape, dtype=dtype, device=device
).permute(*stride_order)
if cache_dtype in ["auto", "half", "bfloat16", "float"]:
key_value_cache.uniform_(-scale, scale)
elif cache_dtype == "fp8":
_generate_random_fp8(key_value_cache, -scale, scale)
else:
raise ValueError(f"Does not support key cache of type {cache_dtype}")
key_caches.append(key_value_cache[:, 0])
value_caches.append(key_value_cache[:, 1])
return key_caches, value_caches
def create_kv_caches_with_random(
num_blocks: int,
block_size: int,
num_layers: int,
num_heads: int,
head_size: int,
cache_dtype: str | torch.dtype | None,
model_dtype: str | torch.dtype | None = None,
seed: int | None = None,
device: str | None = "cuda",
) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
if cache_dtype == "fp8" and head_size % 16:
raise ValueError(
f"Does not support key cache of type fp8 with head_size {head_size}"
)
from vllm.platforms import current_platform
current_platform.seed_everything(seed)
dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
scale = head_size**-0.5
x = 16 // torch.tensor([], dtype=dtype).element_size()
key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
key_caches: list[torch.Tensor] = []
for _ in range(num_layers):
key_cache = torch.empty(size=key_cache_shape, dtype=dtype, device=device)
if cache_dtype in ["auto", "half", "bfloat16", "float"]:
key_cache.uniform_(-scale, scale)
elif cache_dtype == "fp8":
_generate_random_fp8(key_cache, -scale, scale)
else:
raise ValueError(f"Does not support key cache of type {cache_dtype}")
key_caches.append(key_cache)
value_cache_shape = (num_blocks, num_heads, head_size, block_size)
value_caches: list[torch.Tensor] = []
for _ in range(num_layers):
value_cache = torch.empty(size=value_cache_shape, dtype=dtype, device=device)
if cache_dtype in ["auto", "half", "bfloat16", "float"]:
value_cache.uniform_(-scale, scale)
elif cache_dtype == "fp8":
_generate_random_fp8(value_cache, -scale, scale)
else:
raise ValueError(f"Does not support value cache of type {cache_dtype}")
value_caches.append(value_cache)
return key_caches, value_caches
@cache @cache
def is_pin_memory_available() -> bool: def is_pin_memory_available() -> bool:
from vllm.platforms import current_platform from vllm.platforms import current_platform
...@@ -557,121 +374,6 @@ def is_uva_available() -> bool: ...@@ -557,121 +374,6 @@ def is_uva_available() -> bool:
return is_pin_memory_available() return is_pin_memory_available()
def make_ndarray_with_pad(
x: list[list[T]],
pad: T,
dtype: npt.DTypeLike,
*,
max_len: int | None = None,
) -> npt.NDArray:
"""
Make a padded array from 2D inputs.
The padding is applied to the end of each inner list until it reaches
`max_len`.
"""
if max_len is None:
# Unlike for most functions, map is faster than a genexpr over `len`
max_len = max(map(len, x), default=0)
padded_x = np.full((len(x), max_len), pad, dtype=dtype)
for ind, blocktb in enumerate(x):
assert len(blocktb) <= max_len
padded_x[ind, : len(blocktb)] = blocktb
return padded_x
def make_tensor_with_pad(
x: list[list[T]],
pad: T,
dtype: torch.dtype,
*,
max_len: int | None = None,
device: str | torch.device | None = None,
pin_memory: bool = False,
) -> torch.Tensor:
"""
Make a padded tensor from 2D inputs.
The padding is applied to the end of each inner list until it reaches
`max_len`.
"""
np_dtype = TORCH_DTYPE_TO_NUMPY_DTYPE[dtype]
padded_x = make_ndarray_with_pad(x, pad, np_dtype, max_len=max_len)
tensor = torch.from_numpy(padded_x).to(device)
if pin_memory:
tensor = tensor.pin_memory()
return tensor
def async_tensor_h2d(
data: list,
dtype: torch.dtype,
target_device: str | torch.device,
pin_memory: bool,
) -> torch.Tensor:
"""Asynchronously create a tensor and copy it from host to device."""
t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory, device="cpu")
return t.to(device=target_device, non_blocking=True)
def get_dtype_size(dtype: torch.dtype) -> int:
"""Get the size of the data type in bytes."""
return torch.tensor([], dtype=dtype).element_size()
# bool = 0, int = 1, float = 2, complex = 3
def _get_precision_level(dtype: torch.dtype) -> int:
# NOTE: Complex dtypes return `is_floating_point=False`
return (dtype != torch.bool) + dtype.is_floating_point + dtype.is_complex * 2
def is_lossless_cast(src_dtype: torch.dtype, tgt_dtype: torch.dtype):
"""
Test whether it is lossless to cast a tensor from
`src_dtype` to `tgt_dtype`.
"""
if src_dtype == tgt_dtype:
return True
src_level = _get_precision_level(src_dtype)
tgt_level = _get_precision_level(tgt_dtype)
if src_level < tgt_level:
return True
if src_level > tgt_level:
return False
# Compare integral types
if not src_dtype.is_floating_point and not src_dtype.is_complex:
src_info = torch.iinfo(src_dtype)
tgt_info = torch.iinfo(tgt_dtype)
return src_info.min >= tgt_info.min and src_info.max <= tgt_info.max
# Compare floating-point types
src_info = torch.finfo(src_dtype)
tgt_info = torch.finfo(tgt_dtype)
return (
src_info.min >= tgt_info.min
and src_info.max <= tgt_info.max
and src_info.resolution >= tgt_info.resolution
)
def common_broadcastable_dtype(dtypes: Collection[torch.dtype]):
"""
Get the common `dtype` where all of the other `dtypes` can be
cast to it without losing any information.
"""
return max(
dtypes,
key=lambda dtype: sum(is_lossless_cast(dt, dtype) for dt in dtypes),
)
# TODO: This function can be removed if transformer_modules classes are # TODO: This function can be removed if transformer_modules classes are
# serialized by value when communicating between processes # serialized by value when communicating between processes
def init_cached_hf_modules() -> None: def init_cached_hf_modules() -> None:
...@@ -767,60 +469,6 @@ def find_nccl_include_paths() -> list[str] | None: ...@@ -767,60 +469,6 @@ def find_nccl_include_paths() -> list[str] | None:
return out or None return out or None
prev_set_stream = torch.cuda.set_stream
_current_stream_tls = threading.local()
def _patched_set_stream(stream: torch.cuda.Stream) -> None:
_current_stream_tls.value = stream
prev_set_stream(stream)
torch.cuda.set_stream = _patched_set_stream
class _StreamPlaceholder:
def __init__(self):
self.synchronize = lambda: None
def current_stream() -> torch.cuda.Stream:
"""
replace `torch.cuda.current_stream()` with `vllm.utils.current_stream()`.
it turns out that `torch.cuda.current_stream()` is quite expensive,
as it will construct a new stream object at each call.
here we patch `torch.cuda.set_stream` to keep track of the current stream
directly, so that we can avoid calling `torch.cuda.current_stream()`.
the underlying hypothesis is that we do not call `torch._C._cuda_setStream`
from C/C++ code.
"""
from vllm.platforms import current_platform
if not hasattr(_current_stream_tls, "value") or _current_stream_tls.value is None:
# when this function is called before any stream is set,
# we return the default stream.
# On ROCm using the default 0 stream in combination with RCCL
# is hurting performance. Therefore creating a dedicated stream
# per process
if current_platform.is_rocm():
# torch.cuda.set_stream here is the alias of _pathed_set_stream
torch.cuda.set_stream(torch.cuda.Stream())
elif current_platform.is_cpu():
_current_stream_tls.value = _StreamPlaceholder()
else:
current_stream = current_platform.current_stream
if current_stream is not None:
_current_stream_tls.value = current_stream()
else:
raise ValueError(
"Fail to set current stream, current platform "
"may not support current_stream with torch API"
)
return _current_stream_tls.value
def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None: def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:
"""Set up function tracing for the current thread, """Set up function tracing for the current thread,
if enabled via the VLLM_TRACE_FUNCTION environment variable if enabled via the VLLM_TRACE_FUNCTION environment variable
...@@ -842,48 +490,6 @@ def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None: ...@@ -842,48 +490,6 @@ def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:
enable_trace_function_call(log_path) enable_trace_function_call(log_path)
@lru_cache(maxsize=8)
def _cuda_device_count_stateless(cuda_visible_devices: str | None = None) -> int:
# Note: cuda_visible_devices is not used, but we keep it as an argument for
# LRU Cache purposes.
# Code below is based on
# https://github.com/pytorch/pytorch/blob/
# c1cd946818442aca8c7f812b16d187ce1586c3bc/
# torch/cuda/__init__.py#L831C1-L831C17
import torch.cuda
from vllm.platforms import current_platform
if not torch.cuda._is_compiled():
return 0
if current_platform.is_rocm():
# ROCm uses amdsmi instead of nvml for stateless device count
# This requires a sufficiently modern version of Torch 2.4.0
raw_count = (
torch.cuda._device_count_amdsmi()
if (hasattr(torch.cuda, "_device_count_amdsmi"))
else -1
)
else:
raw_count = torch.cuda._device_count_nvml()
r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count
return r
def cuda_device_count_stateless() -> int:
"""Get number of CUDA devices, caching based on the value of
CUDA_VISIBLE_DEVICES at the time of call.
This should be used instead of torch.cuda.device_count()
unless CUDA_VISIBLE_DEVICES has already been set to the desired
value."""
# This can be removed and simply replaced with torch.cuda.get_device_count
# after https://github.com/pytorch/pytorch/pull/122815 is released.
return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
def cuda_is_initialized() -> bool: def cuda_is_initialized() -> bool:
"""Check if CUDA is initialized.""" """Check if CUDA is initialized."""
if not torch.cuda._is_compiled(): if not torch.cuda._is_compiled():
...@@ -1411,27 +1017,6 @@ class FlexibleArgumentParser(ArgumentParser): ...@@ -1411,27 +1017,6 @@ class FlexibleArgumentParser(ArgumentParser):
return processed_args return processed_args
# Using dynamo with vLLM doesn't really work well with PyTorch versions < 2.4.0.
# In particular, the FakeScalarType is not supported for earlier versions of
# PyTorch which breaks dynamo for any ops registered using ScalarType.
def supports_dynamo() -> bool:
base_torch_version = Version(Version(torch.__version__).base_version)
return base_torch_version >= Version("2.4.0")
# Supports xccl with PyTorch versions >= 2.8.0.dev for XPU platform
def supports_xccl() -> bool:
return (
is_torch_equal_or_newer("2.8.0.dev") and torch.distributed.is_xccl_available()
)
# Some backends use pytorch version < 2.4.0 which doesn't
# support `torch.library.custom_op`.
def supports_custom_op() -> bool:
return hasattr(torch.library, "custom_op")
class AtomicCounter: class AtomicCounter:
"""An atomic, thread-safe counter""" """An atomic, thread-safe counter"""
...@@ -1457,118 +1042,6 @@ class AtomicCounter: ...@@ -1457,118 +1042,6 @@ class AtomicCounter:
return self._value return self._value
def weak_ref_tensor(tensor: Any) -> Any:
"""
Create a weak reference to a tensor.
The new tensor will share the same data as the original tensor,
but will not keep the original tensor alive.
"""
if isinstance(tensor, torch.Tensor):
return torch.ops._C.weak_ref_tensor(tensor)
else:
return tensor
def weak_ref_tensors(
tensors: torch.Tensor
| list[torch.Tensor]
| tuple[torch.Tensor]
| IntermediateTensors,
) -> torch.Tensor | list[Any] | tuple[Any] | Any:
"""
Convenience function to create weak references to tensors,
for single tensor, list of tensors or tuple of tensors.
"""
if isinstance(tensors, torch.Tensor):
return weak_ref_tensor(tensors)
if isinstance(tensors, list):
return [weak_ref_tensor(t) for t in tensors]
if isinstance(tensors, tuple):
return tuple(weak_ref_tensor(t) for t in tensors)
# For IntermediateTensors used in pipeline parallelism
from vllm.sequence import IntermediateTensors
if isinstance(tensors, IntermediateTensors):
ret = IntermediateTensors(
{key: weak_ref_tensor(val) for key, val in tensors.tensors.items()}
)
return ret
raise ValueError("Invalid type for tensors")
def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
"""
Get a CUDA view of a CPU tensor using Unified Virtual Addressing (UVA).
"""
assert cpu_tensor.is_pinned(), "CPU tensor must be pinned"
return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
# create a library to hold the custom op
vllm_lib = Library("vllm", "FRAGMENT") # noqa
def direct_register_custom_op(
op_name: str,
op_func: Callable,
mutates_args: list[str] | None = None,
fake_impl: Callable | None = None,
target_lib: Library | None = None,
dispatch_key: str | None = None,
tags: tuple[torch.Tag, ...] = (),
):
"""
`torch.library.custom_op` can have significant overhead because it
needs to consider complicated dispatching logic. This function
directly registers a custom op and dispatches it to the CUDA backend.
See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5
for more details.
By default, the custom op is registered to the vLLM library. If you
want to register it to a different library, you can pass the library
object to the `target_lib` argument.
IMPORTANT: the lifetime of the operator is tied to the lifetime of the
library object. If you want to bind the operator to a different library,
make sure the library object is alive when the operator is used.
"""
if not supports_custom_op():
from vllm.platforms import current_platform
assert not current_platform.is_cuda_alike(), (
"cuda platform needs torch>=2.4 to support custom op, "
"chances are you are using an old version of pytorch "
"or a custom build of pytorch. It is recommended to "
"use vLLM in a fresh new environment and let it install "
"the required dependencies."
)
return
if mutates_args is None:
mutates_args = []
if dispatch_key is None:
from vllm.platforms import current_platform
dispatch_key = current_platform.dispatch_key
import torch.library
if hasattr(torch.library, "infer_schema"):
schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
else:
# for pytorch 2.4
import torch._custom_op.impl
schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
my_lib = target_lib or vllm_lib
my_lib.define(op_name + schema_str, tags=tags)
my_lib.impl(op_name, op_func, dispatch_key=dispatch_key)
if fake_impl is not None:
my_lib._register_fake(op_name, fake_impl)
def kill_process_tree(pid: int): def kill_process_tree(pid: int):
""" """
Kills all descendant processes of the given pid by sending SIGKILL. Kills all descendant processes of the given pid by sending SIGKILL.
...@@ -2063,55 +1536,6 @@ def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]: ...@@ -2063,55 +1536,6 @@ def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]:
raise ValueError(f"Unsupported hash function: {hash_fn_name}") raise ValueError(f"Unsupported hash function: {hash_fn_name}")
def is_torch_equal_or_newer(target: str) -> bool:
"""Check if the installed torch version is >= the target version.
Args:
target: a version string, like "2.6.0".
Returns:
Whether the condition meets.
"""
try:
return _is_torch_equal_or_newer(str(torch.__version__), target)
except Exception:
# Fallback to PKG-INFO to load the package info, needed by the doc gen.
return Version(importlib.metadata.version("torch")) >= Version(target)
# Helper function used in testing.
def _is_torch_equal_or_newer(torch_version: str, target: str) -> bool:
torch_version = version.parse(torch_version)
return torch_version >= version.parse(target)
def _is_torch_equal(target: str) -> bool:
assert target.count(".") == 2
torch_version = str(torch.__version__)
torch_version = version.parse(torch_version)
# torch version is like "2.6.0.dev20240101" or "2.6.0.dev20240101+cpu"
# or "2.6.0+cu128" but never "2.6.0.1"
return (
torch_version >= version.parse(target)
and version.parse(target + ".1") > torch_version
)
def is_torch_equal(target: str) -> bool:
"""Check if the installed torch version is == the target version.
Args:
target: a version string, like "2.6.0".
Returns:
Whether the condition meets.
"""
try:
return _is_torch_equal(target)
except Exception:
return Version(importlib.metadata.version("torch")) == Version(target)
@cache @cache
def _has_module(module_name: str) -> bool: def _has_module(module_name: str) -> bool:
"""Return True if *module_name* can be found in the current environment. """Return True if *module_name* can be found in the current environment.
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import contextlib
import importlib.metadata
import threading
from collections.abc import Callable, Collection
from functools import lru_cache
from typing import TYPE_CHECKING, Any, TypeVar
import numpy as np
import numpy.typing as npt
import torch
from packaging import version
from packaging.version import Version
from torch.library import Library
import vllm.envs as envs
if TYPE_CHECKING:
from vllm.config import ModelConfig
from vllm.sequence import IntermediateTensors
else:
ModelConfig = object
IntermediateTensors = object
STR_DTYPE_TO_TORCH_DTYPE = {
"float32": torch.float32,
"half": torch.half,
"bfloat16": torch.bfloat16,
"float": torch.float,
"fp8": torch.uint8,
"fp8_e4m3": torch.uint8,
"fp8_e5m2": torch.uint8,
"int8": torch.int8,
"fp8_inc": torch.float8_e4m3fn,
"fp8_ds_mla": torch.uint8,
}
TORCH_DTYPE_TO_NUMPY_DTYPE = {
torch.float16: np.float16,
torch.float32: np.float32,
torch.float64: np.float64,
torch.uint8: np.uint8,
torch.int32: np.int32,
torch.int64: np.int64,
}
T = TypeVar("T")
@contextlib.contextmanager
def set_default_torch_dtype(dtype: torch.dtype):
"""Sets the default torch dtype to the given dtype."""
old_dtype = torch.get_default_dtype()
torch.set_default_dtype(dtype)
yield
torch.set_default_dtype(old_dtype)
@contextlib.contextmanager
def set_default_torch_num_threads(num_threads: int):
"""Sets the default number of threads for PyTorch to the given value."""
old_num_threads = torch.get_num_threads()
torch.set_num_threads(num_threads)
yield
torch.set_num_threads(old_num_threads)
def get_dtype_size(dtype: torch.dtype) -> int:
"""Get the size of the data type in bytes."""
return torch.tensor([], dtype=dtype).element_size()
# bool = 0, int = 1, float = 2, complex = 3
def _get_precision_level(dtype: torch.dtype) -> int:
# NOTE: Complex dtypes return `is_floating_point=False`
return (dtype != torch.bool) + dtype.is_floating_point + dtype.is_complex * 2
def is_lossless_cast(src_dtype: torch.dtype, tgt_dtype: torch.dtype):
"""
Test whether it is lossless to cast a tensor from
`src_dtype` to `tgt_dtype`.
"""
if src_dtype == tgt_dtype:
return True
src_level = _get_precision_level(src_dtype)
tgt_level = _get_precision_level(tgt_dtype)
if src_level < tgt_level:
return True
if src_level > tgt_level:
return False
# Compare integral types
if not src_dtype.is_floating_point and not src_dtype.is_complex:
src_info = torch.iinfo(src_dtype)
tgt_info = torch.iinfo(tgt_dtype)
return src_info.min >= tgt_info.min and src_info.max <= tgt_info.max
# Compare floating-point types
src_info = torch.finfo(src_dtype)
tgt_info = torch.finfo(tgt_dtype)
return (
src_info.min >= tgt_info.min
and src_info.max <= tgt_info.max
and src_info.resolution >= tgt_info.resolution
)
def common_broadcastable_dtype(dtypes: Collection[torch.dtype]):
"""
Get the common `dtype` where all of the other `dtypes` can be
cast to it without losing any information.
"""
return max(
dtypes,
key=lambda dtype: sum(is_lossless_cast(dt, dtype) for dt in dtypes),
)
def _generate_random_fp8(
tensor: torch.Tensor,
low: float,
high: float,
) -> None:
# NOTE(zhaoyang): Due to NaN and Inf representation for fp8 data type,
# it may occur Inf or NaN if we directly use torch.randint
# to generate random data for fp8 data.
# For example, s.11111.00 in fp8e5m2 format represents Inf.
# | E4M3 | E5M2
# -----|-------------|-------------------
# Inf | N/A | s.11111.00
# NaN | s.1111.111 | s.11111.{01,10,11}
from vllm import _custom_ops as ops
tensor_tmp = torch.empty_like(tensor, dtype=torch.float16)
tensor_tmp.uniform_(low, high)
ops.convert_fp8(tensor, tensor_tmp)
del tensor_tmp
def get_kv_cache_torch_dtype(
cache_dtype: str | torch.dtype | None,
model_dtype: str | torch.dtype | None = None,
) -> torch.dtype:
if isinstance(cache_dtype, str):
if cache_dtype == "auto":
if isinstance(model_dtype, str) and model_dtype in STR_DTYPE_TO_TORCH_DTYPE:
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype]
elif isinstance(model_dtype, torch.dtype):
torch_dtype = model_dtype
else:
raise ValueError(f"Invalid model dtype: {model_dtype}")
elif cache_dtype in STR_DTYPE_TO_TORCH_DTYPE:
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype]
else:
raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
elif isinstance(cache_dtype, torch.dtype):
torch_dtype = cache_dtype
else:
raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
return torch_dtype
def kv_cache_dtype_str_to_dtype(
kv_cache_dtype: str, model_config: ModelConfig
) -> torch.dtype:
if kv_cache_dtype == "auto":
# Model config may not be specified for unit tests, default to float16
return model_config.dtype if model_config else torch.half
return STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
def create_kv_caches_with_random_flash(
num_blocks: int,
block_size: int,
num_layers: int,
num_heads: int,
head_size: int,
cache_dtype: str | torch.dtype | None,
model_dtype: str | torch.dtype | None = None,
seed: int | None = None,
device: str | None = "cuda",
cache_layout: str | None = "NHD",
) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
from vllm.platforms import current_platform
current_platform.seed_everything(seed)
dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
generic_kv_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
assert cache_layout in ("NHD", "HND")
stride_order = (0, 1, 2, 3, 4) if cache_layout == "NHD" else (0, 1, 3, 2, 4)
kv_cache_allocation_shape = tuple(generic_kv_cache_shape[i] for i in stride_order)
scale = head_size**-0.5
key_caches: list[torch.Tensor] = []
value_caches: list[torch.Tensor] = []
for _ in range(num_layers):
key_value_cache = torch.empty(
size=kv_cache_allocation_shape, dtype=dtype, device=device
).permute(*stride_order)
if cache_dtype in ["auto", "half", "bfloat16", "float"]:
key_value_cache.uniform_(-scale, scale)
elif cache_dtype == "fp8":
_generate_random_fp8(key_value_cache, -scale, scale)
else:
raise ValueError(f"Does not support key cache of type {cache_dtype}")
key_caches.append(key_value_cache[:, 0])
value_caches.append(key_value_cache[:, 1])
return key_caches, value_caches
def create_kv_caches_with_random(
num_blocks: int,
block_size: int,
num_layers: int,
num_heads: int,
head_size: int,
cache_dtype: str | torch.dtype | None,
model_dtype: str | torch.dtype | None = None,
seed: int | None = None,
device: str | None = "cuda",
) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
if cache_dtype == "fp8" and head_size % 16:
raise ValueError(
f"Does not support key cache of type fp8 with head_size {head_size}"
)
from vllm.platforms import current_platform
current_platform.seed_everything(seed)
dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
scale = head_size**-0.5
x = 16 // torch.tensor([], dtype=dtype).element_size()
key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
key_caches: list[torch.Tensor] = []
for _ in range(num_layers):
key_cache = torch.empty(size=key_cache_shape, dtype=dtype, device=device)
if cache_dtype in ["auto", "half", "bfloat16", "float"]:
key_cache.uniform_(-scale, scale)
elif cache_dtype == "fp8":
_generate_random_fp8(key_cache, -scale, scale)
else:
raise ValueError(f"Does not support key cache of type {cache_dtype}")
key_caches.append(key_cache)
value_cache_shape = (num_blocks, num_heads, head_size, block_size)
value_caches: list[torch.Tensor] = []
for _ in range(num_layers):
value_cache = torch.empty(size=value_cache_shape, dtype=dtype, device=device)
if cache_dtype in ["auto", "half", "bfloat16", "float"]:
value_cache.uniform_(-scale, scale)
elif cache_dtype == "fp8":
_generate_random_fp8(value_cache, -scale, scale)
else:
raise ValueError(f"Does not support value cache of type {cache_dtype}")
value_caches.append(value_cache)
return key_caches, value_caches
def async_tensor_h2d(
data: list,
dtype: torch.dtype,
target_device: str | torch.device,
pin_memory: bool,
) -> torch.Tensor:
"""Asynchronously create a tensor and copy it from host to device."""
t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory, device="cpu")
return t.to(device=target_device, non_blocking=True)
def make_ndarray_with_pad(
x: list[list[T]],
pad: T,
dtype: npt.DTypeLike,
*,
max_len: int | None = None,
) -> npt.NDArray:
"""
Make a padded array from 2D inputs.
The padding is applied to the end of each inner list until it reaches
`max_len`.
"""
if max_len is None:
# Unlike for most functions, map is faster than a genexpr over `len`
max_len = max(map(len, x), default=0)
padded_x = np.full((len(x), max_len), pad, dtype=dtype)
for ind, blocktb in enumerate(x):
assert len(blocktb) <= max_len
padded_x[ind, : len(blocktb)] = blocktb
return padded_x
def make_tensor_with_pad(
x: list[list[T]],
pad: T,
dtype: torch.dtype,
*,
max_len: int | None = None,
device: str | torch.device | None = None,
pin_memory: bool = False,
) -> torch.Tensor:
"""
Make a padded tensor from 2D inputs.
The padding is applied to the end of each inner list until it reaches
`max_len`.
"""
np_dtype = TORCH_DTYPE_TO_NUMPY_DTYPE[dtype]
padded_x = make_ndarray_with_pad(x, pad, np_dtype, max_len=max_len)
tensor = torch.from_numpy(padded_x).to(device)
if pin_memory:
tensor = tensor.pin_memory()
return tensor
prev_set_stream = torch.cuda.set_stream
_current_stream_tls = threading.local()
def _patched_set_stream(stream: torch.cuda.Stream) -> None:
_current_stream_tls.value = stream
prev_set_stream(stream)
torch.cuda.set_stream = _patched_set_stream
class _StreamPlaceholder:
def __init__(self):
self.synchronize = lambda: None
def current_stream() -> torch.cuda.Stream:
"""
replace `torch.cuda.current_stream()` with `vllm.utils.current_stream()`.
it turns out that `torch.cuda.current_stream()` is quite expensive,
as it will construct a new stream object at each call.
here we patch `torch.cuda.set_stream` to keep track of the current stream
directly, so that we can avoid calling `torch.cuda.current_stream()`.
the underlying hypothesis is that we do not call `torch._C._cuda_setStream`
from C/C++ code.
"""
from vllm.platforms import current_platform
if not hasattr(_current_stream_tls, "value") or _current_stream_tls.value is None:
# when this function is called before any stream is set,
# we return the default stream.
# On ROCm using the default 0 stream in combination with RCCL
# is hurting performance. Therefore creating a dedicated stream
# per process
if current_platform.is_rocm():
# torch.cuda.set_stream here is the alias of _pathed_set_stream
torch.cuda.set_stream(torch.cuda.Stream())
elif current_platform.is_cpu():
_current_stream_tls.value = _StreamPlaceholder()
else:
current_stream = current_platform.current_stream
if current_stream is not None:
_current_stream_tls.value = current_stream()
else:
raise ValueError(
"Fail to set current stream, current platform "
"may not support current_stream with torch API"
)
return _current_stream_tls.value
@lru_cache(maxsize=8)
def _cuda_device_count_stateless(cuda_visible_devices: str | None = None) -> int:
# Note: cuda_visible_devices is not used, but we keep it as an argument for
# LRU Cache purposes.
# Code below is based on
# https://github.com/pytorch/pytorch/blob/
# c1cd946818442aca8c7f812b16d187ce1586c3bc/
# torch/cuda/__init__.py#L831C1-L831C17
import torch.cuda
import torch.version
from vllm.platforms import current_platform
if not torch.cuda._is_compiled():
return 0
if current_platform.is_rocm():
# ROCm uses amdsmi instead of nvml for stateless device count
# This requires a sufficiently modern version of Torch 2.4.0
raw_count = (
torch.cuda._device_count_amdsmi()
if (hasattr(torch.cuda, "_device_count_amdsmi"))
else -1
)
else:
raw_count = torch.cuda._device_count_nvml()
r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count
return r
def cuda_device_count_stateless() -> int:
"""Get number of CUDA devices, caching based on the value of
CUDA_VISIBLE_DEVICES at the time of call.
This should be used instead of torch.cuda.device_count()
unless CUDA_VISIBLE_DEVICES has already been set to the desired
value."""
# This can be removed and simply replaced with torch.cuda.get_device_count
# after https://github.com/pytorch/pytorch/pull/122815 is released.
return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
def weak_ref_tensor(tensor: Any) -> Any:
"""
Create a weak reference to a tensor.
The new tensor will share the same data as the original tensor,
but will not keep the original tensor alive.
"""
if isinstance(tensor, torch.Tensor):
return torch.ops._C.weak_ref_tensor(tensor)
else:
return tensor
def weak_ref_tensors(
tensors: torch.Tensor
| list[torch.Tensor]
| tuple[torch.Tensor]
| IntermediateTensors,
) -> torch.Tensor | list[Any] | tuple[Any] | Any:
"""
Convenience function to create weak references to tensors,
for single tensor, list of tensors or tuple of tensors.
"""
if isinstance(tensors, torch.Tensor):
return weak_ref_tensor(tensors)
if isinstance(tensors, list):
return [weak_ref_tensor(t) for t in tensors]
if isinstance(tensors, tuple):
return tuple(weak_ref_tensor(t) for t in tensors)
# For IntermediateTensors used in pipeline parallelism
from vllm.sequence import IntermediateTensors
if isinstance(tensors, IntermediateTensors):
ret = IntermediateTensors(
{key: weak_ref_tensor(val) for key, val in tensors.tensors.items()}
)
return ret
raise ValueError("Invalid type for tensors")
def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
"""
Get a CUDA view of a CPU tensor using Unified Virtual Addressing (UVA).
"""
assert cpu_tensor.is_pinned(), "CPU tensor must be pinned"
return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
# Helper function used in testing.
def _is_torch_equal_or_newer(torch_version: str, target: str) -> bool:
torch_version = version.parse(torch_version)
return torch_version >= version.parse(target)
def is_torch_equal_or_newer(target: str) -> bool:
"""Check if the installed torch version is >= the target version.
Args:
target: a version string, like "2.6.0".
Returns:
Whether the condition meets.
"""
try:
return _is_torch_equal_or_newer(str(torch.__version__), target)
except Exception:
# Fallback to PKG-INFO to load the package info, needed by the doc gen.
return Version(importlib.metadata.version("torch")) >= Version(target)
def _is_torch_equal(target: str) -> bool:
assert target.count(".") == 2
torch_version = str(torch.__version__)
torch_version = version.parse(torch_version)
# torch version is like "2.6.0.dev20240101" or "2.6.0.dev20240101+cpu"
# or "2.6.0+cu128" but never "2.6.0.1"
return (
torch_version >= version.parse(target)
and version.parse(target + ".1") > torch_version
)
def is_torch_equal(target: str) -> bool:
"""Check if the installed torch version is == the target version.
Args:
target: a version string, like "2.6.0".
Returns:
Whether the condition meets.
"""
try:
return _is_torch_equal(target)
except Exception:
return Version(importlib.metadata.version("torch")) == Version(target)
# Using dynamo with vLLM doesn't really work well with PyTorch versions < 2.4.0.
# In particular, the FakeScalarType is not supported for earlier versions of
# PyTorch which breaks dynamo for any ops registered using ScalarType.
def supports_dynamo() -> bool:
return is_torch_equal_or_newer("2.4.0")
# Supports xccl with PyTorch versions >= 2.8.0.dev for XPU platform
def supports_xccl() -> bool:
return (
is_torch_equal_or_newer("2.8.0.dev") and torch.distributed.is_xccl_available()
)
# Some backends use pytorch version < 2.4.0 which doesn't
# support `torch.library.custom_op`.
def supports_custom_op() -> bool:
return hasattr(torch.library, "custom_op")
# create a library to hold the custom op
vllm_lib = Library("vllm", "FRAGMENT") # noqa
def direct_register_custom_op(
op_name: str,
op_func: Callable,
mutates_args: list[str] | None = None,
fake_impl: Callable | None = None,
target_lib: Library | None = None,
dispatch_key: str | None = None,
tags: tuple[torch.Tag, ...] = (),
):
"""
`torch.library.custom_op` can have significant overhead because it
needs to consider complicated dispatching logic. This function
directly registers a custom op and dispatches it to the CUDA backend.
See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5
for more details.
By default, the custom op is registered to the vLLM library. If you
want to register it to a different library, you can pass the library
object to the `target_lib` argument.
IMPORTANT: the lifetime of the operator is tied to the lifetime of the
library object. If you want to bind the operator to a different library,
make sure the library object is alive when the operator is used.
"""
if not supports_custom_op():
from vllm.platforms import current_platform
assert not current_platform.is_cuda_alike(), (
"cuda platform needs torch>=2.4 to support custom op, "
"chances are you are using an old version of pytorch "
"or a custom build of pytorch. It is recommended to "
"use vLLM in a fresh new environment and let it install "
"the required dependencies."
)
return
if mutates_args is None:
mutates_args = []
if dispatch_key is None:
from vllm.platforms import current_platform
dispatch_key = current_platform.dispatch_key
import torch.library
if hasattr(torch.library, "infer_schema"):
schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
else:
# for pytorch 2.4
import torch._custom_op.impl
schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
my_lib = target_lib or vllm_lib
my_lib.define(op_name + schema_str, tags=tags)
my_lib.impl(op_name, op_func, dispatch_key=dispatch_key)
if fake_impl is not None:
my_lib._register_fake(op_name, fake_impl)
...@@ -28,7 +28,8 @@ from vllm.logger import init_logger ...@@ -28,7 +28,8 @@ from vllm.logger import init_logger
from vllm.model_executor.layers.batch_invariant import ( from vllm.model_executor.layers.batch_invariant import (
vllm_is_batch_invariant, vllm_is_batch_invariant,
) )
from vllm.utils import cdiv, is_torch_equal_or_newer from vllm.utils import cdiv
from vllm.utils.torch_utils import is_torch_equal_or_newer
from vllm.v1.attention.backends.utils import ( from vllm.v1.attention.backends.utils import (
AttentionMetadataBuilder, AttentionMetadataBuilder,
CommonAttentionMetadata, CommonAttentionMetadata,
......
...@@ -29,7 +29,7 @@ if current_platform.is_rocm(): ...@@ -29,7 +29,7 @@ if current_platform.is_rocm():
import aiter import aiter
from vllm.triton_utils import tl, triton from vllm.triton_utils import tl, triton
from vllm.utils import direct_register_custom_op from vllm.utils.torch_utils import direct_register_custom_op
@triton.jit @triton.jit
def _vllm_layout_trans_kernel( def _vllm_layout_trans_kernel(
......
...@@ -10,7 +10,8 @@ from typing_extensions import Self ...@@ -10,7 +10,8 @@ from typing_extensions import Self
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils import cdiv, get_dtype_size from vllm.utils import cdiv
from vllm.utils.torch_utils import get_dtype_size
logger = init_logger(__name__) logger = init_logger(__name__)
......
...@@ -4,7 +4,8 @@ ...@@ -4,7 +4,8 @@
import torch import torch
from vllm.model_executor.layers.utils import apply_penalties from vllm.model_executor.layers.utils import apply_penalties
from vllm.utils import is_pin_memory_available, make_tensor_with_pad from vllm.utils import is_pin_memory_available
from vllm.utils.torch_utils import make_tensor_with_pad
def apply_all_penalties( def apply_all_penalties(
......
...@@ -71,16 +71,18 @@ from vllm.tasks import GenerationTask, PoolingTask, SupportedTask ...@@ -71,16 +71,18 @@ from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
from vllm.utils import ( from vllm.utils import (
cdiv, cdiv,
check_use_alibi, check_use_alibi,
get_dtype_size,
is_pin_memory_available, is_pin_memory_available,
kv_cache_dtype_str_to_dtype,
length_from_prompt_token_ids_or_embeds, length_from_prompt_token_ids_or_embeds,
round_up, round_up,
supports_dynamo,
) )
from vllm.utils.jsontree import json_map_leaves from vllm.utils.jsontree import json_map_leaves
from vllm.utils.mem_constants import GiB_bytes from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import DeviceMemoryProfiler from vllm.utils.mem_utils import DeviceMemoryProfiler
from vllm.utils.torch_utils import (
get_dtype_size,
kv_cache_dtype_str_to_dtype,
supports_dynamo,
)
from vllm.v1.attention.backends.flash_attn import AttentionMetadata from vllm.v1.attention.backends.flash_attn import AttentionMetadata
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
from vllm.v1.attention.backends.utils import ( from vllm.v1.attention.backends.utils import (
......
...@@ -25,7 +25,8 @@ from vllm.model_executor import set_random_seed ...@@ -25,7 +25,8 @@ from vllm.model_executor import set_random_seed
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.platforms.tpu import USE_TPU_INFERENCE from vllm.platforms.tpu import USE_TPU_INFERENCE
from vllm.tasks import SupportedTask from vllm.tasks import SupportedTask
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv from vllm.utils import cdiv
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig, KVCacheSpec from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig, KVCacheSpec
from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.outputs import ModelRunnerOutput
......
...@@ -7,7 +7,7 @@ import torch ...@@ -7,7 +7,7 @@ import torch
from vllm import forward_context from vllm import forward_context
from vllm.forward_context import ForwardContext from vllm.forward_context import ForwardContext
from vllm.utils import current_stream from vllm.utils.torch_utils import current_stream
_THREAD_ID_TO_CONTEXT: dict = {} _THREAD_ID_TO_CONTEXT: dict = {}
_CURRENT_CONTEXTS: list[Optional["UBatchContext"]] = [None, None] _CURRENT_CONTEXTS: list[Optional["UBatchContext"]] = [None, None]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment