"vscode:/vscode.git/clone" did not exist on "ea49e6a3c82bdd48f377efc79bfe44f37f13b3ad"
__init__.py 13.7 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import contextlib
5
import datetime
Woosuk Kwon's avatar
Woosuk Kwon committed
6
import enum
7
import getpass
8
import inspect
9
import multiprocessing
10
import os
11
import signal
12
import sys
13
14
import tempfile
import threading
Zhuohan Li's avatar
Zhuohan Li committed
15
import uuid
16
import warnings
17
from collections.abc import Callable
Cyrus Leung's avatar
Cyrus Leung committed
18
from functools import partial, wraps
19
from typing import TYPE_CHECKING, Any, TypeVar
Zhuohan Li's avatar
Zhuohan Li committed
20

21
import cloudpickle
22
import psutil
Zhuohan Li's avatar
Zhuohan Li committed
23
import torch
24

25
import vllm.envs as envs
26
from vllm.logger import enable_trace_function_call, init_logger
27
from vllm.ray.lazy_utils import is_in_ray_actor
28

29
30
31
32
33
_DEPRECATED_MAPPINGS = {
    "cprofile": "profiling",
    "cprofile_context": "profiling",
    "get_open_port": "network_utils",
}
34
35
36


def __getattr__(name: str) -> Any:  # noqa: D401 - short deprecation docstring
37
38
39
    """Module-level getattr to handle deprecated utilities."""
    if name in _DEPRECATED_MAPPINGS:
        submodule_name = _DEPRECATED_MAPPINGS[name]
40
41
        warnings.warn(
            f"vllm.utils.{name} is deprecated and will be removed in a future version. "
42
            f"Use vllm.utils.{submodule_name}.{name} instead.",
43
44
45
            DeprecationWarning,
            stacklevel=2,
        )
46
47
        module = __import__(f"vllm.utils.{submodule_name}", fromlist=[submodule_name])
        return getattr(module, name)
48
49
50
51
52
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


def __dir__() -> list[str]:
    # expose deprecated names in dir() for better UX/tab-completion
53
    return sorted(list(globals().keys()) + list(_DEPRECATED_MAPPINGS.keys()))
54
55


56
if TYPE_CHECKING:
57
    from vllm.config import ModelConfig, VllmConfig
58
59
60
else:
    ModelConfig = object
    VllmConfig = object
61

62
63
logger = init_logger(__name__)

64
65
66
67
68
69
# This value is chosen to have a balance between ITL and TTFT. Note it is
# not optimized for throughput.
DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120

70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# Constants related to forcing the attention backend selection

# String name of register which may be set in order to
# force auto-selection of attention backend by Attention
# wrapper
STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"

# Possible string values of STR_BACKEND_ENV_VAR
# register, corresponding to possible backends
STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
STR_TORCH_SDPA_ATTN_VAL: str = "TORCH_SDPA"
STR_XFORMERS_ATTN_VAL: str = "XFORMERS"
STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
STR_INVALID_VAL: str = "INVALID"

85

86
# ANSI color codes
87
88
CYAN = "\033[1;36m"
RESET = "\033[0;0m"
89

90

91
T = TypeVar("T")
92
U = TypeVar("U")
93

94

Woosuk Kwon's avatar
Woosuk Kwon committed
95
96
97
98
99
class Device(enum.Enum):
    GPU = enum.auto()
    CPU = enum.auto()


100
101
102
103
104
class LayerBlockType(enum.Enum):
    attention = "attention"
    mamba = "mamba"


Woosuk Kwon's avatar
Woosuk Kwon committed
105
106
107
108
class Counter:
    def __init__(self, start: int = 0) -> None:
        self.counter = start

Woosuk Kwon's avatar
Woosuk Kwon committed
109
    def __next__(self) -> int:
110
        i = self.counter
Woosuk Kwon's avatar
Woosuk Kwon committed
111
        self.counter += 1
112
        return i
Woosuk Kwon's avatar
Woosuk Kwon committed
113
114
115

    def reset(self) -> None:
        self.counter = 0
Zhuohan Li's avatar
Zhuohan Li committed
116

117

Cyrus Leung's avatar
Cyrus Leung committed
118
119
class AtomicCounter:
    """An atomic, thread-safe counter"""
120

Cyrus Leung's avatar
Cyrus Leung committed
121
122
123
124
    def __init__(self, initial=0):
        """Initialize a new atomic counter to given initial value"""
        self._value = initial
        self._lock = threading.Lock()
125

Cyrus Leung's avatar
Cyrus Leung committed
126
127
128
129
130
    def inc(self, num=1):
        """Atomically increment the counter by num and return the new value"""
        with self._lock:
            self._value += num
            return self._value
131

Cyrus Leung's avatar
Cyrus Leung committed
132
133
134
135
136
    def dec(self, num=1):
        """Atomically decrement the counter by num and return the new value"""
        with self._lock:
            self._value -= num
            return self._value
137

Cyrus Leung's avatar
Cyrus Leung committed
138
139
140
    @property
    def value(self):
        return self._value
141
142


Cyrus Leung's avatar
Cyrus Leung committed
143
144
def random_uuid() -> str:
    return str(uuid.uuid4().hex)
145
146


147
148
# TODO: This function can be removed if transformer_modules classes are
# serialized by value when communicating between processes
149
def init_cached_hf_modules() -> None:
150
151
152
153
    """
    Lazy initialization of the Hugging Face modules.
    """
    from transformers.dynamic_module_utils import init_hf_modules
154

155
    init_hf_modules()
156
157


158
def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:
159
160
161
162
    """Set up function tracing for the current thread,
    if enabled via the VLLM_TRACE_FUNCTION environment variable
    """

163
    if envs.VLLM_TRACE_FUNCTION:
164
        tmp_dir = tempfile.gettempdir()
165
166
        # add username to tmp_dir to avoid permission issues
        tmp_dir = os.path.join(tmp_dir, getpass.getuser())
167
168
169
170
171
172
173
174
        filename = (
            f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}"
            f"_thread_{threading.get_ident()}_"
            f"at_{datetime.datetime.now()}.log"
        ).replace(" ", "_")
        log_path = os.path.join(
            tmp_dir, "vllm", f"vllm-instance-{vllm_config.instance_id}", filename
        )
175
176
        os.makedirs(os.path.dirname(log_path), exist_ok=True)
        enable_trace_function_call(log_path)
177
178


179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
def kill_process_tree(pid: int):
    """
    Kills all descendant processes of the given pid by sending SIGKILL.

    Args:
        pid (int): Process ID of the parent process
    """
    try:
        parent = psutil.Process(pid)
    except psutil.NoSuchProcess:
        return

    # Get all children recursively
    children = parent.children(recursive=True)

    # Send SIGKILL to all children first
    for child in children:
        with contextlib.suppress(ProcessLookupError):
            os.kill(child.pid, signal.SIGKILL)

    # Finally kill the parent
    with contextlib.suppress(ProcessLookupError):
        os.kill(pid, signal.SIGKILL)
202
203


204
# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501
205
def set_ulimit(target_soft_limit=65535):
206
    if sys.platform.startswith("win"):
207
208
209
210
        logger.info("Windows detected, skipping ulimit adjustment.")
        return

    import resource
211

212
213
214
215
216
    resource_type = resource.RLIMIT_NOFILE
    current_soft, current_hard = resource.getrlimit(resource_type)

    if current_soft < target_soft_limit:
        try:
217
            resource.setrlimit(resource_type, (target_soft_limit, current_hard))
218
219
        except ValueError as e:
            logger.warning(
220
221
                "Found ulimit of %s and failed to automatically increase "
                "with error %s. This can cause fd limit errors like "
222
                "`OSError: [Errno 24] Too many open files`. Consider "
223
224
225
226
                "increasing with ulimit -n",
                current_soft,
                e,
            )
227
228


229
230
231
232
233
234
235
def _maybe_force_spawn():
    """Check if we need to force the use of the `spawn` multiprocessing start
    method.
    """
    if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") == "spawn":
        return

236
237
    reasons = []
    if is_in_ray_actor():
238
239
240
241
        # even if we choose to spawn, we need to pass the ray address
        # to the subprocess so that it knows how to connect to the ray cluster.
        # env vars are inherited by subprocesses, even if we use spawn.
        import ray
242

243
        os.environ["RAY_ADDRESS"] = ray.get_runtime_context().gcs_address
244
245
        reasons.append("In a Ray actor and can only be spawned")

Cyrus Leung's avatar
Cyrus Leung committed
246
247
    from .platform_utils import cuda_is_initialized, xpu_is_initialized

248
249
250
251
    if cuda_is_initialized():
        reasons.append("CUDA is initialized")
    elif xpu_is_initialized():
        reasons.append("XPU is initialized")
252

253
    if reasons:
254
255
256
        logger.warning(
            "We must use the `spawn` multiprocessing start method. "
            "Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
257
            "See https://docs.vllm.ai/en/latest/usage/"
258
            "troubleshooting.html#python-multiprocessing "
259
260
261
            "for more information. Reasons: %s",
            "; ".join(reasons),
        )
262
263
264
265
        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"


def get_mp_context():
266
267
268
269
270
271
272
    """Get a multiprocessing context with a particular method (spawn or fork).
    By default we follow the value of the VLLM_WORKER_MULTIPROC_METHOD to
    determine the multiprocessing method (default is fork). However, under
    certain conditions, we may enforce spawn and override the value of
    VLLM_WORKER_MULTIPROC_METHOD.
    """
    _maybe_force_spawn()
273
274
    mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
    return multiprocessing.get_context(mp_method)
275
276


277
278
def run_method(
    obj: Any,
279
    method: str | bytes | Callable,
280
281
282
    args: tuple[Any],
    kwargs: dict[str, Any],
) -> Any:
283
284
285
286
287
288
289
290
291
292
293
294
295
    """
    Run a method of an object with the given arguments and keyword arguments.
    If the method is string, it will be converted to a method using getattr.
    If the method is serialized bytes and will be deserialized using
    cloudpickle.
    If the method is a callable, it will be called directly.
    """
    if isinstance(method, bytes):
        func = partial(cloudpickle.loads(method), obj)
    elif isinstance(method, str):
        try:
            func = getattr(obj, method)
        except AttributeError:
296
297
298
            raise NotImplementedError(
                f"Method {method!r} is not implemented."
            ) from None
299
300
301
    else:
        func = partial(method, obj)  # type: ignore
    return func(*args, **kwargs)
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322


def import_pynvml():
    """
    Historical comments:

    libnvml.so is the library behind nvidia-smi, and
    pynvml is a Python wrapper around it. We use it to get GPU
    status without initializing CUDA context in the current process.
    Historically, there are two packages that provide pynvml:
    - `nvidia-ml-py` (https://pypi.org/project/nvidia-ml-py/): The official
        wrapper. It is a dependency of vLLM, and is installed when users
        install vLLM. It provides a Python module named `pynvml`.
    - `pynvml` (https://pypi.org/project/pynvml/): An unofficial wrapper.
        Prior to version 12.0, it also provides a Python module `pynvml`,
        and therefore conflicts with the official one. What's worse,
        the module is a Python package, and has higher priority than
        the official one which is a standalone Python file.
        This causes errors when both of them are installed.
        Starting from version 12.0, it migrates to a new module
        named `pynvml_utils` to avoid the conflict.
323
324
325
326
327
328
329
    It is so confusing that many packages in the community use the
    unofficial one by mistake, and we have to handle this case.
    For example, `nvcr.io/nvidia/pytorch:24.12-py3` uses the unofficial
    one, and it will cause errors, see the issue
    https://github.com/vllm-project/vllm/issues/12847 for example.
    After all the troubles, we decide to copy the official `pynvml`
    module to our codebase, and use it directly.
330
    """
331
    import vllm.third_party.pynvml as pynvml
332

333
    return pynvml
334
335


336
def warn_for_unimplemented_methods(cls: type[T]) -> type[T]:
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
    """
    A replacement for `abc.ABC`.
    When we use `abc.ABC`, subclasses will fail to instantiate
    if they do not implement all abstract methods.
    Here, we only require `raise NotImplementedError` in the
    base class, and log a warning if the method is not implemented
    in the subclass.
    """

    original_init = cls.__init__

    def find_unimplemented_methods(self: object):
        unimplemented_methods = []
        for attr_name in dir(self):
            # bypass inner method
352
            if attr_name.startswith("_"):
353
354
355
356
357
358
359
360
361
362
363
364
365
                continue

            try:
                attr = getattr(self, attr_name)
                # get the func of callable method
                if callable(attr):
                    attr_func = attr.__func__
            except AttributeError:
                continue
            src = inspect.getsource(attr_func)
            if "NotImplementedError" in src:
                unimplemented_methods.append(attr_name)
        if unimplemented_methods:
366
367
            method_names = ",".join(unimplemented_methods)
            msg = f"Methods {method_names} not implemented in {self}"
368
            logger.debug(msg)
369
370
371
372
373
374

    @wraps(original_init)
    def wrapped_init(self, *args, **kwargs) -> None:
        original_init(self, *args, **kwargs)
        find_unimplemented_methods(self)

375
    type.__setattr__(cls, "__init__", wrapped_init)
376
    return cls
377
378


379
380
# Only relevant for models using ALiBi (e.g, MPT)
def check_use_alibi(model_config: ModelConfig) -> bool:
381
    cfg = model_config.hf_text_config
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
    return (
        getattr(cfg, "alibi", False)  # Falcon
        or (
            "BloomForCausalLM" in getattr(model_config.hf_config, "architectures", [])
        )  # Bloom
        or getattr(cfg, "position_encoding_type", "") == "alibi"  # codellm_1b_alibi
        or (
            hasattr(cfg, "attn_config")  # MPT
            and (
                (
                    isinstance(cfg.attn_config, dict)
                    and cfg.attn_config.get("alibi", False)
                )
                or (
                    not isinstance(cfg.attn_config, dict)
                    and getattr(cfg.attn_config, "alibi", False)
                )
            )
        )
    )
402
403


404
def length_from_prompt_token_ids_or_embeds(
405
406
    prompt_token_ids: list[int] | None,
    prompt_embeds: torch.Tensor | None,
407
) -> int:
408
    """Calculate the request length (in number of tokens) give either
409
410
    prompt_token_ids or prompt_embeds.
    """
411
412
    prompt_token_len = None if prompt_token_ids is None else len(prompt_token_ids)
    prompt_embeds_len = None if prompt_embeds is None else len(prompt_embeds)
413
414
415

    if prompt_token_len is None:
        if prompt_embeds_len is None:
416
            raise ValueError("Neither prompt_token_ids nor prompt_embeds were defined.")
417
418
        return prompt_embeds_len
    else:
419
        if prompt_embeds_len is not None and prompt_embeds_len != prompt_token_len:
420
421
422
            raise ValueError(
                "Prompt token ids and prompt embeds had different lengths"
                f" prompt_token_ids={prompt_token_len}"
423
424
                f" prompt_embeds={prompt_embeds_len}"
            )
425
        return prompt_token_len