__init__.py 2.92 KB
Newer Older
1
2
"""vLLM: a high-throughput and memory-efficient inference engine for LLMs"""

3
4
5
6
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.llm_engine import LLMEngine
from vllm.entrypoints.llm import LLM
7
from vllm.executor.ray_utils import initialize_ray_cluster
8
from vllm.inputs import PromptType, TextPrompt, TokensPrompt
9
from vllm.model_executor.models import ModelRegistry
10
11
12
13
14
from vllm.outputs import (ClassificationOutput, ClassificationRequestOutput,
                          CompletionOutput, EmbeddingOutput,
                          EmbeddingRequestOutput, PoolingOutput,
                          PoolingRequestOutput, RequestOutput, ScoringOutput,
                          ScoringRequestOutput)
15
from vllm.pooling_params import PoolingParams
16
from vllm.sampling_params import SamplingParams
Woosuk Kwon's avatar
Woosuk Kwon committed
17

18
from .version import __version__, __version_tuple__
Woosuk Kwon's avatar
Woosuk Kwon committed
19

20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

def configure_as_vllm_process():
    """
    set some common config/environment variables that should be set
    for all processes created by vllm and all processes
    that interact with vllm workers.
    """
    import os

    import torch

    # see https://github.com/NVIDIA/nccl/issues/1234
    os.environ['NCCL_CUMEM_ENABLE'] = '0'

    # see https://github.com/vllm-project/vllm/issues/10480
    os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
    # see https://github.com/vllm-project/vllm/issues/10619
    torch._inductor.config.compile_threads = 1

    from vllm.platforms import current_platform

    if current_platform.is_xpu():
        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
        torch._dynamo.config.disable = True
    elif current_platform.is_hpu():
        # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
        # does not support torch.compile
        # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
        # torch.compile support
        is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1'
        if is_lazy:
            torch._dynamo.config.disable = True
            # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
            # requires enabling lazy collectives
            # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
            os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'


Woosuk Kwon's avatar
Woosuk Kwon committed
58
__all__ = [
59
    "__version__",
60
    "__version_tuple__",
Woosuk Kwon's avatar
Woosuk Kwon committed
61
    "LLM",
62
    "ModelRegistry",
63
    "PromptType",
64
65
    "TextPrompt",
    "TokensPrompt",
Woosuk Kwon's avatar
Woosuk Kwon committed
66
67
68
    "SamplingParams",
    "RequestOutput",
    "CompletionOutput",
69
70
    "PoolingOutput",
    "PoolingRequestOutput",
71
72
73
74
75
76
    "EmbeddingOutput",
    "EmbeddingRequestOutput",
    "ClassificationOutput",
    "ClassificationRequestOutput",
    "ScoringOutput",
    "ScoringRequestOutput",
Woosuk Kwon's avatar
Woosuk Kwon committed
77
78
79
80
    "LLMEngine",
    "EngineArgs",
    "AsyncLLMEngine",
    "AsyncEngineArgs",
81
    "initialize_ray_cluster",
82
    "PoolingParams",
83
    "configure_as_vllm_process",
Woosuk Kwon's avatar
Woosuk Kwon committed
84
]