__init__.py 2 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
"""vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
3
4
5
import os

import torch
6

7
8
9
10
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.llm_engine import LLMEngine
from vllm.entrypoints.llm import LLM
11
from vllm.executor.ray_utils import initialize_ray_cluster
12
from vllm.inputs import PromptType, TextPrompt, TokensPrompt
13
from vllm.model_executor.models import ModelRegistry
14
15
16
17
18
from vllm.outputs import (ClassificationOutput, ClassificationRequestOutput,
                          CompletionOutput, EmbeddingOutput,
                          EmbeddingRequestOutput, PoolingOutput,
                          PoolingRequestOutput, RequestOutput, ScoringOutput,
                          ScoringRequestOutput)
19
from vllm.pooling_params import PoolingParams
20
from vllm.sampling_params import SamplingParams
Woosuk Kwon's avatar
Woosuk Kwon committed
21

22
from .version import __version__, __version_tuple__
Woosuk Kwon's avatar
Woosuk Kwon committed
23

24
25
26
27
# set some common config/environment variables that should be set
# for all processes created by vllm and all processes
# that interact with vllm workers.
# they are executed whenever `import vllm` is called.
28

29
30
# see https://github.com/NVIDIA/nccl/issues/1234
os.environ['NCCL_CUMEM_ENABLE'] = '0'
31

32
33
34
35
# see https://github.com/vllm-project/vllm/issues/10480
os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
# see https://github.com/vllm-project/vllm/issues/10619
torch._inductor.config.compile_threads = 1
36

Woosuk Kwon's avatar
Woosuk Kwon committed
37
__all__ = [
38
    "__version__",
39
    "__version_tuple__",
Woosuk Kwon's avatar
Woosuk Kwon committed
40
    "LLM",
41
    "ModelRegistry",
42
    "PromptType",
43
44
    "TextPrompt",
    "TokensPrompt",
Woosuk Kwon's avatar
Woosuk Kwon committed
45
46
47
    "SamplingParams",
    "RequestOutput",
    "CompletionOutput",
48
49
    "PoolingOutput",
    "PoolingRequestOutput",
50
51
52
53
54
55
    "EmbeddingOutput",
    "EmbeddingRequestOutput",
    "ClassificationOutput",
    "ClassificationRequestOutput",
    "ScoringOutput",
    "ScoringRequestOutput",
Woosuk Kwon's avatar
Woosuk Kwon committed
56
57
58
59
    "LLMEngine",
    "EngineArgs",
    "AsyncLLMEngine",
    "AsyncEngineArgs",
60
    "initialize_ray_cluster",
61
    "PoolingParams",
Woosuk Kwon's avatar
Woosuk Kwon committed
62
]