__init__.py 1.97 KB
Newer Older
1
"""vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
2
3
4
import os

import torch
5

6
7
8
9
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.llm_engine import LLMEngine
from vllm.entrypoints.llm import LLM
10
from vllm.executor.ray_utils import initialize_ray_cluster
11
from vllm.inputs import PromptType, TextPrompt, TokensPrompt
12
from vllm.model_executor.models import ModelRegistry
13
14
15
16
17
from vllm.outputs import (ClassificationOutput, ClassificationRequestOutput,
                          CompletionOutput, EmbeddingOutput,
                          EmbeddingRequestOutput, PoolingOutput,
                          PoolingRequestOutput, RequestOutput, ScoringOutput,
                          ScoringRequestOutput)
18
from vllm.pooling_params import PoolingParams
19
from vllm.sampling_params import SamplingParams
Woosuk Kwon's avatar
Woosuk Kwon committed
20

21
from .version import __version__, __version_tuple__
Woosuk Kwon's avatar
Woosuk Kwon committed
22

23
24
25
26
# set some common config/environment variables that should be set
# for all processes created by vllm and all processes
# that interact with vllm workers.
# they are executed whenever `import vllm` is called.
27

28
29
# see https://github.com/NVIDIA/nccl/issues/1234
os.environ['NCCL_CUMEM_ENABLE'] = '0'
30

31
32
33
34
# see https://github.com/vllm-project/vllm/issues/10480
os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
# see https://github.com/vllm-project/vllm/issues/10619
torch._inductor.config.compile_threads = 1
35

Woosuk Kwon's avatar
Woosuk Kwon committed
36
__all__ = [
37
    "__version__",
38
    "__version_tuple__",
Woosuk Kwon's avatar
Woosuk Kwon committed
39
    "LLM",
40
    "ModelRegistry",
41
    "PromptType",
42
43
    "TextPrompt",
    "TokensPrompt",
Woosuk Kwon's avatar
Woosuk Kwon committed
44
45
46
    "SamplingParams",
    "RequestOutput",
    "CompletionOutput",
47
48
    "PoolingOutput",
    "PoolingRequestOutput",
49
50
51
52
53
54
    "EmbeddingOutput",
    "EmbeddingRequestOutput",
    "ClassificationOutput",
    "ClassificationRequestOutput",
    "ScoringOutput",
    "ScoringRequestOutput",
Woosuk Kwon's avatar
Woosuk Kwon committed
55
56
57
58
    "LLMEngine",
    "EngineArgs",
    "AsyncLLMEngine",
    "AsyncEngineArgs",
59
    "initialize_ray_cluster",
60
    "PoolingParams",
Woosuk Kwon's avatar
Woosuk Kwon committed
61
]