setup.py 6.05 KB
Newer Older
1
2
3
import io
import os
import re
4
5
import subprocess
from typing import List, Set
6

7
from packaging.version import parse, Version
Woosuk Kwon's avatar
Woosuk Kwon committed
8
import setuptools
Woosuk Kwon's avatar
Woosuk Kwon committed
9
import torch
10
11
12
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME

ROOT_DIR = os.path.dirname(__file__)
13

14
# Compiler flags.
15
CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
16
# TODO(woosuk): Should we use -O3?
17
NVCC_FLAGS = ["-O2", "-std=c++17"]
Woosuk Kwon's avatar
Woosuk Kwon committed
18

19
20
21
ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
22

Cody Yu's avatar
Cody Yu committed
23
if CUDA_HOME is None:
Woosuk Kwon's avatar
Woosuk Kwon committed
24
    raise RuntimeError(
Cody Yu's avatar
Cody Yu committed
25
        f"Cannot find CUDA_HOME. CUDA must be available in order to build the package.")
Woosuk Kwon's avatar
Woosuk Kwon committed
26

27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

def get_nvcc_cuda_version(cuda_dir: str) -> Version:
    """Get the CUDA version from nvcc.

    Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
    """
    nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
                                          universal_newlines=True)
    output = nvcc_output.split()
    release_idx = output.index("release") + 1
    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
    return nvcc_cuda_version


# Collect the compute capabilities of all available GPUs.
device_count = torch.cuda.device_count()
compute_capabilities: Set[int] = set()
for i in range(device_count):
    major, minor = torch.cuda.get_device_capability(i)
    if major < 7:
        raise RuntimeError(
            "GPUs with compute capability less than 7.0 are not supported.")
    compute_capabilities.add(major * 10 + minor)

# Validate the NVCC CUDA version.
nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
if nvcc_cuda_version < Version("11.0"):
    raise RuntimeError("CUDA 11.0 or higher is required to build the package.")
if 86 in compute_capabilities and nvcc_cuda_version < Version("11.1"):
    raise RuntimeError(
        "CUDA 11.1 or higher is required for GPUs with compute capability 8.6.")
if 90 in compute_capabilities and nvcc_cuda_version < Version("11.8"):
    raise RuntimeError(
        "CUDA 11.8 or higher is required for GPUs with compute capability 9.0.")
Woosuk Kwon's avatar
Woosuk Kwon committed
61

62
63
64
65
66
67
68
69
70
71
72
73
# If no GPU is available, add all supported compute capabilities.
if not compute_capabilities:
    compute_capabilities = {70, 75, 80}
    if nvcc_cuda_version >= Version("11.1"):
        compute_capabilities.add(86)
    if nvcc_cuda_version >= Version("11.8"):
        compute_capabilities.add(90)

# Add target compute capabilities to NVCC flags.
for capability in compute_capabilities:
    NVCC_FLAGS += ["-gencode", f"arch=compute_{capability},code=sm_{capability}"]

Woosuk Kwon's avatar
Woosuk Kwon committed
74
75
76
77
78
# Use NVCC threads to parallelize the build.
if nvcc_cuda_version >= Version("11.2"):
    num_threads = min(os.cpu_count(), 8)
    NVCC_FLAGS += ["--threads", str(num_threads)]

Woosuk Kwon's avatar
Woosuk Kwon committed
79
80
81
ext_modules = []

# Cache operations.
82
cache_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
83
    name="vllm.cache_ops",
84
85
    sources=["csrc/cache.cpp", "csrc/cache_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
Woosuk Kwon's avatar
Woosuk Kwon committed
86
87
88
)
ext_modules.append(cache_extension)

89
# Attention kernels.
90
attention_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
91
    name="vllm.attention_ops",
92
93
    sources=["csrc/attention.cpp", "csrc/attention/attention_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
94
95
96
)
ext_modules.append(attention_extension)

Woosuk Kwon's avatar
Woosuk Kwon committed
97
# Positional encoding kernels.
98
positional_encoding_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
99
    name="vllm.pos_encoding_ops",
100
101
    sources=["csrc/pos_encoding.cpp", "csrc/pos_encoding_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
102
103
104
)
ext_modules.append(positional_encoding_extension)

105
# Layer normalization kernels.
106
layernorm_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
107
    name="vllm.layernorm_ops",
108
109
    sources=["csrc/layernorm.cpp", "csrc/layernorm_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
110
111
112
)
ext_modules.append(layernorm_extension)

Woosuk Kwon's avatar
Woosuk Kwon committed
113
# Activation kernels.
114
activation_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
115
    name="vllm.activation_ops",
116
117
    sources=["csrc/activation.cpp", "csrc/activation_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
Woosuk Kwon's avatar
Woosuk Kwon committed
118
119
120
)
ext_modules.append(activation_extension)

121

122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
def get_path(*filepath) -> str:
    return os.path.join(ROOT_DIR, *filepath)


def find_version(filepath: str):
    """Extract version information from the given filepath.

    Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
    """
    with open(filepath) as fp:
        version_match = re.search(
            r"^__version__ = ['\"]([^'\"]*)['\"]", fp.read(), re.M)
        if version_match:
            return version_match.group(1)
        raise RuntimeError("Unable to find version string.")


def read_readme() -> str:
    """Read the README file."""
    return io.open(get_path("README.md"), "r", encoding="utf-8").read()


144
145
def get_requirements() -> List[str]:
    """Get Python package dependencies from requirements.txt."""
146
    with open(get_path("requirements.txt")) as f:
147
148
149
150
        requirements = f.read().strip().split("\n")
    return requirements


Woosuk Kwon's avatar
Woosuk Kwon committed
151
setuptools.setup(
Woosuk Kwon's avatar
Woosuk Kwon committed
152
153
154
    name="vllm",
    version=find_version(get_path("vllm", "__init__.py")),
    author="vLLM Team",
155
    license="Apache 2.0",
156
    description="A high-throughput and memory-efficient inference and serving engine for LLMs",
157
158
    long_description=read_readme(),
    long_description_content_type="text/markdown",
159
    url="https://github.com/vllm-project/vllm",
160
    project_urls={
161
162
        "Homepage": "https://github.com/vllm-project/vllm",
        "Documentation": "https://vllm.readthedocs.io/en/latest/",
163
164
165
166
167
168
169
170
171
    },
    classifiers=[
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "License :: OSI Approved :: Apache Software License",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
    packages=setuptools.find_packages(
172
        exclude=("assets", "benchmarks", "csrc", "docs", "examples", "tests")),
173
174
    python_requires=">=3.8",
    install_requires=get_requirements(),
Woosuk Kwon's avatar
Woosuk Kwon committed
175
    ext_modules=ext_modules,
176
    cmdclass={"build_ext": BuildExtension},
Woosuk Kwon's avatar
Woosuk Kwon committed
177
)