"docs/vscode:/vscode.git/clone" did not exist on "9dc0a801f56c1e972987451347955782902cc6fa"
setup.py 7.13 KB
Newer Older
1
2
3
import io
import os
import re
4
5
import subprocess
from typing import List, Set
6

7
from packaging.version import parse, Version
Woosuk Kwon's avatar
Woosuk Kwon committed
8
import setuptools
Woosuk Kwon's avatar
Woosuk Kwon committed
9
import torch
10
11
12
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME

ROOT_DIR = os.path.dirname(__file__)
13

14
# Compiler flags.
15
CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
16
# TODO(woosuk): Should we use -O3?
17
NVCC_FLAGS = ["-O2", "-std=c++17"]
Woosuk Kwon's avatar
Woosuk Kwon committed
18

19
20
21
ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
22

Cody Yu's avatar
Cody Yu committed
23
if CUDA_HOME is None:
Woosuk Kwon's avatar
Woosuk Kwon committed
24
    raise RuntimeError(
Woosuk Kwon's avatar
Woosuk Kwon committed
25
        "Cannot find CUDA_HOME. CUDA must be available to build the package.")
Woosuk Kwon's avatar
Woosuk Kwon committed
26

27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56

def get_nvcc_cuda_version(cuda_dir: str) -> Version:
    """Get the CUDA version from nvcc.

    Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
    """
    nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
                                          universal_newlines=True)
    output = nvcc_output.split()
    release_idx = output.index("release") + 1
    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
    return nvcc_cuda_version


# Collect the compute capabilities of all available GPUs.
device_count = torch.cuda.device_count()
compute_capabilities: Set[int] = set()
for i in range(device_count):
    major, minor = torch.cuda.get_device_capability(i)
    if major < 7:
        raise RuntimeError(
            "GPUs with compute capability less than 7.0 are not supported.")
    compute_capabilities.add(major * 10 + minor)

# Validate the NVCC CUDA version.
nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
if nvcc_cuda_version < Version("11.0"):
    raise RuntimeError("CUDA 11.0 or higher is required to build the package.")
if 86 in compute_capabilities and nvcc_cuda_version < Version("11.1"):
    raise RuntimeError(
Woosuk Kwon's avatar
Woosuk Kwon committed
57
58
        "CUDA 11.1 or higher is required for GPUs with compute capability 8.6."
    )
59
60
61
62
63
64
65
66
if 89 in compute_capabilities and nvcc_cuda_version < Version("11.8"):
    # CUDA 11.8 is required to generate the code targeting compute capability 8.9.
    # However, GPUs with compute capability 8.9 can also run the code generated by
    # the previous versions of CUDA 11 and targeting compute capability 8.0.
    # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
    # instead of 8.9.
    compute_capabilities.remove(89)
    compute_capabilities.add(80)
67
68
if 90 in compute_capabilities and nvcc_cuda_version < Version("11.8"):
    raise RuntimeError(
Woosuk Kwon's avatar
Woosuk Kwon committed
69
70
        "CUDA 11.8 or higher is required for GPUs with compute capability 9.0."
    )
Woosuk Kwon's avatar
Woosuk Kwon committed
71

72
73
74
75
76
77
# If no GPU is available, add all supported compute capabilities.
if not compute_capabilities:
    compute_capabilities = {70, 75, 80}
    if nvcc_cuda_version >= Version("11.1"):
        compute_capabilities.add(86)
    if nvcc_cuda_version >= Version("11.8"):
78
        compute_capabilities.add(89)
79
80
81
82
        compute_capabilities.add(90)

# Add target compute capabilities to NVCC flags.
for capability in compute_capabilities:
Woosuk Kwon's avatar
Woosuk Kwon committed
83
84
85
    NVCC_FLAGS += [
        "-gencode", f"arch=compute_{capability},code=sm_{capability}"
    ]
86

Woosuk Kwon's avatar
Woosuk Kwon committed
87
88
89
90
91
# Use NVCC threads to parallelize the build.
if nvcc_cuda_version >= Version("11.2"):
    num_threads = min(os.cpu_count(), 8)
    NVCC_FLAGS += ["--threads", str(num_threads)]

Woosuk Kwon's avatar
Woosuk Kwon committed
92
93
94
ext_modules = []

# Cache operations.
95
cache_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
96
    name="vllm.cache_ops",
97
    sources=["csrc/cache.cpp", "csrc/cache_kernels.cu"],
Woosuk Kwon's avatar
Woosuk Kwon committed
98
99
100
101
    extra_compile_args={
        "cxx": CXX_FLAGS,
        "nvcc": NVCC_FLAGS,
    },
Woosuk Kwon's avatar
Woosuk Kwon committed
102
103
104
)
ext_modules.append(cache_extension)

105
# Attention kernels.
106
attention_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
107
    name="vllm.attention_ops",
108
    sources=["csrc/attention.cpp", "csrc/attention/attention_kernels.cu"],
Woosuk Kwon's avatar
Woosuk Kwon committed
109
110
111
112
    extra_compile_args={
        "cxx": CXX_FLAGS,
        "nvcc": NVCC_FLAGS,
    },
113
114
115
)
ext_modules.append(attention_extension)

Woosuk Kwon's avatar
Woosuk Kwon committed
116
# Positional encoding kernels.
117
positional_encoding_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
118
    name="vllm.pos_encoding_ops",
119
    sources=["csrc/pos_encoding.cpp", "csrc/pos_encoding_kernels.cu"],
Woosuk Kwon's avatar
Woosuk Kwon committed
120
121
122
123
    extra_compile_args={
        "cxx": CXX_FLAGS,
        "nvcc": NVCC_FLAGS,
    },
124
125
126
)
ext_modules.append(positional_encoding_extension)

127
# Layer normalization kernels.
128
layernorm_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
129
    name="vllm.layernorm_ops",
130
    sources=["csrc/layernorm.cpp", "csrc/layernorm_kernels.cu"],
Woosuk Kwon's avatar
Woosuk Kwon committed
131
132
133
134
    extra_compile_args={
        "cxx": CXX_FLAGS,
        "nvcc": NVCC_FLAGS,
    },
135
136
137
)
ext_modules.append(layernorm_extension)

Woosuk Kwon's avatar
Woosuk Kwon committed
138
# Activation kernels.
139
activation_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
140
    name="vllm.activation_ops",
141
    sources=["csrc/activation.cpp", "csrc/activation_kernels.cu"],
Woosuk Kwon's avatar
Woosuk Kwon committed
142
143
144
145
    extra_compile_args={
        "cxx": CXX_FLAGS,
        "nvcc": NVCC_FLAGS,
    },
Woosuk Kwon's avatar
Woosuk Kwon committed
146
147
148
)
ext_modules.append(activation_extension)

149
150
151
152
153
154
155
156
157
158
159
160
161
162
# Quantization kernels.
quantization_extension = CUDAExtension(
    name="vllm.quantization_ops",
    sources=[
        "csrc/quantization.cpp",
        "csrc/quantization/awq/gemm_kernels.cu",
    ],
    extra_compile_args={
        "cxx": CXX_FLAGS,
        "nvcc": NVCC_FLAGS,
    },
)
ext_modules.append(quantization_extension)

163

164
165
166
167
168
169
170
171
172
173
def get_path(*filepath) -> str:
    return os.path.join(ROOT_DIR, *filepath)


def find_version(filepath: str):
    """Extract version information from the given filepath.

    Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
    """
    with open(filepath) as fp:
Woosuk Kwon's avatar
Woosuk Kwon committed
174
175
        version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
                                  fp.read(), re.M)
176
177
178
179
180
181
182
183
184
185
        if version_match:
            return version_match.group(1)
        raise RuntimeError("Unable to find version string.")


def read_readme() -> str:
    """Read the README file."""
    return io.open(get_path("README.md"), "r", encoding="utf-8").read()


186
187
def get_requirements() -> List[str]:
    """Get Python package dependencies from requirements.txt."""
188
    with open(get_path("requirements.txt")) as f:
189
190
191
192
        requirements = f.read().strip().split("\n")
    return requirements


Woosuk Kwon's avatar
Woosuk Kwon committed
193
setuptools.setup(
Woosuk Kwon's avatar
Woosuk Kwon committed
194
195
196
    name="vllm",
    version=find_version(get_path("vllm", "__init__.py")),
    author="vLLM Team",
197
    license="Apache 2.0",
Woosuk Kwon's avatar
Woosuk Kwon committed
198
199
    description=("A high-throughput and memory-efficient inference and "
                 "serving engine for LLMs"),
200
201
    long_description=read_readme(),
    long_description_content_type="text/markdown",
202
    url="https://github.com/vllm-project/vllm",
203
    project_urls={
204
205
        "Homepage": "https://github.com/vllm-project/vllm",
        "Documentation": "https://vllm.readthedocs.io/en/latest/",
206
207
208
209
210
    },
    classifiers=[
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
Woosuk Kwon's avatar
Woosuk Kwon committed
211
        "Programming Language :: Python :: 3.11",
212
213
214
        "License :: OSI Approved :: Apache Software License",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
Woosuk Kwon's avatar
Woosuk Kwon committed
215
216
    packages=setuptools.find_packages(exclude=("benchmarks", "csrc", "docs",
                                               "examples", "tests")),
217
218
    python_requires=">=3.8",
    install_requires=get_requirements(),
Woosuk Kwon's avatar
Woosuk Kwon committed
219
    ext_modules=ext_modules,
220
    cmdclass={"build_ext": BuildExtension},
Woosuk Kwon's avatar
Woosuk Kwon committed
221
)