setup.py 6.55 KB
Newer Older
1
2
3
import io
import os
import re
4
5
import subprocess
from typing import List, Set
6

7
from packaging.version import parse, Version
Woosuk Kwon's avatar
Woosuk Kwon committed
8
import setuptools
Woosuk Kwon's avatar
Woosuk Kwon committed
9
import torch
10
11
12
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME

ROOT_DIR = os.path.dirname(__file__)
13

14
# Compiler flags.
15
CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
16
# TODO(woosuk): Should we use -O3?
17
NVCC_FLAGS = ["-O2", "-std=c++17"]
Woosuk Kwon's avatar
Woosuk Kwon committed
18

19
20
21
ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
22

Cody Yu's avatar
Cody Yu committed
23
if CUDA_HOME is None:
Woosuk Kwon's avatar
Woosuk Kwon committed
24
    raise RuntimeError(
25
        f"Cannot find CUDA_HOME. CUDA must be available to build the package.")
Woosuk Kwon's avatar
Woosuk Kwon committed
26

27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

def get_nvcc_cuda_version(cuda_dir: str) -> Version:
    """Get the CUDA version from nvcc.

    Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
    """
    nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
                                          universal_newlines=True)
    output = nvcc_output.split()
    release_idx = output.index("release") + 1
    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
    return nvcc_cuda_version


# Collect the compute capabilities of all available GPUs.
device_count = torch.cuda.device_count()
compute_capabilities: Set[int] = set()
for i in range(device_count):
    major, minor = torch.cuda.get_device_capability(i)
    if major < 7:
        raise RuntimeError(
            "GPUs with compute capability less than 7.0 are not supported.")
    compute_capabilities.add(major * 10 + minor)

# Validate the NVCC CUDA version.
nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
if nvcc_cuda_version < Version("11.0"):
    raise RuntimeError("CUDA 11.0 or higher is required to build the package.")
if 86 in compute_capabilities and nvcc_cuda_version < Version("11.1"):
    raise RuntimeError(
        "CUDA 11.1 or higher is required for GPUs with compute capability 8.6.")
58
59
60
61
62
63
64
65
if 89 in compute_capabilities and nvcc_cuda_version < Version("11.8"):
    # CUDA 11.8 is required to generate the code targeting compute capability 8.9.
    # However, GPUs with compute capability 8.9 can also run the code generated by
    # the previous versions of CUDA 11 and targeting compute capability 8.0.
    # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
    # instead of 8.9.
    compute_capabilities.remove(89)
    compute_capabilities.add(80)
66
67
68
if 90 in compute_capabilities and nvcc_cuda_version < Version("11.8"):
    raise RuntimeError(
        "CUDA 11.8 or higher is required for GPUs with compute capability 9.0.")
Woosuk Kwon's avatar
Woosuk Kwon committed
69

70
71
72
73
74
75
# If no GPU is available, add all supported compute capabilities.
if not compute_capabilities:
    compute_capabilities = {70, 75, 80}
    if nvcc_cuda_version >= Version("11.1"):
        compute_capabilities.add(86)
    if nvcc_cuda_version >= Version("11.8"):
76
        compute_capabilities.add(89)
77
78
79
80
81
82
        compute_capabilities.add(90)

# Add target compute capabilities to NVCC flags.
for capability in compute_capabilities:
    NVCC_FLAGS += ["-gencode", f"arch=compute_{capability},code=sm_{capability}"]

Woosuk Kwon's avatar
Woosuk Kwon committed
83
84
85
86
87
# Use NVCC threads to parallelize the build.
if nvcc_cuda_version >= Version("11.2"):
    num_threads = min(os.cpu_count(), 8)
    NVCC_FLAGS += ["--threads", str(num_threads)]

Woosuk Kwon's avatar
Woosuk Kwon committed
88
89
90
ext_modules = []

# Cache operations.
91
cache_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
92
    name="vllm.cache_ops",
93
94
    sources=["csrc/cache.cpp", "csrc/cache_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
Woosuk Kwon's avatar
Woosuk Kwon committed
95
96
97
)
ext_modules.append(cache_extension)

98
# Attention kernels.
99
attention_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
100
    name="vllm.attention_ops",
101
102
    sources=["csrc/attention.cpp", "csrc/attention/attention_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
103
104
105
)
ext_modules.append(attention_extension)

Woosuk Kwon's avatar
Woosuk Kwon committed
106
# Positional encoding kernels.
107
positional_encoding_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
108
    name="vllm.pos_encoding_ops",
109
110
    sources=["csrc/pos_encoding.cpp", "csrc/pos_encoding_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
111
112
113
)
ext_modules.append(positional_encoding_extension)

114
# Layer normalization kernels.
115
layernorm_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
116
    name="vllm.layernorm_ops",
117
118
    sources=["csrc/layernorm.cpp", "csrc/layernorm_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
119
120
121
)
ext_modules.append(layernorm_extension)

Woosuk Kwon's avatar
Woosuk Kwon committed
122
# Activation kernels.
123
activation_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
124
    name="vllm.activation_ops",
125
126
    sources=["csrc/activation.cpp", "csrc/activation_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
Woosuk Kwon's avatar
Woosuk Kwon committed
127
128
129
)
ext_modules.append(activation_extension)

130

131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
def get_path(*filepath) -> str:
    return os.path.join(ROOT_DIR, *filepath)


def find_version(filepath: str):
    """Extract version information from the given filepath.

    Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
    """
    with open(filepath) as fp:
        version_match = re.search(
            r"^__version__ = ['\"]([^'\"]*)['\"]", fp.read(), re.M)
        if version_match:
            return version_match.group(1)
        raise RuntimeError("Unable to find version string.")


def read_readme() -> str:
    """Read the README file."""
    return io.open(get_path("README.md"), "r", encoding="utf-8").read()


153
154
def get_requirements() -> List[str]:
    """Get Python package dependencies from requirements.txt."""
155
    with open(get_path("requirements.txt")) as f:
156
157
158
159
        requirements = f.read().strip().split("\n")
    return requirements


Woosuk Kwon's avatar
Woosuk Kwon committed
160
setuptools.setup(
Woosuk Kwon's avatar
Woosuk Kwon committed
161
162
163
    name="vllm",
    version=find_version(get_path("vllm", "__init__.py")),
    author="vLLM Team",
164
    license="Apache 2.0",
165
    description="A high-throughput and memory-efficient inference and serving engine for LLMs",
166
167
    long_description=read_readme(),
    long_description_content_type="text/markdown",
168
    url="https://github.com/vllm-project/vllm",
169
    project_urls={
170
171
        "Homepage": "https://github.com/vllm-project/vllm",
        "Documentation": "https://vllm.readthedocs.io/en/latest/",
172
173
174
175
176
177
178
179
180
    },
    classifiers=[
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "License :: OSI Approved :: Apache Software License",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
    packages=setuptools.find_packages(
181
        exclude=("assets", "benchmarks", "csrc", "docs", "examples", "tests")),
182
183
    python_requires=">=3.8",
    install_requires=get_requirements(),
Woosuk Kwon's avatar
Woosuk Kwon committed
184
    ext_modules=ext_modules,
185
    cmdclass={"build_ext": BuildExtension},
Woosuk Kwon's avatar
Woosuk Kwon committed
186
)