setup.py 5.9 KB
Newer Older
1
2
3
import io
import os
import re
4
5
import subprocess
from typing import List, Set
6

7
from packaging.version import parse, Version
Woosuk Kwon's avatar
Woosuk Kwon committed
8
import setuptools
Woosuk Kwon's avatar
Woosuk Kwon committed
9
import torch
10
11
12
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME

ROOT_DIR = os.path.dirname(__file__)
13

14
# Compiler flags.
15
CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
16
# TODO(woosuk): Should we use -O3?
17
NVCC_FLAGS = ["-O2", "-std=c++17"]
Woosuk Kwon's avatar
Woosuk Kwon committed
18

19
20
21
ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
22

Cody Yu's avatar
Cody Yu committed
23
if CUDA_HOME is None:
Woosuk Kwon's avatar
Woosuk Kwon committed
24
    raise RuntimeError(
Cody Yu's avatar
Cody Yu committed
25
        f"Cannot find CUDA_HOME. CUDA must be available in order to build the package.")
Woosuk Kwon's avatar
Woosuk Kwon committed
26

27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66

def get_nvcc_cuda_version(cuda_dir: str) -> Version:
    """Get the CUDA version from nvcc.

    Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
    """
    nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
                                          universal_newlines=True)
    output = nvcc_output.split()
    release_idx = output.index("release") + 1
    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
    return nvcc_cuda_version


# Collect the compute capabilities of all available GPUs.
device_count = torch.cuda.device_count()
compute_capabilities: Set[int] = set()
for i in range(device_count):
    major, minor = torch.cuda.get_device_capability(i)
    if major < 7:
        raise RuntimeError(
            "GPUs with compute capability less than 7.0 are not supported.")
    compute_capabilities.add(major * 10 + minor)
# If no GPU is available, add all supported compute capabilities.
if not compute_capabilities:
    compute_capabilities = {70, 75, 80, 86, 90}
# Add target compute capabilities to NVCC flags.
for capability in compute_capabilities:
    NVCC_FLAGS += ["-gencode", f"arch=compute_{capability},code=sm_{capability}"]

# Validate the NVCC CUDA version.
nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
if nvcc_cuda_version < Version("11.0"):
    raise RuntimeError("CUDA 11.0 or higher is required to build the package.")
if 86 in compute_capabilities and nvcc_cuda_version < Version("11.1"):
    raise RuntimeError(
        "CUDA 11.1 or higher is required for GPUs with compute capability 8.6.")
if 90 in compute_capabilities and nvcc_cuda_version < Version("11.8"):
    raise RuntimeError(
        "CUDA 11.8 or higher is required for GPUs with compute capability 9.0.")
Woosuk Kwon's avatar
Woosuk Kwon committed
67

Woosuk Kwon's avatar
Woosuk Kwon committed
68
69
70
71
72
# Use NVCC threads to parallelize the build.
if nvcc_cuda_version >= Version("11.2"):
    num_threads = min(os.cpu_count(), 8)
    NVCC_FLAGS += ["--threads", str(num_threads)]

Woosuk Kwon's avatar
Woosuk Kwon committed
73
74
75
ext_modules = []

# Cache operations.
76
cache_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
77
    name="vllm.cache_ops",
78
79
    sources=["csrc/cache.cpp", "csrc/cache_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
Woosuk Kwon's avatar
Woosuk Kwon committed
80
81
82
)
ext_modules.append(cache_extension)

83
# Attention kernels.
84
attention_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
85
    name="vllm.attention_ops",
86
87
    sources=["csrc/attention.cpp", "csrc/attention/attention_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
88
89
90
)
ext_modules.append(attention_extension)

Woosuk Kwon's avatar
Woosuk Kwon committed
91
# Positional encoding kernels.
92
positional_encoding_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
93
    name="vllm.pos_encoding_ops",
94
95
    sources=["csrc/pos_encoding.cpp", "csrc/pos_encoding_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
96
97
98
)
ext_modules.append(positional_encoding_extension)

99
# Layer normalization kernels.
100
layernorm_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
101
    name="vllm.layernorm_ops",
102
103
    sources=["csrc/layernorm.cpp", "csrc/layernorm_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
104
105
106
)
ext_modules.append(layernorm_extension)

Woosuk Kwon's avatar
Woosuk Kwon committed
107
# Activation kernels.
108
activation_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
109
    name="vllm.activation_ops",
110
111
    sources=["csrc/activation.cpp", "csrc/activation_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
Woosuk Kwon's avatar
Woosuk Kwon committed
112
113
114
)
ext_modules.append(activation_extension)

115

116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def get_path(*filepath) -> str:
    return os.path.join(ROOT_DIR, *filepath)


def find_version(filepath: str):
    """Extract version information from the given filepath.

    Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
    """
    with open(filepath) as fp:
        version_match = re.search(
            r"^__version__ = ['\"]([^'\"]*)['\"]", fp.read(), re.M)
        if version_match:
            return version_match.group(1)
        raise RuntimeError("Unable to find version string.")


def read_readme() -> str:
    """Read the README file."""
    return io.open(get_path("README.md"), "r", encoding="utf-8").read()


138
139
def get_requirements() -> List[str]:
    """Get Python package dependencies from requirements.txt."""
140
    with open(get_path("requirements.txt")) as f:
141
142
143
144
        requirements = f.read().strip().split("\n")
    return requirements


Woosuk Kwon's avatar
Woosuk Kwon committed
145
setuptools.setup(
Woosuk Kwon's avatar
Woosuk Kwon committed
146
147
148
    name="vllm",
    version=find_version(get_path("vllm", "__init__.py")),
    author="vLLM Team",
149
    license="Apache 2.0",
150
    description="A high-throughput and memory-efficient inference and serving engine for LLMs",
151
152
    long_description=read_readme(),
    long_description_content_type="text/markdown",
153
    url="https://github.com/vllm-project/vllm",
154
    project_urls={
155
156
        "Homepage": "https://github.com/vllm-project/vllm",
        "Documentation": "https://vllm.readthedocs.io/en/latest/",
157
158
159
160
161
162
163
164
165
    },
    classifiers=[
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "License :: OSI Approved :: Apache Software License",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
    packages=setuptools.find_packages(
166
        exclude=("assets", "benchmarks", "csrc", "docs", "examples", "tests")),
167
168
    python_requires=">=3.8",
    install_requires=get_requirements(),
Woosuk Kwon's avatar
Woosuk Kwon committed
169
    ext_modules=ext_modules,
170
    cmdclass={"build_ext": BuildExtension},
Woosuk Kwon's avatar
Woosuk Kwon committed
171
)