setup.py 5.98 KB
Newer Older
1
2
3
import io
import os
import re
4
5
import subprocess
from typing import List, Set
6

7
from packaging.version import parse, Version
Woosuk Kwon's avatar
Woosuk Kwon committed
8
import setuptools
Woosuk Kwon's avatar
Woosuk Kwon committed
9
import torch
10
11
12
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME

ROOT_DIR = os.path.dirname(__file__)
13

14
# Compiler flags.
15
CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
16
# TODO(woosuk): Should we use -O3?
17
NVCC_FLAGS = ["-O2", "-std=c++17"]
Woosuk Kwon's avatar
Woosuk Kwon committed
18

19
20
21
ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
22

Woosuk Kwon's avatar
Woosuk Kwon committed
23
24
if not torch.cuda.is_available():
    raise RuntimeError(
25
26
        f"Cannot find CUDA at CUDA_HOME: {CUDA_HOME}. "
        "CUDA must be available in order to build the package.")
Woosuk Kwon's avatar
Woosuk Kwon committed
27

28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

def get_nvcc_cuda_version(cuda_dir: str) -> Version:
    """Get the CUDA version from nvcc.

    Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
    """
    nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
                                          universal_newlines=True)
    output = nvcc_output.split()
    release_idx = output.index("release") + 1
    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
    return nvcc_cuda_version


# Collect the compute capabilities of all available GPUs.
device_count = torch.cuda.device_count()
compute_capabilities: Set[int] = set()
for i in range(device_count):
    major, minor = torch.cuda.get_device_capability(i)
    if major < 7:
        raise RuntimeError(
            "GPUs with compute capability less than 7.0 are not supported.")
    compute_capabilities.add(major * 10 + minor)
# If no GPU is available, add all supported compute capabilities.
if not compute_capabilities:
    compute_capabilities = {70, 75, 80, 86, 90}
# Add target compute capabilities to NVCC flags.
for capability in compute_capabilities:
    NVCC_FLAGS += ["-gencode", f"arch=compute_{capability},code=sm_{capability}"]

# Validate the NVCC CUDA version.
nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
if nvcc_cuda_version < Version("11.0"):
    raise RuntimeError("CUDA 11.0 or higher is required to build the package.")
if 86 in compute_capabilities and nvcc_cuda_version < Version("11.1"):
    raise RuntimeError(
        "CUDA 11.1 or higher is required for GPUs with compute capability 8.6.")
if 90 in compute_capabilities and nvcc_cuda_version < Version("11.8"):
    raise RuntimeError(
        "CUDA 11.8 or higher is required for GPUs with compute capability 9.0.")
Woosuk Kwon's avatar
Woosuk Kwon committed
68

Woosuk Kwon's avatar
Woosuk Kwon committed
69
70
71
72
73
# Use NVCC threads to parallelize the build.
if nvcc_cuda_version >= Version("11.2"):
    num_threads = min(os.cpu_count(), 8)
    NVCC_FLAGS += ["--threads", str(num_threads)]

Woosuk Kwon's avatar
Woosuk Kwon committed
74
75
76
ext_modules = []

# Cache operations.
77
cache_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
78
    name="vllm.cache_ops",
79
80
    sources=["csrc/cache.cpp", "csrc/cache_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
Woosuk Kwon's avatar
Woosuk Kwon committed
81
82
83
)
ext_modules.append(cache_extension)

84
# Attention kernels.
85
attention_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
86
    name="vllm.attention_ops",
87
88
    sources=["csrc/attention.cpp", "csrc/attention/attention_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
89
90
91
)
ext_modules.append(attention_extension)

Woosuk Kwon's avatar
Woosuk Kwon committed
92
# Positional encoding kernels.
93
positional_encoding_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
94
    name="vllm.pos_encoding_ops",
95
96
    sources=["csrc/pos_encoding.cpp", "csrc/pos_encoding_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
97
98
99
)
ext_modules.append(positional_encoding_extension)

100
# Layer normalization kernels.
101
layernorm_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
102
    name="vllm.layernorm_ops",
103
104
    sources=["csrc/layernorm.cpp", "csrc/layernorm_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
105
106
107
)
ext_modules.append(layernorm_extension)

Woosuk Kwon's avatar
Woosuk Kwon committed
108
# Activation kernels.
109
activation_extension = CUDAExtension(
Woosuk Kwon's avatar
Woosuk Kwon committed
110
    name="vllm.activation_ops",
111
112
    sources=["csrc/activation.cpp", "csrc/activation_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
Woosuk Kwon's avatar
Woosuk Kwon committed
113
114
115
)
ext_modules.append(activation_extension)

116

117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def get_path(*filepath) -> str:
    return os.path.join(ROOT_DIR, *filepath)


def find_version(filepath: str):
    """Extract version information from the given filepath.

    Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
    """
    with open(filepath) as fp:
        version_match = re.search(
            r"^__version__ = ['\"]([^'\"]*)['\"]", fp.read(), re.M)
        if version_match:
            return version_match.group(1)
        raise RuntimeError("Unable to find version string.")


def read_readme() -> str:
    """Read the README file."""
    return io.open(get_path("README.md"), "r", encoding="utf-8").read()


139
140
def get_requirements() -> List[str]:
    """Get Python package dependencies from requirements.txt."""
141
    with open(get_path("requirements.txt")) as f:
142
143
144
145
        requirements = f.read().strip().split("\n")
    return requirements


Woosuk Kwon's avatar
Woosuk Kwon committed
146
setuptools.setup(
Woosuk Kwon's avatar
Woosuk Kwon committed
147
148
149
150
    name="vllm",
    version=find_version(get_path("vllm", "__init__.py")),
    author="vLLM Team",
    author_email="vllm@gmail.com",  # FIXME
151
    license="Apache 2.0",
Woosuk Kwon's avatar
Woosuk Kwon committed
152
    description="vLLM: Easy, Fast, and Cheap LLM Serving with PagedAttention",  # FIXME
153
154
    long_description=read_readme(),
    long_description_content_type="text/markdown",
Woosuk Kwon's avatar
Woosuk Kwon committed
155
    url="https://github.com/WoosukKwon/vllm",
156
    project_urls={
Woosuk Kwon's avatar
Woosuk Kwon committed
157
158
        "Homepage": "https://github.com/WoosukKwon/vllm",
        "Documentation": "https://vllm.readthedocs.io/en/latest/",  # FIXME
159
160
161
162
163
164
165
166
167
    },
    classifiers=[
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "License :: OSI Approved :: Apache Software License",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
    packages=setuptools.find_packages(
168
        exclude=("assets", "benchmarks", "csrc", "docs", "examples", "tests")),
169
170
    python_requires=">=3.8",
    install_requires=get_requirements(),
Woosuk Kwon's avatar
Woosuk Kwon committed
171
    ext_modules=ext_modules,
172
    cmdclass={"build_ext": BuildExtension},
Woosuk Kwon's avatar
Woosuk Kwon committed
173
)