setup.py 13.2 KB
Newer Older
1
2
3
import io
import os
import re
4
5
import subprocess
from typing import List, Set
6
import warnings
7

8
from packaging.version import parse, Version
Woosuk Kwon's avatar
Woosuk Kwon committed
9
import setuptools
Woosuk Kwon's avatar
Woosuk Kwon committed
10
import torch
11
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
12
13

ROOT_DIR = os.path.dirname(__file__)
14

15
16
MAIN_CUDA_VERSION = "12.1"

17
# Supported NVIDIA GPU architectures.
18
19
20
21
22
23
24
25
26
NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"}
# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)


def _is_hip() -> bool:
    return torch.version.hip is not None


27
28
29
30
31
32
33
34
35
def _is_neuron() -> bool:
    torch_neuronx_installed = True
    try:
        subprocess.run(["neuron-ls"], capture_output=True, check=True)
    except FileNotFoundError as e:
        torch_neuronx_installed = False
    return torch_neuronx_installed


36
def _is_cuda() -> bool:
37
    return (torch.version.cuda is not None) and not _is_neuron()
38

39

40
# Compiler flags.
41
CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
42
# TODO(woosuk): Should we use -O3?
43
NVCC_FLAGS = ["-O2", "-std=c++17"]
Woosuk Kwon's avatar
Woosuk Kwon committed
44

45
46
47
48
49
50
51
52
53
54
55
if _is_hip():
    if ROCM_HOME is None:
        raise RuntimeError(
            "Cannot find ROCM_HOME. ROCm must be available to build the package."
        )
    NVCC_FLAGS += ["-DUSE_ROCM"]

if _is_cuda() and CUDA_HOME is None:
    raise RuntimeError(
        "Cannot find CUDA_HOME. CUDA must be available to build the package.")

56
57
58
ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
59

60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

def get_amdgpu_offload_arch():
    command = "/opt/rocm/llvm/bin/amdgpu-offload-arch"
    try:
        output = subprocess.check_output([command])
        return output.decode('utf-8').strip()
    except subprocess.CalledProcessError as e:
        error_message = f"Error: {e}"
        raise RuntimeError(error_message) from e
    except FileNotFoundError as e:
        # If the command is not found, print an error message
        error_message = f"The command {command} was not found."
        raise RuntimeError(error_message) from e

    return None


def get_hipcc_rocm_version():
    # Run the hipcc --version command
    result = subprocess.run(['hipcc', '--version'],
                            stdout=subprocess.PIPE,
                            stderr=subprocess.STDOUT,
                            text=True)

    # Check if the command was executed successfully
    if result.returncode != 0:
        print("Error running 'hipcc --version'")
        return None

    # Extract the version using a regular expression
    match = re.search(r'HIP version: (\S+)', result.stdout)
    if match:
        # Return the version string
        return match.group(1)
    else:
        print("Could not find HIP version in the output")
        return None
Woosuk Kwon's avatar
Woosuk Kwon committed
97

98

99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def get_neuronxcc_version():
    import sysconfig
    site_dir = sysconfig.get_paths()["purelib"]
    version_file = os.path.join(site_dir, "neuronxcc", "version", "__init__.py")

    # Check if the command was executed successfully
    with open(version_file, "rt") as fp:
        content = fp.read()

    # Extract the version using a regular expression
    match = re.search(r"__version__ = '(\S+)'", content)
    if match:
        # Return the version string
        return match.group(1)
    else:
        raise RuntimeError("Could not find HIP version in the output")


117
118
119
120
121
122
123
124
125
126
127
128
129
def get_nvcc_cuda_version(cuda_dir: str) -> Version:
    """Get the CUDA version from nvcc.

    Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
    """
    nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
                                          universal_newlines=True)
    output = nvcc_output.split()
    release_idx = output.index("release") + 1
    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
    return nvcc_cuda_version


130
131
132
133
134
135
136
def get_torch_arch_list() -> Set[str]:
    # TORCH_CUDA_ARCH_LIST can have one or more architectures,
    # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
    # compiler to additionally include PTX code that can be runtime-compiled
    # and executed on the 8.6 or newer architectures. While the PTX code will
    # not give the best performance on the newer architectures, it provides
    # forward compatibility.
137
138
    env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
    if env_arch_list is None:
139
140
141
        return set()

    # List are separated by ; or space.
142
143
144
145
146
    torch_arch_list = set(env_arch_list.replace(" ", ";").split(";"))
    if not torch_arch_list:
        return set()

    # Filter out the invalid architectures and print a warning.
147
148
149
    valid_archs = NVIDIA_SUPPORTED_ARCHS.union(
        {s + "+PTX"
         for s in NVIDIA_SUPPORTED_ARCHS})
150
151
152
153
    arch_list = torch_arch_list.intersection(valid_archs)
    # If none of the specified architectures are valid, raise an error.
    if not arch_list:
        raise RuntimeError(
154
            "None of the CUDA/ROCM architectures in `TORCH_CUDA_ARCH_LIST` env "
155
            f"variable ({env_arch_list}) is supported. "
156
            f"Supported CUDA/ROCM architectures are: {valid_archs}.")
157
158
159
    invalid_arch_list = torch_arch_list - valid_archs
    if invalid_arch_list:
        warnings.warn(
160
            f"Unsupported CUDA/ROCM architectures ({invalid_arch_list}) are "
161
            "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
162
            f"({env_arch_list}). Supported CUDA/ROCM architectures are: "
163
164
            f"{valid_archs}.",
            stacklevel=2)
165
    return arch_list
166
167
168
169


# First, check the TORCH_CUDA_ARCH_LIST environment variable.
compute_capabilities = get_torch_arch_list()
170
if _is_cuda() and not compute_capabilities:
171
172
173
174
175
176
177
178
179
    # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
    # GPUs on the current machine.
    device_count = torch.cuda.device_count()
    for i in range(device_count):
        major, minor = torch.cuda.get_device_capability(i)
        if major < 7:
            raise RuntimeError(
                "GPUs with compute capability below 7.0 are not supported.")
        compute_capabilities.add(f"{major}.{minor}")
180

181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
if _is_cuda():
    nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
    if not compute_capabilities:
        # If no GPU is specified nor available, add all supported architectures
        # based on the NVCC CUDA version.
        compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy()
        if nvcc_cuda_version < Version("11.1"):
            compute_capabilities.remove("8.6")
        if nvcc_cuda_version < Version("11.8"):
            compute_capabilities.remove("8.9")
            compute_capabilities.remove("9.0")
    # Validate the NVCC CUDA version.
    if nvcc_cuda_version < Version("11.0"):
        raise RuntimeError(
            "CUDA 11.0 or higher is required to build the package.")
    if (nvcc_cuda_version < Version("11.1")
            and any(cc.startswith("8.6") for cc in compute_capabilities)):
        raise RuntimeError(
            "CUDA 11.1 or higher is required for compute capability 8.6.")
200
    if nvcc_cuda_version < Version("11.8"):
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
        if any(cc.startswith("8.9") for cc in compute_capabilities):
            # CUDA 11.8 is required to generate the code targeting compute capability 8.9.
            # However, GPUs with compute capability 8.9 can also run the code generated by
            # the previous versions of CUDA 11 and targeting compute capability 8.0.
            # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
            # instead of 8.9.
            warnings.warn(
                "CUDA 11.8 or higher is required for compute capability 8.9. "
                "Targeting compute capability 8.0 instead.",
                stacklevel=2)
            compute_capabilities = set(cc for cc in compute_capabilities
                                       if not cc.startswith("8.9"))
            compute_capabilities.add("8.0+PTX")
        if any(cc.startswith("9.0") for cc in compute_capabilities):
            raise RuntimeError(
                "CUDA 11.8 or higher is required for compute capability 9.0.")

    # Add target compute capabilities to NVCC flags.
    for capability in compute_capabilities:
        num = capability[0] + capability[2]
        NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
        if capability.endswith("+PTX"):
            NVCC_FLAGS += [
                "-gencode", f"arch=compute_{num},code=compute_{num}"
            ]

    # Use NVCC threads to parallelize the build.
    if nvcc_cuda_version >= Version("11.2"):
        nvcc_threads = int(os.getenv("NVCC_THREADS", 8))
        num_threads = min(os.cpu_count(), nvcc_threads)
        NVCC_FLAGS += ["--threads", str(num_threads)]

elif _is_hip():
    amd_arch = get_amdgpu_offload_arch()
    if amd_arch not in ROCM_SUPPORTED_ARCHS:
236
        raise RuntimeError(
237
238
            f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}"
            f"amdgpu_arch_found: {amd_arch}")
239

240
241
242
elif _is_neuron():
    neuronxcc_version = get_neuronxcc_version()

243
ext_modules = []
244

245
246
247
248
249
250
251
vllm_extension_sources = [
    "csrc/cache_kernels.cu",
    "csrc/attention/attention_kernels.cu",
    "csrc/pos_encoding_kernels.cu",
    "csrc/activation_kernels.cu",
    "csrc/layernorm_kernels.cu",
    "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
kliuae's avatar
kliuae committed
252
    "csrc/quantization/gptq/q_gemm.cu",
253
254
255
256
257
258
    "csrc/cuda_utils_kernels.cu",
    "csrc/pybind.cpp",
]

if _is_cuda():
    vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
Woosuk Kwon's avatar
Woosuk Kwon committed
259

260
261
262
263
264
265
266
267
268
269
if not _is_neuron():
    vllm_extension = CUDAExtension(
        name="vllm._C",
        sources=vllm_extension_sources,
        extra_compile_args={
            "cxx": CXX_FLAGS,
            "nvcc": NVCC_FLAGS,
        },
    )
    ext_modules.append(vllm_extension)
270

271

272
273
274
275
def get_path(*filepath) -> str:
    return os.path.join(ROOT_DIR, *filepath)


276
def find_version(filepath: str) -> str:
277
278
279
280
281
    """Extract version information from the given filepath.

    Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
    """
    with open(filepath) as fp:
Woosuk Kwon's avatar
Woosuk Kwon committed
282
283
        version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
                                  fp.read(), re.M)
284
285
286
287
288
        if version_match:
            return version_match.group(1)
        raise RuntimeError("Unable to find version string.")


289
290
def get_vllm_version() -> str:
    version = find_version(get_path("vllm", "__init__.py"))
291
292
293
294
295
296
297

    if _is_hip():
        # Get the HIP version
        hipcc_version = get_hipcc_rocm_version()
        if hipcc_version != MAIN_CUDA_VERSION:
            rocm_version_str = hipcc_version.replace(".", "")[:3]
            version += f"+rocm{rocm_version_str}"
298
299
300
301
302
303
    elif _is_neuron():
        # Get the Neuron version
        neuron_version = str(neuronxcc_version)
        if neuron_version != MAIN_CUDA_VERSION:
            neuron_version_str = neuron_version.replace(".", "")[:3]
            version += f"+neuron{neuron_version_str}"
304
305
306
307
308
309
    else:
        cuda_version = str(nvcc_cuda_version)
        if cuda_version != MAIN_CUDA_VERSION:
            cuda_version_str = cuda_version.replace(".", "")[:3]
            version += f"+cu{cuda_version_str}"

310
311
312
    return version


313
def read_readme() -> str:
Stephen Krider's avatar
Stephen Krider committed
314
315
316
317
318
319
    """Read the README file if present."""
    p = get_path("README.md")
    if os.path.isfile(p):
        return io.open(get_path("README.md"), "r", encoding="utf-8").read()
    else:
        return ""
320
321


322
323
def get_requirements() -> List[str]:
    """Get Python package dependencies from requirements.txt."""
324
325
326
    if _is_hip():
        with open(get_path("requirements-rocm.txt")) as f:
            requirements = f.read().strip().split("\n")
327
328
329
    elif _is_neuron():
        with open(get_path("requirements-neuron.txt")) as f:
            requirements = f.read().strip().split("\n")
330
331
332
    else:
        with open(get_path("requirements.txt")) as f:
            requirements = f.read().strip().split("\n")
333
334
335
    return requirements


Simon Mo's avatar
Simon Mo committed
336
337
338
339
340
package_data = {"vllm": ["py.typed"]}
if os.environ.get("VLLM_USE_PRECOMPILED"):
    ext_modules = []
    package_data["vllm"].append("*.so")

Woosuk Kwon's avatar
Woosuk Kwon committed
341
setuptools.setup(
Woosuk Kwon's avatar
Woosuk Kwon committed
342
    name="vllm",
343
    version=get_vllm_version(),
Woosuk Kwon's avatar
Woosuk Kwon committed
344
    author="vLLM Team",
345
    license="Apache 2.0",
Woosuk Kwon's avatar
Woosuk Kwon committed
346
347
    description=("A high-throughput and memory-efficient inference and "
                 "serving engine for LLMs"),
348
349
    long_description=read_readme(),
    long_description_content_type="text/markdown",
350
    url="https://github.com/vllm-project/vllm",
351
    project_urls={
352
353
        "Homepage": "https://github.com/vllm-project/vllm",
        "Documentation": "https://vllm.readthedocs.io/en/latest/",
354
355
356
357
358
    },
    classifiers=[
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
Woosuk Kwon's avatar
Woosuk Kwon committed
359
        "Programming Language :: Python :: 3.11",
360
361
362
        "License :: OSI Approved :: Apache Software License",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
Woosuk Kwon's avatar
Woosuk Kwon committed
363
364
    packages=setuptools.find_packages(exclude=("benchmarks", "csrc", "docs",
                                               "examples", "tests")),
365
366
    python_requires=">=3.8",
    install_requires=get_requirements(),
Woosuk Kwon's avatar
Woosuk Kwon committed
367
    ext_modules=ext_modules,
368
    cmdclass={"build_ext": BuildExtension} if not _is_neuron() else {},
Simon Mo's avatar
Simon Mo committed
369
    package_data=package_data,
Woosuk Kwon's avatar
Woosuk Kwon committed
370
)