setup.py 14.9 KB
Newer Older
1
import contextlib
2
3
4
import io
import os
import re
5
import subprocess
6
import warnings
7
8
from pathlib import Path
from typing import List, Set
9

10
from packaging.version import parse, Version
Woosuk Kwon's avatar
Woosuk Kwon committed
11
import setuptools
Woosuk Kwon's avatar
Woosuk Kwon committed
12
import torch
13
import torch.utils.cpp_extension as torch_cpp_ext
14
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
15
16

ROOT_DIR = os.path.dirname(__file__)
17

18
19
MAIN_CUDA_VERSION = "12.1"

20
# Supported NVIDIA GPU architectures.
21
22
23
24
25
26
27
28
29
NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"}
# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)


def _is_hip() -> bool:
    return torch.version.hip is not None


30
31
32
33
def _is_neuron() -> bool:
    torch_neuronx_installed = True
    try:
        subprocess.run(["neuron-ls"], capture_output=True, check=True)
34
    except FileNotFoundError:
35
36
37
38
        torch_neuronx_installed = False
    return torch_neuronx_installed


39
def _is_cuda() -> bool:
40
    return (torch.version.cuda is not None) and not _is_neuron()
41

42

43
# Compiler flags.
44
CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
45
# TODO(woosuk): Should we use -O3?
46
NVCC_FLAGS = ["-O2", "-std=c++17"]
Woosuk Kwon's avatar
Woosuk Kwon committed
47

48
49
50
51
52
53
if _is_hip():
    if ROCM_HOME is None:
        raise RuntimeError(
            "Cannot find ROCM_HOME. ROCm must be available to build the package."
        )
    NVCC_FLAGS += ["-DUSE_ROCM"]
54
55
    NVCC_FLAGS += [f"-U__HIP_NO_HALF_CONVERSIONS__"]
    NVCC_FLAGS += [f"-U__HIP_NO_HALF_OPERATORS__"]
56
57
58
59
60

if _is_cuda() and CUDA_HOME is None:
    raise RuntimeError(
        "Cannot find CUDA_HOME. CUDA must be available to build the package.")

61
62
63
ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
64

65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101

def get_amdgpu_offload_arch():
    command = "/opt/rocm/llvm/bin/amdgpu-offload-arch"
    try:
        output = subprocess.check_output([command])
        return output.decode('utf-8').strip()
    except subprocess.CalledProcessError as e:
        error_message = f"Error: {e}"
        raise RuntimeError(error_message) from e
    except FileNotFoundError as e:
        # If the command is not found, print an error message
        error_message = f"The command {command} was not found."
        raise RuntimeError(error_message) from e

    return None


def get_hipcc_rocm_version():
    # Run the hipcc --version command
    result = subprocess.run(['hipcc', '--version'],
                            stdout=subprocess.PIPE,
                            stderr=subprocess.STDOUT,
                            text=True)

    # Check if the command was executed successfully
    if result.returncode != 0:
        print("Error running 'hipcc --version'")
        return None

    # Extract the version using a regular expression
    match = re.search(r'HIP version: (\S+)', result.stdout)
    if match:
        # Return the version string
        return match.group(1)
    else:
        print("Could not find HIP version in the output")
        return None
Woosuk Kwon's avatar
Woosuk Kwon committed
102

103

104
105
106
107
108
def glob(pattern: str):
    root = Path(__name__).parent
    return [str(p) for p in root.glob(pattern)]


109
110
111
def get_neuronxcc_version():
    import sysconfig
    site_dir = sysconfig.get_paths()["purelib"]
112
113
    version_file = os.path.join(site_dir, "neuronxcc", "version",
                                "__init__.py")
114
115
116
117
118
119
120
121
122
123
124
125
126
127

    # Check if the command was executed successfully
    with open(version_file, "rt") as fp:
        content = fp.read()

    # Extract the version using a regular expression
    match = re.search(r"__version__ = '(\S+)'", content)
    if match:
        # Return the version string
        return match.group(1)
    else:
        raise RuntimeError("Could not find HIP version in the output")


128
129
130
131
132
133
134
135
136
137
138
139
140
def get_nvcc_cuda_version(cuda_dir: str) -> Version:
    """Get the CUDA version from nvcc.

    Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
    """
    nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
                                          universal_newlines=True)
    output = nvcc_output.split()
    release_idx = output.index("release") + 1
    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
    return nvcc_cuda_version


141
142
143
144
145
146
147
def get_torch_arch_list() -> Set[str]:
    # TORCH_CUDA_ARCH_LIST can have one or more architectures,
    # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
    # compiler to additionally include PTX code that can be runtime-compiled
    # and executed on the 8.6 or newer architectures. While the PTX code will
    # not give the best performance on the newer architectures, it provides
    # forward compatibility.
148
149
    env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
    if env_arch_list is None:
150
151
152
        return set()

    # List are separated by ; or space.
153
154
155
156
157
    torch_arch_list = set(env_arch_list.replace(" ", ";").split(";"))
    if not torch_arch_list:
        return set()

    # Filter out the invalid architectures and print a warning.
158
159
160
    valid_archs = NVIDIA_SUPPORTED_ARCHS.union(
        {s + "+PTX"
         for s in NVIDIA_SUPPORTED_ARCHS})
161
162
163
164
    arch_list = torch_arch_list.intersection(valid_archs)
    # If none of the specified architectures are valid, raise an error.
    if not arch_list:
        raise RuntimeError(
165
            "None of the CUDA/ROCM architectures in `TORCH_CUDA_ARCH_LIST` env "
166
            f"variable ({env_arch_list}) is supported. "
167
            f"Supported CUDA/ROCM architectures are: {valid_archs}.")
168
169
170
    invalid_arch_list = torch_arch_list - valid_archs
    if invalid_arch_list:
        warnings.warn(
171
            f"Unsupported CUDA/ROCM architectures ({invalid_arch_list}) are "
172
            "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
173
            f"({env_arch_list}). Supported CUDA/ROCM architectures are: "
174
175
            f"{valid_archs}.",
            stacklevel=2)
176
    return arch_list
177
178
179
180


# First, check the TORCH_CUDA_ARCH_LIST environment variable.
compute_capabilities = get_torch_arch_list()
181
if _is_cuda() and not compute_capabilities:
182
183
184
185
186
187
188
189
190
    # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
    # GPUs on the current machine.
    device_count = torch.cuda.device_count()
    for i in range(device_count):
        major, minor = torch.cuda.get_device_capability(i)
        if major < 7:
            raise RuntimeError(
                "GPUs with compute capability below 7.0 are not supported.")
        compute_capabilities.add(f"{major}.{minor}")
191

192
193
ext_modules = []

194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
if _is_cuda():
    nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
    if not compute_capabilities:
        # If no GPU is specified nor available, add all supported architectures
        # based on the NVCC CUDA version.
        compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy()
        if nvcc_cuda_version < Version("11.1"):
            compute_capabilities.remove("8.6")
        if nvcc_cuda_version < Version("11.8"):
            compute_capabilities.remove("8.9")
            compute_capabilities.remove("9.0")
    # Validate the NVCC CUDA version.
    if nvcc_cuda_version < Version("11.0"):
        raise RuntimeError(
            "CUDA 11.0 or higher is required to build the package.")
    if (nvcc_cuda_version < Version("11.1")
            and any(cc.startswith("8.6") for cc in compute_capabilities)):
        raise RuntimeError(
            "CUDA 11.1 or higher is required for compute capability 8.6.")
213
    if nvcc_cuda_version < Version("11.8"):
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
        if any(cc.startswith("8.9") for cc in compute_capabilities):
            # CUDA 11.8 is required to generate the code targeting compute capability 8.9.
            # However, GPUs with compute capability 8.9 can also run the code generated by
            # the previous versions of CUDA 11 and targeting compute capability 8.0.
            # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
            # instead of 8.9.
            warnings.warn(
                "CUDA 11.8 or higher is required for compute capability 8.9. "
                "Targeting compute capability 8.0 instead.",
                stacklevel=2)
            compute_capabilities = set(cc for cc in compute_capabilities
                                       if not cc.startswith("8.9"))
            compute_capabilities.add("8.0+PTX")
        if any(cc.startswith("9.0") for cc in compute_capabilities):
            raise RuntimeError(
                "CUDA 11.8 or higher is required for compute capability 9.0.")

231
232
    NVCC_FLAGS_PUNICA = NVCC_FLAGS.copy()

233
234
235
236
237
238
239
240
    # Add target compute capabilities to NVCC flags.
    for capability in compute_capabilities:
        num = capability[0] + capability[2]
        NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
        if capability.endswith("+PTX"):
            NVCC_FLAGS += [
                "-gencode", f"arch=compute_{num},code=compute_{num}"
            ]
241
242
243
244
245
246
247
248
        if int(capability[0]) >= 8:
            NVCC_FLAGS_PUNICA += [
                "-gencode", f"arch=compute_{num},code=sm_{num}"
            ]
            if capability.endswith("+PTX"):
                NVCC_FLAGS_PUNICA += [
                    "-gencode", f"arch=compute_{num},code=compute_{num}"
                ]
249
250
251
252
253
254
255

    # Use NVCC threads to parallelize the build.
    if nvcc_cuda_version >= Version("11.2"):
        nvcc_threads = int(os.getenv("NVCC_THREADS", 8))
        num_threads = min(os.cpu_count(), nvcc_threads)
        NVCC_FLAGS += ["--threads", str(num_threads)]

256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
    # changes for punica kernels
    NVCC_FLAGS += torch_cpp_ext.COMMON_NVCC_FLAGS
    REMOVE_NVCC_FLAGS = [
        '-D__CUDA_NO_HALF_OPERATORS__',
        '-D__CUDA_NO_HALF_CONVERSIONS__',
        '-D__CUDA_NO_BFLOAT16_CONVERSIONS__',
        '-D__CUDA_NO_HALF2_OPERATORS__',
    ]
    for flag in REMOVE_NVCC_FLAGS:
        with contextlib.suppress(ValueError):
            torch_cpp_ext.COMMON_NVCC_FLAGS.remove(flag)

    install_punica = bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "1")))
    device_count = torch.cuda.device_count()
    for i in range(device_count):
        major, minor = torch.cuda.get_device_capability(i)
        if major < 8:
            install_punica = False
            break
    if install_punica:
        ext_modules.append(
            CUDAExtension(
                name="vllm._punica_C",
                sources=["csrc/punica/punica_ops.cc"] +
                glob("csrc/punica/bgmv/*.cu"),
                extra_compile_args={
                    "cxx": CXX_FLAGS,
                    "nvcc": NVCC_FLAGS_PUNICA,
                },
            ))
286
287
288
elif _is_hip():
    amd_arch = get_amdgpu_offload_arch()
    if amd_arch not in ROCM_SUPPORTED_ARCHS:
289
        raise RuntimeError(
290
291
            f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}"
            f"amdgpu_arch_found: {amd_arch}")
292

293
294
295
elif _is_neuron():
    neuronxcc_version = get_neuronxcc_version()

296
297
298
299
300
301
302
vllm_extension_sources = [
    "csrc/cache_kernels.cu",
    "csrc/attention/attention_kernels.cu",
    "csrc/pos_encoding_kernels.cu",
    "csrc/activation_kernels.cu",
    "csrc/layernorm_kernels.cu",
    "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
kliuae's avatar
kliuae committed
303
    "csrc/quantization/gptq/q_gemm.cu",
304
305
306
307
308
309
    "csrc/cuda_utils_kernels.cu",
    "csrc/pybind.cpp",
]

if _is_cuda():
    vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
Woosuk Kwon's avatar
Woosuk Kwon committed
310

311
312
313
314
315
316
317
318
319
320
if not _is_neuron():
    vllm_extension = CUDAExtension(
        name="vllm._C",
        sources=vllm_extension_sources,
        extra_compile_args={
            "cxx": CXX_FLAGS,
            "nvcc": NVCC_FLAGS,
        },
    )
    ext_modules.append(vllm_extension)
321

322

323
324
325
326
def get_path(*filepath) -> str:
    return os.path.join(ROOT_DIR, *filepath)


327
def find_version(filepath: str) -> str:
328
329
330
331
332
    """Extract version information from the given filepath.

    Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
    """
    with open(filepath) as fp:
Woosuk Kwon's avatar
Woosuk Kwon committed
333
334
        version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
                                  fp.read(), re.M)
335
336
337
338
339
        if version_match:
            return version_match.group(1)
        raise RuntimeError("Unable to find version string.")


340
341
def get_vllm_version() -> str:
    version = find_version(get_path("vllm", "__init__.py"))
342
343
344
345
346
347
348

    if _is_hip():
        # Get the HIP version
        hipcc_version = get_hipcc_rocm_version()
        if hipcc_version != MAIN_CUDA_VERSION:
            rocm_version_str = hipcc_version.replace(".", "")[:3]
            version += f"+rocm{rocm_version_str}"
349
350
351
352
353
354
    elif _is_neuron():
        # Get the Neuron version
        neuron_version = str(neuronxcc_version)
        if neuron_version != MAIN_CUDA_VERSION:
            neuron_version_str = neuron_version.replace(".", "")[:3]
            version += f"+neuron{neuron_version_str}"
355
356
357
358
359
360
    else:
        cuda_version = str(nvcc_cuda_version)
        if cuda_version != MAIN_CUDA_VERSION:
            cuda_version_str = cuda_version.replace(".", "")[:3]
            version += f"+cu{cuda_version_str}"

361
362
363
    return version


364
def read_readme() -> str:
Stephen Krider's avatar
Stephen Krider committed
365
366
367
368
369
370
    """Read the README file if present."""
    p = get_path("README.md")
    if os.path.isfile(p):
        return io.open(get_path("README.md"), "r", encoding="utf-8").read()
    else:
        return ""
371
372


373
374
def get_requirements() -> List[str]:
    """Get Python package dependencies from requirements.txt."""
375
376
377
    if _is_hip():
        with open(get_path("requirements-rocm.txt")) as f:
            requirements = f.read().strip().split("\n")
378
379
380
    elif _is_neuron():
        with open(get_path("requirements-neuron.txt")) as f:
            requirements = f.read().strip().split("\n")
381
382
383
    else:
        with open(get_path("requirements.txt")) as f:
            requirements = f.read().strip().split("\n")
384
385
386
    return requirements


Simon Mo's avatar
Simon Mo committed
387
388
389
390
391
package_data = {"vllm": ["py.typed"]}
if os.environ.get("VLLM_USE_PRECOMPILED"):
    ext_modules = []
    package_data["vllm"].append("*.so")

Woosuk Kwon's avatar
Woosuk Kwon committed
392
setuptools.setup(
Woosuk Kwon's avatar
Woosuk Kwon committed
393
    name="vllm",
394
    version=get_vllm_version(),
Woosuk Kwon's avatar
Woosuk Kwon committed
395
    author="vLLM Team",
396
    license="Apache 2.0",
Woosuk Kwon's avatar
Woosuk Kwon committed
397
398
    description=("A high-throughput and memory-efficient inference and "
                 "serving engine for LLMs"),
399
400
    long_description=read_readme(),
    long_description_content_type="text/markdown",
401
    url="https://github.com/vllm-project/vllm",
402
    project_urls={
403
404
        "Homepage": "https://github.com/vllm-project/vllm",
        "Documentation": "https://vllm.readthedocs.io/en/latest/",
405
406
407
408
409
    },
    classifiers=[
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
Woosuk Kwon's avatar
Woosuk Kwon committed
410
        "Programming Language :: Python :: 3.11",
411
412
413
        "License :: OSI Approved :: Apache Software License",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
Woosuk Kwon's avatar
Woosuk Kwon committed
414
415
    packages=setuptools.find_packages(exclude=("benchmarks", "csrc", "docs",
                                               "examples", "tests")),
416
417
    python_requires=">=3.8",
    install_requires=get_requirements(),
Woosuk Kwon's avatar
Woosuk Kwon committed
418
    ext_modules=ext_modules,
419
    cmdclass={"build_ext": BuildExtension} if not _is_neuron() else {},
Simon Mo's avatar
Simon Mo committed
420
    package_data=package_data,
Woosuk Kwon's avatar
Woosuk Kwon committed
421
)