setup.py 17.3 KB
Newer Older
1
import contextlib
2
3
4
import io
import os
import re
5
import subprocess
6
import warnings
7
8
from pathlib import Path
from typing import List, Set
9

10
from packaging.version import parse, Version
Woosuk Kwon's avatar
Woosuk Kwon committed
11
import setuptools
Woosuk Kwon's avatar
Woosuk Kwon committed
12
import torch
13
import torch.utils.cpp_extension as torch_cpp_ext
14
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
15

zhuwenwen's avatar
zhuwenwen committed
16
17
18
19
from typing import Optional, Union
import subprocess
from pathlib import Path

20
ROOT_DIR = os.path.dirname(__file__)
21

22
23
MAIN_CUDA_VERSION = "12.1"

24
# Supported NVIDIA GPU architectures.
25
NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
zhuwenwen's avatar
zhuwenwen committed
26
ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx926","gfx1030", "gfx1100"}
27
28
29
30
31
32
33
# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)


def _is_hip() -> bool:
    return torch.version.hip is not None


34
35
36
37
def _is_neuron() -> bool:
    torch_neuronx_installed = True
    try:
        subprocess.run(["neuron-ls"], capture_output=True, check=True)
38
    except FileNotFoundError:
39
40
41
42
        torch_neuronx_installed = False
    return torch_neuronx_installed


43
def _is_cuda() -> bool:
44
    return (torch.version.cuda is not None) and not _is_neuron()
45

46

47
# Compiler flags.
48
CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
49
# TODO(woosuk): Should we use -O3?
zhuwenwen's avatar
zhuwenwen committed
50
NVCC_FLAGS = ["-O2", "-std=c++17","--gpu-max-threads-per-block=1024"]
Woosuk Kwon's avatar
Woosuk Kwon committed
51

52
53
54
55
56
57
if _is_hip():
    if ROCM_HOME is None:
        raise RuntimeError(
            "Cannot find ROCM_HOME. ROCm must be available to build the package."
        )
    NVCC_FLAGS += ["-DUSE_ROCM"]
58
59
    NVCC_FLAGS += ["-U__HIP_NO_HALF_CONVERSIONS__"]
    NVCC_FLAGS += ["-U__HIP_NO_HALF_OPERATORS__"]
60
61
62
63
64

if _is_cuda() and CUDA_HOME is None:
    raise RuntimeError(
        "Cannot find CUDA_HOME. CUDA must be available to build the package.")

65
66
67
ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
68

69

zhuwenwen's avatar
zhuwenwen committed
70
71
72
73
74
75
76
77
78
79
80
81
# def get_amdgpu_offload_arch():
#     command = "/opt/rocm/llvm/bin/amdgpu-offload-arch"
#     try:
#         output = subprocess.check_output([command])
#         return output.decode('utf-8').strip()
#     except subprocess.CalledProcessError as e:
#         error_message = f"Error: {e}"
#         raise RuntimeError(error_message) from e
#     except FileNotFoundError as e:
#         # If the command is not found, print an error message
#         error_message = f"The command {command} was not found."
#         raise RuntimeError(error_message) from e
82

zhuwenwen's avatar
zhuwenwen committed
83
#     return None
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105


def get_hipcc_rocm_version():
    # Run the hipcc --version command
    result = subprocess.run(['hipcc', '--version'],
                            stdout=subprocess.PIPE,
                            stderr=subprocess.STDOUT,
                            text=True)

    # Check if the command was executed successfully
    if result.returncode != 0:
        print("Error running 'hipcc --version'")
        return None

    # Extract the version using a regular expression
    match = re.search(r'HIP version: (\S+)', result.stdout)
    if match:
        # Return the version string
        return match.group(1)
    else:
        print("Could not find HIP version in the output")
        return None
Woosuk Kwon's avatar
Woosuk Kwon committed
106

107

108
109
110
111
112
def glob(pattern: str):
    root = Path(__name__).parent
    return [str(p) for p in root.glob(pattern)]


113
114
115
def get_neuronxcc_version():
    import sysconfig
    site_dir = sysconfig.get_paths()["purelib"]
116
117
    version_file = os.path.join(site_dir, "neuronxcc", "version",
                                "__init__.py")
118
119
120
121
122
123
124
125
126
127
128
129
130
131

    # Check if the command was executed successfully
    with open(version_file, "rt") as fp:
        content = fp.read()

    # Extract the version using a regular expression
    match = re.search(r"__version__ = '(\S+)'", content)
    if match:
        # Return the version string
        return match.group(1)
    else:
        raise RuntimeError("Could not find HIP version in the output")


132
133
134
135
136
137
138
139
140
141
142
143
144
def get_nvcc_cuda_version(cuda_dir: str) -> Version:
    """Get the CUDA version from nvcc.

    Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
    """
    nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
                                          universal_newlines=True)
    output = nvcc_output.split()
    release_idx = output.index("release") + 1
    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
    return nvcc_cuda_version


145
146
147
148
149
150
151
def get_torch_arch_list() -> Set[str]:
    # TORCH_CUDA_ARCH_LIST can have one or more architectures,
    # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
    # compiler to additionally include PTX code that can be runtime-compiled
    # and executed on the 8.6 or newer architectures. While the PTX code will
    # not give the best performance on the newer architectures, it provides
    # forward compatibility.
152
153
    env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
    if env_arch_list is None:
154
155
156
        return set()

    # List are separated by ; or space.
157
158
159
160
161
    torch_arch_list = set(env_arch_list.replace(" ", ";").split(";"))
    if not torch_arch_list:
        return set()

    # Filter out the invalid architectures and print a warning.
162
163
164
    valid_archs = NVIDIA_SUPPORTED_ARCHS.union(
        {s + "+PTX"
         for s in NVIDIA_SUPPORTED_ARCHS})
165
166
167
168
    arch_list = torch_arch_list.intersection(valid_archs)
    # If none of the specified architectures are valid, raise an error.
    if not arch_list:
        raise RuntimeError(
169
            "None of the CUDA/ROCM architectures in `TORCH_CUDA_ARCH_LIST` env "
170
            f"variable ({env_arch_list}) is supported. "
171
            f"Supported CUDA/ROCM architectures are: {valid_archs}.")
172
173
174
    invalid_arch_list = torch_arch_list - valid_archs
    if invalid_arch_list:
        warnings.warn(
175
            f"Unsupported CUDA/ROCM architectures ({invalid_arch_list}) are "
176
            "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
177
            f"({env_arch_list}). Supported CUDA/ROCM architectures are: "
178
179
            f"{valid_archs}.",
            stacklevel=2)
180
    return arch_list
181
182
183
184


# First, check the TORCH_CUDA_ARCH_LIST environment variable.
compute_capabilities = get_torch_arch_list()
185
if _is_cuda() and not compute_capabilities:
186
187
188
189
190
191
192
193
194
    # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
    # GPUs on the current machine.
    device_count = torch.cuda.device_count()
    for i in range(device_count):
        major, minor = torch.cuda.get_device_capability(i)
        if major < 7:
            raise RuntimeError(
                "GPUs with compute capability below 7.0 are not supported.")
        compute_capabilities.add(f"{major}.{minor}")
195

196
197
ext_modules = []

198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
if _is_cuda():
    nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
    if not compute_capabilities:
        # If no GPU is specified nor available, add all supported architectures
        # based on the NVCC CUDA version.
        compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy()
        if nvcc_cuda_version < Version("11.1"):
            compute_capabilities.remove("8.6")
        if nvcc_cuda_version < Version("11.8"):
            compute_capabilities.remove("8.9")
            compute_capabilities.remove("9.0")
    # Validate the NVCC CUDA version.
    if nvcc_cuda_version < Version("11.0"):
        raise RuntimeError(
            "CUDA 11.0 or higher is required to build the package.")
    if (nvcc_cuda_version < Version("11.1")
            and any(cc.startswith("8.6") for cc in compute_capabilities)):
        raise RuntimeError(
            "CUDA 11.1 or higher is required for compute capability 8.6.")
217
    if nvcc_cuda_version < Version("11.8"):
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
        if any(cc.startswith("8.9") for cc in compute_capabilities):
            # CUDA 11.8 is required to generate the code targeting compute capability 8.9.
            # However, GPUs with compute capability 8.9 can also run the code generated by
            # the previous versions of CUDA 11 and targeting compute capability 8.0.
            # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
            # instead of 8.9.
            warnings.warn(
                "CUDA 11.8 or higher is required for compute capability 8.9. "
                "Targeting compute capability 8.0 instead.",
                stacklevel=2)
            compute_capabilities = set(cc for cc in compute_capabilities
                                       if not cc.startswith("8.9"))
            compute_capabilities.add("8.0+PTX")
        if any(cc.startswith("9.0") for cc in compute_capabilities):
            raise RuntimeError(
                "CUDA 11.8 or higher is required for compute capability 9.0.")

235
236
    NVCC_FLAGS_PUNICA = NVCC_FLAGS.copy()

237
238
239
240
241
242
243
244
    # Add target compute capabilities to NVCC flags.
    for capability in compute_capabilities:
        num = capability[0] + capability[2]
        NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
        if capability.endswith("+PTX"):
            NVCC_FLAGS += [
                "-gencode", f"arch=compute_{num},code=compute_{num}"
            ]
245
246
247
248
249
250
251
252
        if int(capability[0]) >= 8:
            NVCC_FLAGS_PUNICA += [
                "-gencode", f"arch=compute_{num},code=sm_{num}"
            ]
            if capability.endswith("+PTX"):
                NVCC_FLAGS_PUNICA += [
                    "-gencode", f"arch=compute_{num},code=compute_{num}"
                ]
253
254
255
256
257
258
259

    # Use NVCC threads to parallelize the build.
    if nvcc_cuda_version >= Version("11.2"):
        nvcc_threads = int(os.getenv("NVCC_THREADS", 8))
        num_threads = min(os.cpu_count(), nvcc_threads)
        NVCC_FLAGS += ["--threads", str(num_threads)]

260
261
262
    if nvcc_cuda_version >= Version("11.8"):
        NVCC_FLAGS += ["-DENABLE_FP8_E5M2"]

263
264
265
266
267
268
269
270
271
272
273
274
    # changes for punica kernels
    NVCC_FLAGS += torch_cpp_ext.COMMON_NVCC_FLAGS
    REMOVE_NVCC_FLAGS = [
        '-D__CUDA_NO_HALF_OPERATORS__',
        '-D__CUDA_NO_HALF_CONVERSIONS__',
        '-D__CUDA_NO_BFLOAT16_CONVERSIONS__',
        '-D__CUDA_NO_HALF2_OPERATORS__',
    ]
    for flag in REMOVE_NVCC_FLAGS:
        with contextlib.suppress(ValueError):
            torch_cpp_ext.COMMON_NVCC_FLAGS.remove(flag)

275
    install_punica = bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
    device_count = torch.cuda.device_count()
    for i in range(device_count):
        major, minor = torch.cuda.get_device_capability(i)
        if major < 8:
            install_punica = False
            break
    if install_punica:
        ext_modules.append(
            CUDAExtension(
                name="vllm._punica_C",
                sources=["csrc/punica/punica_ops.cc"] +
                glob("csrc/punica/bgmv/*.cu"),
                extra_compile_args={
                    "cxx": CXX_FLAGS,
                    "nvcc": NVCC_FLAGS_PUNICA,
                },
            ))
zhuwenwen's avatar
zhuwenwen committed
293
# elif _is_hip():
zhuwenwen's avatar
zhuwenwen committed
294
295
296
297
298
299
300
301
302
#     amd_archs = os.getenv("GPU_ARCHS")
#     if amd_archs is None:
#         amd_archs = get_amdgpu_offload_arch()
#     for arch in amd_archs.split(";"):
#         if arch not in ROCM_SUPPORTED_ARCHS:
#             raise RuntimeError(
#                 f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}"
#                 f"amdgpu_arch_found: {arch}")
#         NVCC_FLAGS += [f"--offload-arch={arch}"]
303

304
305
elif _is_neuron():
    neuronxcc_version = get_neuronxcc_version()
306

307
308
309
310
311
312
313
vllm_extension_sources = [
    "csrc/cache_kernels.cu",
    "csrc/attention/attention_kernels.cu",
    "csrc/pos_encoding_kernels.cu",
    "csrc/activation_kernels.cu",
    "csrc/layernorm_kernels.cu",
    "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
kliuae's avatar
kliuae committed
314
    "csrc/quantization/gptq/q_gemm.cu",
315
    "csrc/cuda_utils_kernels.cu",
316
    "csrc/moe_align_block_size_kernels.cu",
317
318
319
320
321
    "csrc/pybind.cpp",
]

if _is_cuda():
    vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
322
    vllm_extension_sources.append("csrc/custom_all_reduce.cu")
Woosuk Kwon's avatar
Woosuk Kwon committed
323

324
325
326
327
328
329
330
331
if not _is_neuron():
    vllm_extension = CUDAExtension(
        name="vllm._C",
        sources=vllm_extension_sources,
        extra_compile_args={
            "cxx": CXX_FLAGS,
            "nvcc": NVCC_FLAGS,
        },
332
        libraries=["cuda"] if _is_cuda() else [],
333
334
    )
    ext_modules.append(vllm_extension)
335

336

337
338
339
340
def get_path(*filepath) -> str:
    return os.path.join(ROOT_DIR, *filepath)


341
def find_version(filepath: str) -> str:
342
343
344
345
346
    """Extract version information from the given filepath.

    Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
    """
    with open(filepath) as fp:
Woosuk Kwon's avatar
Woosuk Kwon committed
347
348
        version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
                                  fp.read(), re.M)
349
350
351
352
353
        if version_match:
            return version_match.group(1)
        raise RuntimeError("Unable to find version string.")


zhuwenwen's avatar
zhuwenwen committed
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
def get_abi():
    try:
        command = "echo '#include <string>' | gcc -x c++ -E -dM - | fgrep _GLIBCXX_USE_CXX11_ABI" 
        result = subprocess.run(command, shell=True, capture_output=True, text=True) 
        output = result.stdout.strip() 
        abi = "abi" + output.split(" ")[-1]
        return abi
    except Exception:
        return 'abiUnknown'


def get_sha(root: Union[str, Path]) -> str:
    try:
        return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=root).decode('ascii').strip()
    except Exception:
        return 'Unknown'

def get_version_add(sha: Optional[str] = None) -> str:
    vllm_root = os.path.dirname(os.path.abspath(__file__))
    add_version_path = os.path.join(os.path.join(vllm_root, "vllm"), "version.py")
    if sha != 'Unknown':
        if sha is None:
            sha = get_sha(vllm_root)
        version = 'git' + sha[:7]

    # abi version
    version += "." + get_abi()

    # dtk version
    if os.getenv("ROCM_PATH"):
        rocm_path = os.getenv('ROCM_PATH', "")
        rocm_version_path = os.path.join(rocm_path, '.info', "rocm_version")
        with open(rocm_version_path, 'r',encoding='utf-8') as file:
            lines = file.readlines()
        rocm_version=lines[0][:-2].replace(".", "")
        version += ".dtk" + rocm_version

    # torch version
    version += ".torch" + torch.__version__[:3]

    with open(add_version_path, encoding="utf-8",mode="w") as file:
zhuwenwen's avatar
zhuwenwen committed
395
396
        file.write("__version__='0.3.0'\n")
        file.write("__dcu_version__='0.3.0+{}'\n".format(version))
zhuwenwen's avatar
zhuwenwen committed
397
398
399
400
401
402
403
404
405
406
407
    file.close()
    
    
def get_version():
    get_version_add()
    version_file = 'vllm/version.py'
    with open(version_file, encoding='utf-8') as f:
        exec(compile(f.read(), version_file, 'exec'))
    return locals()['__dcu_version__']


408
409
def get_vllm_version() -> str:
    version = find_version(get_path("vllm", "__init__.py"))
410
411
412

    if _is_hip():
        # Get the HIP version
zhuwenwen's avatar
zhuwenwen committed
413
414
415
416
417
        # hipcc_version = get_hipcc_rocm_version()
        # if hipcc_version != MAIN_CUDA_VERSION:
        #     rocm_version_str = hipcc_version.replace(".", "")[:3]
        #     version += f"+rocm{rocm_version_str}"
        version = get_version()
418
419
420
421
422
423
    elif _is_neuron():
        # Get the Neuron version
        neuron_version = str(neuronxcc_version)
        if neuron_version != MAIN_CUDA_VERSION:
            neuron_version_str = neuron_version.replace(".", "")[:3]
            version += f"+neuron{neuron_version_str}"
424
425
426
427
428
429
    else:
        cuda_version = str(nvcc_cuda_version)
        if cuda_version != MAIN_CUDA_VERSION:
            cuda_version_str = cuda_version.replace(".", "")[:3]
            version += f"+cu{cuda_version_str}"

430
431
432
    return version


433
def read_readme() -> str:
Stephen Krider's avatar
Stephen Krider committed
434
435
436
437
438
439
    """Read the README file if present."""
    p = get_path("README.md")
    if os.path.isfile(p):
        return io.open(get_path("README.md"), "r", encoding="utf-8").read()
    else:
        return ""
440
441


442
443
def get_requirements() -> List[str]:
    """Get Python package dependencies from requirements.txt."""
444
445
446
    if _is_hip():
        with open(get_path("requirements-rocm.txt")) as f:
            requirements = f.read().strip().split("\n")
447
448
449
    elif _is_neuron():
        with open(get_path("requirements-neuron.txt")) as f:
            requirements = f.read().strip().split("\n")
450
451
452
    else:
        with open(get_path("requirements.txt")) as f:
            requirements = f.read().strip().split("\n")
453
454
455
    return requirements


Simon Mo's avatar
Simon Mo committed
456
457
458
459
460
package_data = {"vllm": ["py.typed"]}
if os.environ.get("VLLM_USE_PRECOMPILED"):
    ext_modules = []
    package_data["vllm"].append("*.so")

Woosuk Kwon's avatar
Woosuk Kwon committed
461
setuptools.setup(
Woosuk Kwon's avatar
Woosuk Kwon committed
462
    name="vllm",
463
    version=get_vllm_version(),
Woosuk Kwon's avatar
Woosuk Kwon committed
464
    author="vLLM Team",
465
    license="Apache 2.0",
Woosuk Kwon's avatar
Woosuk Kwon committed
466
467
    description=("A high-throughput and memory-efficient inference and "
                 "serving engine for LLMs"),
468
469
    long_description=read_readme(),
    long_description_content_type="text/markdown",
470
    url="https://github.com/vllm-project/vllm",
471
    project_urls={
472
473
        "Homepage": "https://github.com/vllm-project/vllm",
        "Documentation": "https://vllm.readthedocs.io/en/latest/",
474
475
476
477
478
    },
    classifiers=[
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
Woosuk Kwon's avatar
Woosuk Kwon committed
479
        "Programming Language :: Python :: 3.11",
480
481
482
        "License :: OSI Approved :: Apache Software License",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
Woosuk Kwon's avatar
Woosuk Kwon committed
483
484
    packages=setuptools.find_packages(exclude=("benchmarks", "csrc", "docs",
                                               "examples", "tests")),
485
486
    python_requires=">=3.8",
    install_requires=get_requirements(),
Woosuk Kwon's avatar
Woosuk Kwon committed
487
    ext_modules=ext_modules,
488
    cmdclass={"build_ext": BuildExtension} if not _is_neuron() else {},
Simon Mo's avatar
Simon Mo committed
489
    package_data=package_data,
Woosuk Kwon's avatar
Woosuk Kwon committed
490
)