Merge branch 'v0.6.3.post1-dev'

ad385667 · zhuwenwen · be0967c1 · 903593d3 · ad385667 · ad385667
Commit ad385667 authored Oct 23, 2024 by zhuwenwen
20 changed files
--- a/format.sh
+++ b/format.sh
@@ -96,19 +96,7 @@ echo 'vLLM yapf: Done'
 # Run mypy
 echo 'vLLM mypy:'
-mypy --follow-imports skip  # Note that this is less strict than CI
+tools/mypy.sh
-mypy tests --follow-imports skip
-mypy vllm/attention --follow-imports skip
-mypy vllm/core --follow-imports skip
-mypy vllm/distributed --follow-imports skip
-mypy vllm/engine  --follow-imports skip
-mypy vllm/entrypoints --follow-imports skip
-mypy vllm/executor --follow-imports skip
-mypy vllm/lora --follow-imports skip
-mypy vllm/model_executor  --follow-imports skip
-mypy vllm/prompt_adapter --follow-imports skip
-mypy vllm/spec_decode --follow-imports skip
-mypy vllm/worker --follow-imports skip
 echo 'vLLM mypy: Done'
@@ -161,7 +149,7 @@ echo 'vLLM codespell: Done'
 # Lint specified files
 lint() {
-    ruff "$@"
+    ruff check "$@"
 }
 # Lint files that differ from main branch. Ignores dirs that are not slated
@@ -177,7 +165,7 @@ lint_changed() {
    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
-             ruff
+             ruff check
    fi
 }
@@ -242,6 +230,11 @@ echo 'vLLM isort: Done'
 # NOTE: Keep up to date with .github/workflows/clang-format.yml
 CLANG_FORMAT_EXCLUDES=(
    'csrc/moe/topk_softmax_kernels.cu'
+    'csrc/quantization/gguf/ggml-common.h'
+    'csrc/quantization/gguf/dequantize.cuh'
+    'csrc/quantization/gguf/vecdotq.cuh'
+    'csrc/quantization/gguf/mmq.cuh'
+    'csrc/quantization/gguf/mmvq.cuh'
 )
 # Format specified files with clang-format
@@ -260,7 +253,7 @@ clang_format_changed() {
    MERGEBASE="$(git merge-base origin/main HEAD)"
    # Get the list of changed files, excluding the specified ones
-    changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}"))
+    changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | (grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") || echo -e))
    if [ -n "$changed_files" ]; then
        echo "$changed_files" | xargs -P 5 clang-format -i
    fi
@@ -283,6 +276,9 @@ else
 fi
 echo 'vLLM clang-format: Done'
+echo 'vLLM actionlint:'
+tools/actionlint.sh -color
+echo 'vLLM actionlint: Done'
 if ! git diff --quiet &>/dev/null; then
    echo 'Reformatted files. Please review and stage the changes.'

--- a/pyproject.toml
+++ b/pyproject.toml
 [build-system]
 # Should be mirrored in requirements-build.txt
 requires = [
-    "cmake>=3.21",
+    "cmake>=3.26",
    "ninja",
    "packaging",
-    "setuptools >= 49.4.0",
+    "setuptools>=61",
+    "setuptools-scm>=8.0",
    "torch == 2.4.0",
    "wheel",
+    "jinja2",
 ]
 build-backend = "setuptools.build_meta"
+[tool.setuptools_scm]
+# version_file = "vllm/_version.py" # currently handled by `setup.py:get_version()`
 [tool.ruff]
 # Allow lines to be as long as 80.
 line-length = 80
@@ -18,6 +23,10 @@ exclude = [
    "examples/fp8/quantizer/quantize.py"
 ]
+[tool.ruff.lint.per-file-ignores]
+"vllm/version.py" = ["F401"]
+"vllm/_version.py" = ["ALL"]
 [tool.ruff.lint]
 select = [
    # pycodestyle
@@ -41,6 +50,8 @@ ignore = [
    "E731",
    # Loop control variable not used within loop body
    "B007",
+    # f-string format
+    "UP032",
 ]
 [tool.mypy]
@@ -56,6 +67,8 @@ files = [
    "vllm/*.py",
    "vllm/adapter_commons",
    "vllm/assets",
+    "vllm/entrypoints",
+    "vllm/core",
    "vllm/inputs",
    "vllm/logging",
    "vllm/multimodal",
@@ -73,7 +86,7 @@ exclude = [
 [tool.codespell]
 ignore-words-list = "dout, te, indicies, subtile"
-skip = "./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
+skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
 [tool.isort]
 use_parentheses = true
@@ -82,5 +95,6 @@ skip_gitignore = true
 [tool.pytest.ini_options]
 markers = [
    "skip_global_cleanup",
-    "vlm: run tests for vision language models only",
+    "core_model: run this model test in each PR instead of just daily",
+    "distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
 ]
--- a/python_only_dev.py
+++ b/python_only_dev.py
+# enable python only development
+# copy compiled files to the current directory directly
+import argparse
+import os
+import shutil
+import subprocess
+import sys
+import warnings
+parser = argparse.ArgumentParser(
+    description="Development mode for python-only code")
+parser.add_argument('-q',
+                    '--quit-dev',
+                    action='store_true',
+                    help='Set the flag to quit development mode')
+args = parser.parse_args()
+# cannot directly `import vllm` , because it will try to
+# import from the current directory
+output = subprocess.run([sys.executable, "-m", "pip", "show", "vllm"],
+                        capture_output=True)
+assert output.returncode == 0, "vllm is not installed"
+text = output.stdout.decode("utf-8")
+package_path = None
+for line in text.split("\n"):
+    if line.startswith("Location: "):
+        package_path = line.split(": ")[1]
+        break
+assert package_path is not None, "could not find package path"
+cwd = os.getcwd()
+assert cwd != package_path, "should not import from the current directory"
+files_to_copy = [
+    "vllm/_C.abi3.so",
+    "vllm/_core_C.abi3.so",
+    "vllm/_moe_C.abi3.so",
+    "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
+    "vllm/vllm_flash_attn/flash_attn_interface.py",
+    "vllm/vllm_flash_attn/__init__.py",
+    # "vllm/_version.py", # not available in nightly wheels yet
+]
+# Try to create _version.py to avoid version related warning
+# Refer to https://github.com/vllm-project/vllm/pull/8771
+try:
+    from setuptools_scm import get_version
+    get_version(write_to="vllm/_version.py")
+except ImportError:
+    warnings.warn(
+        "To avoid warnings related to vllm._version, "
+        "you should install setuptools-scm by `pip install setuptools-scm`",
+        stacklevel=2)
+if not args.quit_dev:
+    for file in files_to_copy:
+        src = os.path.join(package_path, file)
+        dst = file
+        print(f"Copying {src} to {dst}")
+        shutil.copyfile(src, dst)
+    pre_built_vllm_path = os.path.join(package_path, "vllm")
+    tmp_path = os.path.join(package_path, "vllm_pre_built")
+    current_vllm_path = os.path.join(cwd, "vllm")
+    print(f"Renaming {pre_built_vllm_path} to {tmp_path} for backup")
+    os.rename(pre_built_vllm_path, tmp_path)
+    print(f"Linking {current_vllm_path} to {pre_built_vllm_path}")
+    os.symlink(current_vllm_path, pre_built_vllm_path)
+else:
+    vllm_symlink_path = os.path.join(package_path, "vllm")
+    vllm_backup_path = os.path.join(package_path, "vllm_pre_built")
+    current_vllm_path = os.path.join(cwd, "vllm")
+    print(f"Unlinking {current_vllm_path} to {vllm_symlink_path}")
+    assert os.path.islink(
+        vllm_symlink_path
+    ), f"not in dev mode: {vllm_symlink_path} is not a symbolic link"
+    assert current_vllm_path == os.readlink(
+        vllm_symlink_path
+    ), "current directory is not the source code of package"
+    os.unlink(vllm_symlink_path)
+    print(f"Recovering backup from {vllm_backup_path} to {vllm_symlink_path}")
+    os.rename(vllm_backup_path, vllm_symlink_path)
--- a/requirements-adag.txt
+++ b/requirements-adag.txt
-# Dependencies for Ray accelerated DAG
-cupy-cuda12x
-ray >= 2.32
\ No newline at end of file
--- a/requirements-build.txt
+++ b/requirements-build.txt
 # Should be mirrored in pyproject.toml
-cmake>=3.21
+cmake>=3.26
 ninja
 packaging
-setuptools>=49.4.0
+setuptools>=61
+setuptools-scm>=8
 torch==2.4.0
 wheel
+jinja2
--- a/requirements-common.txt
+++ b/requirements-common.txt
-cmake >= 3.21
-ninja  # For faster builds.
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
-requests
+requests >= 2.26.0
 tqdm
 py-cpuinfo
-transformers >= 4.43.2  # Required for Chameleon and Llama 3.1 hotfox.
+transformers >= 4.45.2  # Required for Llama 3.2 and Qwen2-VL.
 tokenizers >= 0.19.1  # Required for Llama 3.
-fastapi
+protobuf # Required by LlamaTokenizer.
+fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
+fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
 aiohttp
-openai
+openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
 uvicorn[standard]
-pydantic >= 2.0  # Required for OpenAI server.
+pydantic >= 2.9  # Required for fastapi >= 0.113.0
 pillow  # Required for image processing
 prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
-lm-format-enforcer == 0.10.3
+lm-format-enforcer == 0.10.6
-outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
+outlines >= 0.0.43, < 0.1
-typing_extensions
+typing_extensions >= 4.10
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
+partial-json-parser # used for parsing partial JSON outputs
 pyzmq
+msgspec
+gguf == 0.10.0
+importlib_metadata
+mistral_common[opencv] >= 1.4.4
+pyyaml
+six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
+setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
+einops # Required for Qwen2-VL.
+compressed-tensors == 0.6.0 # required for compressed-tensors
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -7,5 +7,4 @@ nvidia-ml-py # for pynvml package
 torch == 2.4.0
 # These must be updated alongside torch
 torchvision == 0.19   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers == 0.0.27.post2  # Requires PyTorch 2.4.0
+xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.4.0
-vllm-flash-attn == 2.6.1  # Requires PyTorch 2.4.0
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@@ -2,13 +2,13 @@
 yapf==0.32.0
 toml==0.10.2
 tomli==2.0.1
-ruff==0.1.5
+ruff==0.6.5
 codespell==2.3.0
 isort==5.13.2
 clang-format==18.1.5
 # type checking
-mypy==1.9.0
+mypy==1.11.1
 types-PyYAML
 types-requests
 types-setuptools
--- a/requirements-mamba.txt
+++ b/requirements-mamba.txt
-# Mamba dependencies
-mamba-ssm>=1.2.2
-causal-conv1d>=1.2.0
--- a/requirements-neuron.txt
+++ b/requirements-neuron.txt
@@ -2,6 +2,6 @@
 -r requirements-common.txt
 # Dependencies for Neuron devices
-transformers-neuronx >= 0.9.0
+transformers-neuronx >= 0.12.0
-torch-neuronx >= 2.1.0
+torch-neuronx >= 2.1.2
 neuronx-cc
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
 # Common dependencies
-# -r requirements-common.txt
+-r requirements-common.txt
-# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2
-cmake >= 3.21
-ninja  # For faster builds.
-psutil
-sentencepiece  # Required for LLaMA tokenizer.
-numpy < 2.0.0
-requests
-tqdm
-py-cpuinfo
-transformers < 4.43
-tokenizers >= 0.19.1  # Required for Llama 3.
-fastapi
-aiohttp
-openai
-uvicorn[standard]
-pydantic >= 2.0  # Required for OpenAI server.
-pillow  # Required for image processing
-prometheus_client >= 0.18.0
-prometheus-fastapi-instrumentator >= 7.0.0
-tiktoken >= 0.6.0  # Required for DBRX tokenizer
-lm-format-enforcer == 0.10.3
-outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
-typing_extensions
-filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
-pyzmq
-# OpenVINO dependencies
+torch == 2.4.0 #  should be aligned with "common" vLLM torch version
-torch >= 2.1.2
+openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
-openvino ~= 2024.3.0.dev
-openvino-tokenizers[transformers] ~= 2024.3.0.0.dev
+optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version
-optimum-intel[openvino] >= 1.18.1
+optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git@main # latest optimum-intel is used to support latest transformers version
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -8,3 +8,11 @@ botocore
 ray >= 2.10.0
 peft
 pytest-asyncio
+tensorizer>=2.9.0
+setuptools_scm>=8
+torch == 2.3.0
+triton == 2.1.0
+flash_attn == 2.6.1
+xformers == 0.0.25
+lmslim == 0.1.2
\ No newline at end of file
--- a/requirements-test.txt
+++ b/requirements-test.txt
-# Needed for Ray accelerated DAG tests
-r requirements-adag.txt
 # testing
 pytest
 tensorizer>=2.9.0
@@ -11,17 +8,27 @@ pytest-shard
 # testing utils
 awscli
-einops # required for MPT
+einops # required for MPT, qwen-vl and Mamba
 httpx
+librosa # required for audio tests
+opencv-python # required for video tests
 peft
 requests
-ray
+ray[adag]==2.35
 sentence-transformers # required for embedding
-compressed-tensors==0.4.0 # required for compressed-tensors
+soundfile # required for audio test
 timm # required for internvl test
+transformers_stream_generator # required for qwen-vl test
+matplotlib # required for qwen-vl test
+datamodel_code_generator # required for minicpm3 test
+lm-eval[api]==0.4.4 # required for model evaluation test
+# TODO: Add this after fully implementing llava(mantis)
+# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
 # Benchmarking
 aiohttp
 # quantization
-bitsandbytes==0.42.0
+bitsandbytes>=0.44.0
\ No newline at end of file
+buildkite-test-collector==0.1.8
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -4,4 +4,4 @@
 # Dependencies for TPU
 # Currently, the TPU backend uses a nightly version of PyTorch XLA.
 # You can install the dependencies in Dockerfile.tpu.
-ray
+ray[default]
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
 # Common dependencies
 -r requirements-common.txt
-setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
+ray >= 2.9
+cmake>=3.26
-torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl
+ninja
-intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
+packaging
-oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl
+setuptools-scm>=8
+wheel
-triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+jinja2
+# Following pkgs retrieved from https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+torch == 2.3.1+cxx11.abi
+intel-extension-for-pytorch == 2.3.110+xpu
+oneccl_bind_pt == 2.3.100+xpu
+triton-xpu == 3.0.0b2
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ import os
 import re
 import subprocess
 import sys
-import warnings
+from pathlib import Path
 from shutil import which
 from typing import Dict, List
@@ -13,6 +13,7 @@ import torch
 from packaging.version import Version, parse
 from setuptools import Extension, find_packages, setup
 from setuptools.command.build_ext import build_ext
+from setuptools_scm import get_version
 from torch.utils.cpp_extension import CUDA_HOME
 from typing import Optional, Union
@@ -34,43 +35,18 @@ def load_module_from_path(module_name, path):
 ROOT_DIR = os.path.dirname(__file__)
 logger = logging.getLogger(__name__)
-def embed_commit_hash():
-    try:
-        if "BUILDKITE_COMMIT" in os.environ:
-            # ci build
-            commit_id = os.environ["BUILDKITE_COMMIT"]
-        else:
-            commit_id = subprocess.check_output(["git", "rev-parse", "HEAD"],
-                                                encoding="utf-8").strip()
-        commit_contents = f'__commit__ = "{commit_id}"\n'
-        version_file = os.path.join(ROOT_DIR, "vllm", "commit_id.py")
-        with open(version_file, "w", encoding="utf-8") as f:
-            f.write(commit_contents)
-    except subprocess.CalledProcessError as e:
-        warnings.warn(f"Failed to get commit hash:\n{e}",
-                      RuntimeWarning,
-                      stacklevel=2)
-    except Exception as e:
-        warnings.warn(f"Failed to embed commit hash:\n{e}",
-                      RuntimeWarning,
-                      stacklevel=2)
-embed_commit_hash()
 # cannot import envs directly because it depends on vllm,
 #  which is not installed yet
 envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
 VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
-# vLLM only supports Linux platform
+if not sys.platform.startswith("linux"):
-assert sys.platform.startswith(
+    logger.warning(
-    "linux"), "vLLM only supports Linux platform (including WSL)."
+        "vLLM only supports Linux platform (including WSL). "
+        "Building on %s, "
+        "so vLLM may not be able to run correctly", sys.platform)
+    VLLM_TARGET_DEVICE = "empty"
 MAIN_CUDA_VERSION = "12.1"
@@ -156,15 +132,8 @@ class cmake_build_ext(build_ext):
        default_cfg = "Debug" if self.debug else "RelWithDebInfo"
        cfg = envs.CMAKE_BUILD_TYPE or default_cfg
-        # where .so files will be written, should be the same for all extensions
-        # that use the same CMakeLists.txt.
-        outdir = os.path.abspath(
-            os.path.dirname(self.get_ext_fullpath(ext.name)))
        cmake_args = [
            '-DCMAKE_BUILD_TYPE={}'.format(cfg),
-            '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}'.format(outdir),
-            '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY={}'.format(self.build_temp),
            '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
        ]
@@ -174,20 +143,27 @@ class cmake_build_ext(build_ext):
        if is_sccache_available():
            cmake_args += [
+                '-DCMAKE_C_COMPILER_LAUNCHER=sccache',
                '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
                '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
-                '-DCMAKE_C_COMPILER_LAUNCHER=sccache',
+                '-DCMAKE_HIP_COMPILER_LAUNCHER=sccache',
            ]
        elif is_ccache_available():
            cmake_args += [
+                '-DCMAKE_C_COMPILER_LAUNCHER=ccache',
                '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
                '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
+                '-DCMAKE_HIP_COMPILER_LAUNCHER=ccache',
            ]
        # Pass the python executable to cmake so it can find an exact
        # match.
        cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]
+        # Pass the python path to cmake so it can reuse the build dependencies
+        # on subsequent calls to python.
+        cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))]
        #
        # Setup parallelism and build tool
        #
@@ -221,10 +197,12 @@ class cmake_build_ext(build_ext):
            os.makedirs(self.build_temp)
        targets = []
+        target_name = lambda s: remove_prefix(remove_prefix(s, "vllm."),
+                                              "vllm_flash_attn.")
        # Build all the extensions
        for ext in self.extensions:
            self.configure(ext)
-            targets.append(remove_prefix(ext.name, "vllm."))
+            targets.append(target_name(ext.name))
        num_jobs, _ = self.compute_num_jobs()
@@ -237,6 +215,47 @@ class cmake_build_ext(build_ext):
        subprocess.check_call(["cmake", *build_args], cwd=self.build_temp)
+        # Install the libraries
+        for ext in self.extensions:
+            # Install the extension into the proper location
+            outdir = Path(self.get_ext_fullpath(ext.name)).parent.absolute()
+            # Skip if the install directory is the same as the build directory
+            if outdir == self.build_temp:
+                continue
+            # CMake appends the extension prefix to the install path,
+            # and outdir already contains that prefix, so we need to remove it.
+            prefix = outdir
+            for i in range(ext.name.count('.')):
+                prefix = prefix.parent
+            # prefix here should actually be the same for all components
+            install_args = [
+                "cmake", "--install", ".", "--prefix", prefix, "--component",
+                target_name(ext.name)
+            ]
+            subprocess.check_call(install_args, cwd=self.build_temp)
+    def run(self):
+        # First, run the standard build_ext command to compile the extensions
+        super().run()
+        # copy vllm/vllm_flash_attn/*.py from self.build_lib to current
+        # directory so that they can be included in the editable build
+        import glob
+        files = glob.glob(
+            os.path.join(self.build_lib, "vllm", "vllm_flash_attn", "*.py"))
+        for file in files:
+            dst_file = os.path.join("vllm/vllm_flash_attn",
+                                    os.path.basename(file))
+            print(f"Copying {file} to {dst_file}")
+            self.copy_file(file, dst_file)
+def _no_device() -> bool:
+    return VLLM_TARGET_DEVICE == "empty"
 def _is_cuda() -> bool:
    has_cuda = torch.version.cuda is not None
@@ -279,7 +298,7 @@ def _build_custom_ops() -> bool:
 def _build_core_ext() -> bool:
-    return not _is_neuron() and not _is_tpu()
+    return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu())
 def get_hipcc_rocm_version():
@@ -320,7 +339,7 @@ def get_neuronxcc_version():
        # Return the version string
        return match.group(1)
    else:
-        raise RuntimeError("Could not find HIP version in the output")
+        raise RuntimeError("Could not find Neuron version in the output")
 def get_nvcc_cuda_version() -> Version:
@@ -341,19 +360,6 @@ def get_path(*filepath) -> str:
    return os.path.join(ROOT_DIR, *filepath)
-def find_version(filepath: str) -> str:
-    """Extract version information from the given filepath.
-    Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
-    """
-    with open(filepath) as fp:
-        version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
-                                  fp.read(), re.M)
-        if version_match:
-            return version_match.group(1)
-        raise RuntimeError("Unable to find version string.")
 def get_sha(root: Union[str, Path]) -> str:
    try:
        return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=root).decode('ascii').strip()
@@ -364,13 +370,21 @@ def get_sha(root: Union[str, Path]) -> str:
 def get_version_add(sha: Optional[str] = None) -> str:
    vllm_root = os.path.dirname(os.path.abspath(__file__))
    add_version_path = os.path.join(os.path.join(vllm_root, "vllm"), "version.py")
+    major, minor, _ = torch.__version__.split('.')
    if add_git_version:
        if sha != 'Unknown':
            if sha is None:
                sha = get_sha(vllm_root)
-            version = 'das.opt1' + sha[:7]
+            # if (major, minor) == ('2', '1'):
+            #     version = 'das.opt1.' + sha[:7]
+            if (major, minor) == ('2', '3'):
+                version = 'das.opt1.' + sha[:7]
    else:
-        version = 'das.opt1'
+        # if (major, minor) == ('2', '1'):
+        #     version = 'das.opt1'
+        if (major, minor) == ('2', '3'):
+            version = 'das.opt1'
    # dtk version
    if os.getenv("ROCM_PATH"):
@@ -382,20 +396,20 @@ def get_version_add(sha: Optional[str] = None) -> str:
        version += ".dtk" + rocm_version
    new_version_content = f"""
-import warnings
 try:
-    import vllm.commit_id
+    __version__ = "0.6.3.post1"
-    __commit__ = vllm.commit_id.__commit__
+    __version_tuple__ = (0, 6, 3)
+    __dcu_version__ = f'0.6.3.post1+{version}' 
+    from vllm.version import __version__, __version_tuple__, __dcu_version__
 except Exception as e:
+    import warnings
    warnings.warn(f"Failed to read commit hash:\\n + str(e)",
                  RuntimeWarning,
                  stacklevel=2)
-    __commit__ = "COMMIT_HASH_PLACEHOLDER"
+    __version__ = "dev"
+    __version_tuple__ = (0, 0, __version__)
-__version__ = "0.5.4"
-__dcu_version__ = f'0.5.4+{version}' 
 """
    with open(add_version_path, encoding="utf-8",mode="w") as file:
@@ -412,34 +426,44 @@ def get_version():
 def get_vllm_version() -> str:
-    # version = find_version(get_path("vllm", "version.py"))
+    if not _is_hip():
+        version = get_version(
+            write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
+        )
-    if _is_cuda():
+        sep = "+" if "+" not in version else "."  # dev versions might contain +
+    if _no_device():
+        if envs.VLLM_TARGET_DEVICE == "empty":
+            version += f"{sep}empty"
+    elif _is_cuda():
        cuda_version = str(get_nvcc_cuda_version())
        if cuda_version != MAIN_CUDA_VERSION:
            cuda_version_str = cuda_version.replace(".", "")[:3]
-            version += f"+cu{cuda_version_str}"
+            # skip this for source tarball, required for pypi
+            if "sdist" not in sys.argv:
+                version += f"{sep}cu{cuda_version_str}"
    elif _is_hip():
        # Get the HIP version
        # hipcc_version = get_hipcc_rocm_version()
        # if hipcc_version != MAIN_CUDA_VERSION:
        #     rocm_version_str = hipcc_version.replace(".", "")[:3]
-        #     version += f"+rocm{rocm_version_str}"
+        #     version += f"{sep}rocm{rocm_version_str}"
        version = get_version()
    elif _is_neuron():
        # Get the Neuron version
        neuron_version = str(get_neuronxcc_version())
        if neuron_version != MAIN_CUDA_VERSION:
            neuron_version_str = neuron_version.replace(".", "")[:3]
-            version += f"+neuron{neuron_version_str}"
+            version += f"{sep}neuron{neuron_version_str}"
    elif _is_openvino():
-        version += "+openvino"
+        version += f"{sep}openvino"
    elif _is_tpu():
-        version += "+tpu"
+        version += f"{sep}tpu"
    elif _is_cpu():
-        version += "+cpu"
+        version += f"{sep}cpu"
    elif _is_xpu():
-        version += "+xpu"
+        version += f"{sep}xpu"
    else:
        raise RuntimeError("Unknown runtime environment")
@@ -465,11 +489,15 @@ def get_requirements() -> List[str]:
        for line in requirements:
            if line.startswith("-r "):
                resolved_requirements += _read_requirements(line.split()[1])
+            elif line.startswith("--"):
+                continue
            else:
                resolved_requirements.append(line)
        return resolved_requirements
-    if _is_cuda():
+    if _no_device():
+        requirements = _read_requirements("requirements-cuda.txt")
+    elif _is_cuda():
        requirements = _read_requirements("requirements-cuda.txt")
        cuda_major, cuda_minor = torch.version.cuda.split(".")
        modified_requirements = []
@@ -508,16 +536,26 @@ if _build_core_ext():
 if _is_cuda() or _is_hip():
    ext_modules.append(CMakeExtension(name="vllm._moe_C"))
+# if _is_hip():
+#     ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
+if _is_cuda():
+    ext_modules.append(
+        CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c"))
 if _build_custom_ops():
    ext_modules.append(CMakeExtension(name="vllm._C"))
 package_data = {
-    "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
+    "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json", "benchmarks/*.py"]
 }
 if envs.VLLM_USE_PRECOMPILED:
    ext_modules = []
    package_data["vllm"].append("*.so")
+if _no_device():
+    ext_modules = []
 setup(
    name="vllm",
    version=get_vllm_version(),
@@ -539,7 +577,11 @@ setup(
        "Programming Language :: Python :: 3.11",
        "Programming Language :: Python :: 3.12",
        "License :: OSI Approved :: Apache Software License",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Information Technology",
+        "Intended Audience :: Science/Research",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Scientific/Engineering :: Information Analysis",
    ],
    packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
                                    "tests*")),
@@ -548,6 +590,7 @@ setup(
    ext_modules=ext_modules,
    extras_require={
        "tensorizer": ["tensorizer>=2.9.0"],
+        "audio": ["librosa", "soundfile"]  # Required for audio processing
    },
    cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
    package_data=package_data,

--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
 """vllm.entrypoints.api_server with some extra logging for testing."""
-from typing import Any, Dict
+from typing import Any, Dict, Iterable
 import uvicorn
 from fastapi.responses import JSONResponse, Response
@@ -18,9 +18,10 @@ class AsyncLLMEngineWithStats(AsyncLLMEngine):
        super().__init__(*args, **kwargs)
        self._num_aborts = 0
-    async def abort(self, request_id: str) -> None:
+    async def _engine_abort(self, request_ids: Iterable[str]):
-        await super().abort(request_id)
+        ids = list(request_ids)
-        self._num_aborts += 1
+        self._num_aborts += len(ids)
+        await super()._engine_abort(ids)
    def testing_stats(self) -> Dict[str, Any]:
        return {"num_aborted_requests": self._num_aborts}

--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -25,8 +25,7 @@ def _query_server_long(prompt: str) -> dict:
 @pytest.fixture
-def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
+def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
-               worker_use_ray: bool):
    script_path = Path(__file__).parent.joinpath(
        "api_server_async_engine.py").absolute()
    commands = [
@@ -35,8 +34,7 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
        "127.0.0.1", "--tokenizer-pool-size",
        str(tokenizer_pool_size)
    ]
-    if engine_use_ray:
-        commands.append("--engine-use-ray")
    if worker_use_ray:
        commands.append("--worker-use-ray")
    uvicorn_process = subprocess.Popen(commands)
@@ -46,9 +44,8 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
 @pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
 @pytest.mark.parametrize("worker_use_ray", [False, True])
-@pytest.mark.parametrize("engine_use_ray", [False, True])
+def test_api_server(api_server, tokenizer_pool_size: int,
-def test_api_server(api_server, tokenizer_pool_size: int, worker_use_ray: bool,
+                    worker_use_ray: bool):
-                    engine_use_ray: bool):
    """
    Run the API server and test it.

--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
 import asyncio
+import os
+import uuid
+from asyncio import CancelledError
+from copy import copy
 from dataclasses import dataclass
+from typing import List, Optional
 import pytest
+import pytest_asyncio
 import torch
 from vllm import SamplingParams
 from vllm.config import ParallelConfig
 from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
+from vllm.outputs import RequestOutput as RealRequestOutput
+from vllm.sampling_params import RequestOutputKind
+from ..conftest import cleanup
 from ..utils import wait_for_gpu_memory_to_clear
@@ -17,6 +26,11 @@ class RequestOutput:
    finished: bool = False
+@dataclass
+class MockModelConfig:
+    use_async_output_proc = True
 class MockEngine:
    def __init__(self):
@@ -26,6 +40,7 @@ class MockEngine:
        self.request_id = None
        # Ugly, remove dependency when possible
        self.parallel_config = ParallelConfig(1, 1, False)
+        self.model_config = MockModelConfig()
    async def step_async(self, virtual_engine):
        # PP size is 1, ignore virtual engine
@@ -66,24 +81,24 @@ class MockEngine:
 class MockAsyncLLMEngine(AsyncLLMEngine):
+    _engine_class = MockEngine
-    def _init_engine(self, *args, **kwargs):
-        return MockEngine()
 @pytest.mark.asyncio
 async def test_new_requests_event():
-    engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False)
+    params = SamplingParams()
+    engine = MockAsyncLLMEngine()
    engine.start_background_loop()
    await asyncio.sleep(0.01)
    assert engine.engine.step_calls == 0
-    await engine.add_request("1", "", None)
+    await engine.add_request("1", "", params)
    await asyncio.sleep(0.01)
    assert engine.engine.add_request_calls == 1
    assert engine.engine.step_calls == 1
-    await engine.add_request("2", "", None)
+    await engine.add_request("2", "", params)
    engine.engine.generate("2")
    await asyncio.sleep(0)
    await asyncio.sleep(0)
@@ -98,7 +113,7 @@ async def test_new_requests_event():
    await asyncio.sleep(0.001)
    assert engine.engine.step_calls == old_step_calls
-    await engine.add_request("3", "", None)
+    await engine.add_request("3", "", params)
    await asyncio.sleep(0.01)
    assert engine.engine.add_request_calls == 3
    assert engine.engine.step_calls == old_step_calls + 1
@@ -106,39 +121,254 @@ async def test_new_requests_event():
    assert engine.engine.add_request_calls == 3
    assert engine.engine.step_calls == old_step_calls + 1
-    engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
+    engine = MockAsyncLLMEngine()
    assert engine.get_model_config() is not None
    assert engine.get_tokenizer() is not None
    assert engine.get_decoding_config() is not None
-def test_asyncio_run():
+def start_engine():
    wait_for_gpu_memory_to_clear(
        devices=list(range(torch.cuda.device_count())),
        threshold_bytes=2 * 2**30,
        timeout_s=60,
    )
-    engine = AsyncLLMEngine.from_engine_args(
+    num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
-        AsyncEngineArgs(model="facebook/opt-125m"))
+    print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
+    return AsyncLLMEngine.from_engine_args(
+        AsyncEngineArgs(model="facebook/opt-125m",
+                        enforce_eager=True,
+                        num_scheduler_steps=num_scheduler_steps))
+def uid() -> str:
+    return str(uuid.uuid4())
+@pytest_asyncio.fixture(scope="module")
+async def async_engine():
+    engine = await asyncio.get_event_loop().run_in_executor(executor=None,
+                                                            func=start_engine)
+    try:
+        yield engine
+    finally:
+        engine.shutdown_background_loop()
+        del engine
+        await asyncio.sleep(0.1)
+        cleanup()
+@pytest.fixture()
+def should_do_global_cleanup_after_test(request) -> bool:
+    # So we can share the async engine fixture between these tests
+    return False
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_asyncio_run(async_engine, stop):
+    scheduler_config = await async_engine.get_scheduler_config()
+    num_scheduler_steps = scheduler_config.num_scheduler_steps
    async def run(prompt: str):
        sampling_params = SamplingParams(
            temperature=0,
            max_tokens=32,
+            min_tokens=32,
+            stop=stop,
        )
-        async for output in engine.generate(prompt,
+        output_count = 0
-                                            sampling_params,
+        final_output = None
-                                            request_id=prompt):
+        async for output in async_engine.generate(prompt,
+                                                  sampling_params,
+                                                  request_id=uid()):
+            output_count += 1
            final_output = output
-        return final_output
+        return final_output, output_count
-    async def generate():
+    results = await asyncio.gather(
-        return await asyncio.gather(
+        run("test0"),
-            run("test0"),
+        run("test0"),
-            run("test1"),
+    )
-        )
-    results = asyncio.run(generate())
    assert len(results) == 2
+    first, second = results
+    # remove nondeterministic fields for comparison
+    first[0].metrics = None
+    second[0].metrics = None
+    first[0].request_id = None
+    second[0].request_id = None
+    assert str(first) == str(second)
+    output_count = results[0][1]
+    if num_scheduler_steps == 1:
+        assert output_count == 32
+    else:
+        assert 1 < output_count < 32
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_output_kinds(async_engine, stop):
+    """Test that output_kind works as expected and that
+    results are equivalent across different kinds."""
+    scheduler_config = await async_engine.get_scheduler_config()
+    num_scheduler_steps = scheduler_config.num_scheduler_steps
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=32,
+        min_tokens=32,
+        stop=stop,
+    )
+    async def run(prompt: str, kind: RequestOutputKind):
+        params = copy(sampling_params)
+        params.output_kind = kind
+        output_count = 0
+        final_output = None
+        async for output in async_engine.generate(prompt,
+                                                  params,
+                                                  request_id=uid()):
+            output_count += 1
+            final_output = output
+        assert final_output is not None
+        assert final_output.finished
+        return (final_output.prompt_token_ids,
+                final_output.outputs[0].token_ids,
+                final_output.outputs[0].text, output_count)
+    async def run_deltas(prompt: str):
+        params = copy(sampling_params)
+        params.output_kind = RequestOutputKind.DELTA
+        prompt_tokens = None
+        output_tokens: List[int] = []
+        output_text = ""
+        output_count = 0
+        final_output = None
+        async for output in async_engine.generate(prompt,
+                                                  params,
+                                                  request_id=uid()):
+            token_ids = output.outputs[0].token_ids
+            text = output.outputs[0].text
+            final_output = output
+            # Ensure we get prompt ids iff we haven't yet received output tokens
+            if output_tokens:
+                assert 1 <= len(token_ids) <= num_scheduler_steps
+                assert stop or text
+                assert not output.prompt_token_ids
+            else:
+                assert output.prompt_token_ids
+                prompt_tokens = output.prompt_token_ids
+            output_tokens.extend(token_ids)
+            output_text += text
+            output_count += 1
+        assert final_output is not None
+        assert final_output.finished
+        return prompt_tokens, output_tokens, output_text, output_count
+    results = await asyncio.gather(
+        run("common input prompt", RequestOutputKind.CUMULATIVE),
+        run("common input prompt", RequestOutputKind.FINAL_ONLY),
+        run_deltas("common input prompt"))
+    # Make sure outputs are the same
+    prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
+    assert len(prompt_set) == 1
+    text_set = set(text for _, _, text, _ in results)
+    assert len(text_set) == 1
+    tokens_set = set(tuple(ids) for _, ids, _, _ in results)
+    assert len(tokens_set) == 1
+    cumulative, final, deltas = results
+    # output message counts
+    assert cumulative[3] == deltas[3]
+    if num_scheduler_steps == 1:
+        assert cumulative[3] == 32
+    else:
+        assert 1 < cumulative[3] < 32
+    assert final[3] == 1
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_cancellation(async_engine, stop):
+    scheduler_config = await async_engine.get_scheduler_config()
+    num_scheduler_steps = scheduler_config.num_scheduler_steps
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=13,
+        max_tokens=13,
+        stop=stop,
+    )
+    stop_at = 5 if num_scheduler_steps == 1 else 1
+    request_id = uid()
+    i = 0
+    with pytest.raises(CancelledError):
+        async for output in async_engine.generate("test2",
+                                                  sampling_params,
+                                                  request_id=request_id):
+            assert not output.finished
+            i += 1
+            if i == stop_at:
+                await async_engine.abort(request_id)
+    assert i == stop_at
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_delayed_generator(async_engine, stop):
+    scheduler_config = await async_engine.get_scheduler_config()
+    if scheduler_config.num_scheduler_steps != 1:
+        pytest.skip("no need to test this one with multistep")
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=10,
+        max_tokens=10,
+        stop=stop,
+    )
+    stream = async_engine.generate("test3", sampling_params, request_id=uid())
+    i = 0
+    final_output: Optional[RealRequestOutput] = None
+    async for output in stream:
+        final_output = output
+        if i == 0:
+            # wait for generation to complete before consuming
+            # the remaining messages
+            await asyncio.sleep(1)
+        if i < 9:
+            assert not output.finished
+        i += 1
+    assert i == 10
+    assert final_output is not None
+    assert len(final_output.outputs[0].token_ids) == 10
+    assert final_output.finished
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
-import openai  # use the official client for correctness check
-import pytest
-from ..utils import RemoteOpenAIServer
-# any model with a chat template should work here
-MODEL_NAME = "facebook/opt-125m"
-@pytest.fixture(scope="module")
-def server():
-    args = [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "float16",
-        "--max-model-len",
-        "2048",
-        "--enforce-eager",
-        "--engine-use-ray"
-    ]
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        yield remote_server
-@pytest.fixture(scope="module")
-def client(server):
-    return server.get_async_client()
-@pytest.mark.asyncio
-async def test_check_models(client: openai.AsyncOpenAI):
-    models = await client.models.list()
-    models = models.data
-    served_model = models[0]
-    assert served_model.id == MODEL_NAME
-    assert all(model.root == MODEL_NAME for model in models)
-@pytest.mark.asyncio
-async def test_single_completion(client: openai.AsyncOpenAI):
-    completion = await client.completions.create(model=MODEL_NAME,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
-    assert completion.id is not None
-    assert len(completion.choices) == 1
-    assert len(completion.choices[0].text) >= 5
-    assert completion.choices[0].finish_reason == "length"
-    assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 5
-@pytest.mark.asyncio
-async def test_single_chat_session(client: openai.AsyncOpenAI):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-    # test single completion
-    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
-                                                           messages=messages,
-                                                           max_tokens=10,
-                                                           logprobs=True,
-                                                           top_logprobs=5)
-    assert chat_completion.id is not None
-    assert len(chat_completion.choices) == 1
-    choice = chat_completion.choices[0]
-    assert choice.finish_reason == "length"
-    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=13, total_tokens=23)
-    message = choice.message
-    assert message.content is not None and len(message.content) >= 10
-    assert message.role == "assistant"
-    messages.append({"role": "assistant", "content": message.content})
-    # test multi-turn dialogue
-    messages.append({"role": "user", "content": "express your result in json"})
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=10,
-    )
-    message = chat_completion.choices[0].message
-    assert message.content is not None and len(message.content) >= 0