Merge tag 'v0.7.3' into v0.7.3-dev

ec5e299c · zhuwenwen · 47bd229c · ed6e9075 · ec5e299c · ec5e299c
Commit ec5e299c authored Feb 21, 2025 by zhuwenwen
20 changed files
--- a/examples/online_serving/prometheus_grafana/README.md
+++ b/examples/online_serving/prometheus_grafana/README.md
-# Prometheus and Grafana 
+# Prometheus and Grafana
-This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites. 
+This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites.
+Install:
-Install: 
 - [`docker`](https://docs.docker.com/engine/install/)
 - [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository)
 ## Launch
 Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
 ```bash
 vllm serve mistralai/Mistral-7B-v0.1 \
    --max-model-len 2048 \
@@ -16,11 +18,13 @@ vllm serve mistralai/Mistral-7B-v0.1 \
 ```
 Launch Prometheus and Grafana servers with `docker compose`:
 ```bash
 docker compose up
 ```
 Submit some sample requests to the server:
 ```bash
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
@@ -41,13 +45,13 @@ Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the de
 ### Add Prometheus Data Source
-Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus. 
+Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus.
 On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each containers. You can just use `http://prometheus:9090`.
 Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
-### Import Dashboard 
+### Import Dashboard
 Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following:

--- a/examples/other/logging_configuration.md
+++ b/examples/other/logging_configuration.md
@@ -15,7 +15,6 @@ more-complex-and-more-flexible.
  - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` and
    set `VLLM_LOGGING_CONFIG_PATH=<path-to-logging-config.json>`
 ## Logging Configuration Environment Variables
 ### `VLLM_CONFIGURE_LOGGING`
@@ -45,7 +44,6 @@ schema](https://docs.python.org/3/library/logging.config.html#dictionary-schema-
 If `VLLM_LOGGING_CONFIG_PATH` is specified, but `VLLM_CONFIGURE_LOGGING` is
 disabled, an error will occur while starting vLLM.
 ## Examples
 ### Example 1: Customize vLLM root logger
@@ -98,7 +96,6 @@ VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```
 ### Example 2: Silence a particular vLLM logger
 To silence a particular vLLM logger, it is necessary to provide custom logging
@@ -153,7 +150,6 @@ VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```
 ### Example 3: Disable vLLM default logging configuration
 To disable vLLM's default logging configuration and silence all vLLM loggers,
@@ -166,7 +162,6 @@ VLLM_CONFIGURE_LOGGING=0 \
    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```
 ## Additional resources
 - [`logging.config` Dictionary Schema Details](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,8 +12,42 @@ requires = [
 ]
 build-backend = "setuptools.build_meta"
+[project]
+name = "vllm"
+authors = [{name = "vLLM Team"}]
+license = { "file"= "LICENSE" }
+readme = "README.md"
+description = "A high-throughput and memory-efficient inference and serving engine for LLMs"
+classifiers = [
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "License :: OSI Approved :: Apache Software License",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Information Technology",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Scientific/Engineering :: Information Analysis",
+]
+requires-python = ">=3.9"
+dynamic = [ "version", "dependencies", "optional-dependencies"]
+[project.urls]
+Homepage="https://github.com/vllm-project/vllm"
+Documentation="https://vllm.readthedocs.io/en/latest/"
+Slack="http://slack.vllm.ai/"
+[project.scripts]
+vllm = "vllm.entrypoints.cli.main:main"
 [tool.setuptools_scm]
-# version_file = "vllm/_version.py" # currently handled by `setup.py:get_version()`
+# no extra settings needed, presence enables setuptools-scm
+[tool.setuptools.packages.find]
+where = ["."]
+exclude = ["benchmarks", "csrc", "docs", "examples", "tests*"]
+namespaces = false
 [tool.yapfignore]
 ignore_patterns = [
@@ -59,7 +93,8 @@ ignore = [
    "UP032",
    # Python 3.8 typing
    "UP006", "UP035",
+    # Can remove once 3.10+ is the minimum Python version
+    "UP007",
 ]
 [tool.mypy]
@@ -92,7 +127,7 @@ exclude = [
 [tool.codespell]
 ignore-words-list = "dout, te, indicies, subtile, ElementE"
-skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
+skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora/data/*,build/*,vllm/third_party/*"
 [tool.isort]
 use_parentheses = true

--- a/requirements-common.txt
+++ b/requirements-common.txt
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
+numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding.
 requests >= 2.26.0
 tqdm
 blake3
@@ -8,12 +9,11 @@ py-cpuinfo
 transformers >= 4.48.2  # Required for Bamba model and Transformers backend.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
-fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
+fastapi[standard] >= 0.107.0, < 0.113.0; python_version < '3.9'
-fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
+fastapi[standard]  >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
 aiohttp
 openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
-uvicorn[standard]
+pydantic >= 2.9
-pydantic >= 2.9  # Required for fastapi >= 0.113.0
 prometheus_client >= 0.18.0
 pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
@@ -21,7 +21,7 @@ tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
 outlines == 0.1.11
 lark == 1.2.2 
-xgrammar >= 0.1.6; platform_machine == "x86_64"
+xgrammar == 0.1.11; platform_machine == "x86_64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs

--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -2,8 +2,7 @@
 -r requirements-common.txt
 # Dependencies for NVIDIA GPUs
-ray[default] >= 2.9
+ray[adag] == 2.40.0 # Required for pipeline parallelism in V1.
-nvidia-ml-py >= 12.560.30 # for pynvml package
 torch == 2.5.1
 torchaudio==2.5.1
 # These must be updated alongside torch

--- a/requirements-neuron.txt
+++ b/requirements-neuron.txt
@@ -2,6 +2,5 @@
 -r requirements-common.txt
 # Dependencies for Neuron devices
-transformers-neuronx >= 0.13.0
 torch-neuronx >= 2.5.0
 neuronx-cc
--- a/requirements-rocm-build.txt
+++ b/requirements-rocm-build.txt
+# Common dependencies
+-r requirements-common.txt
+--extra-index-url https://download.pytorch.org/whl/rocm6.2
+torch==2.5.1
+torchvision==0.20.1
+torchaudio==2.5.1
+cmake>=3.26
+ninja
+packaging
+setuptools>=61
+setuptools-scm>=8
+wheel
+jinja2
+amdsmi==6.2.4
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -10,6 +10,8 @@ ray >= 2.10.0
 peft
 pytest-asyncio
 tensorizer>=2.9.0
+runai-model-streamer==0.11.0
+runai-model-streamer-s3==0.11.0
 setuptools_scm>=8
 torch == 2.4.1
@@ -17,3 +19,4 @@ triton == 3.0.0
 flash_attn == 2.6.1
 lmslim == 0.2.0  
 numa
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -19,6 +19,7 @@ pqdm
 ray[adag]==2.40.0
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
+jiwer # required for audio tests
 timm # required for internvl test
 torch==2.5.1
 torchaudio==2.5.1
@@ -36,3 +37,5 @@ genai_perf==0.0.8
 tritonclient==2.51.0
 numpy < 2.0.0
+runai-model-streamer==0.11.0
+runai-model-streamer-s3==0.11.0
\ No newline at end of file
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -66,6 +66,7 @@ charset-normalizer==3.4.0
 click==8.1.7
    # via
    #   black
+    #   jiwer
    #   nltk
    #   ray
 colorama==0.4.6
@@ -170,6 +171,8 @@ huggingface-hub==0.26.2
    #   tokenizers
    #   transformers
    #   vocos
+humanize==4.11.0
+    # via runai-model-streamer
 idna==3.10
    # via
    #   anyio
@@ -187,6 +190,8 @@ jinja2==3.1.4
    # via
    #   datamodel-code-generator
    #   torch
+jiwer==3.0.5
+    # via -r requirements-test.in
 jmespath==1.0.1
    # via
    #   boto3
@@ -287,6 +292,7 @@ numpy==1.26.4
    #   patsy
    #   peft
    #   rouge-score
+    #   runai-model-streamer
    #   sacrebleu
    #   scikit-learn
    #   scipy
@@ -470,6 +476,8 @@ pyyaml==6.0.2
    #   timm
    #   transformers
    #   vocos
+rapidfuzz==3.12.1
+    # via jiwer
 ray[adag]==2.40.0
    # via -r requirements-test.in
 redis==5.2.0
@@ -509,6 +517,10 @@ rpds-py==0.20.1
    #   referencing
 rsa==4.7.2
    # via awscli
+runai-model-streamer==0.11.0
+    # via -r requirements-test.in
+runai-model-streamer-s3==0.11.0
+    # via -r requirements-test.in
 s3transfer==0.10.3
    # via
    #   awscli
@@ -589,6 +601,7 @@ torch==2.5.1
    #   encodec
    #   lm-eval
    #   peft
+    #   runai-model-streamer
    #   sentence-transformers
    #   tensorizer
    #   timm

--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@ from typing import Dict, List
 import torch
 from packaging.version import Version, parse
-from setuptools import Extension, find_packages, setup
+from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
 from setuptools_scm import get_version
 from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
@@ -55,6 +55,12 @@ elif not (sys.platform.startswith("linux")
        "Building on %s, "
        "so vLLM may not be able to run correctly", sys.platform)
    VLLM_TARGET_DEVICE = "empty"
+elif (sys.platform.startswith("linux") and torch.version.cuda is None
+      and os.getenv("VLLM_TARGET_DEVICE") is None
+      and torch.version.hip is None):
+    # if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
+    # fallback to cpu
+    VLLM_TARGET_DEVICE = "cpu"
 MAIN_CUDA_VERSION = "12.1"
@@ -270,15 +276,34 @@ class cmake_build_ext(build_ext):
 class repackage_wheel(build_ext):
    """Extracts libraries and other files from an existing wheel."""
-    default_wheel = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-    def run(self) -> None:
+    def get_base_commit_in_main_branch(self) -> str:
-        wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION",
+        import subprocess
-                                   self.default_wheel)
+        try:
+            current_branch = subprocess.check_output(
+                ["git", "branch", "--show-current"]).decode("utf-8").strip()
+            base_commit = subprocess.check_output(
+                ["git", "merge-base", "main",
+                 current_branch]).decode("utf-8").strip()
+            return base_commit
+        except Exception as err:
+            logger.warning(
+                "Failed to get the base commit in the main branch. "
+                "Using the nightly wheel. The libraries in this "
+                "wheel may not be compatible with your dev branch: %s", err)
+            return "nightly"
+    def run(self) -> None:
        assert _is_cuda(
        ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+        wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
+        if wheel_location is None:
+            base_commit = self.get_base_commit_in_main_branch()
+            wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
        import zipfile
        if os.path.isfile(wheel_location):
@@ -377,12 +402,7 @@ def _is_hip() -> bool:
 def _is_neuron() -> bool:
-    torch_neuronx_installed = True
+    return VLLM_TARGET_DEVICE == "neuron"
-    try:
-        subprocess.run(["neuron-ls"], capture_output=True, check=True)
-    except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
-        torch_neuronx_installed = False
-    return torch_neuronx_installed or VLLM_TARGET_DEVICE == "neuron"
 def _is_tpu() -> bool:
@@ -505,9 +525,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
    new_version_content = f"""
 try:
-    __version__ = "0.7.2"
+    __version__ = "0.7.3"
-    __version_tuple__ = (0, 7, 2)
+    __version_tuple__ = (0, 7, 3)
-    __hcu_version__ = f'0.7.2+{version}' 
+    __hcu_version__ = f'0.7.3+{version}' 
    from vllm.version import __version__, __version_tuple__, __hcu_version__
 except Exception as e:
@@ -551,10 +571,7 @@ def get_gaudi_sw_version():
 def get_vllm_version() -> str:
    if not _is_hip():
-        version = get_version(
+        version = get_version(write_to="vllm/_version.py")
-            write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
-        )
        sep = "+" if "+" not in version else "."  # dev versions might contain +
    if _no_device():
@@ -593,7 +610,8 @@ def get_vllm_version() -> str:
    elif _is_tpu():
        version += f"{sep}tpu"
    elif _is_cpu():
-        version += f"{sep}cpu"
+        if envs.VLLM_TARGET_DEVICE == "cpu":
+            version += f"{sep}cpu"
    elif _is_xpu():
        version += f"{sep}xpu"
    else:
@@ -602,16 +620,6 @@ def get_vllm_version() -> str:
    return version
-def read_readme() -> str:
-    """Read the README file if present."""
-    p = get_path("README.md")
-    if os.path.isfile(p):
-        with open(get_path("README.md"), encoding="utf-8") as f:
-            return f.read()
-    else:
-        return ""
 def get_requirements() -> List[str]:
    """Get Python package dependencies from requirements.txt."""
@@ -705,36 +713,10 @@ else:
    }
 setup(
-    name="vllm",
+    # static metadata should rather go in pyproject.toml
    version=get_vllm_version(),
-    author="vLLM Team",
-    license="Apache 2.0",
-    description=("A high-throughput and memory-efficient inference and "
-                 "serving engine for LLMs"),
-    long_description=read_readme(),
-    long_description_content_type="text/markdown",
-    url="https://github.com/vllm-project/vllm",
-    project_urls={
-        "Homepage": "https://github.com/vllm-project/vllm",
-        "Documentation": "https://vllm.readthedocs.io/en/latest/",
-    },
-    classifiers=[
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
-        "Programming Language :: Python :: 3.11",
-        "Programming Language :: Python :: 3.12",
-        "License :: OSI Approved :: Apache Software License",
-        "Intended Audience :: Developers",
-        "Intended Audience :: Information Technology",
-        "Intended Audience :: Science/Research",
-        "Topic :: Scientific/Engineering :: Artificial Intelligence",
-        "Topic :: Scientific/Engineering :: Information Analysis",
-    ],
-    packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
-                                    "tests*")),
-    python_requires=">=3.9",
-    install_requires=get_requirements(),
    ext_modules=ext_modules,
+    install_requires=get_requirements(),
    extras_require={
        "tensorizer": ["tensorizer>=2.9.0"],
        "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
@@ -743,9 +725,4 @@ setup(
    },
    cmdclass=cmdclass,
    package_data=package_data,
-    entry_points={
-        "console_scripts": [
-            "vllm=vllm.scripts:main",
-        ],
-    },
 )
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -9,6 +9,7 @@ import weakref
 import pytest
 from vllm import LLM
+from vllm.config import LoadFormat
 from vllm.platforms import current_platform
 from ..conftest import VllmRunner
@@ -19,7 +20,7 @@ from ..utils import models_path_prefix
 MODELS = [
    os.path.join(models_path_prefix, "google/gemma-2-2b-it"),
-    os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"),
+    os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
 ]
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
@@ -35,7 +36,7 @@ def v1(run_with_both_engines):
 def test_vllm_gc_ed():
    """Verify vllm instance is GC'ed when it is deleted"""
-    llm = LLM(os.path.join(models_path_prefix, "facebook/opt-125m"))
+    llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"), load_format=LoadFormat.RUNAI_STREAMER)
    weak_llm = weakref.ref(llm)
    del llm
    # If there's any circular reference to vllm, this fails
@@ -97,13 +98,13 @@ def test_models(
 # @pytest.mark.parametrize(
 #     "model, distributed_executor_backend, attention_backend, "
 #     "test_suite", [
-#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "L4"),
+#         (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "ray", "", "L4"),
-#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "L4"),
+#         (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "mp", "", "L4"),
 #         (os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), "ray", "", "L4"),
 #         (os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), "mp", "", "L4"),
-#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "A100"),
+#         (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "ray", "", "A100"),
-#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "A100"),
+#         (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "mp", "", "A100"),
-#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "FLASHINFER", "A100"),
+#         (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "mp", "FLASHINFER", "A100"),
 #         (os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray", "FLASHINFER", "A100"),
 #     ])
 # def test_models_distributed(
@@ -119,7 +120,7 @@ def test_models(
 #     if test_suite != TARGET_TEST_SUITE:
 #         pytest.skip(f"Skip test for {test_suite}")
-#     if model == os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf") and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+#     if model == os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct") and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
 #         # test ray adag
 #         os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
 #         os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
@@ -144,9 +145,10 @@ def test_models(
 #     with hf_runner(model, dtype=dtype) as hf_model:
 #         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    # check_outputs_equal(
+#     check_outputs_equal(
-    #     outputs_0_lst=hf_outputs,
+#         outputs_0_lst=hf_outputs,
-    #     outputs_1_lst=vllm_outputs,
+#         outputs_1_lst=vllm_outputs,
-    #     name_0="hf",
+#         name_0="hf",
-    #     name_1="vllm",
+#         name_1="vllm",
-    # )
+#     )
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -8,7 +8,6 @@ prefill requests are chunked.
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
 import os
-from contextlib import nullcontext
 import pytest
@@ -22,7 +21,7 @@ from ..utils import models_path_prefix
 MODELS = [
    os.path.join(models_path_prefix, "facebook/opt-125m"),
-    os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"),
+    os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
 ]
@@ -94,7 +93,7 @@ def test_models_distributed(
 ) -> None:
    override_backend_env_variable(monkeypatch, attention_backend)
-    if (model == "meta-llama/Llama-2-7b-hf"
+    if (model == "meta-llama/Llama-3.2-1B-Instruct"
            and distributed_executor_backend == "ray"):
        # test ray adag
        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
@@ -223,7 +222,7 @@ def test_with_prefix_caching(
    Checks exact match decode with and without prefix caching
    with chunked prefill enabled.
    """
-    model = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf")
+    model = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
    # The common prompt has 142 tokens with Llama-2 tokenizer.
    common_prompt = "You are a helpful AI assistant " * 20
    unique_prompts = [
@@ -235,7 +234,6 @@ def test_with_prefix_caching(
    max_num_batched_tokens = max_num_seqs = chunk_size
    outputs = {}  # type: ignore
-    check_result = True
    for enable in (True, False):
        with vllm_runner(
                model,
@@ -247,25 +245,17 @@ def test_with_prefix_caching(
                enforce_eager=enforce_eager,
                max_num_seqs=max_num_seqs,
        ) as vllm_model:
-            # It should fail when prefix caching is enable and chunk
-            # size is not a multiple of block size (16).
-            should_fail = chunk_size % 16 != 0 and enable
-            check_result &= not should_fail
            outputs[enable] = []
-            # Send the request one-by-one to ensure the cache is populated.
+            for prompt in full_prompts:
-            with pytest.raises(ValueError) if should_fail else nullcontext():
+                outputs[enable] += vllm_model.generate_greedy([prompt],
-                for prompt in full_prompts:
+                                                              max_tokens)
-                    outputs[enable] += vllm_model.generate_greedy([prompt],
-                                                                  max_tokens)
+    check_outputs_equal(
+        outputs_0_lst=outputs[False],
-    # Check results only if we did not expect a failure.
+        outputs_1_lst=outputs[True],
-    if check_result:
+        name_0="w/o prefix caching",
-        check_outputs_equal(
+        name_1="with prefix caching",
-            outputs_0_lst=outputs[False],
+    )
-            outputs_1_lst=outputs[True],
-            name_0="w/o prefix caching",
-            name_1="with prefix caching",
-        )
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])

--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -5,5 +5,5 @@ from ..utils import compare_two_settings, models_path_prefix
 def test_cpu_offload():
-    compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"), [],
+    compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), [],
                         ["--cpu-offload-gb", "1"])
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
 # SPDX-License-Identifier: Apache-2.0
+import pytest
 import torch
 from vllm import LLM, SamplingParams
+from vllm.config import LoadFormat
 from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.utils import GiB_bytes
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 from ..utils import fork_new_process_for_each_test
+@fork_new_process_for_each_test
+def test_python_error():
+    """
+    Test if Python error occurs when there's low-level
+    error happening from the C++ side.
+    """
+    allocator = CuMemAllocator.get_instance()
+    total_bytes = torch.cuda.mem_get_info()[1]
+    alloc_bytes = int(total_bytes * 0.7)
+    tensors = []
+    with allocator.use_memory_pool():
+        # allocate 70% of the total memory
+        x = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
+        tensors.append(x)
+    # release the memory
+    allocator.sleep()
+    # allocate more memory than the total memory
+    y = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
+    tensors.append(y)
+    with pytest.raises(RuntimeError):
+        # when the allocator is woken up, it should raise an error
+        # because we don't have enough memory
+        allocator.wake_up()
 @fork_new_process_for_each_test
 def test_basic_cumem():
    # some tensors from default memory pool
@@ -88,10 +117,23 @@ def test_cumem_with_cudagraph():
 @fork_new_process_for_each_test
-def test_end_to_end():
+@pytest.mark.parametrize(
+    "model, use_v1",
+    [
+        # sleep mode with safetensors
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/meta-llama/Llama-3.2-1B", True),
+        # sleep mode with pytorch checkpoint
+        ("facebook/opt-125m", False),
+    ])
+def test_end_to_end(model: str, use_v1: bool):
+    import os
+    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
    free, total = torch.cuda.mem_get_info()
    used_bytes_baseline = total - free  # in case other process is running
-    llm = LLM("meta-llama/Llama-3.2-1B", enable_sleep_mode=True)
+    load_format = LoadFormat.AUTO
+    if "Llama" in model:
+        load_format = LoadFormat.RUNAI_STREAMER
+    llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
    prompt = "How are you?"
    sampling_params = SamplingParams(temperature=0, max_tokens=10)
    output = llm.generate(prompt, sampling_params)
@@ -112,3 +154,5 @@ def test_end_to_end():
    # cmp output
    assert output[0].outputs[0].text == output2[0].outputs[0].text
+    del os.environ["VLLM_USE_V1"]
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -20,7 +20,7 @@ from ..utils import models_path_prefix
 import os
 MODELS = [
-    os.path.join(models_path_prefix, "facebook/opt-125m"),
+    os.path.join(models_path_prefix, "distilbert/distilgpt2"),
 ]

--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -92,7 +92,7 @@ def test_simple_piecewise_compile():
            num_graphs_seen=1,  # one graph for the model
            num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
            num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
-            num_inductor_compilations=3,  # num_piecewise_capturable_graphs_seen
+            num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
            num_cudagraph_caputured=
            6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
    ):

--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -322,7 +322,7 @@ def test_toy_llama():
            num_graphs_seen=0,
            num_piecewise_graphs_seen=0,
            num_piecewise_capturable_graphs_seen=0,
-            num_inductor_compilations=0,
+            num_backend_compilations=0,
            num_cudagraph_caputured=0,
    ):
        outputs.append(run_model(llama_config, use_compile=False))
@@ -332,7 +332,7 @@ def test_toy_llama():
            num_graphs_seen=1,  # one graph for the model
            num_piecewise_graphs_seen=1,
            num_piecewise_capturable_graphs_seen=1,
-            num_inductor_compilations=1,  # num_piecewise_capturable_graphs_seen
+            num_backend_compilations=1,  # num_piecewise_capturable_graphs_seen
            num_cudagraph_caputured=
            2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
    ):
@@ -345,7 +345,7 @@ def test_toy_llama():
            1,  # 2 * num_layers + 1
            num_piecewise_capturable_graphs_seen=1 +
            llama_config.num_layers,  # 1 + num_layers
-            num_inductor_compilations=1 +
+            num_backend_compilations=1 +
            llama_config.num_layers,  # num_piecewise_capturable_graphs_seen
            num_cudagraph_caputured=2 *
        (1 + llama_config.num_layers

--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -27,7 +27,7 @@ class TestSetting:
 test_settings = [
    # basic llama model
    TestSetting(
-        model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"),
+        model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
        model_args=[],
        pp_size=2,
        tp_size=2,

--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@@ -6,7 +6,6 @@ import torch
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.config import CompilationLevel
 from vllm.platforms import current_platform
 import os
 from ..utils import models_path_prefix
@@ -17,14 +16,14 @@ TEST_MODELS = [
        "dtype": torch.float16,
        "quantization": "compressed-tensors"
    }),
-    # (os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), {
+    # (os.path.join(models_path_prefix, "neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic"), {
    #     "dtype": torch.float16,
    #     "quantization": "fp8"
    # }),
-    (os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples"), {
+    (os.path.join(models_path_prefix, "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"), {
        "quantization": "compressed-tensors"
    }),
-    (os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), {}),
+    (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), {}),
 ]
 if is_quant_method_supported("aqlm"):
@@ -72,11 +71,6 @@ def check_full_graph_support(model,
    # make sure these models can be captured in full graph mode
    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
-    # The base meta llama uses too much memory.
-    if (model == "meta-llama/Meta-Llama-3-8B"
-            and optimization_level >= CompilationLevel.PIECEWISE):
-        return
    print(f"MODEL={model}")
    prompts = [