Merge tag 'v0.6.5' into v0.6.5-dev

4d3a2c28 · zhuwenwen · 92ec5d8e · 2d1b9baa · 4d3a2c28 · 4d3a2c28
Commit 4d3a2c28 authored Dec 30, 2024 by zhuwenwen
20 changed files
--- a/requirements-build.txt
+++ b/requirements-build.txt
 # Should be mirrored in pyproject.toml
 cmake>=3.26
 ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
-torch==2.4.0
+torch==2.5.1; platform_machine != 'aarch64'
 wheel
 jinja2
--- a/requirements-common.txt
+++ b/requirements-common.txt
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
-requests
+requests >= 2.26.0
 tqdm
+blake3
 py-cpuinfo
-transformers == 4.45.0  # Required for Llama 3.2.
+transformers >= 4.45.2  # Required for Llama 3.2 and Qwen2-VL.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
-fastapi < 0.113.0; python_version < '3.9'
+fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
-fastapi >= 0.114.1; python_version >= '3.9'
+fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
 aiohttp
-openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
+openai >= 1.45.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
 uvicorn[standard]
 pydantic >= 2.9  # Required for fastapi >= 0.113.0
 pillow  # Required for image processing
 prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
-lm-format-enforcer == 0.10.6
+lm-format-enforcer >= 0.10.9, < 0.11
-outlines >= 0.0.43, < 0.1
+outlines == 0.1.11
+xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
-filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
+filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq
 msgspec
 gguf == 0.10.0
 importlib_metadata
-mistral_common >= 1.4.3
+mistral_common[opencv] >= 1.5.0
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
+compressed-tensors == 0.8.1 # required for compressed-tensors
+depyf==0.18.0 # required for profiling and debugging torch.compile
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
 # Common dependencies
 -r requirements-common.txt
-# Dependencies for x86_64 CPUs
+# Dependencies for CPUs
-torch == 2.4.0+cpu; platform_machine != "ppc64le"
+torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
-torchvision; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
+torch==2.5.1; platform_machine == "aarch64"
+torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
+datasets # for benchmark scripts
\ No newline at end of file
--- a/requirements-cuda-arm64.txt
+++ b/requirements-cuda-arm64.txt
+--index-url https://download.pytorch.org/whl/nightly/cu124
+torchvision==0.22.0.dev20241215; platform_machine == 'aarch64'
+torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64'
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -3,8 +3,8 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
-nvidia-ml-py # for pynvml package
+nvidia-ml-py >= 12.560.30 # for pynvml package
-torch == 2.4.0
+torch == 2.5.1; platform_machine != 'aarch64'
 # These must be updated alongside torch
-torchvision == 0.19   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.4.0
+xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
+# Common dependencies
+-r requirements-common.txt
+# Dependencies for HPU code
+ray
+triton
+pandas
+tabulate
+setuptools>=61
+setuptools-scm>=8
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@4312768
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
 # formatting
 yapf==0.32.0
 toml==0.10.2
-tomli==2.0.1
+tomli==2.0.2
 ruff==0.6.5
 codespell==2.3.0
 isort==5.13.2
 clang-format==18.1.5
+sphinx-lint==1.0.0
 # type checking
 mypy==1.11.1

--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
 # Common dependencies
 -r requirements-common.txt
-# OpenVINO dependencies
+torch == 2.5.1 #  should be aligned with "common" vLLM torch version
-torch >= 2.1.2
+openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
-openvino ~= 2024.3.0
-optimum-intel[openvino] >= 1.18.2
+optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version
+optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git@main # latest optimum-intel is used to support latest transformers version
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -5,6 +5,7 @@
 awscli
 boto3
 botocore
+datasets
 ray >= 2.10.0
 peft
 pytest-asyncio
@@ -15,4 +16,4 @@ torch == 2.4.1
 triton == 3.0.0
 flash_attn == 2.6.1
 lmslim == 0.2.0  
 numa
\ No newline at end of file
--- a/requirements-test.in
+++ b/requirements-test.in
+# testing
+pytest
+tensorizer>=2.9.0
+pytest-forked
+pytest-asyncio
+pytest-rerunfailures
+pytest-shard
+# testing utils
+awscli
+decord # required for video tests
+einops # required for MPT, qwen-vl and Mamba
+httpx
+librosa # required for audio tests
+peft
+ray[adag]==2.40.0
+sentence-transformers # required for embedding tests
+soundfile # required for audio tests
+timm # required for internvl test
+torch==2.5.1
+transformers_stream_generator # required for qwen-vl test
+matplotlib # required for qwen-vl test
+mistral_common[opencv] >= 1.5.0 # required for pixtral test
+datamodel_code_generator # required for minicpm3 test
+lm-eval[api]==0.4.4 # required for model evaluation test
+# quantization
+bitsandbytes>=0.45.0
+buildkite-test-collector==0.1.9
+numpy < 2.0.0
--- a/requirements-test.txt
+++ b/requirements-test.txt
-# testing
+#
-pytest
+# This file is autogenerated by pip-compile with Python 3.12
-tensorizer>=2.9.0
+# by the following command:
-pytest-forked
+#
-pytest-asyncio
+#    python3.12 -m piptools compile requirements-test.in -o requirements-test.txt
-pytest-rerunfailures
+#
-pytest-shard
+absl-py==2.1.0
-pytest-html
+    # via rouge-score
-pytest-timeout
+accelerate==1.0.1
+    # via
+    #   lm-eval
+    #   peft
+aiohappyeyeballs==2.4.3
+    # via aiohttp
+aiohttp==3.10.10
+    # via
+    #   datasets
+    #   fsspec
+    #   lm-eval
+aiosignal==1.3.1
+    # via
+    #   aiohttp
+    #   ray
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.6.2.post1
+    # via httpx
+argcomplete==3.5.1
+    # via datamodel-code-generator
+attrs==24.2.0
+    # via
+    #   aiohttp
+    #   jsonlines
+    #   jsonschema
+    #   referencing
+audioread==3.0.1
+    # via librosa
+awscli==1.35.23
+    # via -r requirements-test.in
+bitsandbytes>=0.45.0
+    # via -r requirements-test.in
+black==24.10.0
+    # via datamodel-code-generator
+boto3==1.35.57
+    # via tensorizer
+botocore==1.35.57
+    # via
+    #   awscli
+    #   boto3
+    #   s3transfer
+buildkite-test-collector==0.1.9
+    # via -r requirements-test.in
+certifi==2024.8.30
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+cffi==1.17.1
+    # via soundfile
+chardet==5.2.0
+    # via mbstrdecoder
+charset-normalizer==3.4.0
+    # via requests
+click==8.1.7
+    # via
+    #   black
+    #   nltk
+    #   ray
+colorama==0.4.6
+    # via
+    #   awscli
+    #   sacrebleu
+    #   tqdm-multiprocess
+contourpy==1.3.0
+    # via matplotlib
+cupy-cuda12x==13.3.0
+    # via ray
+cycler==0.12.1
+    # via matplotlib
+datamodel-code-generator==0.26.3
+    # via -r requirements-test.in
+dataproperty==1.0.1
+    # via
+    #   pytablewriter
+    #   tabledata
+datasets==3.0.2
+    # via
+    #   evaluate
+    #   lm-eval
+decorator==5.1.1
+    # via librosa
+decord==0.6.0
+    # via -r requirements-test.in
+dill==0.3.8
+    # via
+    #   datasets
+    #   evaluate
+    #   lm-eval
+    #   multiprocess
+dnspython==2.7.0
+    # via email-validator
+docutils==0.16
+    # via awscli
+einops==0.8.0
+    # via -r requirements-test.in
+email-validator==2.2.0
+    # via pydantic
+evaluate==0.4.3
+    # via lm-eval
+fastrlock==0.8.2
+    # via cupy-cuda12x
+filelock==3.16.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   ray
+    #   torch
+    #   transformers
+    #   triton
+fonttools==4.54.1
+    # via matplotlib
+frozenlist==1.5.0
+    # via
+    #   aiohttp
+    #   aiosignal
+    #   ray
+fsspec[http]==2024.9.0
+    # via
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   torch
+genson==1.3.0
+    # via datamodel-code-generator
+h11==0.14.0
+    # via httpcore
+hiredis==3.0.0
+    # via tensorizer
+httpcore==1.0.6
+    # via httpx
+httpx==0.27.2
+    # via -r requirements-test.in
+huggingface-hub==0.26.2
+    # via
+    #   accelerate
+    #   datasets
+    #   evaluate
+    #   peft
+    #   sentence-transformers
+    #   timm
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   anyio
+    #   email-validator
+    #   httpx
+    #   requests
+    #   yarl
+inflect==5.6.2
+    # via datamodel-code-generator
+iniconfig==2.0.0
+    # via pytest
+isort==5.13.2
+    # via datamodel-code-generator
+jinja2==3.1.4
+    # via
+    #   datamodel-code-generator
+    #   torch
+jmespath==1.0.1
+    # via
+    #   boto3
+    #   botocore
+joblib==1.4.2
+    # via
+    #   librosa
+    #   nltk
+    #   scikit-learn
+jsonlines==4.0.0
+    # via lm-eval
+jsonschema==4.23.0
+    # via
+    #   mistral-common
+    #   ray
+jsonschema-specifications==2024.10.1
+    # via jsonschema
+kiwisolver==1.4.7
+    # via matplotlib
+lazy-loader==0.4
+    # via librosa
+libnacl==2.1.0
+    # via tensorizer
+librosa==0.10.2.post1
+    # via -r requirements-test.in
+llvmlite==0.43.0
+    # via numba
+lm-eval[api]==0.4.4
+    # via -r requirements-test.in
+lxml==5.3.0
+    # via sacrebleu
+markupsafe==3.0.2
+    # via jinja2
+matplotlib==3.9.2
+    # via -r requirements-test.in
+mbstrdecoder==1.1.3
+    # via
+    #   dataproperty
+    #   pytablewriter
+    #   typepy
+mistral-common[opencv]==1.5.1
+    # via
+    #   -r requirements-test.in
+    #   mistral-common
+more-itertools==10.5.0
+    # via lm-eval
+mpmath==1.3.0
+    # via sympy
+msgpack==1.1.0
+    # via
+    #   librosa
+    #   ray
+multidict==6.1.0
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via
+    #   datasets
+    #   evaluate
+mypy-extensions==1.0.0
+    # via black
+networkx==3.2.1
+    # via torch
+nltk==3.9.1
+    # via rouge-score
+numba==0.60.0
+    # via librosa
+numexpr==2.10.1
+    # via lm-eval
+numpy==1.26.4
+    # via
+    #   -r requirements-test.in
+    #   accelerate
+    #   bitsandbytes
+    #   contourpy
+    #   cupy-cuda12x
+    #   datasets
+    #   decord
+    #   evaluate
+    #   librosa
+    #   matplotlib
+    #   mistral-common
+    #   numba
+    #   numexpr
+    #   opencv-python-headless
+    #   pandas
+    #   peft
+    #   rouge-score
+    #   sacrebleu
+    #   scikit-learn
+    #   scipy
+    #   soxr
+    #   tensorizer
+    #   torchvision
+    #   transformers
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+opencv-python-headless==4.10.0.84
+    # via mistral-common
+packaging==24.1
+    # via
+    #   accelerate
+    #   black
+    #   datamodel-code-generator
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   lazy-loader
+    #   matplotlib
+    #   peft
+    #   pooch
+    #   pytest
+    #   pytest-rerunfailures
+    #   ray
+    #   transformers
+    #   typepy
+pandas==2.2.3
+    # via
+    #   datasets
+    #   evaluate
+pathspec==0.12.1
+    # via black
+pathvalidate==3.2.1
+    # via pytablewriter
+peft==0.13.2
+    # via
+    #   -r requirements-test.in
+    #   lm-eval
+pillow==10.4.0
+    # via
+    #   matplotlib
+    #   mistral-common
+    #   sentence-transformers
+    #   torchvision
+platformdirs==4.3.6
+    # via
+    #   black
+    #   pooch
+pluggy==1.5.0
+    # via pytest
+pooch==1.8.2
+    # via librosa
+portalocker==2.10.1
+    # via sacrebleu
+propcache==0.2.0
+    # via yarl
+protobuf==5.28.3
+    # via
+    #   ray
+    #   tensorizer
+psutil==6.1.0
+    # via
+    #   accelerate
+    #   peft
+    #   tensorizer
+py==1.11.0
+    # via pytest-forked
+pyarrow==18.0.0
+    # via datasets
+pyasn1==0.6.1
+    # via rsa
+pybind11==2.13.6
+    # via lm-eval
+pycparser==2.22
+    # via cffi
+pydantic[email]==2.9.2
+    # via
+    #   datamodel-code-generator
+    #   mistral-common
+pydantic-core==2.23.4
+    # via pydantic
+pyparsing==3.2.0
+    # via matplotlib
+pytablewriter==1.2.0
+    # via lm-eval
+pytest==8.3.3
+    # via
+    #   -r requirements-test.in
+    #   buildkite-test-collector
+    #   pytest-asyncio
+    #   pytest-forked
+    #   pytest-rerunfailures
+    #   pytest-shard
+pytest-asyncio==0.24.0
+    # via -r requirements-test.in
+pytest-forked==1.6.0
+    # via -r requirements-test.in
+pytest-rerunfailures==14.0
+    # via -r requirements-test.in
+pytest-shard==0.1.2
+    # via -r requirements-test.in
+python-dateutil==2.9.0.post0
+    # via
+    #   botocore
+    #   matplotlib
+    #   pandas
+    #   typepy
+pytz==2024.2
+    # via
+    #   pandas
+    #   typepy
+pyyaml==6.0.2
+    # via
+    #   accelerate
+    #   awscli
+    #   datamodel-code-generator
+    #   datasets
+    #   huggingface-hub
+    #   peft
+    #   ray
+    #   timm
+    #   transformers
+ray[adag]==2.40.0
+    # via -r requirements-test.in
+redis==5.2.0
+    # via tensorizer
+referencing==0.35.1
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+regex==2024.9.11
+    # via
+    #   nltk
+    #   sacrebleu
+    #   tiktoken
+    #   transformers
+requests==2.32.3
+    # via
+    #   buildkite-test-collector
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   lm-eval
+    #   mistral-common
+    #   pooch
+    #   ray
+    #   tiktoken
+    #   transformers
+rouge-score==0.1.2
+    # via lm-eval
+rpds-py==0.20.1
+    # via
+    #   jsonschema
+    #   referencing
+rsa==4.7.2
+    # via awscli
+s3transfer==0.10.3
+    # via
+    #   awscli
+    #   boto3
+sacrebleu==2.4.3
+    # via lm-eval
+safetensors==0.4.5
+    # via
+    #   accelerate
+    #   peft
+    #   timm
+    #   transformers
+scikit-learn==1.5.2
+    # via
+    #   librosa
+    #   lm-eval
+    #   sentence-transformers
+scipy==1.13.1
+    # via
+    #   librosa
+    #   scikit-learn
+    #   sentence-transformers
+sentence-transformers==3.2.1
+    # via -r requirements-test.in
+sentencepiece==0.2.0
+    # via mistral-common
+six==1.16.0
+    # via
+    #   python-dateutil
+    #   rouge-score
+sniffio==1.3.1
+    # via
+    #   anyio
+    #   httpx
+soundfile==0.12.1
+    # via
+    #   -r requirements-test.in
+    #   librosa
+soxr==0.5.0.post1
+    # via librosa
+sqlitedict==2.1.0
+    # via lm-eval
+sympy==1.13.1
+    # via torch
+tabledata==1.3.3
+    # via pytablewriter
+tabulate==0.9.0
+    # via sacrebleu
+tcolorpy==0.1.6
+    # via pytablewriter
+tenacity==9.0.0
+    # via lm-eval
+tensorizer==2.9.0
+    # via -r requirements-test.in
+threadpoolctl==3.5.0
+    # via scikit-learn
+tiktoken==0.7.0
+    # via
+    #   lm-eval
+    #   mistral-common
+timm==1.0.11
+    # via -r requirements-test.in
+tokenizers==0.21.0
+    # via transformers
+torch==2.5.1
+    # via
+    #   -r requirements-test.in
+    #   accelerate
+    #   bitsandbytes
+    #   lm-eval
+    #   peft
+    #   sentence-transformers
+    #   tensorizer
+    #   timm
+    #   torchvision
+torchvision==0.20.1
+    # via timm
+tqdm==4.66.6
+    # via
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   lm-eval
+    #   nltk
+    #   peft
+    #   sentence-transformers
+    #   tqdm-multiprocess
+    #   transformers
+tqdm-multiprocess==0.0.11
+    # via lm-eval
+transformers==4.47.0
+    # via
+    #   lm-eval
+    #   peft
+    #   sentence-transformers
+    #   transformers-stream-generator
+transformers-stream-generator==0.0.5
+    # via -r requirements-test.in
+triton==3.1.0
+    # via torch
+typepy[datetime]==1.3.2
+    # via
+    #   dataproperty
+    #   pytablewriter
+    #   tabledata
+typing-extensions==4.12.2
+    # via
+    #   huggingface-hub
+    #   librosa
+    #   mistral-common
+    #   pydantic
+    #   pydantic-core
+    #   torch
+tzdata==2024.2
+    # via pandas
+urllib3==1.26.20
+    # via
+    #   botocore
+    #   requests
+word2number==1.1
+    # via lm-eval
+xxhash==3.5.0
+    # via
+    #   datasets
+    #   evaluate
+yarl==1.17.1
+    # via aiohttp
+zstandard==0.23.0
+    # via lm-eval
-# testing utils
+# The following packages are considered to be unsafe in a requirements file:
-awscli
+# setuptools
-einops # required for MPT, qwen-vl and Mamba
\ No newline at end of file
-httpx
-librosa # required for audio test
-opencv-python # required for video test
-peft
-requests
-ray[adag]==2.35
-sentence-transformers # required for embedding
-soundfile # required for audio test
-compressed-tensors==0.4.0 # required for compressed-tensors
-timm # required for internvl test
-transformers_stream_generator # required for qwen-vl test
-matplotlib # required for qwen-vl test
-datamodel_code_generator # required for minicpm3 test
-# TODO: Add this after fully implementing llava(mantis)
-# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
-# Benchmarking
-aiohttp
-# quantization
-# bitsandbytes>=0.44.0
-buildkite-test-collector==0.1.8
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -2,6 +2,22 @@
 -r requirements-common.txt
 # Dependencies for TPU
-# Currently, the TPU backend uses a nightly version of PyTorch XLA.
+cmake>=3.26
-# You can install the dependencies in Dockerfile.tpu.
+ninja
+packaging
+setuptools-scm>=8
+wheel
+jinja2
 ray[default]
+# Install torch_xla
+--pre
+--extra-index-url https://download.pytorch.org/whl/nightly/cpu
+--find-links https://storage.googleapis.com/libtpu-releases/index.html
+--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
+--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+torch==2.6.0.dev20241126+cpu
+torchvision==0.20.0.dev20241126+cpu
+torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl
+jaxlib==0.4.36.dev20241122
+jax==0.4.36.dev20241122
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
 # Common dependencies
 -r requirements-common.txt
-setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
 ray >= 2.9
-# Following pkgs retrieved from https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+cmake>=3.26
-torch == 2.3.1+cxx11.abi
+ninja
-intel-extension-for-pytorch == 2.3.110+xpu
+packaging
-oneccl_bind_pt == 2.3.100+xpu
+setuptools-scm>=8
+wheel
+jinja2
+torch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
+intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
+oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
-triton-xpu == 3.0.0b2
+triton-xpu == 3.0.0b1
--- a/setup.py
+++ b/setup.py
 import importlib.util
-import io
 import logging
 import os
 import re
@@ -63,12 +62,6 @@ def is_ninja_available() -> bool:
    return which("ninja") is not None
-def remove_prefix(text, prefix):
-    if text.startswith(prefix):
-        return text[len(prefix):]
-    return text
 class CMakeExtension(Extension):
    def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
@@ -164,6 +157,14 @@ class cmake_build_ext(build_ext):
        # on subsequent calls to python.
        cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))]
+        # Override the base directory for FetchContent downloads to $ROOT/.deps
+        # This allows sharing dependencies between profiles,
+        # and plays more nicely with sccache.
+        # To override this, set the FETCHCONTENT_BASE_DIR environment variable.
+        fc_base_dir = os.path.join(ROOT_DIR, ".deps")
+        fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir)
+        cmake_args += ['-DFETCHCONTENT_BASE_DIR={}'.format(fc_base_dir)]
        #
        # Setup parallelism and build tool
        #
@@ -197,8 +198,10 @@ class cmake_build_ext(build_ext):
            os.makedirs(self.build_temp)
        targets = []
-        target_name = lambda s: remove_prefix(remove_prefix(s, "vllm."),
-                                              "vllm_flash_attn.")
+        def target_name(s: str) -> str:
+            return s.removeprefix("vllm.").removeprefix("vllm_flash_attn.")
        # Build all the extensions
        for ext in self.extensions:
            self.configure(ext)
@@ -253,6 +256,92 @@ class cmake_build_ext(build_ext):
            self.copy_file(file, dst_file)
+class repackage_wheel(build_ext):
+    """Extracts libraries and other files from an existing wheel."""
+    default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+    def run(self) -> None:
+        wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION",
+                                   self.default_wheel)
+        assert _is_cuda(
+        ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+        import zipfile
+        if os.path.isfile(wheel_location):
+            wheel_path = wheel_location
+            print(f"Using existing wheel={wheel_path}")
+        else:
+            # Download the wheel from a given URL, assume
+            # the filename is the last part of the URL
+            wheel_filename = wheel_location.split("/")[-1]
+            import tempfile
+            # create a temporary directory to store the wheel
+            temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
+            wheel_path = os.path.join(temp_dir, wheel_filename)
+            print(f"Downloading wheel from {wheel_location} to {wheel_path}")
+            from urllib.request import urlretrieve
+            try:
+                urlretrieve(wheel_location, filename=wheel_path)
+            except Exception as e:
+                from setuptools.errors import SetupError
+                raise SetupError(
+                    f"Failed to get vLLM wheel from {wheel_location}") from e
+        with zipfile.ZipFile(wheel_path) as wheel:
+            files_to_copy = [
+                "vllm/_C.abi3.so",
+                "vllm/_moe_C.abi3.so",
+                "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
+                "vllm/vllm_flash_attn/flash_attn_interface.py",
+                "vllm/vllm_flash_attn/__init__.py",
+                # "vllm/_version.py", # not available in nightly wheels yet
+            ]
+            file_members = filter(lambda x: x.filename in files_to_copy,
+                                  wheel.filelist)
+            for file in file_members:
+                print(f"Extracting and including {file.filename} "
+                      "from existing wheel")
+                package_name = os.path.dirname(file.filename).replace("/", ".")
+                file_name = os.path.basename(file.filename)
+                if package_name not in package_data:
+                    package_data[package_name] = []
+                wheel.extract(file)
+                if file_name.endswith(".py"):
+                    # python files shouldn't be added to package_data
+                    continue
+                package_data[package_name].append(file_name)
+def _is_hpu() -> bool:
+    is_hpu_available = True
+    try:
+        subprocess.run(["hl-smi"], capture_output=True, check=True)
+    except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
+        if not os.path.exists('/dev/accel/accel0') and not os.path.exists(
+                '/dev/accel/accel_controlD0'):
+            # last resort...
+            try:
+                output = subprocess.check_output(
+                    'lsmod | grep habanalabs | wc -l', shell=True)
+                is_hpu_available = int(output) > 0
+            except (ValueError, FileNotFoundError, PermissionError,
+                    subprocess.CalledProcessError):
+                is_hpu_available = False
+    return is_hpu_available or VLLM_TARGET_DEVICE == "hpu"
 def _no_device() -> bool:
    return VLLM_TARGET_DEVICE == "empty"
@@ -260,7 +349,7 @@ def _no_device() -> bool:
 def _is_cuda() -> bool:
    has_cuda = torch.version.cuda is not None
    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
-            and not (_is_neuron() or _is_tpu()))
+            and not (_is_neuron() or _is_tpu() or _is_hpu()))
 def _is_hip() -> bool:
@@ -297,10 +386,6 @@ def _build_custom_ops() -> bool:
    return _is_cuda() or _is_hip() or _is_cpu()
-def _build_core_ext() -> bool:
-    return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu())
 def get_hipcc_rocm_version():
    # Run the hipcc --version command
    result = subprocess.run(['hipcc', '--version'],
@@ -330,7 +415,7 @@ def get_neuronxcc_version():
                                "__init__.py")
    # Check if the command was executed successfully
-    with open(version_file, "rt") as fp:
+    with open(version_file) as fp:
        content = fp.read()
    # Extract the version using a regular expression
@@ -339,7 +424,7 @@ def get_neuronxcc_version():
        # Return the version string
        return match.group(1)
    else:
-        raise RuntimeError("Could not find HIP version in the output")
+        raise RuntimeError("Could not find Neuron version in the output")
 def get_nvcc_cuda_version() -> Version:
@@ -375,15 +460,15 @@ def get_version_add(sha: Optional[str] = None) -> str:
        if sha != 'Unknown':
            if sha is None:
                sha = get_sha(vllm_root)
-            if (major, minor) == ('2', '3'):
+            # if (major, minor) == ('2', '3'):
-                version = 'das.opt1.' + sha[:7]
+            #     version = 'das.opt1.' + sha[:7]
            if (major, minor) == ('2', '4'):
-                version = 'das.opt2.' + sha[:7]
+                version = 'das.opt1.' + sha[:7]
    else:
-        if (major, minor) == ('2', '3'):
+        # if (major, minor) == ('2', '3'):
-            version = 'das.opt1'
+        #     version = 'das.opt1'
        if (major, minor) == ('2', '4'):
-            version = 'das.opt2'
+            version = 'das.opt1'
    # dtk version
@@ -397,9 +482,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
    new_version_content = f"""
 try:
-    __version__ = "0.6.2"
+    __version__ = "0.6.5"
-    __version_tuple__ = (0, 6, 2)
+    __version_tuple__ = (0, 6, 5)
-    __hcu_version__ = f'0.6.2+{version}' 
+    __hcu_version__ = f'0.6.5+{version}' 
    from vllm.version import __version__, __version_tuple__, __hcu_version__
 except Exception as e:
@@ -423,6 +508,22 @@ def get_version():
    return locals()['__hcu_version__']
+def get_gaudi_sw_version():
+    """
+    Returns the driver version.
+    """
+    # Enable console printing for `hl-smi` check
+    output = subprocess.run("hl-smi",
+                            shell=True,
+                            text=True,
+                            capture_output=True,
+                            env={"ENABLE_CONSOLE": "true"})
+    if output.returncode == 0 and output.stdout:
+        return output.stdout.split("\n")[2].replace(
+            " ", "").split(":")[1][:-1].split("-")[0]
+    return "0.0.0"  # when hl-smi is not available
 def get_vllm_version() -> str:
    if not _is_hip():
        version = get_version(
@@ -435,12 +536,15 @@ def get_vllm_version() -> str:
        if envs.VLLM_TARGET_DEVICE == "empty":
            version += f"{sep}empty"
    elif _is_cuda():
-        cuda_version = str(get_nvcc_cuda_version())
+        if envs.VLLM_USE_PRECOMPILED:
-        if cuda_version != MAIN_CUDA_VERSION:
+            version += ".precompiled"
-            cuda_version_str = cuda_version.replace(".", "")[:3]
+        else:
-            # skip this for source tarball, required for pypi
+            cuda_version = str(get_nvcc_cuda_version())
-            if "sdist" not in sys.argv:
+            if cuda_version != MAIN_CUDA_VERSION:
-                version += f"{sep}cu{cuda_version_str}"
+                cuda_version_str = cuda_version.replace(".", "")[:3]
+                # skip this for source tarball, required for pypi
+                if "sdist" not in sys.argv:
+                    version += f"{sep}cu{cuda_version_str}"
    elif _is_hip():
        # Get the HIP version
        # hipcc_version = get_hipcc_rocm_version()
@@ -454,6 +558,12 @@ def get_vllm_version() -> str:
        if neuron_version != MAIN_CUDA_VERSION:
            neuron_version_str = neuron_version.replace(".", "")[:3]
            version += f"{sep}neuron{neuron_version_str}"
+    elif _is_hpu():
+        # Get the Intel Gaudi Software Suite version
+        gaudi_sw_version = str(get_gaudi_sw_version())
+        if gaudi_sw_version != MAIN_CUDA_VERSION:
+            gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
+            version += f"{sep}gaudi{gaudi_sw_version}"
    elif _is_openvino():
        version += f"{sep}openvino"
    elif _is_tpu():
@@ -472,7 +582,8 @@ def read_readme() -> str:
    """Read the README file if present."""
    p = get_path("README.md")
    if os.path.isfile(p):
-        return io.open(get_path("README.md"), "r", encoding="utf-8").read()
+        with open(get_path("README.md"), encoding="utf-8") as f:
+            return f.read()
    else:
        return ""
@@ -487,6 +598,8 @@ def get_requirements() -> List[str]:
        for line in requirements:
            if line.startswith("-r "):
                resolved_requirements += _read_requirements(line.split()[1])
+            elif line.startswith("--"):
+                continue
            else:
                resolved_requirements.append(line)
        return resolved_requirements
@@ -509,6 +622,8 @@ def get_requirements() -> List[str]:
        requirements = _read_requirements("requirements-rocm.txt")
    elif _is_neuron():
        requirements = _read_requirements("requirements-neuron.txt")
+    elif _is_hpu():
+        requirements = _read_requirements("requirements-hpu.txt")
    elif _is_openvino():
        requirements = _read_requirements("requirements-openvino.txt")
    elif _is_tpu():
@@ -519,16 +634,13 @@ def get_requirements() -> List[str]:
        requirements = _read_requirements("requirements-xpu.txt")
    else:
        raise ValueError(
-            "Unsupported platform, please use CUDA, ROCm, Neuron, "
+            "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
            "OpenVINO, or CPU.")
    return requirements
 ext_modules = []
-if _build_core_ext():
-    ext_modules.append(CMakeExtension(name="vllm._core_C"))
 if _is_cuda() or _is_hip():
    ext_modules.append(CMakeExtension(name="vllm._moe_C"))
@@ -545,13 +657,18 @@ if _build_custom_ops():
 package_data = {
    "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json", "benchmarks/*.py","model_executor/layers/quantization/configs/*.json"]
 }
-if envs.VLLM_USE_PRECOMPILED:
-    ext_modules = []
-    package_data["vllm"].append("*.so")
 if _no_device():
    ext_modules = []
+if not ext_modules:
+    cmdclass = {}
+else:
+    cmdclass = {
+        "build_ext":
+        repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
+    }
 setup(
    name="vllm",
    version=get_vllm_version(),
@@ -567,25 +684,28 @@ setup(
        "Documentation": "https://vllm.readthedocs.io/en/latest/",
    },
    classifiers=[
-        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "Programming Language :: Python :: 3.11",
        "Programming Language :: Python :: 3.12",
        "License :: OSI Approved :: Apache Software License",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Information Technology",
+        "Intended Audience :: Science/Research",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Scientific/Engineering :: Information Analysis",
    ],
    packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
                                    "tests*")),
-    python_requires=">=3.8",
+    python_requires=">=3.9",
    install_requires=get_requirements(),
    ext_modules=ext_modules,
    extras_require={
        "tensorizer": ["tensorizer>=2.9.0"],
-        "video": ["opencv-python"],  # Required for video processing
+        "audio": ["librosa", "soundfile"],  # Required for audio processing
-        "audio": ["librosa", "soundfile"]  # Required for audio processing
+        "video": ["decord"]  # Required for video processing
    },
-    cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
+    cmdclass=cmdclass,
    package_data=package_data,
    entry_points={
        "console_scripts": [

--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -12,11 +12,11 @@ import torch
 from vllm import SamplingParams
 from vllm.config import ParallelConfig
+from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
 from vllm.outputs import RequestOutput as RealRequestOutput
 from vllm.sampling_params import RequestOutputKind
-from ..conftest import cleanup
 from ..utils import wait_for_gpu_memory_to_clear
 import os
 from ..utils import models_path_prefix
@@ -87,17 +87,19 @@ class MockAsyncLLMEngine(AsyncLLMEngine):
 @pytest.mark.asyncio
 async def test_new_requests_event():
+    params = SamplingParams()
    engine = MockAsyncLLMEngine()
    engine.start_background_loop()
    await asyncio.sleep(0.01)
    assert engine.engine.step_calls == 0
-    await engine.add_request("1", "", None)
+    await engine.add_request("1", "", params)
    await asyncio.sleep(0.01)
    assert engine.engine.add_request_calls == 1
    assert engine.engine.step_calls == 1
-    await engine.add_request("2", "", None)
+    await engine.add_request("2", "", params)
    engine.engine.generate("2")
    await asyncio.sleep(0)
    await asyncio.sleep(0)
@@ -112,7 +114,7 @@ async def test_new_requests_event():
    await asyncio.sleep(0.001)
    assert engine.engine.step_calls == old_step_calls
-    await engine.add_request("3", "", None)
+    await engine.add_request("3", "", params)
    await asyncio.sleep(0.01)
    assert engine.engine.add_request_calls == 3
    assert engine.engine.step_calls == old_step_calls + 1
@@ -156,7 +158,7 @@ async def async_engine():
        engine.shutdown_background_loop()
        del engine
        await asyncio.sleep(0.1)
-        cleanup()
+        cleanup_dist_env_and_memory()
 @pytest.fixture()

--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -11,22 +11,31 @@ from unittest.mock import patch
 import pytest
 from vllm import LLM
-from vllm.utils import is_hip
+from vllm.platforms import current_platform
 from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+from ..conftest import VllmRunner
 from ..models.utils import check_outputs_equal
 from ..utils import multi_gpu_test
 import os
 from ..utils import models_path_prefix
 MODELS = [
-    os.path.join(models_path_prefix, "facebook/opt-125m"),
+    os.path.join(models_path_prefix, "google/gemma-2-2b-it"),
-    os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
+    os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"),
 ]
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
 def test_vllm_gc_ed():
    """Verify vllm instance is GC'ed when it is deleted"""
    llm = LLM(os.path.join(models_path_prefix, "facebook/opt-125m"))
@@ -37,6 +46,7 @@ def test_vllm_gc_ed():
    assert weak_llm() is None
+@pytest.mark.skip_v1
 @pytest.mark.parametrize("model", MODELS)
 # @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
 @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
@@ -45,8 +55,6 @@ def test_vllm_gc_ed():
 @pytest.mark.parametrize("enforce_eager", [False, True])
 def test_models(
    hf_runner,
-    vllm_runner,
-    example_prompts,
    model: str,
    backend: str,
    dtype: str,
@@ -54,18 +62,30 @@ def test_models(
    enforce_eager: bool,
 ) -> None:
-    if backend == "FLASHINFER" and is_hip():
+    if backend == "FLASHINFER" and current_platform.is_rocm():
        pytest.skip("Flashinfer does not support ROCm/HIP.")
+    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
+        pytest.skip(
+            "XFORMERS does not support gemma2 with full context length.")
    os.environ["VLLM_ATTENTION_BACKEND"] = backend
+    # 5042 tokens for gemma2
+    # gemma2 has alternating sliding window size of 4096
+    # we need a prompt with more than 4096 tokens to test the sliding window
+    prompt = "The following numbers of the sequence " + ", ".join(
+        str(i) for i in range(1024)) + " are:"
+    example_prompts = [prompt]
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    with vllm_runner(model,
+    with VllmRunner(model,
-                     dtype=dtype,
+                    max_model_len=8192,
-                     enforce_eager=enforce_eager,
+                    dtype=dtype,
-                     gpu_memory_utilization=0.7) as vllm_model:
+                    enforce_eager=enforce_eager,
+                    gpu_memory_utilization=0.7) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
    check_outputs_equal(
@@ -110,6 +130,11 @@ def test_models(
 #     if attention_backend:
 #         os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
+#     # Import VLLM_USE_V1 dynamically to handle patching
+#     from vllm.envs import VLLM_USE_V1
+#     if VLLM_USE_V1 and distributed_executor_backend != "mp":
+#         pytest.skip(f"Skip {distributed_executor_backend} for V1")
 #     dtype = "half"
 #     max_tokens = 5
@@ -135,6 +160,7 @@ def test_models(
 #     )
+@pytest.mark.skip_v1
 def test_model_with_failure(vllm_runner) -> None:
    try:
        with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
@@ -159,3 +185,30 @@ def test_model_with_failure(vllm_runner) -> None:
                          ModelInputForGPUWithSamplingMetadata)
    finally:
        os.remove(filename)
+@pytest.mark.skip_v1
+def test_failure_with_async_out_proc(vllm_runner) -> None:
+    filename = None
+    try:
+        with vllm_runner("facebook/opt-125m",
+                         dtype="half",
+                         enforce_eager=False,
+                         gpu_memory_utilization=0.7) as vllm_model,\
+             patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
+                       side_effect=ValueError()):
+            model_config = vllm_model.model.llm_engine.model_config
+            assert model_config.use_async_output_proc
+            with pytest.raises(ValueError) as exc_info:
+                vllm_model.generate_greedy('how to make pizza?', 250)
+            matches = re.search(r"input dumped to (.+).pkl",
+                                str(exc_info.value))
+            assert matches is not None
+            filename = f"{matches.group(1)}.pkl"
+    finally:
+        # Clean up
+        if filename is not None:
+            os.remove(filename)
+        pass
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -11,6 +11,9 @@ from contextlib import nullcontext
 import pytest
+from tests.kernels.utils import override_backend_env_variable
+from vllm.platforms import current_platform
 from ..models.utils import check_logprobs_close, check_outputs_equal
 from ..utils import multi_gpu_test
 import os
@@ -18,7 +21,7 @@ from ..utils import models_path_prefix
 MODELS = [
    os.path.join(models_path_prefix, "facebook/opt-125m"),
-    os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
+    os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"),
 ]
@@ -30,6 +33,7 @@ MODELS = [
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
 @pytest.mark.parametrize("tensor_parallel_size", [1])
+@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 def test_models(
    hf_runner,
    vllm_runner,
@@ -40,11 +44,15 @@ def test_models(
    chunked_prefill_token_size: int,
    enforce_eager: bool,
    tensor_parallel_size: int,
+    attention_backend: str,
+    monkeypatch,
 ) -> None:
    """
    Checks exact match decode between huggingface model and vllm runner with
    chunked prefill.
    """
+    override_backend_env_variable(monkeypatch, attention_backend)
    max_num_seqs = chunked_prefill_token_size
    max_num_batched_tokens = chunked_prefill_token_size
@@ -73,13 +81,18 @@ def test_models(
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
 @pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 def test_models_distributed(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    distributed_executor_backend: str,
+    attention_backend: str,
+    monkeypatch,
 ) -> None:
+    override_backend_env_variable(monkeypatch, attention_backend)
    if (model == "meta-llama/Llama-2-7b-hf"
            and distributed_executor_backend == "ray"):
        # test ray adag
@@ -193,17 +206,17 @@ def test_models_distributed(
 @pytest.mark.parametrize("max_tokens", [16])
 @pytest.mark.parametrize("enforce_eager", [False])
 @pytest.mark.parametrize("chunk_size", [30, 32])
-@pytest.mark.parametrize("use_v2_block_manager", [False, True])
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
 @pytest.mark.parametrize("tensor_parallel_size", [1])
+@pytest.mark.parametrize("dtype", ["half"])
 def test_with_prefix_caching(
    vllm_runner,
    max_tokens: int,
    enforce_eager: bool,
    chunk_size: int,
-    use_v2_block_manager: bool,
    tensor_parallel_size: int,
+    dtype: str,
 ) -> None:
    """
    Checks exact match decode with and without prefix caching
@@ -225,12 +238,11 @@ def test_with_prefix_caching(
    for enable in (True, False):
        with vllm_runner(
                model,
-                dtype="half",
+                dtype=dtype,
                max_num_batched_tokens=max_num_batched_tokens,
                enable_chunked_prefill=True,
                enable_prefix_caching=enable,
                tensor_parallel_size=tensor_parallel_size,
-                use_v2_block_manager=use_v2_block_manager,
                enforce_eager=enforce_eager,
                max_num_seqs=max_num_seqs,
        ) as vllm_model:
@@ -253,3 +265,61 @@ def test_with_prefix_caching(
            name_0="w/o prefix caching",
            name_1="with prefix caching",
        )
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
+@pytest.mark.parametrize("enforce_eager", [False])
+@pytest.mark.parametrize("attention_backend", ["TORCH_SDPA"])
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
+def test_models_cpu(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    chunked_prefill_token_size: int,
+    enforce_eager: bool,
+    attention_backend: str,
+    monkeypatch,
+) -> None:
+    test_models(
+        hf_runner,
+        vllm_runner,
+        example_prompts,
+        model,
+        dtype,
+        max_tokens,
+        chunked_prefill_token_size,
+        enforce_eager,
+        1,
+        attention_backend,
+        monkeypatch,
+    )
+@pytest.mark.parametrize("max_tokens", [16])
+@pytest.mark.parametrize("enforce_eager", [False])
+@pytest.mark.parametrize("chunk_size", [30, 32])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
+def test_with_prefix_caching_cpu(
+    vllm_runner,
+    max_tokens: int,
+    enforce_eager: bool,
+    chunk_size: int,
+    dtype: str,
+) -> None:
+    test_with_prefix_caching(
+        vllm_runner,
+        max_tokens,
+        enforce_eager,
+        chunk_size,
+        1,
+        dtype,
+    )
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -3,5 +3,5 @@ from ..utils import compare_two_settings, models_path_prefix
 def test_cpu_offload():
-    compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), [],
+    compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"), [],
-                         ["--cpu-offload-gb", "4"])
+                         ["--cpu-offload-gb", "1"])
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -26,9 +26,9 @@ MODELS = [
 @pytest.fixture(scope="module", autouse=True)
 def check_settings():
    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
-        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
+        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1."
-        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
+        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
-        "tests/basic_correctness/test_preemption.py`")
+        "pytest tests/basic_correctness/test_preemption.py`")
 @pytest.fixture
@@ -139,113 +139,6 @@ def test_preemption(
    assert total_preemption == total_recorded_preemption
-@pytest.mark.parametrize("model", MODELS)
-# @pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [96])
-@pytest.mark.parametrize("beam_width", [4])
-def test_swap(
-    caplog_vllm,
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    beam_width: int,
-    worker_use_ray: bool,
-) -> None:
-    """Use beam search enables swapping."""
-    example_prompts = example_prompts[:1]
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
-                                                   max_tokens)
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            swap_space=10,
-            disable_log_stats=False,
-            worker_use_ray=worker_use_ray,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
-                                                       beam_width, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-        total_preemption = (
-            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
-    for i in range(len(example_prompts)):
-        hf_output_ids, _ = hf_outputs[i]
-        vllm_output_ids, _ = vllm_outputs[i]
-        assert len(hf_output_ids) == len(vllm_output_ids)
-        for j in range(len(hf_output_ids)):
-            assert hf_output_ids[j] == vllm_output_ids[j], (
-                f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
-                f"vLLM: {vllm_output_ids}")
-    assert ("is preempted by PreemptionMode.SWAP mode because there "
-            "is not enough KV cache space." in caplog_vllm.text)
-    # Ensure the count bucket of request-level histogram metrics matches
-    # the number of requests as a simple sanity check to ensure metrics are
-    # generated
-    preemption_metrics = None
-    for m in REGISTRY.collect():
-        if m.name == "vllm:num_preemptions":
-            preemption_metrics = m
-    assert preemption_metrics is not None
-    total_recorded_preemption = 0
-    for sample in preemption_metrics.samples:
-        total_recorded_preemption += sample.value
-    assert total_preemption == total_recorded_preemption
-@pytest.mark.parametrize("model", MODELS)
-# @pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [96])
-@pytest.mark.parametrize("beam_width", [4])
-def test_swap_infeasible(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    beam_width: int,
-    worker_use_ray: bool,
-) -> None:
-    """Verify infeasible swap request will be ignored."""
-    BLOCK_SIZE = 16
-    prefill_blocks = 2
-    decode_blocks = max_tokens // BLOCK_SIZE
-    example_prompts = example_prompts[:1]
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            swap_space=10,
-            block_size=BLOCK_SIZE,
-            # Since beam search have more than 1 sequence, prefill +
-            # decode blocks are not enough to finish.
-            num_gpu_blocks_override=prefill_blocks + decode_blocks,
-            max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
-            worker_use_ray=worker_use_ray,
-    ) as vllm_model:
-        sampling_params = SamplingParams(n=beam_width,
-                                         use_beam_search=True,
-                                         temperature=0.0,
-                                         max_tokens=max_tokens,
-                                         ignore_eos=True)
-        req_outputs = vllm_model.model.generate(
-            example_prompts,
-            sampling_params=sampling_params,
-        )
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-    # Verify the request is ignored and not hang.
-    assert req_outputs[0].outputs[0].finish_reason == "length"
 @pytest.mark.parametrize("model", MODELS)
 # @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("dtype", ["half"])

--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
+from copy import deepcopy
+from typing import Callable, Union
+from torch import fx
+from vllm.compilation.inductor_pass import InductorPass
+class TestBackend:
+    """
+    This class provides a simple Inductor backend that can be used for testing.
+    It takes a list of custom passes and runs them after Inductor's passes.
+    It also saves the graph before and after the custom passes for inspection.
+    """
+    def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph],
+                                                             None]]):
+        self.custom_passes = list(passes)
+        from torch._inductor import config
+        self.current_config = config.shallow_copy_dict()
+        self.current_config['force_disable_caches'] = True
+        self.current_config['post_grad_custom_post_pass'] = self.post_pass
+    def __call__(self, graph: fx.GraphModule, example_inputs):
+        from torch._inductor.compile_fx import compile_fx
+        return compile_fx(graph,
+                          example_inputs,
+                          config_patches=self.current_config)
+    def post_pass(self, graph: fx.Graph):
+        self.graph_pre_pass = deepcopy(graph)
+        for pass_ in self.custom_passes:
+            pass_(graph)
+        self.graph_post_pass = deepcopy(graph)
+        # assign by reference, will reflect the final state of the graph
+        self.final_graph = graph