Merge tag 'v0.7.1' into v0.7.1-dev

afd0da21 · zhuwenwen · 1a11f127 · 4f4d427a · afd0da21 · afd0da21
Commit afd0da21 authored Feb 03, 2025 by zhuwenwen
20 changed files
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -5,7 +5,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers == 4.47.0  # Required for Llama 3.2 and Qwen2-VL.
+transformers >= 4.48.2  # Required for Bamba.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
 fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
@@ -19,7 +19,7 @@ pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
-outlines == 0.1.11 # Requires pytorch
+outlines == 0.1.11
 lark == 1.2.2 
 xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
@@ -34,6 +34,6 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch
+compressed-tensors == 0.9.0 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,7 +2,14 @@
 -r requirements-common.txt

 # Dependencies for CPUs
-torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
-torch==2.5.1; platform_machine == "aarch64"
-torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
-datasets # for benchmark scripts
\ No newline at end of file
+torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
+torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin" 
+
+# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
+torchaudio; platform_machine != "ppc64le"
+torchaudio==2.5.1; platform_machine == "ppc64le"
+
+# required for the image processor of phi3v, this must be updated alongside torch
+torchvision; platform_machine != "ppc64le"
+torchvision==0.20.1; platform_machine == "ppc64le"
+datasets # for benchmark scripts
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -5,6 +5,7 @@
 ray[default] >= 2.9
 nvidia-ml-py >= 12.560.30 # for pynvml package
 torch == 2.5.1
+torchaudio==2.5.1
 # These must be updated alongside torch
 torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -3,7 +3,7 @@

 # Dependencies for HPU code
 ray
-triton
+triton==3.1.0
 pandas
 tabulate
 setuptools>=61

--- a/requirements-lint.txt
+++ b/requirements-lint.txt
 # formatting
-yapf==0.32.0
-toml==0.10.2
-tomli==2.0.2
-ruff==0.6.5
-codespell==2.3.0
-isort==5.13.2
-clang-format==18.1.5
-sphinx-lint==1.0.0
-
-# type checking
-mypy==1.11.1
-types-PyYAML
-types-requests
-types-setuptools
+pre-commit==4.0.1
--- a/requirements-neuron.txt
+++ b/requirements-neuron.txt
@@ -2,6 +2,6 @@
 -r requirements-common.txt

 # Dependencies for Neuron devices
-transformers-neuronx >= 0.12.0
-torch-neuronx >= 2.1.2
+transformers-neuronx >= 0.13.0
+torch-neuronx >= 2.5.0
 neuronx-cc
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -12,20 +12,27 @@ decord # required for video tests
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
+vector_quantize_pytorch # required for minicpmo_26 test
+vocos # required for minicpmo_26 test
 peft
+pqdm
 ray[adag]==2.40.0
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 timm # required for internvl test
 torch==2.5.1
+torchaudio==2.5.1
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.0 # required for pixtral test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
-
+transformers==4.48.2 
 # quantization
 bitsandbytes>=0.45.0
 buildkite-test-collector==0.1.9

+genai_perf==0.0.8
+tritonclient==2.51.0
+
 numpy < 2.0.0
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.12
 # by the following command:
 #
-#    python3.12 -m piptools compile requirements-test.in -o requirements-test.txt
+# python3.12 -m piptools compile requirements-test.in -o requirements-test.txt
 #
 absl-py==2.1.0
    # via rouge-score
@@ -37,7 +37,7 @@ audioread==3.0.1
    # via librosa
 awscli==1.35.23
    # via -r requirements-test.in
-bitsandbytes>=0.45.0
+bitsandbytes==0.45.0
    # via -r requirements-test.in
 black==24.10.0
    # via datamodel-code-generator
@@ -48,6 +48,8 @@ botocore==1.35.57
    #   awscli
    #   boto3
    #   s3transfer
+bounded-pool-executor==0.0.3
+    # via pqdm
 buildkite-test-collector==0.1.9
    # via -r requirements-test.in
 certifi==2024.8.30
@@ -73,6 +75,8 @@ colorama==0.4.6
    #   tqdm-multiprocess
 contourpy==1.3.0
    # via matplotlib
+cramjam==2.9.0
+    # via fastparquet
 cupy-cuda12x==13.3.0
    # via ray
 cycler==0.12.1
@@ -102,11 +106,21 @@ dnspython==2.7.0
 docutils==0.16
    # via awscli
 einops==0.8.0
-    # via -r requirements-test.in
+    # via
+    #   -r requirements-test.in
+    #   encodec
+    #   vector-quantize-pytorch
+    #   vocos
+einx==0.3.0
+    # via vector-quantize-pytorch
 email-validator==2.2.0
    # via pydantic
+encodec==0.1.1
+    # via vocos
 evaluate==0.4.3
    # via lm-eval
+fastparquet==2024.11.0
+    # via genai-perf
 fastrlock==0.8.2
    # via cupy-cuda12x
 filelock==3.16.1
@@ -119,6 +133,8 @@ filelock==3.16.1
    #   triton
 fonttools==4.54.1
    # via matplotlib
+frozendict==2.4.6
+    # via einx
 frozenlist==1.5.0
    # via
    #   aiohttp
@@ -128,8 +144,11 @@ fsspec[http]==2024.9.0
    # via
    #   datasets
    #   evaluate
+    #   fastparquet
    #   huggingface-hub
    #   torch
+genai-perf==0.0.8
+    # via -r requirements-test.in
 genson==1.3.0
    # via datamodel-code-generator
 h11==0.14.0
@@ -150,6 +169,7 @@ huggingface-hub==0.26.2
    #   timm
    #   tokenizers
    #   transformers
+    #   vocos
 idna==3.10
    # via
    #   anyio
@@ -184,6 +204,8 @@ jsonschema==4.23.0
    #   ray
 jsonschema-specifications==2024.10.1
    # via jsonschema
+kaleido==0.2.1
+    # via genai-perf
 kiwisolver==1.4.7
    # via matplotlib
 lazy-loader==0.4
@@ -198,6 +220,8 @@ lm-eval[api]==0.4.4
    # via -r requirements-test.in
 lxml==5.3.0
    # via sacrebleu
+markdown-it-py==3.0.0
+    # via rich
 markupsafe==3.0.2
    # via jinja2
 matplotlib==3.9.2
@@ -207,6 +231,8 @@ mbstrdecoder==1.1.3
    #   dataproperty
    #   pytablewriter
    #   typepy
+mdurl==0.1.2
+    # via markdown-it-py
 mistral-common[opencv]==1.5.1
    # via
    #   -r requirements-test.in
@@ -246,7 +272,11 @@ numpy==1.26.4
    #   cupy-cuda12x
    #   datasets
    #   decord
+    #   einx
+    #   encodec
    #   evaluate
+    #   fastparquet
+    #   genai-perf
    #   librosa
    #   matplotlib
    #   mistral-common
@@ -254,15 +284,19 @@ numpy==1.26.4
    #   numexpr
    #   opencv-python-headless
    #   pandas
+    #   patsy
    #   peft
    #   rouge-score
    #   sacrebleu
    #   scikit-learn
    #   scipy
    #   soxr
+    #   statsmodels
    #   tensorizer
    #   torchvision
    #   transformers
+    #   tritonclient
+    #   vocos
 nvidia-cublas-cu12==12.4.5.8
    # via
    #   nvidia-cudnn-cu12
@@ -304,30 +338,39 @@ packaging==24.1
    #   datamodel-code-generator
    #   datasets
    #   evaluate
+    #   fastparquet
    #   huggingface-hub
    #   lazy-loader
    #   matplotlib
    #   peft
+    #   plotly
    #   pooch
    #   pytest
    #   pytest-rerunfailures
    #   ray
+    #   statsmodels
    #   transformers
    #   typepy
 pandas==2.2.3
    # via
    #   datasets
    #   evaluate
+    #   fastparquet
+    #   genai-perf
+    #   statsmodels
 pathspec==0.12.1
    # via black
 pathvalidate==3.2.1
    # via pytablewriter
+patsy==1.0.1
+    # via statsmodels
 peft==0.13.2
    # via
    #   -r requirements-test.in
    #   lm-eval
 pillow==10.4.0
    # via
+    #   genai-perf
    #   matplotlib
    #   mistral-common
    #   sentence-transformers
@@ -336,12 +379,16 @@ platformdirs==4.3.6
    # via
    #   black
    #   pooch
+plotly==5.24.1
+    # via genai-perf
 pluggy==1.5.0
    # via pytest
 pooch==1.8.2
    # via librosa
 portalocker==2.10.1
    # via sacrebleu
+pqdm==0.2.0
+    # via -r requirements-test.in
 propcache==0.2.0
    # via yarl
 protobuf==5.28.3
@@ -356,7 +403,9 @@ psutil==6.1.0
 py==1.11.0
    # via pytest-forked
 pyarrow==18.0.0
-    # via datasets
+    # via
+    #   datasets
+    #   genai-perf
 pyasn1==0.6.1
    # via rsa
 pybind11==2.13.6
@@ -369,6 +418,8 @@ pydantic[email]==2.9.2
    #   mistral-common
 pydantic-core==2.23.4
    # via pydantic
+pygments==2.18.0
+    # via rich
 pyparsing==3.2.0
    # via matplotlib
 pytablewriter==1.2.0
@@ -377,14 +428,18 @@ pytest==8.3.3
    # via
    #   -r requirements-test.in
    #   buildkite-test-collector
+    #   genai-perf
    #   pytest-asyncio
    #   pytest-forked
+    #   pytest-mock
    #   pytest-rerunfailures
    #   pytest-shard
 pytest-asyncio==0.24.0
    # via -r requirements-test.in
 pytest-forked==1.6.0
    # via -r requirements-test.in
+pytest-mock==3.14.0
+    # via genai-perf
 pytest-rerunfailures==14.0
    # via -r requirements-test.in
 pytest-shard==0.1.2
@@ -395,6 +450,8 @@ python-dateutil==2.9.0.post0
    #   matplotlib
    #   pandas
    #   typepy
+python-rapidjson==1.20
+    # via tritonclient
 pytz==2024.2
    # via
    #   pandas
@@ -405,11 +462,14 @@ pyyaml==6.0.2
    #   awscli
    #   datamodel-code-generator
    #   datasets
+    #   genai-perf
    #   huggingface-hub
    #   peft
    #   ray
+    #   responses
    #   timm
    #   transformers
+    #   vocos
 ray[adag]==2.40.0
    # via -r requirements-test.in
 redis==5.2.0
@@ -434,8 +494,13 @@ requests==2.32.3
    #   mistral-common
    #   pooch
    #   ray
+    #   responses
    #   tiktoken
    #   transformers
+responses==0.25.3
+    # via genai-perf
+rich==13.9.4
+    # via genai-perf
 rouge-score==0.1.2
    # via lm-eval
 rpds-py==0.20.1
@@ -466,6 +531,8 @@ scipy==1.13.1
    #   librosa
    #   scikit-learn
    #   sentence-transformers
+    #   statsmodels
+    #   vocos
 sentence-transformers==3.2.1
    # via -r requirements-test.in
 sentencepiece==0.2.0
@@ -486,8 +553,12 @@ soxr==0.5.0.post1
    # via librosa
 sqlitedict==2.1.0
    # via lm-eval
+statsmodels==0.14.4
+    # via genai-perf
 sympy==1.13.1
-    # via torch
+    # via
+    #   einx
+    #   torch
 tabledata==1.3.3
    # via pytablewriter
 tabulate==0.9.0
@@ -495,7 +566,9 @@ tabulate==0.9.0
 tcolorpy==0.1.6
    # via pytablewriter
 tenacity==9.0.0
-    # via lm-eval
+    # via
+    #   lm-eval
+    #   plotly
 tensorizer==2.9.0
    # via -r requirements-test.in
 threadpoolctl==3.5.0
@@ -513,12 +586,21 @@ torch==2.5.1
    #   -r requirements-test.in
    #   accelerate
    #   bitsandbytes
+    #   encodec
    #   lm-eval
    #   peft
    #   sentence-transformers
    #   tensorizer
    #   timm
+    #   torchaudio
    #   torchvision
+    #   vector-quantize-pytorch
+    #   vocos
+torchaudio==2.5.1
+    # via
+    #   -r requirements-test.in
+    #   encodec
+    #   vocos
 torchvision==0.20.1
    # via timm
 tqdm==4.66.6
@@ -529,13 +611,16 @@ tqdm==4.66.6
    #   lm-eval
    #   nltk
    #   peft
+    #   pqdm
    #   sentence-transformers
    #   tqdm-multiprocess
    #   transformers
 tqdm-multiprocess==0.0.11
    # via lm-eval
-transformers==4.47.0
+transformers==4.48.2
    # via
+    #   -r requirements-test.in
+    #   genai-perf
    #   lm-eval
    #   peft
    #   sentence-transformers
@@ -544,6 +629,10 @@ transformers-stream-generator==0.0.5
    # via -r requirements-test.in
 triton==3.1.0
    # via torch
+tritonclient==2.51.0
+    # via
+    #   -r requirements-test.in
+    #   genai-perf
 typepy[datetime]==1.3.2
    # via
    #   dataproperty
@@ -551,18 +640,26 @@ typepy[datetime]==1.3.2
    #   tabledata
 typing-extensions==4.12.2
    # via
+    #   bitsandbytes
    #   huggingface-hub
    #   librosa
    #   mistral-common
+    #   pqdm
    #   pydantic
    #   pydantic-core
    #   torch
 tzdata==2024.2
    # via pandas
-urllib3==1.26.20
+urllib3==2.2.3
    # via
    #   botocore
    #   requests
+    #   responses
+    #   tritonclient
+vector-quantize-pytorch==1.21.2
+    # via -r requirements-test.in
+vocos==0.1.0
+    # via -r requirements-test.in
 word2number==1.1
    # via lm-eval
 xxhash==3.5.0

--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -13,11 +13,11 @@ ray[default]
 # Install torch_xla
 --pre
 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+--find-links https://storage.googleapis.com/libtpu-wheels/index.html
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.6.0.dev20241126+cpu
-torchvision==0.20.0.dev20241126+cpu
-torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl
-jaxlib==0.4.36.dev20241122
-jax==0.4.36.dev20241122
+torch==2.6.0.dev20241216+cpu
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
--- a/setup.py
+++ b/setup.py
+import ctypes
 import importlib.util
 import logging
 import os
@@ -13,7 +14,7 @@ from packaging.version import Version, parse
 from setuptools import Extension, find_packages, setup
 from setuptools.command.build_ext import build_ext
 from setuptools_scm import get_version
-from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME

 from typing import Optional, Union
 import subprocess
@@ -40,9 +41,14 @@ envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))

 VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE

-if not sys.platform.startswith("linux"):
+if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
    logger.warning(
-        "vLLM only supports Linux platform (including WSL). "
+        "VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
+    VLLM_TARGET_DEVICE = "cpu"
+elif not (sys.platform.startswith("linux")
+          or sys.platform.startswith("darwin")):
+    logger.warning(
+        "vLLM only supports Linux platform (including WSL) and MacOS."
        "Building on %s, "
        "so vLLM may not be able to run correctly", sys.platform)
    VLLM_TARGET_DEVICE = "empty"
@@ -229,8 +235,11 @@ class cmake_build_ext(build_ext):

            # CMake appends the extension prefix to the install path,
            # and outdir already contains that prefix, so we need to remove it.
+            # We assume only the final component of extension prefix is added by
+            # CMake, this is currently true for current extensions but may not
+            # always be the case.
            prefix = outdir
-            for i in range(ext.name.count('.')):
+            if '.' in ext.name:
                prefix = prefix.parent

            # prefix here should actually be the same for all components
@@ -258,7 +267,7 @@ class cmake_build_ext(build_ext):

 class repackage_wheel(build_ext):
    """Extracts libraries and other files from an existing wheel."""
-    default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+    default_wheel = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"

    def run(self) -> None:
        wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION",
@@ -299,9 +308,11 @@ class repackage_wheel(build_ext):
            files_to_copy = [
                "vllm/_C.abi3.so",
                "vllm/_moe_C.abi3.so",
-                "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
+                "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
+                "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
                "vllm/vllm_flash_attn/flash_attn_interface.py",
                "vllm/vllm_flash_attn/__init__.py",
+                "vllm/cumem_allocator.abi3.so",
                # "vllm/_version.py", # not available in nightly wheels yet
            ]
            file_members = filter(lambda x: x.filename in files_to_copy,
@@ -325,21 +336,26 @@ class repackage_wheel(build_ext):


 def _is_hpu() -> bool:
-    is_hpu_available = True
+    # if VLLM_TARGET_DEVICE env var was set explicitly, skip HPU autodetection
+    if os.getenv("VLLM_TARGET_DEVICE", None) == VLLM_TARGET_DEVICE:
+        return VLLM_TARGET_DEVICE == "hpu"
+
+    # if VLLM_TARGET_DEVICE was not set explicitly, check if hl-smi succeeds,
+    # and if it doesn't, check if habanalabs driver is loaded
+    is_hpu_available = False
    try:
-        subprocess.run(["hl-smi"], capture_output=True, check=True)
+        out = subprocess.run(["hl-smi"], capture_output=True, check=True)
+        is_hpu_available = out.returncode == 0
    except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
-        if not os.path.exists('/dev/accel/accel0') and not os.path.exists(
-                '/dev/accel/accel_controlD0'):
-            # last resort...
+        if sys.platform.startswith("linux"):
            try:
                output = subprocess.check_output(
                    'lsmod | grep habanalabs | wc -l', shell=True)
                is_hpu_available = int(output) > 0
            except (ValueError, FileNotFoundError, PermissionError,
                    subprocess.CalledProcessError):
-                is_hpu_available = False
-    return is_hpu_available or VLLM_TARGET_DEVICE == "hpu"
+                pass
+    return is_hpu_available


 def _no_device() -> bool:
@@ -386,25 +402,31 @@ def _build_custom_ops() -> bool:
    return _is_cuda() or _is_hip() or _is_cpu()


-def get_hipcc_rocm_version():
-    # Run the hipcc --version command
-    result = subprocess.run(['hipcc', '--version'],
-                            stdout=subprocess.PIPE,
-                            stderr=subprocess.STDOUT,
-                            text=True)
+def get_rocm_version():
+    # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
+    # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
+    try:
+        librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
+        if not librocm_core_file.is_file():
+            return None
+        librocm_core = ctypes.CDLL(librocm_core_file)
+        VerErrors = ctypes.c_uint32
+        get_rocm_core_version = librocm_core.getROCmVersion
+        get_rocm_core_version.restype = VerErrors
+        get_rocm_core_version.argtypes = [
+            ctypes.POINTER(ctypes.c_uint32),
+            ctypes.POINTER(ctypes.c_uint32),
+            ctypes.POINTER(ctypes.c_uint32),
+        ]
+        major = ctypes.c_uint32()
+        minor = ctypes.c_uint32()
+        patch = ctypes.c_uint32()

-    # Check if the command was executed successfully
-    if result.returncode != 0:
-        print("Error running 'hipcc --version'")
+        if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
+                                  ctypes.byref(patch)) == 0):
+            return f"{major.value}.{minor.value}.{patch.value}"
        return None
-
-    # Extract the version using a regular expression
-    match = re.search(r'HIP version: (\S+)', result.stdout)
-    if match:
-        # Return the version string
-        return match.group(1)
-    else:
-        print("Could not find HIP version in the output")
+    except Exception:
        return None


@@ -482,9 +504,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
    
    new_version_content = f"""
 try:
-    __version__ = "0.6.6.post1"
-    __version_tuple__ = (0, 6, 6)
-    __hcu_version__ = f'0.6.6.post1+{version}' 
+    __version__ = "0.7.1"
+    __version_tuple__ = (0, 7, 1)
+    __hcu_version__ = f'0.7.1+{version}' 
    
    from vllm.version import __version__, __version_tuple__, __hcu_version__
 except Exception as e:
@@ -527,14 +549,10 @@ def get_gaudi_sw_version():


 def get_vllm_version() -> str:
-    # TODO: Revisit this temporary approach: https://github.com/vllm-project/vllm/issues/9182#issuecomment-2404860236
-    try:
-        if not _is_hip():
-            version = get_version(
-                write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
-            )
-    except LookupError:
-        version = "0.0.0"
+    if not _is_hip():
+        version = get_version(
+            write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
+        )

        sep = "+" if "+" not in version else "."  # dev versions might contain +

@@ -552,11 +570,10 @@ def get_vllm_version() -> str:
                if "sdist" not in sys.argv:
                    version += f"{sep}cu{cuda_version_str}"
    elif _is_hip():
-        # Get the HIP version
-        # hipcc_version = get_hipcc_rocm_version()
-        # if hipcc_version != MAIN_CUDA_VERSION:
-        #     rocm_version_str = hipcc_version.replace(".", "")[:3]
-        #     version += f"{sep}rocm{rocm_version_str}"
+        # Get the Rocm Version
+        # rocm_version = get_rocm_version() or torch.version.hip
+        # if rocm_version and rocm_version != MAIN_CUDA_VERSION:
+        #     version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
        version = get_version()
    elif _is_neuron():
        # Get the Neuron version
@@ -611,7 +628,7 @@ def get_requirements() -> List[str]:
        return resolved_requirements

    if _no_device():
-        requirements = _read_requirements("requirements-cuda.txt")
+        requirements = _read_requirements("requirements-cpu.txt")
    elif _is_cuda():
        requirements = _read_requirements("requirements-cuda.txt")
        cuda_major, cuda_minor = torch.version.cuda.split(".")
@@ -654,14 +671,24 @@ if _is_cuda() or _is_hip():
 #     ext_modules.append(CMakeExtension(name="vllm._rocm_C"))

 if _is_cuda():
-    ext_modules.append(
-        CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c"))
+    ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
+    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.0"):
+        # FA3 requires CUDA 12.0 or later
+        ext_modules.append(
+            CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
+    ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))

 if _build_custom_ops():
    ext_modules.append(CMakeExtension(name="vllm._C"))

 package_data = {
-    "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json", "benchmarks/*.py","model_executor/layers/quantization/configs/w8a8/*.json"]
+    "vllm": [
+        "py.typed",
+        "model_executor/layers/fused_moe/configs/*.json",
+        "model_executor/layers/quantization/utils/configs/*.json",
+        "benchmarks/*.py",
+        "model_executor/layers/quantization/configs/w8a8/*.json"
+    ]
 }

 if _no_device():

--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -27,27 +27,32 @@ def _query_server_long(prompt: str) -> dict:


 @pytest.fixture
-def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
+def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
    script_path = Path(__file__).parent.joinpath(
        "api_server_async_engine.py").absolute()
    commands = [
-        sys.executable, "-u",
-        str(script_path), "--model", os.path.join(models_path_prefix, "facebook/opt-125m"), "--host",
-        "127.0.0.1", "--tokenizer-pool-size",
-        str(tokenizer_pool_size)
+        sys.executable,
+        "-u",
+        str(script_path),
+        "--model",
+        os.path.join(models_path_prefix, "facebook/opt-125m"),
+        "--host",
+        "127.0.0.1",
+        "--tokenizer-pool-size",
+        str(tokenizer_pool_size),
+        "--distributed-executor-backend",
+        distributed_executor_backend,
    ]

-    if worker_use_ray:
-        commands.append("--worker-use-ray")
    uvicorn_process = subprocess.Popen(commands)
    yield
    uvicorn_process.terminate()


 @pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
-@pytest.mark.parametrize("worker_use_ray", [False, True])
+@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
 def test_api_server(api_server, tokenizer_pool_size: int,
-                    worker_use_ray: bool):
+                    distributed_executor_backend: str):
    """
    Run the API server and test it.


--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -46,7 +46,6 @@ def test_vllm_gc_ed():
    assert weak_llm() is None


-@pytest.mark.skip_v1
 @pytest.mark.parametrize("model", MODELS)
 # @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
 @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
@@ -65,9 +64,10 @@ def test_models(
    if backend == "FLASHINFER" and current_platform.is_rocm():
        pytest.skip("Flashinfer does not support ROCm/HIP.")

-    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
+    if backend in ("XFORMERS",
+                   "FLASHINFER") and model == "google/gemma-2-2b-it":
        pytest.skip(
-            "XFORMERS does not support gemma2 with full context length.")
+            f"{backend} does not support gemma2 with full context length.")

    os.environ["VLLM_ATTENTION_BACKEND"] = backend


--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.device_allocator.cumem import CuMemAllocator
+from vllm.utils import GiB_bytes
+
+from ..utils import fork_new_process_for_each_test
+
+
+@fork_new_process_for_each_test
+def test_basic_cumem():
+    # some tensors from default memory pool
+    shape = (1024, 1024)
+    x = torch.empty(shape, device='cuda')
+    x.zero_()
+
+    # some tensors from custom memory pool
+    allocator = CuMemAllocator.get_instance()
+    with allocator.use_memory_pool():
+        # custom memory pool
+        y = torch.empty(shape, device='cuda')
+        y.zero_()
+        y += 1
+        z = torch.empty(shape, device='cuda')
+        z.zero_()
+        z += 2
+
+    # they can be used together
+    output = x + y + z
+    assert torch.allclose(output, torch.ones_like(output) * 3)
+
+    free_bytes = torch.cuda.mem_get_info()[0]
+    allocator.sleep()
+    free_bytes_after_sleep = torch.cuda.mem_get_info()[0]
+    assert free_bytes_after_sleep > free_bytes
+    allocator.wake_up()
+
+    # they can be used together
+    output = x + y + z
+    assert torch.allclose(output, torch.ones_like(output) * 3)
+
+
+@fork_new_process_for_each_test
+def test_cumem_with_cudagraph():
+    allocator = CuMemAllocator.get_instance()
+    with allocator.use_memory_pool():
+        weight = torch.eye(1024, device='cuda')
+    with allocator.use_memory_pool(tag="discard"):
+        cache = torch.empty(1024, 1024, device='cuda')
+
+    def model(x):
+        out = x @ weight
+        cache[:out.size(0)].copy_(out)
+        return out + 1
+
+    x = torch.empty(128, 1024, device='cuda')
+
+    # warmup
+    model(x)
+
+    # capture cudagraph
+    model_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(model_graph):
+        y = model(x)
+
+    free_bytes = torch.cuda.mem_get_info()[0]
+    allocator.sleep()
+    free_bytes_after_sleep = torch.cuda.mem_get_info()[0]
+    assert free_bytes_after_sleep > free_bytes
+    allocator.wake_up()
+
+    # after waking up, the content in the weight tensor
+    # should be restored, but the content in the cache tensor
+    # should be discarded
+
+    # this operation is also compatible with cudagraph
+
+    x.random_()
+    model_graph.replay()
+
+    # cache content is as expected
+    assert torch.allclose(x, cache[:x.size(0)])
+
+    # output content is as expected
+    assert torch.allclose(y, x + 1)
+
+
+@fork_new_process_for_each_test
+def test_end_to_end():
+    free, total = torch.cuda.mem_get_info()
+    used_bytes_baseline = total - free  # in case other process is running
+    llm = LLM("meta-llama/Llama-3.2-1B", enable_sleep_mode=True)
+    prompt = "How are you?"
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+    output = llm.generate(prompt, sampling_params)
+
+    # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
+    # which is difficult to measure in the test. therefore, we only
+    # test sleep level 1 here.
+    llm.sleep(level=1)
+
+    free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+    used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+    # now the memory usage is mostly cudagraph memory pool,
+    # and it should be less than the model weights (1B model, 2GiB weights)
+    assert used_bytes < 2 * GiB_bytes
+
+    llm.wake_up()
+    output2 = llm.generate(prompt, sampling_params)
+
+    # cmp output
+    assert output[0].outputs[0].text == output2[0].outputs[0].text
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -32,10 +32,10 @@ def check_settings():


 @pytest.fixture
-def worker_use_ray() -> bool:
-    # When SPMD worker is used, use ray_use_worker=True
+def distributed_executor_backend() -> str:
+    # When SPMD worker is used, use distributed_executor_backend="ray"
    # to test delta input optimization works with preemption.
-    return envs.VLLM_USE_RAY_SPMD_WORKER
+    return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp"


 @pytest.mark.parametrize("model", MODELS)
@@ -50,7 +50,7 @@ def test_chunked_prefill_recompute(
    dtype: str,
    max_tokens: int,
    chunked_prefill_token_size: int,
-    worker_use_ray: bool,
+    distributed_executor_backend: str,
 ) -> None:
    """Ensure that chunked prefill works with preemption."""
    max_num_seqs = min(chunked_prefill_token_size, 256)
@@ -69,7 +69,7 @@ def test_chunked_prefill_recompute(
            max_num_batched_tokens=max_num_batched_tokens,
            enable_chunked_prefill=enable_chunked_prefill,
            max_num_seqs=max_num_seqs,
-            worker_use_ray=worker_use_ray,
+            distributed_executor_backend=distributed_executor_backend,
            disable_log_stats=False,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -97,7 +97,7 @@ def test_preemption(
    model: str,
    dtype: str,
    max_tokens: int,
-    worker_use_ray: bool,
+    distributed_executor_backend: str,
 ) -> None:
    """By default, recompute preemption is enabled"""

@@ -108,7 +108,7 @@ def test_preemption(
            model,
            dtype=dtype,
            disable_log_stats=False,
-            worker_use_ray=worker_use_ray,
+            distributed_executor_backend=distributed_executor_backend,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
@@ -149,7 +149,7 @@ def test_preemption_infeasible(
    model: str,
    dtype: str,
    max_tokens: int,
-    worker_use_ray: bool,
+    distributed_executor_backend: str,
 ) -> None:
    """Verify infeasible preemption request will be ignored."""
    BLOCK_SIZE = 16
@@ -164,7 +164,7 @@ def test_preemption_infeasible(
            # ignored instead of hanging forever.
            num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
            max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
-            worker_use_ray=worker_use_ray,
+            distributed_executor_backend=distributed_executor_backend,
    ) as vllm_model:
        sampling_params = SamplingParams(max_tokens=max_tokens,
                                         ignore_eos=True)

--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -7,7 +7,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are
 initialized randomly with a fixed seed.
 """
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Any, List, Optional, Tuple

 import torch
 from torch import nn
@@ -54,6 +54,16 @@ class LlamaConfig:
    tractable_init: bool = False
    random_seed: int = 0

+    def compute_hash(self) -> str:
+        factors: List[Any] = []
+        for k, v in self.__dict__.items():
+            if k == "random_seed":
+                continue
+            factors.append((k, v))
+        factors.sort()
+        import hashlib
+        return hashlib.md5(str(factors).encode()).hexdigest()
+
    def __post_init__(self):
        assert self.mlp_size >= self.hidden_size

@@ -263,7 +273,8 @@ def run_model(llama_config,
        compilation_config = CompilationConfig(
            level=CompilationLevel.NO_COMPILATION, )

-    vllm_config = VllmConfig(compilation_config=compilation_config)
+    vllm_config = VllmConfig(compilation_config=compilation_config,
+                             additional_config=llama_config)
    with set_current_vllm_config(vllm_config):
        model = LlamaModel(config=llama_config,
                           vllm_config=vllm_config,

--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -59,7 +59,7 @@ test_settings = [
        model_args=["--task", "embed"],
        pp_size=1,
        tp_size=1,
-        attn_backend="FLASHINFER",
+        attn_backend="FLASH_ATTN",
        method="encode",
        fullgraph=True,
    ),

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -30,13 +30,13 @@ from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
                              initialize_model_parallel)
 from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
-                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
+                         TokensPrompt, to_enc_dec_tuple_list,
+                         zip_enc_dec_prompts)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
-from vllm.platforms import current_platform
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
-                        identity)
+                        identity, is_list_of)
 from .utils import models_path_prefix

 logger = init_logger(__name__)
@@ -44,6 +44,7 @@ logger = init_logger(__name__)
 _TEST_DIR = os.path.dirname(__file__)
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
+_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")

 _M = TypeVar("_M")
 _PromptMultiModalInput = Union[List[_M], List[List[_M]]]
@@ -181,6 +182,12 @@ def example_prompts() -> List[str]:
    return prompts


+@pytest.fixture
+def example_system_message() -> str:
+    with open(_SYS_MSG) as f:
+        return f.read()
+
+
 class DecoderPromptType(Enum):
    """For encoder/decoder models only."""
    CUSTOM = 1
@@ -240,11 +247,13 @@ def video_assets() -> _VideoAssets:


 _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
+_R = TypeVar("_R")


 class HfRunner:

    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
+        from vllm.platforms import current_platform
        if x is None or isinstance(x, (bool, )):
            return x

@@ -882,6 +891,12 @@ class VllmRunner:
        beam_width: int,
        max_tokens: int,
    ) -> List[Tuple[List[List[int]], List[str]]]:
+        if is_list_of(prompts, str, check="all"):
+            prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
+        else:
+            prompts = [
+                TokensPrompt(prompt_token_ids=tokens) for tokens in prompts
+            ]
        outputs = self.model.beam_search(
            prompts,
            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
@@ -919,6 +934,10 @@ class VllmRunner:
        req_outputs = self.model.score(text_1, text_2)
        return [req_output.outputs.score for req_output in req_outputs]

+    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
+        executor = self.model.llm_engine.model_executor
+        return executor.apply_model(func)
+
    def __enter__(self):
        return self


--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -796,6 +796,44 @@ class TestPrefixCachingBlockAllocator:
            block_hashes=block_hashes_seq1)
        assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks

+    # Test reset prefix cache
+    @staticmethod
+    @pytest.mark.parametrize("num_blocks", [10])
+    @pytest.mark.parametrize("block_size", [16])
+    def test_reset_prefix_cache(num_blocks: int, block_size: int):
+        """This test case simulates the case of resetting the prefix cache."""
+
+        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
+                                                block_size=block_size)
+        token_ids = list(range(3 * block_size))
+
+        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator,
+        )
+
+        # Free each block in the first chain.
+        for block in first_chain:
+            allocator.free(block)
+
+        # Failed to reset prefix cache because some blocks are not freed yet.
+        assert not allocator.reset_prefix_cache()
+        assert allocator.get_prefix_cache_hit_rate() > 0.0
+
+        # Free each block in the second chain.
+        for block in second_chain:
+            allocator.free(block)
+
+        # Reset prefix cache.
+        assert allocator.reset_prefix_cache()
+        assert allocator.get_prefix_cache_hit_rate() == 0.0
+
    @staticmethod
    def create_immutable_chain(
        block_size: int,

--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -50,7 +50,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):

    for sz in test_sizes:
        for dtype in [torch.float32, torch.float16, torch.bfloat16]:
-            with graph_capture() as graph_capture_context:
+            with graph_capture(device=device) as graph_capture_context:
                # use integers so result matches NCCL exactly
                inp1 = torch.randint(1,
                                     16, (sz, ),

--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -59,8 +59,7 @@ def worker_fn():
                                     device=get_world_group().device)
    tensor = torch.ones(16, 1024, 1024,
                        dtype=torch.float32).cuda(pynccl_comm.rank)
-    with pynccl_comm.change_state(enable=True):
-        tensor = pynccl_comm.all_reduce(tensor)
+    tensor = pynccl_comm.all_reduce(tensor)
    torch.cuda.synchronize()
    assert torch.all(tensor == pynccl_comm.world_size).cpu().item()

@@ -81,17 +80,16 @@ def multiple_allreduce_worker_fn():
    group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
    pynccl_comm = PyNcclCommunicator(group=group, device=device)
    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
-    with pynccl_comm.change_state(enable=True):
-        # two groups can communicate independently
-        if torch.distributed.get_rank() in [0, 1]:
-            tensor = pynccl_comm.all_reduce(tensor)
-            tensor = pynccl_comm.all_reduce(tensor)
-            torch.cuda.synchronize()
-            assert torch.all(tensor == 4).cpu().item()
-        else:
-            tensor = pynccl_comm.all_reduce(tensor)
-            torch.cuda.synchronize()
-            assert torch.all(tensor == 2).cpu().item()
+    # two groups can communicate independently
+    if torch.distributed.get_rank() in [0, 1]:
+        tensor = pynccl_comm.all_reduce(tensor)
+        tensor = pynccl_comm.all_reduce(tensor)
+        torch.cuda.synchronize()
+        assert torch.all(tensor == 4).cpu().item()
+    else:
+        tensor = pynccl_comm.all_reduce(tensor)
+        torch.cuda.synchronize()
+        assert torch.all(tensor == 2).cpu().item()


 @pytest.mark.skipif(torch.cuda.device_count() < 4,
@@ -107,7 +105,7 @@ def multiple_allreduce_with_vllm_worker_fn():
    device = torch.device(f"cuda:{torch.distributed.get_rank()}")
    ensure_model_parallel_initialized(2, 2)
    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
-    with graph_capture():
+    with graph_capture(device=device):
        # two tp groups can communicate independently
        if torch.distributed.get_rank() in [0, 1]:
            tensor = tensor_model_parallel_all_reduce(tensor)
@@ -137,9 +135,7 @@ def worker_fn_with_cudagraph():
        # run something in the default stream to initialize torch engine
        a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
        torch.cuda.synchronize()
-        with torch.cuda.graph(
-                graph, stream=pynccl_comm.stream), pynccl_comm.change_state(
-                    enable=True):
+        with torch.cuda.graph(graph):
            a_out = pynccl_comm.all_reduce(a)
        torch.cuda.synchronize()
        graph.replay()
@@ -168,8 +164,7 @@ def all_gather_worker_fn():
        for r in range(world_size)
    ]).to(device)

-    with pynccl_comm.change_state(enable=True):
-        pynccl_comm.all_gather(result, tensor)
+    pynccl_comm.all_gather(result, tensor)
    torch.cuda.synchronize()
    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)

@@ -206,8 +201,7 @@ def reduce_scatter_worker_fn():
    expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size]
                   for tensor in all_tensors).to(device)

-    with pynccl_comm.change_state(enable=True):
-        pynccl_comm.reduce_scatter(result, tensor)
+    pynccl_comm.reduce_scatter(result, tensor)
    torch.cuda.synchronize()
    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)

@@ -234,15 +228,13 @@ def send_recv_worker_fn():
    else:
        tensor = torch.empty(16, 1024, 1024,
                             dtype=torch.float32).cuda(pynccl_comm.rank)
-    with pynccl_comm.change_state(enable=True):
-        if pynccl_comm.rank == 0:
-            pynccl_comm.send(tensor,
-                             dst=(pynccl_comm.rank + 1) %
-                             pynccl_comm.world_size)
-        else:
-            pynccl_comm.recv(tensor,
-                             src=(pynccl_comm.rank - 1) %
-                             pynccl_comm.world_size)
+
+    if pynccl_comm.rank == 0:
+        pynccl_comm.send(tensor,
+                         dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
+    else:
+        pynccl_comm.recv(tensor,
+                         src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
    torch.cuda.synchronize()
    assert torch.all(tensor == 1).cpu().item()

@@ -273,15 +265,12 @@ def multiple_send_recv_worker_fn():
                             1024,
                             dtype=torch.float32,
                             device=device)
-    with pynccl_comm.change_state(enable=True):
-        if torch.distributed.get_rank() in [0, 1]:
-            pynccl_comm.send(tensor,
-                             dst=(pynccl_comm.rank + 1) %
-                             pynccl_comm.world_size)
-        else:
-            pynccl_comm.recv(tensor,
-                             src=(pynccl_comm.rank - 1) %
-                             pynccl_comm.world_size)
+    if torch.distributed.get_rank() in [0, 1]:
+        pynccl_comm.send(tensor,
+                         dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
+    else:
+        pynccl_comm.recv(tensor,
+                         src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
    torch.cuda.synchronize()
    if torch.distributed.get_rank() in [0, 2]:
        assert torch.all(tensor == 1).cpu().item()