[Misc] Restrict ray version dependency and update PP feature warning in V1 (#15556)

df8d3d12 · Rui Qiao · GitHub · 619d3de8 · df8d3d12 · df8d3d12
Unverified Commit df8d3d12 authored Mar 26, 2025 by Rui Qiao Committed by GitHub Mar 27, 2025
Showing with 8 additions and 5 deletions

requirements/cuda.txt requirements/cuda.txt +1 -1

requirements/test.in requirements/test.in +1 -1

vllm/config.py vllm/config.py +1 -1

vllm/engine/arg_utils.py vllm/engine/arg_utils.py +5 -2

No files found.
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -4,7 +4,7 @@
 numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding

 # Dependencies for NVIDIA GPUs
-ray[cgraph]>=2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
+ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
 torch==2.6.0
 torchaudio==2.6.0
 # These must be updated alongside torch

--- a/requirements/test.in
+++ b/requirements/test.in
@@ -17,7 +17,7 @@ vector_quantize_pytorch # required for minicpmo_26 test
 vocos # required for minicpmo_26 test
 peft
 pqdm
-ray[cgraph]>=2.43.0 # Ray Compiled Graph, required by pipeline parallelism tests
+ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -313,7 +313,7 @@ class ModelConfig:
            raise ValueError(
                "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer "
                "module was not found."
-                "See https://github.com/vllm-project/vllm/blob/main/Dockerfile"
+                "See https://github.com/vllm-project/vllm/blob/main/Dockerfile "
                "for instructions on how to install it.")

        # The tokenizer version is consistent with the model version by default.

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1686,8 +1686,11 @@ class EngineArgs:
        if self.enable_lora and _warn_or_fallback("LORA"):
            return False

-        # PP is supported on V1, but off by default for now.
-        if self.pipeline_parallel_size > 1 and _warn_or_fallback("PP"):
+        # PP is supported on V1 with Ray distributed executor,
+        # but off for MP distributed executor for now.
+        if (self.pipeline_parallel_size > 1
+                and self.distributed_executor_backend == "mp"
+                and _warn_or_fallback("PP (MP distributed executor)")):
            return False

        # ngram is supported on V1, but off by default for now.