[Misc] Deprecation Warning when setting --engine-use-ray (#7424)

Signed-off-by: Wallas Santos <wallashss@ibm.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com> Co-authored-by: youkaichao <youkaichao@126.com>

[Misc] Deprecation Warning when setting --engine-use-ray (#7424)
Signed-off-by: Wallas Santos <wallashss@ibm.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com> Co-authored-by: youkaichao <youkaichao@126.com>
70b746ef · Wallas Henrique · GitHub · 67d115db · 70b746ef · 70b746ef
Unverified Commit 70b746ef authored Aug 14, 2024 by Wallas Henrique Committed by GitHub Aug 14, 2024
7 changed files
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
+import os
 import subprocess
 import sys
 import time
@@ -35,11 +36,17 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
        "127.0.0.1", "--tokenizer-pool-size",
        str(tokenizer_pool_size)
    ]
+    # Copy the environment variables and append `VLLM_ALLOW_ENGINE_USE_RAY=1`
+    # to prevent `--engine-use-ray` raises an exception due to it deprecation
+    env_vars = os.environ.copy()
+    env_vars["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
    if engine_use_ray:
        commands.append("--engine-use-ray")
    if worker_use_ray:
        commands.append("--worker-use-ray")
-    uvicorn_process = subprocess.Popen(commands)
+    uvicorn_process = subprocess.Popen(commands, env=env_vars)
    yield
    uvicorn_process.terminate()

--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
 import asyncio
+import os
 from dataclasses import dataclass
 import pytest
@@ -106,11 +107,16 @@ async def test_new_requests_event():
    assert engine.engine.add_request_calls == 3
    assert engine.engine.step_calls == old_step_calls + 1
+    # Allow deprecated engine_use_ray to not raise exception
+    os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
    engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
    assert engine.get_model_config() is not None
    assert engine.get_tokenizer() is not None
    assert engine.get_decoding_config() is not None
+    os.environ.pop("VLLM_ALLOW_ENGINE_USE_RAY")
 def test_asyncio_run():
    wait_for_gpu_memory_to_clear(

--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -23,7 +23,11 @@ def server():
        str(chatml_jinja_path),
    ]
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    # Allow `--engine-use-ray`, otherwise the launch of the server throw
+    # an error due to try to use a deprecated feature
+    env_dict = {"VLLM_ALLOW_ENGINE_USE_RAY": "1"}
+    with RemoteOpenAIServer(MODEL_NAME, args,
+                            env_dict=env_dict) as remote_server:
        yield remote_server

--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
 import asyncio
+import os
 from itertools import cycle
 from typing import Dict, List, Optional, Sequence, Tuple, Union
@@ -56,6 +57,11 @@ class AsyncLLM:
    ) -> None:
        if "disable_log_stats" not in kwargs:
            kwargs["disable_log_stats"] = True
+        # Needed to engine_use_ray works as a deprecated feature,
+        # otherwise the following constructor will raise an exception
+        os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
        engine_args = AsyncEngineArgs(
            model=model,
            tokenizer=tokenizer,

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -923,7 +923,13 @@ class AsyncEngineArgs(EngineArgs):
        parser.add_argument('--engine-use-ray',
                            action='store_true',
                            help='Use Ray to start the LLM engine in a '
-                            'separate process as the server process.')
+                            'separate process as the server process.'
+                            '(DEPRECATED. This argument is deprecated '
+                            'and will be removed in a future update. '
+                            'Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to force '
+                            'use it. See '
+                            'https://github.com/vllm-project/vllm/issues/7045.'
+                            ')')
        parser.add_argument('--disable-log-requests',
                            action='store_true',
                            help='Disable logging requests.')

--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -29,6 +29,7 @@ from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.usage.usage_lib import UsageContext
+from vllm.utils import print_warning_once
 logger = init_logger(__name__)
 ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
@@ -510,6 +511,20 @@ class AsyncLLMEngine:
        self.log_requests = log_requests
        self.engine = self._init_engine(*args, **kwargs)
+        if self.engine_use_ray:
+            print_warning_once(
+                "DEPRECATED. `--engine-use-ray` is deprecated and will "
+                "be removed in a future update. "
+                "See https://github.com/vllm-project/vllm/issues/7045.")
+            if envs.VLLM_ALLOW_ENGINE_USE_RAY:
+                print_warning_once(
+                    "VLLM_ALLOW_ENGINE_USE_RAY is set, force engine use Ray")
+            else:
+                raise ValueError("`--engine-use-ray` is deprecated. "
+                                 "Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to "
+                                 "force use it")
        self.background_loop: Optional[asyncio.Future] = None
        # We need to keep a reference to unshielded
        # task as well to prevent it from being garbage

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -55,6 +55,7 @@ if TYPE_CHECKING:
    VERBOSE: bool = False
    VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
    VLLM_TEST_FORCE_FP8_MARLIN: bool = False
+    VLLM_ALLOW_ENGINE_USE_RAY: bool = False
    VLLM_PLUGINS: Optional[List[str]] = None
@@ -364,6 +365,14 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    (os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
     ("1", "true")),
+    # If set, allow running the engine as a separate ray actor,
+    # which is a deprecated feature soon to be removed.
+    # See https://github.com/vllm-project/vllm/issues/7045
+    "VLLM_ALLOW_ENGINE_USE_RAY":
+    lambda:
+    (os.environ.get("VLLM_ALLOW_ENGINE_USE_RAY", "0").strip().lower() in
+     ("1", "true")),
    # a list of plugin names to load, separated by commas.
    # if this is not set, it means all plugins will be loaded
    # if this is set to an empty string, no plugins will be loaded