"vscode:/vscode.git/clone" did not exist on "92c35abb242babbf592390960fb5a4155261e017"
Unverified Commit 70b746ef authored by Wallas Henrique's avatar Wallas Henrique Committed by GitHub
Browse files

[Misc] Deprecation Warning when setting --engine-use-ray (#7424)


Signed-off-by: default avatarWallas Santos <wallashss@ibm.com>
Co-authored-by: default avataryoukaichao <youkaichao@gmail.com>
Co-authored-by: default avatarNick Hill <nickhill@us.ibm.com>
Co-authored-by: default avataryoukaichao <youkaichao@126.com>
parent 67d115db
import os
import subprocess import subprocess
import sys import sys
import time import time
...@@ -35,11 +36,17 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool, ...@@ -35,11 +36,17 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
"127.0.0.1", "--tokenizer-pool-size", "127.0.0.1", "--tokenizer-pool-size",
str(tokenizer_pool_size) str(tokenizer_pool_size)
] ]
# Copy the environment variables and append `VLLM_ALLOW_ENGINE_USE_RAY=1`
# to prevent `--engine-use-ray` raises an exception due to it deprecation
env_vars = os.environ.copy()
env_vars["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
if engine_use_ray: if engine_use_ray:
commands.append("--engine-use-ray") commands.append("--engine-use-ray")
if worker_use_ray: if worker_use_ray:
commands.append("--worker-use-ray") commands.append("--worker-use-ray")
uvicorn_process = subprocess.Popen(commands) uvicorn_process = subprocess.Popen(commands, env=env_vars)
yield yield
uvicorn_process.terminate() uvicorn_process.terminate()
......
import asyncio import asyncio
import os
from dataclasses import dataclass from dataclasses import dataclass
import pytest import pytest
...@@ -106,11 +107,16 @@ async def test_new_requests_event(): ...@@ -106,11 +107,16 @@ async def test_new_requests_event():
assert engine.engine.add_request_calls == 3 assert engine.engine.add_request_calls == 3
assert engine.engine.step_calls == old_step_calls + 1 assert engine.engine.step_calls == old_step_calls + 1
# Allow deprecated engine_use_ray to not raise exception
os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True) engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
assert engine.get_model_config() is not None assert engine.get_model_config() is not None
assert engine.get_tokenizer() is not None assert engine.get_tokenizer() is not None
assert engine.get_decoding_config() is not None assert engine.get_decoding_config() is not None
os.environ.pop("VLLM_ALLOW_ENGINE_USE_RAY")
def test_asyncio_run(): def test_asyncio_run():
wait_for_gpu_memory_to_clear( wait_for_gpu_memory_to_clear(
......
...@@ -23,7 +23,11 @@ def server(): ...@@ -23,7 +23,11 @@ def server():
str(chatml_jinja_path), str(chatml_jinja_path),
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: # Allow `--engine-use-ray`, otherwise the launch of the server throw
# an error due to try to use a deprecated feature
env_dict = {"VLLM_ALLOW_ENGINE_USE_RAY": "1"}
with RemoteOpenAIServer(MODEL_NAME, args,
env_dict=env_dict) as remote_server:
yield remote_server yield remote_server
......
import asyncio import asyncio
import os
from itertools import cycle from itertools import cycle
from typing import Dict, List, Optional, Sequence, Tuple, Union from typing import Dict, List, Optional, Sequence, Tuple, Union
...@@ -56,6 +57,11 @@ class AsyncLLM: ...@@ -56,6 +57,11 @@ class AsyncLLM:
) -> None: ) -> None:
if "disable_log_stats" not in kwargs: if "disable_log_stats" not in kwargs:
kwargs["disable_log_stats"] = True kwargs["disable_log_stats"] = True
# Needed to engine_use_ray works as a deprecated feature,
# otherwise the following constructor will raise an exception
os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
model=model, model=model,
tokenizer=tokenizer, tokenizer=tokenizer,
......
...@@ -923,7 +923,13 @@ class AsyncEngineArgs(EngineArgs): ...@@ -923,7 +923,13 @@ class AsyncEngineArgs(EngineArgs):
parser.add_argument('--engine-use-ray', parser.add_argument('--engine-use-ray',
action='store_true', action='store_true',
help='Use Ray to start the LLM engine in a ' help='Use Ray to start the LLM engine in a '
'separate process as the server process.') 'separate process as the server process.'
'(DEPRECATED. This argument is deprecated '
'and will be removed in a future update. '
'Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to force '
'use it. See '
'https://github.com/vllm-project/vllm/issues/7045.'
')')
parser.add_argument('--disable-log-requests', parser.add_argument('--disable-log-requests',
action='store_true', action='store_true',
help='Disable logging requests.') help='Disable logging requests.')
......
...@@ -29,6 +29,7 @@ from vllm.prompt_adapter.request import PromptAdapterRequest ...@@ -29,6 +29,7 @@ from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.utils import print_warning_once
logger = init_logger(__name__) logger = init_logger(__name__)
ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
...@@ -510,6 +511,20 @@ class AsyncLLMEngine: ...@@ -510,6 +511,20 @@ class AsyncLLMEngine:
self.log_requests = log_requests self.log_requests = log_requests
self.engine = self._init_engine(*args, **kwargs) self.engine = self._init_engine(*args, **kwargs)
if self.engine_use_ray:
print_warning_once(
"DEPRECATED. `--engine-use-ray` is deprecated and will "
"be removed in a future update. "
"See https://github.com/vllm-project/vllm/issues/7045.")
if envs.VLLM_ALLOW_ENGINE_USE_RAY:
print_warning_once(
"VLLM_ALLOW_ENGINE_USE_RAY is set, force engine use Ray")
else:
raise ValueError("`--engine-use-ray` is deprecated. "
"Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to "
"force use it")
self.background_loop: Optional[asyncio.Future] = None self.background_loop: Optional[asyncio.Future] = None
# We need to keep a reference to unshielded # We need to keep a reference to unshielded
# task as well to prevent it from being garbage # task as well to prevent it from being garbage
......
...@@ -55,6 +55,7 @@ if TYPE_CHECKING: ...@@ -55,6 +55,7 @@ if TYPE_CHECKING:
VERBOSE: bool = False VERBOSE: bool = False
VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
VLLM_TEST_FORCE_FP8_MARLIN: bool = False VLLM_TEST_FORCE_FP8_MARLIN: bool = False
VLLM_ALLOW_ENGINE_USE_RAY: bool = False
VLLM_PLUGINS: Optional[List[str]] = None VLLM_PLUGINS: Optional[List[str]] = None
...@@ -364,6 +365,14 @@ environment_variables: Dict[str, Callable[[], Any]] = { ...@@ -364,6 +365,14 @@ environment_variables: Dict[str, Callable[[], Any]] = {
(os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in (os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
("1", "true")), ("1", "true")),
# If set, allow running the engine as a separate ray actor,
# which is a deprecated feature soon to be removed.
# See https://github.com/vllm-project/vllm/issues/7045
"VLLM_ALLOW_ENGINE_USE_RAY":
lambda:
(os.environ.get("VLLM_ALLOW_ENGINE_USE_RAY", "0").strip().lower() in
("1", "true")),
# a list of plugin names to load, separated by commas. # a list of plugin names to load, separated by commas.
# if this is not set, it means all plugins will be loaded # if this is not set, it means all plugins will be loaded
# if this is set to an empty string, no plugins will be loaded # if this is set to an empty string, no plugins will be loaded
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment