Unverified Commit 9040151f authored by Flora Feng's avatar Flora Feng Committed by GitHub
Browse files

[V0 Deprecation] Deprecate --disable-frontend-multiprocessing (#37612)


Signed-off-by: default avatarsfeng33 <4florafeng@gmail.com>
parent 8fbe3f30
......@@ -319,9 +319,6 @@ def _compare_tp(
pp_env = {
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
}
# Temporary. Currently when zeromq + SPMD is used, it does not properly
# terminate because of a Ray Compiled Graph issue.
common_args.append("--disable-frontend-multiprocessing")
elif distributed_backend == "mp":
pp_env = None
else:
......
......@@ -28,7 +28,7 @@ def server_args(request: pytest.FixtureRequest) -> list[str]:
>>> @pytest.mark.parametrize(
>>> "server_args",
>>> [
>>> ["--disable-frontend-multiprocessing"],
>>> ["--max-model-len", "10100"],
>>> [
>>> "--model=NousResearch/Hermes-3-Llama-3.1-70B",
>>> "--enable-auto-tool-choice",
......@@ -40,7 +40,7 @@ def server_args(request: pytest.FixtureRequest) -> list[str]:
>>> ...
This will run `test_foo` twice with servers with:
- `--disable-frontend-multiprocessing`
- `--max-model-len 10100`
- `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
"""
......@@ -79,17 +79,6 @@ async def client(server):
yield async_client
@pytest.mark.parametrize(
"server_args",
[
pytest.param([], id="default-frontend-multiprocessing"),
pytest.param(
["--disable-frontend-multiprocessing"],
id="disable-frontend-multiprocessing",
),
],
indirect=True,
)
@pytest.mark.asyncio
async def test_show_version(server: RemoteOpenAIServer):
response = requests.get(server.url_for("version"))
......@@ -98,17 +87,6 @@ async def test_show_version(server: RemoteOpenAIServer):
assert response.json() == {"version": VLLM_VERSION}
@pytest.mark.parametrize(
"server_args",
[
pytest.param([], id="default-frontend-multiprocessing"),
pytest.param(
["--disable-frontend-multiprocessing"],
id="disable-frontend-multiprocessing",
),
],
indirect=True,
)
@pytest.mark.asyncio
async def test_check_health(server: RemoteOpenAIServer):
response = requests.get(server.url_for("health"))
......@@ -119,13 +97,7 @@ async def test_check_health(server: RemoteOpenAIServer):
@pytest.mark.parametrize(
"server_args",
[
pytest.param(
["--max-model-len", "10100"], id="default-frontend-multiprocessing"
),
pytest.param(
["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
id="disable-frontend-multiprocessing",
),
pytest.param(["--max-model-len", "10100"]),
],
indirect=True,
)
......
......@@ -50,7 +50,6 @@ def default_server_args():
params=[
"",
"--enable-chunked-prefill",
"--disable-frontend-multiprocessing",
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
],
)
......
......@@ -83,11 +83,8 @@ def example_prompt_embeds(hf_runner):
return [_encode_embeds(item) for item in example_embeddings]
@pytest.fixture(scope="module", params=["", "--disable-frontend-multiprocessing"])
def server_with_prompt_embeds(default_server_args, request):
if request.param:
default_server_args.append(request.param)
@pytest.fixture(scope="module")
def server_with_prompt_embeds(default_server_args):
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server
......
......@@ -150,7 +150,6 @@ async def test_shutdown_on_engine_failure():
"0.05",
"--max-num-seqs",
"2",
"--disable-frontend-multiprocessing",
],
# ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when
# stdout/stderr pipes are enabled during ROCm GPU initialization.
......
......@@ -26,19 +26,12 @@ def default_server_args():
"128",
"--enforce-eager",
"--enable-prompt-tokens-details",
"--no-enable-prefix-caching",
]
@pytest.fixture(
scope="module",
params=[
["--no-enable-prefix-caching"],
["--no-enable-prefix-caching", "--disable-frontend-multiprocessing"],
],
)
def server(default_server_args, request):
if request.param:
default_server_args = default_server_args + request.param
@pytest.fixture(scope="module")
def server(default_server_args):
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server
......
......@@ -181,7 +181,6 @@ async def run_vllm_async(
n: int,
engine_args: AsyncEngineArgs,
do_profile: bool,
disable_frontend_multiprocessing: bool = False,
disable_detokenize: bool = False,
) -> float:
from vllm import SamplingParams
......@@ -191,7 +190,6 @@ async def run_vllm_async(
async with build_async_engine_client_from_engine_args(
engine_args,
disable_frontend_multiprocessing=disable_frontend_multiprocessing,
) as llm:
model_config = llm.model_config
assert all(
......@@ -757,12 +755,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
default=False,
help="Use vLLM async engine rather than LLM class.",
)
parser.add_argument(
"--disable-frontend-multiprocessing",
action="store_true",
default=False,
help="Disable decoupled async engine frontend.",
)
parser.add_argument(
"--disable-detokenize",
action="store_true",
......@@ -880,7 +872,6 @@ def main(args: argparse.Namespace):
requests,
args.n,
AsyncEngineArgs.from_cli_args(args),
disable_frontend_multiprocessing=args.disable_frontend_multiprocessing,
disable_detokenize=args.disable_detokenize,
do_profile=args.profile,
)
......
......@@ -79,7 +79,6 @@ async def build_async_engine_client(
args: Namespace,
*,
usage_context: UsageContext = UsageContext.OPENAI_API_SERVER,
disable_frontend_multiprocessing: bool | None = None,
client_config: dict[str, Any] | None = None,
) -> AsyncIterator[EngineClient]:
if os.getenv("VLLM_WORKER_MULTIPROC_METHOD") == "forkserver":
......@@ -98,13 +97,9 @@ async def build_async_engine_client(
engine_args._api_process_count = client_config.get("client_count", 1)
engine_args._api_process_rank = client_config.get("client_index", 0)
if disable_frontend_multiprocessing is None:
disable_frontend_multiprocessing = bool(args.disable_frontend_multiprocessing)
async with build_async_engine_client_from_engine_args(
engine_args,
usage_context=usage_context,
disable_frontend_multiprocessing=disable_frontend_multiprocessing,
client_config=client_config,
) as engine:
yield engine
......@@ -115,7 +110,6 @@ async def build_async_engine_client_from_engine_args(
engine_args: AsyncEngineArgs,
*,
usage_context: UsageContext = UsageContext.OPENAI_API_SERVER,
disable_frontend_multiprocessing: bool = False,
client_config: dict[str, Any] | None = None,
) -> AsyncIterator[EngineClient]:
"""
......@@ -129,9 +123,6 @@ async def build_async_engine_client_from_engine_args(
# Create the EngineConfig (determines if we can use V1).
vllm_config = engine_args.create_engine_config(usage_context=usage_context)
if disable_frontend_multiprocessing:
logger.warning("V1 is enabled, but got --disable-frontend-multiprocessing.")
from vllm.v1.engine.async_llm import AsyncLLM
async_llm: AsyncLLM | None = None
......
......@@ -105,9 +105,6 @@ class BaseFrontendArgs:
"""When `--max-logprobs` is specified, represents single tokens as
strings of the form 'token_id:{token_id}' so that tokens that are not
JSON-encodable can be identified."""
disable_frontend_multiprocessing: bool = False
"""If specified, will run the OpenAI frontend server in the same process as
the model serving engine."""
enable_auto_tool_choice: bool = False
"""Enable auto tool choice for supported models. Use `--tool-call-parser`
to specify which parser to use."""
......
......@@ -823,7 +823,6 @@ async def main(args: Namespace):
async with build_async_engine_client(
args,
usage_context=UsageContext.OPENAI_BATCH_RUNNER,
disable_frontend_multiprocessing=False,
) as engine_client:
await run_batch(engine_client, args)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment