Unverified Commit 9040151f authored by Flora Feng's avatar Flora Feng Committed by GitHub
Browse files

[V0 Deprecation] Deprecate --disable-frontend-multiprocessing (#37612)


Signed-off-by: default avatarsfeng33 <4florafeng@gmail.com>
parent 8fbe3f30
...@@ -319,9 +319,6 @@ def _compare_tp( ...@@ -319,9 +319,6 @@ def _compare_tp(
pp_env = { pp_env = {
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1", "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
} }
# Temporary. Currently when zeromq + SPMD is used, it does not properly
# terminate because of a Ray Compiled Graph issue.
common_args.append("--disable-frontend-multiprocessing")
elif distributed_backend == "mp": elif distributed_backend == "mp":
pp_env = None pp_env = None
else: else:
......
...@@ -28,7 +28,7 @@ def server_args(request: pytest.FixtureRequest) -> list[str]: ...@@ -28,7 +28,7 @@ def server_args(request: pytest.FixtureRequest) -> list[str]:
>>> @pytest.mark.parametrize( >>> @pytest.mark.parametrize(
>>> "server_args", >>> "server_args",
>>> [ >>> [
>>> ["--disable-frontend-multiprocessing"], >>> ["--max-model-len", "10100"],
>>> [ >>> [
>>> "--model=NousResearch/Hermes-3-Llama-3.1-70B", >>> "--model=NousResearch/Hermes-3-Llama-3.1-70B",
>>> "--enable-auto-tool-choice", >>> "--enable-auto-tool-choice",
...@@ -40,7 +40,7 @@ def server_args(request: pytest.FixtureRequest) -> list[str]: ...@@ -40,7 +40,7 @@ def server_args(request: pytest.FixtureRequest) -> list[str]:
>>> ... >>> ...
This will run `test_foo` twice with servers with: This will run `test_foo` twice with servers with:
- `--disable-frontend-multiprocessing` - `--max-model-len 10100`
- `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`. - `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
""" """
...@@ -79,17 +79,6 @@ async def client(server): ...@@ -79,17 +79,6 @@ async def client(server):
yield async_client yield async_client
@pytest.mark.parametrize(
"server_args",
[
pytest.param([], id="default-frontend-multiprocessing"),
pytest.param(
["--disable-frontend-multiprocessing"],
id="disable-frontend-multiprocessing",
),
],
indirect=True,
)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_show_version(server: RemoteOpenAIServer): async def test_show_version(server: RemoteOpenAIServer):
response = requests.get(server.url_for("version")) response = requests.get(server.url_for("version"))
...@@ -98,17 +87,6 @@ async def test_show_version(server: RemoteOpenAIServer): ...@@ -98,17 +87,6 @@ async def test_show_version(server: RemoteOpenAIServer):
assert response.json() == {"version": VLLM_VERSION} assert response.json() == {"version": VLLM_VERSION}
@pytest.mark.parametrize(
"server_args",
[
pytest.param([], id="default-frontend-multiprocessing"),
pytest.param(
["--disable-frontend-multiprocessing"],
id="disable-frontend-multiprocessing",
),
],
indirect=True,
)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_check_health(server: RemoteOpenAIServer): async def test_check_health(server: RemoteOpenAIServer):
response = requests.get(server.url_for("health")) response = requests.get(server.url_for("health"))
...@@ -119,13 +97,7 @@ async def test_check_health(server: RemoteOpenAIServer): ...@@ -119,13 +97,7 @@ async def test_check_health(server: RemoteOpenAIServer):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"server_args", "server_args",
[ [
pytest.param( pytest.param(["--max-model-len", "10100"]),
["--max-model-len", "10100"], id="default-frontend-multiprocessing"
),
pytest.param(
["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
id="disable-frontend-multiprocessing",
),
], ],
indirect=True, indirect=True,
) )
......
...@@ -50,7 +50,6 @@ def default_server_args(): ...@@ -50,7 +50,6 @@ def default_server_args():
params=[ params=[
"", "",
"--enable-chunked-prefill", "--enable-chunked-prefill",
"--disable-frontend-multiprocessing",
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}", f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
], ],
) )
......
...@@ -83,11 +83,8 @@ def example_prompt_embeds(hf_runner): ...@@ -83,11 +83,8 @@ def example_prompt_embeds(hf_runner):
return [_encode_embeds(item) for item in example_embeddings] return [_encode_embeds(item) for item in example_embeddings]
@pytest.fixture(scope="module", params=["", "--disable-frontend-multiprocessing"]) @pytest.fixture(scope="module")
def server_with_prompt_embeds(default_server_args, request): def server_with_prompt_embeds(default_server_args):
if request.param:
default_server_args.append(request.param)
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server yield remote_server
......
...@@ -150,7 +150,6 @@ async def test_shutdown_on_engine_failure(): ...@@ -150,7 +150,6 @@ async def test_shutdown_on_engine_failure():
"0.05", "0.05",
"--max-num-seqs", "--max-num-seqs",
"2", "2",
"--disable-frontend-multiprocessing",
], ],
# ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when # ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when
# stdout/stderr pipes are enabled during ROCm GPU initialization. # stdout/stderr pipes are enabled during ROCm GPU initialization.
......
...@@ -26,19 +26,12 @@ def default_server_args(): ...@@ -26,19 +26,12 @@ def default_server_args():
"128", "128",
"--enforce-eager", "--enforce-eager",
"--enable-prompt-tokens-details", "--enable-prompt-tokens-details",
"--no-enable-prefix-caching",
] ]
@pytest.fixture( @pytest.fixture(scope="module")
scope="module", def server(default_server_args):
params=[
["--no-enable-prefix-caching"],
["--no-enable-prefix-caching", "--disable-frontend-multiprocessing"],
],
)
def server(default_server_args, request):
if request.param:
default_server_args = default_server_args + request.param
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server yield remote_server
......
...@@ -181,7 +181,6 @@ async def run_vllm_async( ...@@ -181,7 +181,6 @@ async def run_vllm_async(
n: int, n: int,
engine_args: AsyncEngineArgs, engine_args: AsyncEngineArgs,
do_profile: bool, do_profile: bool,
disable_frontend_multiprocessing: bool = False,
disable_detokenize: bool = False, disable_detokenize: bool = False,
) -> float: ) -> float:
from vllm import SamplingParams from vllm import SamplingParams
...@@ -191,7 +190,6 @@ async def run_vllm_async( ...@@ -191,7 +190,6 @@ async def run_vllm_async(
async with build_async_engine_client_from_engine_args( async with build_async_engine_client_from_engine_args(
engine_args, engine_args,
disable_frontend_multiprocessing=disable_frontend_multiprocessing,
) as llm: ) as llm:
model_config = llm.model_config model_config = llm.model_config
assert all( assert all(
...@@ -757,12 +755,6 @@ def add_cli_args(parser: argparse.ArgumentParser): ...@@ -757,12 +755,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
default=False, default=False,
help="Use vLLM async engine rather than LLM class.", help="Use vLLM async engine rather than LLM class.",
) )
parser.add_argument(
"--disable-frontend-multiprocessing",
action="store_true",
default=False,
help="Disable decoupled async engine frontend.",
)
parser.add_argument( parser.add_argument(
"--disable-detokenize", "--disable-detokenize",
action="store_true", action="store_true",
...@@ -880,7 +872,6 @@ def main(args: argparse.Namespace): ...@@ -880,7 +872,6 @@ def main(args: argparse.Namespace):
requests, requests,
args.n, args.n,
AsyncEngineArgs.from_cli_args(args), AsyncEngineArgs.from_cli_args(args),
disable_frontend_multiprocessing=args.disable_frontend_multiprocessing,
disable_detokenize=args.disable_detokenize, disable_detokenize=args.disable_detokenize,
do_profile=args.profile, do_profile=args.profile,
) )
......
...@@ -79,7 +79,6 @@ async def build_async_engine_client( ...@@ -79,7 +79,6 @@ async def build_async_engine_client(
args: Namespace, args: Namespace,
*, *,
usage_context: UsageContext = UsageContext.OPENAI_API_SERVER, usage_context: UsageContext = UsageContext.OPENAI_API_SERVER,
disable_frontend_multiprocessing: bool | None = None,
client_config: dict[str, Any] | None = None, client_config: dict[str, Any] | None = None,
) -> AsyncIterator[EngineClient]: ) -> AsyncIterator[EngineClient]:
if os.getenv("VLLM_WORKER_MULTIPROC_METHOD") == "forkserver": if os.getenv("VLLM_WORKER_MULTIPROC_METHOD") == "forkserver":
...@@ -98,13 +97,9 @@ async def build_async_engine_client( ...@@ -98,13 +97,9 @@ async def build_async_engine_client(
engine_args._api_process_count = client_config.get("client_count", 1) engine_args._api_process_count = client_config.get("client_count", 1)
engine_args._api_process_rank = client_config.get("client_index", 0) engine_args._api_process_rank = client_config.get("client_index", 0)
if disable_frontend_multiprocessing is None:
disable_frontend_multiprocessing = bool(args.disable_frontend_multiprocessing)
async with build_async_engine_client_from_engine_args( async with build_async_engine_client_from_engine_args(
engine_args, engine_args,
usage_context=usage_context, usage_context=usage_context,
disable_frontend_multiprocessing=disable_frontend_multiprocessing,
client_config=client_config, client_config=client_config,
) as engine: ) as engine:
yield engine yield engine
...@@ -115,7 +110,6 @@ async def build_async_engine_client_from_engine_args( ...@@ -115,7 +110,6 @@ async def build_async_engine_client_from_engine_args(
engine_args: AsyncEngineArgs, engine_args: AsyncEngineArgs,
*, *,
usage_context: UsageContext = UsageContext.OPENAI_API_SERVER, usage_context: UsageContext = UsageContext.OPENAI_API_SERVER,
disable_frontend_multiprocessing: bool = False,
client_config: dict[str, Any] | None = None, client_config: dict[str, Any] | None = None,
) -> AsyncIterator[EngineClient]: ) -> AsyncIterator[EngineClient]:
""" """
...@@ -129,9 +123,6 @@ async def build_async_engine_client_from_engine_args( ...@@ -129,9 +123,6 @@ async def build_async_engine_client_from_engine_args(
# Create the EngineConfig (determines if we can use V1). # Create the EngineConfig (determines if we can use V1).
vllm_config = engine_args.create_engine_config(usage_context=usage_context) vllm_config = engine_args.create_engine_config(usage_context=usage_context)
if disable_frontend_multiprocessing:
logger.warning("V1 is enabled, but got --disable-frontend-multiprocessing.")
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
async_llm: AsyncLLM | None = None async_llm: AsyncLLM | None = None
......
...@@ -105,9 +105,6 @@ class BaseFrontendArgs: ...@@ -105,9 +105,6 @@ class BaseFrontendArgs:
"""When `--max-logprobs` is specified, represents single tokens as """When `--max-logprobs` is specified, represents single tokens as
strings of the form 'token_id:{token_id}' so that tokens that are not strings of the form 'token_id:{token_id}' so that tokens that are not
JSON-encodable can be identified.""" JSON-encodable can be identified."""
disable_frontend_multiprocessing: bool = False
"""If specified, will run the OpenAI frontend server in the same process as
the model serving engine."""
enable_auto_tool_choice: bool = False enable_auto_tool_choice: bool = False
"""Enable auto tool choice for supported models. Use `--tool-call-parser` """Enable auto tool choice for supported models. Use `--tool-call-parser`
to specify which parser to use.""" to specify which parser to use."""
......
...@@ -823,7 +823,6 @@ async def main(args: Namespace): ...@@ -823,7 +823,6 @@ async def main(args: Namespace):
async with build_async_engine_client( async with build_async_engine_client(
args, args,
usage_context=UsageContext.OPENAI_BATCH_RUNNER, usage_context=UsageContext.OPENAI_BATCH_RUNNER,
disable_frontend_multiprocessing=False,
) as engine_client: ) as engine_client:
await run_batch(engine_client, args) await run_batch(engine_client, args)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment