[V0 Deprecation] Remove AsyncLLMEngine (#25025)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>

[V0 Deprecation] Remove AsyncLLMEngine (#25025)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
e19bce40 · Woosuk Kwon · GitHub · 505805b6 · e19bce40 · 505805b6
Unverified Commit e19bce40 authored Sep 18, 2025 by Woosuk Kwon Committed by GitHub Sep 18, 2025
11 changed files
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -28,11 +28,9 @@ def monkeypatch_module():
    mpatch.undo()
-@pytest.fixture(scope="module", params=[False, True])
+@pytest.fixture(scope="module")
-def server(request, monkeypatch_module, zephyr_lora_files):  #noqa: F811
+def server(monkeypatch_module, zephyr_lora_files):  #noqa: F811
+    monkeypatch_module.setenv('VLLM_USE_V1', '1')
-    use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
    args = [
        # use half precision for speed and memory savings in CI environment
@@ -57,13 +55,6 @@ def server(request, monkeypatch_module, zephyr_lora_files):  #noqa: F811
        yield remote_server
-@pytest.fixture
-def is_v1_server(server):
-    import os
-    assert os.environ['VLLM_USE_V1'] in ['0', '1']
-    return os.environ['VLLM_USE_V1'] == '1'
 @pytest_asyncio.fixture
 async def client(server):
    async with server.get_async_client() as async_client:
@@ -481,10 +472,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 async def test_structured_outputs_choice_chat(
-        client: openai.AsyncOpenAI, sample_structured_outputs_choices,
+    client: openai.AsyncOpenAI,
-        is_v1_server: bool):
+    sample_structured_outputs_choices,
-    if not is_v1_server:
+):
-        pytest.skip("Structured outputs is only supported in v1 engine")
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -522,12 +512,10 @@ async def test_structured_outputs_choice_chat(
 @pytest.mark.asyncio
-async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI,
+async def test_structured_outputs_json_chat(
+    client: openai.AsyncOpenAI,
    sample_json_schema,
-                                            is_v1_server: bool):
+):
-    if not is_v1_server:
-        pytest.skip("Structured outputs is only supported in v1 engine")
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -569,10 +557,10 @@ async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
-async def test_structured_outputs_regex_chat(client: openai.AsyncOpenAI,
+async def test_structured_outputs_regex_chat(
-                                             sample_regex, is_v1_server: bool):
+    client: openai.AsyncOpenAI,
-    if not is_v1_server:
+    sample_regex,
-        pytest.skip("Structured outputs is only supported in v1 engine")
+):
    messages = [{
        "role": "system",
@@ -660,10 +648,10 @@ async def test_structured_outputs_choice_chat_logprobs(
 @pytest.mark.asyncio
-async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
+async def test_named_tool_use(
-                              is_v1_server: bool):
+    client: openai.AsyncOpenAI,
-    if not is_v1_server:
+    sample_json_schema,
-        pytest.skip("Tool use is only supported in v1 engine")
+):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -821,11 +809,7 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
-async def test_response_format_json_schema(client: openai.AsyncOpenAI,
+async def test_response_format_json_schema(client: openai.AsyncOpenAI):
-                                           is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip(
-            "JSON schema response format is only supported in v1 engine")
    prompt = 'what is 1+1? The format is "result": 2'
    # Check that this prompt cannot lead to a valid JSON without json_schema
    for _ in range(2):

--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@@ -14,6 +14,9 @@ from transformers import AutoConfig
 from ...utils import RemoteOpenAIServer
+pytest.skip("Skipping prompt_embeds test until V1 supports it.",
+            allow_module_level=True)
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"

--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -53,12 +53,13 @@ def monkeypatch_module():
    mpatch.undo()
-@pytest.fixture(scope="module", params=[False, True])
+@pytest.fixture(scope="module", params=[True])
 def server_with_lora_modules_json(request, monkeypatch_module,
                                  zephyr_lora_files):
    use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+    assert use_v1
+    monkeypatch_module.setenv('VLLM_USE_V1', '1')
    # Define the json format LoRA module configurations
    lora_module_1 = {

--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -22,7 +22,7 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 PREV_MINOR_VERSION = version._prev_minor_version()
-@pytest.fixture(scope="module", params=[True, False])
+@pytest.fixture(scope="module", params=[True])
 def use_v1(request):
    # Module-scoped variant of run_with_both_engines
    #

--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -10,8 +10,30 @@ import pytest
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from ...utils import RemoteOpenAIServer
-from .test_completion import default_server_args  # noqa: F401
-from .test_completion import MODEL_NAME
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+@pytest.fixture(scope="module")
+def default_server_args(zephyr_lora_files):
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        # lora config
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+    ]
 @pytest.fixture(scope="module")

--- a/tests/entrypoints/openai/test_skip_tokenizer.py
+++ b/tests/entrypoints/openai/test_skip_tokenizer.py
@@ -15,14 +15,6 @@ MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
 DTYPE = "float16"
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
 @pytest.fixture(scope="module")
 def server():
    args = [

--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -7,7 +7,6 @@ import pytest
 import vllm.envs as envs
 from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
@@ -96,20 +95,3 @@ def test_v1_attn_backend(monkeypatch):
        _ = AsyncEngineArgs(model=MODEL).create_engine_config()
        assert envs.VLLM_USE_V1
        m.delenv("VLLM_USE_V1")
-def test_reject_using_constructor_directly(monkeypatch):
-    with monkeypatch.context() as m:
-        if os.getenv("VLLM_USE_V1", None):
-            m.delenv("VLLM_USE_V1")
-        # Sets VLLM_USE_V1=1.
-        vllm_config = AsyncEngineArgs(model=MODEL).create_engine_config()
-        # This uses the V0 constructor directly.
-        with pytest.raises(ValueError):
-            AsyncLLMEngine(vllm_config,
-                           AsyncLLMEngine._get_executor_cls(vllm_config),
-                           log_stats=True)
-        m.delenv("VLLM_USE_V1")
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -11,7 +11,6 @@ import uvicorn
 from fastapi import FastAPI, Request, Response
 from vllm import envs
-from vllm.engine.async_llm_engine import AsyncEngineDeadError
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT,
                                        H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT)
@@ -154,7 +153,6 @@ def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
    """
    @app.exception_handler(RuntimeError)
-    @app.exception_handler(AsyncEngineDeadError)
    @app.exception_handler(EngineDeadError)
    @app.exception_handler(EngineGenerateError)
    async def runtime_exception_handler(request: Request, __):

--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -38,7 +38,6 @@ from typing_extensions import assert_never
 import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine  # type: ignore
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (load_chat_template,
                                         resolve_hf_chat_template,
@@ -201,7 +200,8 @@ async def build_async_engine_client_from_engine_args(
    vllm_config = engine_args.create_engine_config(usage_context=usage_context)
    # V1 AsyncLLM.
-    if envs.VLLM_USE_V1:
+    assert envs.VLLM_USE_V1
    if disable_frontend_multiprocessing:
        logger.warning(
            "V1 is enabled, but got --disable-frontend-multiprocessing. "
@@ -209,10 +209,8 @@ async def build_async_engine_client_from_engine_args(
    from vllm.v1.engine.async_llm import AsyncLLM
    async_llm: Optional[AsyncLLM] = None
-        client_count = client_config.pop(
+    client_count = client_config.pop("client_count") if client_config else 1
-            "client_count") if client_config else 1
+    client_index = client_config.pop("client_index") if client_config else 0
-        client_index = client_config.pop(
-            "client_index") if client_config else 0
    try:
        async_llm = AsyncLLM.from_vllm_config(
            vllm_config=vllm_config,
@@ -231,21 +229,6 @@ async def build_async_engine_client_from_engine_args(
        if async_llm:
            async_llm.shutdown()
-    # V0 AsyncLLM.
-    else:
-        engine_client: Optional[EngineClient] = None
-        try:
-            engine_client = AsyncLLMEngine.from_vllm_config(
-                vllm_config=vllm_config,
-                usage_context=usage_context,
-                enable_log_requests=engine_args.enable_log_requests,
-                disable_log_stats=engine_args.disable_log_stats)
-            yield engine_client
-        finally:
-            if engine_client and hasattr(engine_client, "shutdown"):
-                engine_client.shutdown()
 async def validate_json_request(raw_request: Request):
    content_type = raw_request.headers.get("content-type", "").lower()