Unverified Commit e19bce40 authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

[V0 Deprecation] Remove AsyncLLMEngine (#25025)


Signed-off-by: default avatarWoosuk Kwon <woosuk@thinkingmachines.ai>
Signed-off-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
parent 505805b6
...@@ -28,11 +28,9 @@ def monkeypatch_module(): ...@@ -28,11 +28,9 @@ def monkeypatch_module():
mpatch.undo() mpatch.undo()
@pytest.fixture(scope="module", params=[False, True]) @pytest.fixture(scope="module")
def server(request, monkeypatch_module, zephyr_lora_files): #noqa: F811 def server(monkeypatch_module, zephyr_lora_files): #noqa: F811
monkeypatch_module.setenv('VLLM_USE_V1', '1')
use_v1 = request.param
monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
args = [ args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
...@@ -57,13 +55,6 @@ def server(request, monkeypatch_module, zephyr_lora_files): #noqa: F811 ...@@ -57,13 +55,6 @@ def server(request, monkeypatch_module, zephyr_lora_files): #noqa: F811
yield remote_server yield remote_server
@pytest.fixture
def is_v1_server(server):
import os
assert os.environ['VLLM_USE_V1'] in ['0', '1']
return os.environ['VLLM_USE_V1'] == '1'
@pytest_asyncio.fixture @pytest_asyncio.fixture
async def client(server): async def client(server):
async with server.get_async_client() as async_client: async with server.get_async_client() as async_client:
...@@ -481,10 +472,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, ...@@ -481,10 +472,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_structured_outputs_choice_chat( async def test_structured_outputs_choice_chat(
client: openai.AsyncOpenAI, sample_structured_outputs_choices, client: openai.AsyncOpenAI,
is_v1_server: bool): sample_structured_outputs_choices,
if not is_v1_server: ):
pytest.skip("Structured outputs is only supported in v1 engine")
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -522,12 +512,10 @@ async def test_structured_outputs_choice_chat( ...@@ -522,12 +512,10 @@ async def test_structured_outputs_choice_chat(
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI, async def test_structured_outputs_json_chat(
client: openai.AsyncOpenAI,
sample_json_schema, sample_json_schema,
is_v1_server: bool): ):
if not is_v1_server:
pytest.skip("Structured outputs is only supported in v1 engine")
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -569,10 +557,10 @@ async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI, ...@@ -569,10 +557,10 @@ async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_structured_outputs_regex_chat(client: openai.AsyncOpenAI, async def test_structured_outputs_regex_chat(
sample_regex, is_v1_server: bool): client: openai.AsyncOpenAI,
if not is_v1_server: sample_regex,
pytest.skip("Structured outputs is only supported in v1 engine") ):
messages = [{ messages = [{
"role": "system", "role": "system",
...@@ -660,10 +648,10 @@ async def test_structured_outputs_choice_chat_logprobs( ...@@ -660,10 +648,10 @@ async def test_structured_outputs_choice_chat_logprobs(
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema, async def test_named_tool_use(
is_v1_server: bool): client: openai.AsyncOpenAI,
if not is_v1_server: sample_json_schema,
pytest.skip("Tool use is only supported in v1 engine") ):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -821,11 +809,7 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI): ...@@ -821,11 +809,7 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_response_format_json_schema(client: openai.AsyncOpenAI, async def test_response_format_json_schema(client: openai.AsyncOpenAI):
is_v1_server: bool):
if not is_v1_server:
pytest.skip(
"JSON schema response format is only supported in v1 engine")
prompt = 'what is 1+1? The format is "result": 2' prompt = 'what is 1+1? The format is "result": 2'
# Check that this prompt cannot lead to a valid JSON without json_schema # Check that this prompt cannot lead to a valid JSON without json_schema
for _ in range(2): for _ in range(2):
......
This diff is collapsed.
...@@ -14,6 +14,9 @@ from transformers import AutoConfig ...@@ -14,6 +14,9 @@ from transformers import AutoConfig
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
pytest.skip("Skipping prompt_embeds test until V1 supports it.",
allow_module_level=True)
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
......
...@@ -53,12 +53,13 @@ def monkeypatch_module(): ...@@ -53,12 +53,13 @@ def monkeypatch_module():
mpatch.undo() mpatch.undo()
@pytest.fixture(scope="module", params=[False, True]) @pytest.fixture(scope="module", params=[True])
def server_with_lora_modules_json(request, monkeypatch_module, def server_with_lora_modules_json(request, monkeypatch_module,
zephyr_lora_files): zephyr_lora_files):
use_v1 = request.param use_v1 = request.param
monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0') assert use_v1
monkeypatch_module.setenv('VLLM_USE_V1', '1')
# Define the json format LoRA module configurations # Define the json format LoRA module configurations
lora_module_1 = { lora_module_1 = {
......
...@@ -22,7 +22,7 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" ...@@ -22,7 +22,7 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
PREV_MINOR_VERSION = version._prev_minor_version() PREV_MINOR_VERSION = version._prev_minor_version()
@pytest.fixture(scope="module", params=[True, False]) @pytest.fixture(scope="module", params=[True])
def use_v1(request): def use_v1(request):
# Module-scoped variant of run_with_both_engines # Module-scoped variant of run_with_both_engines
# #
......
...@@ -10,8 +10,30 @@ import pytest ...@@ -10,8 +10,30 @@ import pytest
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
from .test_completion import default_server_args # noqa: F401
from .test_completion import MODEL_NAME MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@pytest.fixture(scope="module")
def default_server_args(zephyr_lora_files):
return [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--max-num-seqs",
"128",
"--enforce-eager",
# lora config
"--enable-lora",
"--lora-modules",
f"zephyr-lora={zephyr_lora_files}",
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
]
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
......
...@@ -15,14 +15,6 @@ MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11" ...@@ -15,14 +15,6 @@ MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
DTYPE = "float16" DTYPE = "float16"
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
......
...@@ -7,7 +7,6 @@ import pytest ...@@ -7,7 +7,6 @@ import pytest
import vllm.envs as envs import vllm.envs as envs
from vllm import LLM from vllm import LLM
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
MODEL = "meta-llama/Llama-3.2-1B-Instruct" MODEL = "meta-llama/Llama-3.2-1B-Instruct"
...@@ -96,20 +95,3 @@ def test_v1_attn_backend(monkeypatch): ...@@ -96,20 +95,3 @@ def test_v1_attn_backend(monkeypatch):
_ = AsyncEngineArgs(model=MODEL).create_engine_config() _ = AsyncEngineArgs(model=MODEL).create_engine_config()
assert envs.VLLM_USE_V1 assert envs.VLLM_USE_V1
m.delenv("VLLM_USE_V1") m.delenv("VLLM_USE_V1")
def test_reject_using_constructor_directly(monkeypatch):
with monkeypatch.context() as m:
if os.getenv("VLLM_USE_V1", None):
m.delenv("VLLM_USE_V1")
# Sets VLLM_USE_V1=1.
vllm_config = AsyncEngineArgs(model=MODEL).create_engine_config()
# This uses the V0 constructor directly.
with pytest.raises(ValueError):
AsyncLLMEngine(vllm_config,
AsyncLLMEngine._get_executor_cls(vllm_config),
log_stats=True)
m.delenv("VLLM_USE_V1")
This diff is collapsed.
...@@ -11,7 +11,6 @@ import uvicorn ...@@ -11,7 +11,6 @@ import uvicorn
from fastapi import FastAPI, Request, Response from fastapi import FastAPI, Request, Response
from vllm import envs from vllm import envs
from vllm.engine.async_llm_engine import AsyncEngineDeadError
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT, from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT,
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT) H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT)
...@@ -154,7 +153,6 @@ def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None: ...@@ -154,7 +153,6 @@ def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
""" """
@app.exception_handler(RuntimeError) @app.exception_handler(RuntimeError)
@app.exception_handler(AsyncEngineDeadError)
@app.exception_handler(EngineDeadError) @app.exception_handler(EngineDeadError)
@app.exception_handler(EngineGenerateError) @app.exception_handler(EngineGenerateError)
async def runtime_exception_handler(request: Request, __): async def runtime_exception_handler(request: Request, __):
......
...@@ -38,7 +38,6 @@ from typing_extensions import assert_never ...@@ -38,7 +38,6 @@ from typing_extensions import assert_never
import vllm.envs as envs import vllm.envs as envs
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import (load_chat_template, from vllm.entrypoints.chat_utils import (load_chat_template,
resolve_hf_chat_template, resolve_hf_chat_template,
...@@ -201,7 +200,8 @@ async def build_async_engine_client_from_engine_args( ...@@ -201,7 +200,8 @@ async def build_async_engine_client_from_engine_args(
vllm_config = engine_args.create_engine_config(usage_context=usage_context) vllm_config = engine_args.create_engine_config(usage_context=usage_context)
# V1 AsyncLLM. # V1 AsyncLLM.
if envs.VLLM_USE_V1: assert envs.VLLM_USE_V1
if disable_frontend_multiprocessing: if disable_frontend_multiprocessing:
logger.warning( logger.warning(
"V1 is enabled, but got --disable-frontend-multiprocessing. " "V1 is enabled, but got --disable-frontend-multiprocessing. "
...@@ -209,10 +209,8 @@ async def build_async_engine_client_from_engine_args( ...@@ -209,10 +209,8 @@ async def build_async_engine_client_from_engine_args(
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
async_llm: Optional[AsyncLLM] = None async_llm: Optional[AsyncLLM] = None
client_count = client_config.pop( client_count = client_config.pop("client_count") if client_config else 1
"client_count") if client_config else 1 client_index = client_config.pop("client_index") if client_config else 0
client_index = client_config.pop(
"client_index") if client_config else 0
try: try:
async_llm = AsyncLLM.from_vllm_config( async_llm = AsyncLLM.from_vllm_config(
vllm_config=vllm_config, vllm_config=vllm_config,
...@@ -231,21 +229,6 @@ async def build_async_engine_client_from_engine_args( ...@@ -231,21 +229,6 @@ async def build_async_engine_client_from_engine_args(
if async_llm: if async_llm:
async_llm.shutdown() async_llm.shutdown()
# V0 AsyncLLM.
else:
engine_client: Optional[EngineClient] = None
try:
engine_client = AsyncLLMEngine.from_vllm_config(
vllm_config=vllm_config,
usage_context=usage_context,
enable_log_requests=engine_args.enable_log_requests,
disable_log_stats=engine_args.disable_log_stats)
yield engine_client
finally:
if engine_client and hasattr(engine_client, "shutdown"):
engine_client.shutdown()
async def validate_json_request(raw_request: Request): async def validate_json_request(raw_request: Request):
content_type = raw_request.headers.get("content-type", "").lower() content_type = raw_request.headers.get("content-type", "").lower()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment