Unverified Commit e19bce40 authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

[V0 Deprecation] Remove AsyncLLMEngine (#25025)


Signed-off-by: default avatarWoosuk Kwon <woosuk@thinkingmachines.ai>
Signed-off-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
parent 505805b6
......@@ -28,11 +28,9 @@ def monkeypatch_module():
mpatch.undo()
@pytest.fixture(scope="module", params=[False, True])
def server(request, monkeypatch_module, zephyr_lora_files): #noqa: F811
use_v1 = request.param
monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
@pytest.fixture(scope="module")
def server(monkeypatch_module, zephyr_lora_files): #noqa: F811
monkeypatch_module.setenv('VLLM_USE_V1', '1')
args = [
# use half precision for speed and memory savings in CI environment
......@@ -57,13 +55,6 @@ def server(request, monkeypatch_module, zephyr_lora_files): #noqa: F811
yield remote_server
@pytest.fixture
def is_v1_server(server):
import os
assert os.environ['VLLM_USE_V1'] in ['0', '1']
return os.environ['VLLM_USE_V1'] == '1'
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
......@@ -481,10 +472,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
@pytest.mark.asyncio
async def test_structured_outputs_choice_chat(
client: openai.AsyncOpenAI, sample_structured_outputs_choices,
is_v1_server: bool):
if not is_v1_server:
pytest.skip("Structured outputs is only supported in v1 engine")
client: openai.AsyncOpenAI,
sample_structured_outputs_choices,
):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
......@@ -522,12 +512,10 @@ async def test_structured_outputs_choice_chat(
@pytest.mark.asyncio
async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI,
async def test_structured_outputs_json_chat(
client: openai.AsyncOpenAI,
sample_json_schema,
is_v1_server: bool):
if not is_v1_server:
pytest.skip("Structured outputs is only supported in v1 engine")
):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
......@@ -569,10 +557,10 @@ async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI,
@pytest.mark.asyncio
async def test_structured_outputs_regex_chat(client: openai.AsyncOpenAI,
sample_regex, is_v1_server: bool):
if not is_v1_server:
pytest.skip("Structured outputs is only supported in v1 engine")
async def test_structured_outputs_regex_chat(
client: openai.AsyncOpenAI,
sample_regex,
):
messages = [{
"role": "system",
......@@ -660,10 +648,10 @@ async def test_structured_outputs_choice_chat_logprobs(
@pytest.mark.asyncio
async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
is_v1_server: bool):
if not is_v1_server:
pytest.skip("Tool use is only supported in v1 engine")
async def test_named_tool_use(
client: openai.AsyncOpenAI,
sample_json_schema,
):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
......@@ -821,11 +809,7 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):
@pytest.mark.asyncio
async def test_response_format_json_schema(client: openai.AsyncOpenAI,
is_v1_server: bool):
if not is_v1_server:
pytest.skip(
"JSON schema response format is only supported in v1 engine")
async def test_response_format_json_schema(client: openai.AsyncOpenAI):
prompt = 'what is 1+1? The format is "result": 2'
# Check that this prompt cannot lead to a valid JSON without json_schema
for _ in range(2):
......
This diff is collapsed.
......@@ -14,6 +14,9 @@ from transformers import AutoConfig
from ...utils import RemoteOpenAIServer
pytest.skip("Skipping prompt_embeds test until V1 supports it.",
allow_module_level=True)
# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
......
......@@ -53,12 +53,13 @@ def monkeypatch_module():
mpatch.undo()
@pytest.fixture(scope="module", params=[False, True])
@pytest.fixture(scope="module", params=[True])
def server_with_lora_modules_json(request, monkeypatch_module,
zephyr_lora_files):
use_v1 = request.param
monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
assert use_v1
monkeypatch_module.setenv('VLLM_USE_V1', '1')
# Define the json format LoRA module configurations
lora_module_1 = {
......
......@@ -22,7 +22,7 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
PREV_MINOR_VERSION = version._prev_minor_version()
@pytest.fixture(scope="module", params=[True, False])
@pytest.fixture(scope="module", params=[True])
def use_v1(request):
# Module-scoped variant of run_with_both_engines
#
......
......@@ -10,8 +10,30 @@ import pytest
from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer
from .test_completion import default_server_args # noqa: F401
from .test_completion import MODEL_NAME
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@pytest.fixture(scope="module")
def default_server_args(zephyr_lora_files):
return [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--max-num-seqs",
"128",
"--enforce-eager",
# lora config
"--enable-lora",
"--lora-modules",
f"zephyr-lora={zephyr_lora_files}",
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
]
@pytest.fixture(scope="module")
......
......@@ -15,14 +15,6 @@ MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
DTYPE = "float16"
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.fixture(scope="module")
def server():
args = [
......
......@@ -7,7 +7,6 @@ import pytest
import vllm.envs as envs
from vllm import LLM
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
......@@ -96,20 +95,3 @@ def test_v1_attn_backend(monkeypatch):
_ = AsyncEngineArgs(model=MODEL).create_engine_config()
assert envs.VLLM_USE_V1
m.delenv("VLLM_USE_V1")
def test_reject_using_constructor_directly(monkeypatch):
with monkeypatch.context() as m:
if os.getenv("VLLM_USE_V1", None):
m.delenv("VLLM_USE_V1")
# Sets VLLM_USE_V1=1.
vllm_config = AsyncEngineArgs(model=MODEL).create_engine_config()
# This uses the V0 constructor directly.
with pytest.raises(ValueError):
AsyncLLMEngine(vllm_config,
AsyncLLMEngine._get_executor_cls(vllm_config),
log_stats=True)
m.delenv("VLLM_USE_V1")
This diff is collapsed.
......@@ -11,7 +11,6 @@ import uvicorn
from fastapi import FastAPI, Request, Response
from vllm import envs
from vllm.engine.async_llm_engine import AsyncEngineDeadError
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT,
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT)
......@@ -154,7 +153,6 @@ def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
"""
@app.exception_handler(RuntimeError)
@app.exception_handler(AsyncEngineDeadError)
@app.exception_handler(EngineDeadError)
@app.exception_handler(EngineGenerateError)
async def runtime_exception_handler(request: Request, __):
......
......@@ -38,7 +38,6 @@ from typing_extensions import assert_never
import vllm.envs as envs
from vllm.config import VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import (load_chat_template,
resolve_hf_chat_template,
......@@ -201,7 +200,8 @@ async def build_async_engine_client_from_engine_args(
vllm_config = engine_args.create_engine_config(usage_context=usage_context)
# V1 AsyncLLM.
if envs.VLLM_USE_V1:
assert envs.VLLM_USE_V1
if disable_frontend_multiprocessing:
logger.warning(
"V1 is enabled, but got --disable-frontend-multiprocessing. "
......@@ -209,10 +209,8 @@ async def build_async_engine_client_from_engine_args(
from vllm.v1.engine.async_llm import AsyncLLM
async_llm: Optional[AsyncLLM] = None
client_count = client_config.pop(
"client_count") if client_config else 1
client_index = client_config.pop(
"client_index") if client_config else 0
client_count = client_config.pop("client_count") if client_config else 1
client_index = client_config.pop("client_index") if client_config else 0
try:
async_llm = AsyncLLM.from_vllm_config(
vllm_config=vllm_config,
......@@ -231,21 +229,6 @@ async def build_async_engine_client_from_engine_args(
if async_llm:
async_llm.shutdown()
# V0 AsyncLLM.
else:
engine_client: Optional[EngineClient] = None
try:
engine_client = AsyncLLMEngine.from_vllm_config(
vllm_config=vllm_config,
usage_context=usage_context,
enable_log_requests=engine_args.enable_log_requests,
disable_log_stats=engine_args.disable_log_stats)
yield engine_client
finally:
if engine_client and hasattr(engine_client, "shutdown"):
engine_client.shutdown()
async def validate_json_request(raw_request: Request):
content_type = raw_request.headers.get("content-type", "").lower()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment