Unverified Commit 52c2a8d4 authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

[V0 Deprecation] Remove LLMEngine (#25033)


Signed-off-by: default avatarWoosuk Kwon <woosuk@thinkingmachines.ai>
Signed-off-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
parent 367a480b
......@@ -3,47 +3,18 @@
import pytest
from vllm.core.scheduler import Scheduler
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams
from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
from vllm.v1.core.sched.scheduler import Scheduler
from vllm.v1.engine.llm_engine import LLMEngine
class DummyV0Scheduler(Scheduler):
def schedule(self):
raise Exception("Exception raised by DummyV0Scheduler")
class DummyV1Scheduler(V1Scheduler):
class DummyV1Scheduler(Scheduler):
def schedule(self):
raise Exception("Exception raised by DummyV1Scheduler")
def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0")
with pytest.raises(Exception) as exception_info:
engine_args = EngineArgs(
model="facebook/opt-125m",
enforce_eager=True, # reduce test time
scheduler_cls=DummyV0Scheduler,
)
engine = LLMEngine.from_engine_args(engine_args=engine_args)
sampling_params = SamplingParams(max_tokens=1)
engine.add_request("0", "foo", sampling_params)
engine.step()
assert str(
exception_info.value) == "Exception raised by DummyV0Scheduler"
def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
......@@ -59,7 +30,7 @@ def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
scheduler_cls=DummyV1Scheduler,
)
engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
engine = LLMEngine.from_engine_args(engine_args=engine_args)
sampling_params = SamplingParams(max_tokens=1)
engine.add_request("0", "foo", sampling_params)
......
......@@ -10,13 +10,6 @@ from transformers import AutoModelForSeq2SeqLM
from vllm.assets.audio import AudioAsset
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
"""We can run both engines for this test."""
pass
# FIXME(zhuohan): The test can not pass if we:
# 1. Increase max_tokens to 256.
# 2. Increase beam_width to 8.
......
......@@ -9,13 +9,6 @@ import pytest
from vllm import SamplingParams
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
"""We can run both engines for this test."""
pass
# We also test with llama because it has generation_config to specify EOS
# (past regression).
MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"]
......
......@@ -8,12 +8,6 @@ from vllm import SamplingParams
MODELS = ["distilbert/distilgpt2"]
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
"""We can run both engines for this test."""
pass
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_ranks(
......
......@@ -352,58 +352,3 @@ def test_decode_prompt_logprobs(complete_sequence: str,
logprobs[token_id + 1].decoded_token
for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
])
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 7, 16, -1])
def test_decode_prompt_logprobs_chunked_prefill(
vllm_runner,
model,
chunked_prefill_token_size: int,
example_prompts,
monkeypatch,
):
# VLLM V1 does not use incremental detokenization for
# prompt logprobs, so this test strategy is irrelevant.
monkeypatch.setenv("VLLM_USE_V1", "0")
max_num_seqs = 256
enable_chunked_prefill = False
max_num_batched_tokens = None
if chunked_prefill_token_size != -1:
enable_chunked_prefill = True
max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
max_num_batched_tokens = chunked_prefill_token_size
with vllm_runner(model,
dtype="half",
max_logprobs=5,
gpu_memory_utilization=0.5,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
max_num_seqs=max_num_seqs) as vllm_model:
vllm_sampling_params = SamplingParams(max_tokens=10,
logprobs=5,
prompt_logprobs=5,
temperature=0.0)
vllm_results = vllm_model.llm.generate(
example_prompts, sampling_params=vllm_sampling_params)
for idx, result in enumerate(vllm_results):
assert result.prompt_logprobs is not None
assert result.prompt_logprobs[0] is None
# Compared detokenized prompts ids to original prompt.
generated_string = ""
for (prompt_token,
prompt_logprobs) in zip(result.prompt_token_ids[1:],
result.prompt_logprobs[1:]):
# prompt_logprobs is a dict of the token_id: logprob
# We select the token_id corresponding to the actual prompt
# Decoded token in the detokenized string corresponding to this
# prompt token.
generated_string += prompt_logprobs[prompt_token].decoded_token
assert generated_string == example_prompts[idx], (
"Detokenized prompt logprobs do not match original prompt")
......@@ -1508,14 +1508,6 @@ class EngineArgs:
recommend_to_remove=True)
return False
if self.kv_cache_dtype != "auto":
supported = current_platform.is_kv_cache_dtype_supported(
self.kv_cache_dtype, model_config)
if not supported:
_raise_or_fallback(feature_name="--kv-cache-dtype",
recommend_to_remove=False)
return False
# No Mamba or Encoder-Decoder so far.
if not model_config.is_v1_compatible:
_raise_or_fallback(feature_name=model_config.architectures,
......
This diff is collapsed.
......@@ -11,7 +11,6 @@ from pydantic import ValidationError
from tqdm.auto import tqdm
from typing_extensions import TypeVar
import vllm.envs as envs
from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
BeamSearchSequence,
create_sort_beams_key_function)
......@@ -19,7 +18,6 @@ from vllm.config import (CompilationConfig, ModelDType,
StructuredOutputsConfig, TokenizerMode, is_init_field)
from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides,
PoolerConfig, RunnerOption)
from vllm.engine.llm_engine import LLMEngine
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
ChatTemplateContentFormatOption,
apply_hf_chat_template,
......@@ -54,6 +52,7 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
get_cached_tokenizer)
from vllm.usage.usage_lib import UsageContext
from vllm.utils import Counter, Device, as_iter, is_list_of
from vllm.v1.engine.llm_engine import LLMEngine
from vllm.v1.sample.logits_processor import LogitsProcessor
if TYPE_CHECKING:
......@@ -309,11 +308,7 @@ class LLM:
self.request_counter = Counter()
self.default_sampling_params: Union[dict[str, Any], None] = None
if envs.VLLM_USE_V1:
supported_tasks = self.llm_engine \
.get_supported_tasks() # type: ignore
else:
supported_tasks = self.llm_engine.model_config.supported_tasks
supported_tasks = self.llm_engine.get_supported_tasks() # type: ignore
logger.info("Supported_tasks: %s", supported_tasks)
......@@ -1473,8 +1468,6 @@ class LLM:
Note:
This method is only available with the V1 LLM engine.
"""
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
assert isinstance(self.llm_engine, V1LLMEngine)
return self.llm_engine.get_metrics()
def _validate_and_add_requests(
......
......@@ -672,21 +672,15 @@ def tensorize_vllm_model(engine_args: "EngineArgs",
) as stream:
stream.write(encryption_params.key)
from vllm import LLMEngine
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
if not envs.VLLM_USE_V1:
engine = LLMEngine.from_engine_args(engine_args)
engine.model_executor.collective_rpc(
"save_tensorized_model",
kwargs={"tensorizer_config": tensorizer_config.to_serializable()},
)
else:
engine = V1LLMEngine.from_vllm_config(engine_config)
engine.collective_rpc(
"save_tensorized_model",
kwargs={"tensorizer_config": tensorizer_config.to_serializable()},
)
assert envs.VLLM_USE_V1
from vllm.v1.engine.llm_engine import LLMEngine
engine = LLMEngine.from_vllm_config(engine_config)
engine.collective_rpc(
"save_tensorized_model",
kwargs={"tensorizer_config": tensorizer_config.to_serializable()},
)
def tensorize_lora_adapter(lora_path: str,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment