Unverified Commit d4d93db2 authored by Robert Shaw's avatar Robert Shaw Committed by GitHub
Browse files

[V1] V1 Enablement Oracle (#13726)


Signed-off-by: default avatarrshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: default avatarrshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: default avatarNicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: default avatarTyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: default avatarMichael Goin <michael@neuralmagic.com>
parent 8c0d15d5
......@@ -6,11 +6,18 @@ Run `pytest tests/samplers/test_no_bad_words.py`.
"""
from typing import Optional
import pytest
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
"""We can run both engines for this test."""
pass
def _generate(
model: LLM,
prompt: str,
......
......@@ -7,6 +7,12 @@ from vllm import SamplingParams
MODELS = ["distilbert/distilgpt2"]
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
"""We can run both engines for this test."""
pass
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_ranks(
......
......@@ -8,6 +8,15 @@ import torch.nn.functional as F
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
from vllm.model_executor.utils import set_random_seed
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This file tests V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
......
......@@ -18,6 +18,14 @@ from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
from vllm.utils import Counter, is_pin_memory_available
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This file tests V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
class MockLogitsSampler(Sampler):
def __init__(self, fake_logits: torch.Tensor):
......
......@@ -17,7 +17,9 @@ RANDOM_SEEDS = list(range(5))
@pytest.fixture
def vllm_model(vllm_runner):
def vllm_model(vllm_runner, monkeypatch):
# This file relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
with vllm_runner(MODEL, dtype="half") as vllm_model:
yield vllm_model
......
......@@ -11,6 +11,14 @@ from vllm.model_executor.utils import set_random_seed
CUDA_DEVICES = [f"cuda:{i}" for i in range(1)]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This file tests V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
def get_zero_temperature_prob_dist(batch_size, k, vocab_size):
"""
Generates a fake temperature zero probability distribution.
......
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
......@@ -12,6 +12,14 @@ from vllm.distributed import cleanup_dist_env_and_memory
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Tensorizer only tested on V0 so far.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.fixture(autouse=True)
def cleanup():
cleanup_dist_env_and_memory(shutdown_ray=True)
......
......@@ -7,11 +7,13 @@ will never happen again.
"""
import gc
import pytest
import torch
from vllm import LLM, SamplingParams
@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
def test_duplicated_ignored_sequence_group():
"""https://github.com/vllm-project/vllm/issues/1655"""
......
......@@ -366,7 +366,10 @@ def test_bind_kv_cache_non_attention():
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
def test_bind_kv_cache_encoder_decoder():
def test_bind_kv_cache_encoder_decoder(monkeypatch):
# V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
monkeypatch.setenv("VLLM_USE_V1", "0")
from vllm.attention import Attention, AttentionType
# example from bart
......
......@@ -279,7 +279,12 @@ def test_decode_prompt_logprobs_chunked_prefill(
model,
chunked_prefill_token_size: int,
example_prompts,
monkeypatch,
):
# VLLM V1 does not use incremental detokenization for
# prompt logprobs, so this test strategy is irrelevant.
monkeypatch.setenv("VLLM_USE_V1", "0")
max_num_seqs = 256
enable_chunked_prefill = False
max_num_batched_tokens = None
......
......@@ -91,20 +91,22 @@ CONFIGS: dict[str, ServerConfig] = {
"without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
"to the user's question - just respond to it normally."
},
"granite20b": {
"model":
"mbayser/granite-20b-functioncalling-FP8-KV",
"arguments": [
"--tool-call-parser", "granite-20b-fc", "--chat-template",
str(VLLM_PATH /
"examples/tool_chat_template_granite_20b_fc.jinja"),
"--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20"
],
"supports_parallel":
False,
"supports_rocm":
False,
},
# V1 Test: Passing locally but failing in CI. This runs the
# V0 Engine because of CPU offloading. Need to debug why.
# "granite20b": {
# "model":
# "mbayser/granite-20b-functioncalling-FP8-KV",
# "arguments": [
# "--tool-call-parser", "granite-20b-fc", "--chat-template",
# str(VLLM_PATH /
# "examples/tool_chat_template_granite_20b_fc.jinja"),
# "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20"
# ],
# "supports_parallel":
# False,
# "supports_rocm":
# False,
# },
"granite-3.0-8b": {
"model":
"ibm-granite/granite-3.0-8b-instruct",
......
......@@ -19,6 +19,16 @@ from opentelemetry.sdk.environment_variables import (
from vllm import LLM, SamplingParams
from vllm.tracing import SpanAttributes
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
......
......@@ -18,19 +18,19 @@ if not envs.VLLM_USE_V1:
def test_prefix_caching_from_cli():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
args = parser.parse_args([])
engine_args = EngineArgs.from_cli_args(args=args)
assert (engine_args.enable_prefix_caching
vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
assert (vllm_config.cache_config.enable_prefix_caching
), "V1 turns on prefix caching by default."
# Turn it off possible with flag.
args = parser.parse_args(["--no-enable-prefix-caching"])
engine_args = EngineArgs.from_cli_args(args=args)
assert not engine_args.enable_prefix_caching
vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
assert not vllm_config.cache_config.enable_prefix_caching
# Turn it on with flag.
args = parser.parse_args(["--enable-prefix-caching"])
engine_args = EngineArgs.from_cli_args(args=args)
assert engine_args.enable_prefix_caching
vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
assert vllm_config.cache_config.enable_prefix_caching
def test_defaults_with_usage_context():
......@@ -38,11 +38,21 @@ def test_defaults_with_usage_context():
vllm_config: VllmConfig = engine_args.create_engine_config(
UsageContext.LLM_CLASS)
from vllm.platforms import current_platform
device_name = current_platform.get_device_name().lower()
if "h100" in device_name or "h200" in device_name:
# For H100 and H200, we use larger default values.
default_llm_tokens = 16384
default_server_tokens = 8192
else:
default_llm_tokens = 8192
default_server_tokens = 2048
assert vllm_config.scheduler_config.max_num_seqs == 1024
assert vllm_config.scheduler_config.max_num_batched_tokens == 8192
assert vllm_config.scheduler_config.max_num_batched_tokens == default_llm_tokens # noqa: E501
engine_args = EngineArgs(model="facebook/opt-125m")
vllm_config = engine_args.create_engine_config(
UsageContext.OPENAI_API_SERVER)
assert vllm_config.scheduler_config.max_num_seqs == 1024
assert vllm_config.scheduler_config.max_num_batched_tokens == 2048
assert vllm_config.scheduler_config.max_num_batched_tokens == default_server_tokens # noqa: E501
......@@ -6,7 +6,6 @@ from collections.abc import Generator
import pytest
import torch
from tests.kernels.utils import override_backend_env_variable
from tests.v1.sample.utils import (
BatchLogprobsComposition, BatchLogprobsSpecType,
assert_incr_detok_str_matches_non_incr_detok_str,
......@@ -334,7 +333,7 @@ def test_get_logprobs_and_prompt_logprobs(
do_apc=do_apc)
def test_max_logprobs(monkeypatch):
def test_max_logprobs():
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs`
Should also fail for `prompt_logprobs > max_logprobs`
......@@ -344,7 +343,6 @@ def test_max_logprobs(monkeypatch):
Args:
monkeypatch
"""
override_backend_env_variable(monkeypatch, "FLASH_ATTN")
runner = VllmRunner("facebook/opt-125m",
max_logprobs=1,
......
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
import vllm.envs as envs
from vllm import LLM
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
UNSUPPORTED_MODELS_V1 = [
"openai/whisper-large-v3", # transcription
"facebook/bart-large-cnn", # encoder decoder
"mistralai/Mamba-Codestral-7B-v0.1", # mamba
"ibm-ai-platform/Bamba-9B", # hybrid
"BAAI/bge-m3", # embedding
]
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
@pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1)
def test_reject_unsupported_models(monkeypatch, model):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
args = AsyncEngineArgs(model=model)
with pytest.raises(NotImplementedError):
_ = args.create_engine_config()
m.delenv("VLLM_USE_V1")
def test_reject_bad_config(monkeypatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0")
def test_unsupported_configs(monkeypatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
with pytest.raises(NotImplementedError):
AsyncEngineArgs(
model=MODEL,
kv_cache_dtype="fp8",
).create_engine_config()
with pytest.raises(NotImplementedError):
AsyncEngineArgs(
model=MODEL,
speculative_model=MODEL,
).create_engine_config()
with pytest.raises(NotImplementedError):
AsyncEngineArgs(
model=MODEL,
guided_decoding_backend="lm-format-enforcer:no-fallback",
).create_engine_config()
with pytest.raises(NotImplementedError):
AsyncEngineArgs(
model=MODEL,
preemption_mode="swap",
).create_engine_config()
with pytest.raises(NotImplementedError):
AsyncEngineArgs(
model=MODEL,
disable_async_output_proc=True,
).create_engine_config()
with pytest.raises(NotImplementedError):
AsyncEngineArgs(
model=MODEL,
scheduling_policy="priority",
).create_engine_config()
with pytest.raises(NotImplementedError):
AsyncEngineArgs(
model=MODEL,
num_scheduler_steps=5,
).create_engine_config()
with pytest.raises(NotImplementedError):
AsyncEngineArgs(
model=MODEL,
scheduler_delay_factor=1.2,
).create_engine_config()
def test_enable_by_default_fallback(monkeypatch):
with monkeypatch.context() as m:
if os.getenv("VLLM_USE_V1", None):
m.delenv("VLLM_USE_V1")
# Should default to V1 for supported config.
_ = AsyncEngineArgs(
model=MODEL,
enforce_eager=True,
).create_engine_config()
assert envs.VLLM_USE_V1
m.delenv("VLLM_USE_V1")
# Should fall back to V0 for experimental config.
_ = AsyncEngineArgs(
model=MODEL,
enable_lora=True,
).create_engine_config()
assert not envs.VLLM_USE_V1
m.delenv("VLLM_USE_V1")
# Should fall back to V0 for supported model.
_ = AsyncEngineArgs(
model=UNSUPPORTED_MODELS_V1[0]).create_engine_config()
assert not envs.VLLM_USE_V1
m.delenv("VLLM_USE_V1")
def test_v1_llm_by_default(monkeypatch):
with monkeypatch.context() as m:
if os.getenv("VLLM_USE_V1", None):
m.delenv("VLLM_USE_V1")
# Should default to V1 for supported config.
model = LLM(MODEL, enforce_eager=True)
print(model.generate("Hello my name is"))
assert hasattr(model.llm_engine, "engine_core")
m.delenv("VLLM_USE_V1")
def test_v1_attn_backend(monkeypatch):
with monkeypatch.context() as m:
if os.getenv("VLLM_USE_V1", None):
m.delenv("VLLM_USE_V1")
m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
# Fall back to V0.
_ = AsyncEngineArgs(model=MODEL).create_engine_config()
assert not envs.VLLM_USE_V1
m.delenv("VLLM_USE_V1")
# Reject if V1.
m.setenv("VLLM_USE_V1", "1")
with pytest.raises(NotImplementedError):
AsyncEngineArgs(model=MODEL).create_engine_config()
m.delenv("VLLM_USE_V1")
m.setenv("VLLM_ATTENTION_BACKEND", "FLASHMLA")
_ = AsyncEngineArgs(model=MODEL).create_engine_config()
assert envs.VLLM_USE_V1
m.delenv("VLLM_USE_V1")
def test_reject_using_constructor_directly(monkeypatch):
with monkeypatch.context() as m:
if os.getenv("VLLM_USE_V1", None):
m.delenv("VLLM_USE_V1")
# Sets VLLM_USE_V1=1.
vllm_config = AsyncEngineArgs(model=MODEL).create_engine_config()
# This uses the V0 constructor directly.
with pytest.raises(ValueError):
AsyncLLMEngine(vllm_config,
AsyncLLMEngine._get_executor_cls(vllm_config),
log_stats=True)
m.delenv("VLLM_USE_V1")
......@@ -15,6 +15,9 @@ QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "80")
@pytest.mark.skipif(
MODEL_NAME == "casperhansen/deepseek-coder-v2-instruct-awq",
reason="OOM in the CI")
@pytest.mark.skipif(
not current_platform.has_device_capability(int(MIN_CAPABILITY)),
reason="Current system does not have minimum capability.")
......@@ -22,10 +25,14 @@ def test_weight_loading(vllm_runner):
"""
Test parameter weight loading with tp>1.
"""
# MoE models need fp16.
NEEDS_FP16 = (QUANTIZATION == "gptq" or MODEL_NAME
== "nm-testing/test-w4a16-mixtral-actorder-group")
with vllm_runner(
model_name=MODEL_NAME,
revision=REVISION,
dtype=torch.half if QUANTIZATION == "gptq" else "auto",
dtype=torch.half if NEEDS_FP16 else "auto",
quantization=None if QUANTIZATION == "None" else QUANTIZATION,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=2) as model:
......
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This module tests V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
\ No newline at end of file
......@@ -1140,6 +1140,10 @@ class CacheConfig:
if self.cache_dtype == "auto":
pass
elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"):
if envs.VLLM_USE_V1:
raise NotImplementedError(
"V1 does not yet support fp8 KV cache. "
"Set VLLM_USE_V1=0 to enable fp8 kv cache.")
logger.info(
"Using fp8 data type to store kv cache. It reduces the GPU "
"memory footprint and boosts the performance. "
......@@ -3142,16 +3146,7 @@ class CompilationConfig(BaseModel):
self.inductor_compile_config[KEY] = False
if self.splitting_ops is None:
if envs.VLLM_USE_V1:
# v1 must split the graph on attention ops
# for piecewise cudagraph
self.splitting_ops = [
"vllm.unified_attention",
"vllm.unified_attention_with_output",
]
else:
# v0 uses full graph compilation
self.splitting_ops = []
self.splitting_ops = []
for k, v in self.inductor_passes.items():
if not isinstance(v, str):
......@@ -3246,6 +3241,15 @@ class CompilationConfig(BaseModel):
self.bs_to_padded_graph_size[
self.max_capture_size] = self.max_capture_size
def set_splitting_ops_for_v1(self):
# If default, override splitting ops for piecewise cudagraph on V1.
# NOTE: this function needs to be called
if not self.splitting_ops:
self.splitting_ops = [
"vllm.unified_attention",
"vllm.unified_attention_with_output",
]
@dataclass
class VllmConfig:
......@@ -3297,6 +3301,7 @@ class VllmConfig:
vllm_factors: list[Any] = []
from vllm import __version__
vllm_factors.append(__version__)
vllm_factors.append(envs.VLLM_USE_V1)
if self.model_config:
vllm_factors.append(self.model_config.compute_hash())
else:
......@@ -3460,6 +3465,7 @@ class VllmConfig:
# CUDA graphs do not work properly with the custom CUDA kernels.
# FIXME(woosuk): Disable inductor to reduce the compilation time
# and avoid any potential issues with the inductor.
# FIXME(rob): Add function to set all of these.
self.compilation_config.custom_ops = ["none"]
self.compilation_config.use_cudagraph = True
self.compilation_config.use_inductor = True
......@@ -3467,6 +3473,7 @@ class VllmConfig:
self.compilation_config.pass_config.enable_fusion = False
self.compilation_config.pass_config.enable_noop = False
self.compilation_config.level = CompilationLevel.PIECEWISE
self.compilation_config.set_splitting_ops_for_v1()
self._set_cudagraph_sizes()
......
......@@ -223,15 +223,6 @@ class EngineArgs:
if not self.tokenizer:
self.tokenizer = self.model
# Override the default value of enable_prefix_caching if it's not set
# by user.
if self.enable_prefix_caching is None:
self.enable_prefix_caching = bool(envs.VLLM_USE_V1)
# Override max_num_seqs if it's not set by user.
if self.max_num_seqs is None:
self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024
# support `EngineArgs(compilation_config={...})`
# without having to manually construct a
# CompilationConfig object
......@@ -246,7 +237,6 @@ class EngineArgs:
@staticmethod
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
"""Shared CLI arguments for vLLM engine."""
# Model arguments
parser.add_argument(
'--model',
......@@ -1191,24 +1181,51 @@ class EngineArgs:
use_tqdm_on_load=self.use_tqdm_on_load,
)
def create_engine_config(self,
usage_context: Optional[UsageContext] = None
) -> VllmConfig:
def create_engine_config(
self,
usage_context: Optional[UsageContext] = None,
) -> VllmConfig:
"""
Create the VllmConfig.
NOTE: for autoselection of V0 vs V1 engine, we need to
create the ModelConfig first, since ModelConfig's attrs
(e.g. the model arch) are needed to make the decision.
This function set VLLM_USE_V1=X if VLLM_USE_V1 is
unspecified by the user.
If VLLM_USE_V1 is specified by the user but the VllmConfig
is incompatible, we raise an error.
"""
from vllm.platforms import current_platform
current_platform.pre_register_and_update()
if envs.VLLM_USE_V1:
self._override_v1_engine_args(usage_context)
device_config = DeviceConfig(device=self.device)
model_config = self.create_model_config()
if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
and self.enable_prefix_caching):
logger.warning("--enable-prefix-caching is currently not "
"supported for multimodal models in v0 and "
"has been disabled.")
self.enable_prefix_caching = False
# * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
# and fall back to V0 for experimental or unsupported features.
# * If VLLM_USE_V1=1, we enable V1 for supported + experimental
# features and raise error for unsupported features.
# * If VLLM_USE_V1=0, we disable V1.
use_v1 = False
try_v1 = envs.VLLM_USE_V1 or not envs.is_set("VLLM_USE_V1")
if try_v1 and self._is_v1_supported_oracle(model_config):
use_v1 = True
# If user explicitly set VLLM_USE_V1, sanity check we respect it.
if envs.is_set("VLLM_USE_V1"):
assert use_v1 == envs.VLLM_USE_V1
# Otherwise, set the VLLM_USE_V1 variable globally.
else:
envs.set_vllm_use_v1(use_v1)
# Set default arguments for V0 or V1 Engine.
if use_v1:
self._set_default_args_v1(usage_context)
else:
self._set_default_args_v0(model_config)
cache_config = CacheConfig(
block_size=self.block_size,
......@@ -1239,50 +1256,6 @@ class EngineArgs:
worker_extension_cls=self.worker_extension_cls,
)
max_model_len = model_config.max_model_len
use_long_context = max_model_len > 32768
if self.enable_chunked_prefill is None:
# If not explicitly set, enable chunked prefill by default for
# long context (> 32K) models. This is to avoid OOM errors in the
# initial memory profiling phase.
# For multimodal models and models with MLA, chunked prefill is
# disabled by default in V0, but enabled by design in V1
if model_config.is_multimodal_model or model_config.use_mla:
self.enable_chunked_prefill = bool(envs.VLLM_USE_V1)
elif use_long_context:
is_gpu = device_config.device_type == "cuda"
use_sliding_window = (model_config.get_sliding_window()
is not None)
use_spec_decode = self.speculative_model is not None
from vllm.platforms import current_platform
if (is_gpu and not use_sliding_window and not use_spec_decode
and not self.enable_lora
and not self.enable_prompt_adapter
and model_config.runner_type != "pooling"
and not current_platform.is_rocm()):
self.enable_chunked_prefill = True
logger.warning(
"Chunked prefill is enabled by default for models with "
"max_model_len > 32K. Currently, chunked prefill might "
"not work with some features or models. If you "
"encounter any issues, please disable chunked prefill "
"by setting --enable-chunked-prefill=False.")
if self.enable_chunked_prefill is None:
self.enable_chunked_prefill = False
if not self.enable_chunked_prefill and use_long_context:
logger.warning(
"The model has a long context length (%s). This may cause OOM "
"errors during the initial memory profiling phase, or result "
"in low performance due to small KV cache space. Consider "
"setting --max-model-len to a smaller value.", max_model_len)
elif (self.enable_chunked_prefill
and model_config.runner_type == "pooling"):
msg = "Chunked prefill is not supported for pooling models"
raise ValueError(msg)
speculative_config = SpeculativeConfig.maybe_create_spec_config(
target_model_config=model_config,
target_parallel_config=parallel_config,
......@@ -1425,18 +1398,282 @@ class EngineArgs:
additional_config=self.additional_config,
)
if envs.VLLM_USE_V1:
self._override_v1_engine_config(config)
return config
def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
"""
Override the EngineArgs's args based on the usage context for V1.
"""
assert envs.VLLM_USE_V1, "V1 is not enabled"
def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
"""Oracle for whether to use V0 or V1 Engine by default."""
#############################################################
# Unsupported Feature Flags on V1.
if (self.load_format == LoadFormat.TENSORIZER.value
or self.load_format == LoadFormat.SHARDED_STATE.value):
_raise_or_fallback(
feature_name=f"--load_format {self.load_format}",
recommend_to_remove=False)
return False
if (self.logits_processor_pattern
!= EngineArgs.logits_processor_pattern):
_raise_or_fallback(feature_name="--logits-processor-pattern",
recommend_to_remove=False)
return False
if self.preemption_mode != EngineArgs.preemption_mode:
_raise_or_fallback(feature_name="--preemption-mode",
recommend_to_remove=True)
return False
if (self.disable_async_output_proc
!= EngineArgs.disable_async_output_proc):
_raise_or_fallback(feature_name="--disable-async-output-proc",
recommend_to_remove=True)
return False
if self.scheduling_policy != EngineArgs.scheduling_policy:
_raise_or_fallback(feature_name="--scheduling-policy",
recommend_to_remove=False)
return False
if self.worker_cls != EngineArgs.worker_cls:
_raise_or_fallback(feature_name="--worker-cls",
recommend_to_remove=False)
return False
if self.worker_extension_cls != EngineArgs.worker_extension_cls:
_raise_or_fallback(feature_name="--worker-extension-cls",
recommend_to_remove=False)
return False
if self.num_scheduler_steps != EngineArgs.num_scheduler_steps:
_raise_or_fallback(feature_name="--num-scheduler-steps",
recommend_to_remove=True)
return False
if self.scheduler_delay_factor != EngineArgs.scheduler_delay_factor:
_raise_or_fallback(feature_name="--scheduler-delay-factor",
recommend_to_remove=True)
return False
if self.additional_config != EngineArgs.additional_config:
_raise_or_fallback(feature_name="--additional-config",
recommend_to_remove=False)
return False
# Only support Xgrammar for guided decoding so far.
SUPPORTED_GUIDED_DECODING = ["xgrammar", "xgrammar:nofallback"]
if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING:
_raise_or_fallback(feature_name="--guided-decoding-backend",
recommend_to_remove=False)
return False
# Need at least Ampere for now (FA support required).
from vllm.platforms import current_platform
if (current_platform.is_cuda()
and current_platform.get_device_capability().major < 8):
_raise_or_fallback(feature_name="Compute Capability < 8.0",
recommend_to_remove=False)
return False
# No Fp8 KV cache so far.
if self.kv_cache_dtype != "auto":
_raise_or_fallback(feature_name="--kv-cache-dtype",
recommend_to_remove=False)
return False
# No Prompt Adapter so far.
if self.enable_prompt_adapter:
_raise_or_fallback(feature_name="--enable-prompt-adapter",
recommend_to_remove=False)
return False
# No MistralTokenizer support so far (not compatible
# with xgrammar)
if model_config.tokenizer_mode == "mistral":
_raise_or_fallback(feature_name="--tokenizer-mode mistral",
recommend_to_remove=False)
return False
# No CPU offloading yet.
if self.cpu_offload_gb != EngineArgs.cpu_offload_gb:
_raise_or_fallback(feature_name="--cpu-offload-gb",
recommend_to_remove=False)
return False
# Only Fp16 and Bf16 dtypes since we only support FA.
V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
if model_config.dtype not in V1_SUPPORTED_DTYPES:
_raise_or_fallback(feature_name=f"--dtype {model_config.dtype}",
recommend_to_remove=False)
return False
# Some quantization is not compatible with torch.compile.
V1_UNSUPPORTED_QUANT = ["bitsandbytes", "gguf"]
if model_config.quantization in V1_UNSUPPORTED_QUANT:
_raise_or_fallback(
feature_name=f"--quantization {model_config.quantization}",
recommend_to_remove=False)
return False
# No Embedding Models so far.
if model_config.task not in ["generate"]:
_raise_or_fallback(feature_name=f"--task {model_config.task}",
recommend_to_remove=False)
return False
# No Mamba or Encoder-Decoder so far.
if not model_config.is_v1_compatible:
_raise_or_fallback(feature_name=model_config.architectures,
recommend_to_remove=False)
return False
# No TransformersModel support so far.
if (model_config.model_impl == ModelImpl.TRANSFORMERS
or model_config.model_impl == "transformers"):
_raise_or_fallback(
feature_name=f"model_impl={model_config.model_impl}",
recommend_to_remove=False)
return False
# No Concurrent Partial Prefills so far.
if (self.max_num_partial_prefills
!= EngineArgs.max_num_partial_prefills
or self.max_long_partial_prefills
!= EngineArgs.max_long_partial_prefills
or self.long_prefill_token_threshold
!= EngineArgs.long_prefill_token_threshold):
_raise_or_fallback(feature_name="Concurrent Partial Prefill",
recommend_to_remove=False)
return False
# No OTLP observability so far.
if (self.otlp_traces_endpoint or self.collect_detailed_traces):
_raise_or_fallback(feature_name="--otlp-traces-endpoint",
recommend_to_remove=False)
return False
# Only Ngram speculative decoding so far.
if (self.speculative_model is not None
or self.num_speculative_tokens is not None):
# This is supported but experimental (handled below).
if self.speculative_model == "[ngram]":
pass
else:
_raise_or_fallback(feature_name="Speculative Decoding",
recommend_to_remove=False)
return False
# No Disaggregated Prefill so far.
if self.kv_transfer_config != EngineArgs.kv_transfer_config:
_raise_or_fallback(feature_name="--kv-transfer-config",
recommend_to_remove=False)
return False
# No FlashInfer or XFormers so far.
V1_BACKENDS = [
"FLASH_ATTN_VLLM_V1", "FLASH_ATTN", "PALLAS", "PALLAS_VLLM_V1",
"TRITON_MLA", "FLASHMLA"
]
if (envs.is_set("VLLM_ATTENTION_BACKEND")
and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
name = f"VLLM_ATTENTION_BACKEND={envs.VLLM_ATTENTION_BACKEND}"
_raise_or_fallback(feature_name=name, recommend_to_remove=True)
return False
#############################################################
# Experimental Features - allow users to opt in.
# MLA is is supported on V1, but off by default for now.
if model_config.use_mla and _warn_or_fallback("MLA"):
return False
# LoRA is supported on V1, but off by default for now.
if self.enable_lora and _warn_or_fallback("LORA"):
return False
# PP is supported on V1, but off by default for now.
if self.pipeline_parallel_size > 1 and _warn_or_fallback("PP"):
return False
# ngram is supported on V1, but off by default for now.
if self.speculative_model == "[ngram]" and _warn_or_fallback("ngram"):
return False
# Non-CUDA is supported on V1, but off by default for now.
not_cuda = not current_platform.is_cuda()
if not_cuda and _warn_or_fallback( # noqa: SIM103
current_platform.device_type):
return False
#############################################################
return True
def _set_default_args_v0(self, model_config: ModelConfig) -> None:
"""Set Default Arguments for V0 Engine."""
max_model_len = model_config.max_model_len
use_long_context = max_model_len > 32768
if self.enable_chunked_prefill is None:
# Chunked prefill not supported for Multimodal or MLA in V0.
if model_config.is_multimodal_model or model_config.use_mla:
self.enable_chunked_prefill = False
# Enable chunked prefill by default for long context (> 32K)
# models to avoid OOM errors in initial memory profiling phase.
elif use_long_context:
from vllm.platforms import current_platform
is_gpu = current_platform.is_cuda()
use_sliding_window = (model_config.get_sliding_window()
is not None)
use_spec_decode = self.speculative_model is not None
if (is_gpu and not use_sliding_window and not use_spec_decode
and not self.enable_lora
and not self.enable_prompt_adapter
and model_config.runner_type != "pooling"):
self.enable_chunked_prefill = True
logger.warning(
"Chunked prefill is enabled by default for models "
"with max_model_len > 32K. Chunked prefill might "
"not work with some features or models. If you "
"encounter any issues, please disable by launching "
"with --enable-chunked-prefill=False.")
if self.enable_chunked_prefill is None:
self.enable_chunked_prefill = False
if not self.enable_chunked_prefill and use_long_context:
logger.warning(
"The model has a long context length (%s). This may cause"
"OOM during the initial memory profiling phase, or result "
"in low performance due to small KV cache size. Consider "
"setting --max-model-len to a smaller value.", max_model_len)
elif (self.enable_chunked_prefill
and model_config.runner_type == "pooling"):
msg = "Chunked prefill is not supported for pooling models"
raise ValueError(msg)
# Disable prefix caching for multimodal models for VLLM_V0.
if (model_config.is_multimodal_model and self.enable_prefix_caching):
logger.warning(
"--enable-prefix-caching is not supported for multimodal "
"models in V0 and has been disabled.")
self.enable_prefix_caching = False
# Set max_num_seqs to 256 for VLLM_V0.
if self.max_num_seqs is None:
self.max_num_seqs = 256
def _set_default_args_v1(self, usage_context: UsageContext) -> None:
"""Set Default Arguments for V1 Engine."""
# V1 always uses chunked prefills.
self.enable_chunked_prefill = True
# V1 enables prefix caching by default.
if self.enable_prefix_caching is None:
self.enable_prefix_caching = True
# V1 should use the new scheduler by default.
# Swap it only if this arg is set to the original V0 default
if self.scheduler_cls == EngineArgs.scheduler_cls:
......@@ -1471,19 +1708,21 @@ class EngineArgs:
UsageContext.OPENAI_API_SERVER: 2048,
}
use_context_value = usage_context.value if usage_context else None
if (self.max_num_batched_tokens is None
and usage_context in default_max_num_batched_tokens):
self.max_num_batched_tokens = default_max_num_batched_tokens[
usage_context]
logger.warning(
logger.debug(
"Setting max_num_batched_tokens to %d for %s usage context.",
self.max_num_batched_tokens, usage_context.value)
self.max_num_batched_tokens, use_context_value)
def _override_v1_engine_config(self, engine_config: VllmConfig) -> None:
"""
Override the EngineConfig's configs based on the usage context for V1.
"""
assert envs.VLLM_USE_V1, "V1 is not enabled"
default_max_num_seqs = 1024
if self.max_num_seqs is None:
self.max_num_seqs = default_max_num_seqs
logger.debug("Setting max_num_seqs to %d for %s usage context.",
self.max_num_seqs, use_context_value)
@dataclass
......@@ -1508,6 +1747,33 @@ class AsyncEngineArgs(EngineArgs):
return parser
def _raise_or_fallback(feature_name: str, recommend_to_remove: bool):
if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
raise NotImplementedError(
f"VLLM_USE_V1=1 is not supported with {feature_name}.")
msg = f"{feature_name} is not supported by the V1 Engine. "
msg += "Falling back to V0. "
if recommend_to_remove:
msg += f"We recommend to remove {feature_name} from your config "
msg += "in favor of the V1 Engine."
logger.warning(msg)
def _warn_or_fallback(feature_name: str) -> bool:
if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
logger.warning(
"Detected VLLM_USE_V1=1 with %s. Usage should "
"be considered experimental. Please report any "
"issues on Github.", feature_name)
should_exit = False
else:
logger.info(
"%s is experimental on VLLM_USE_V1=1. "
"Falling back to V0 Engine.", feature_name)
should_exit = True
return should_exit
# These functions are used by sphinx to build the documentation
def _engine_args_parser():
return EngineArgs.add_cli_args(FlexibleArgumentParser())
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment