Unverified Commit d4d93db2 authored by Robert Shaw's avatar Robert Shaw Committed by GitHub
Browse files

[V1] V1 Enablement Oracle (#13726)


Signed-off-by: default avatarrshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: default avatarrshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: default avatarNicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: default avatarTyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: default avatarMichael Goin <michael@neuralmagic.com>
parent 8c0d15d5
......@@ -6,11 +6,18 @@ Run `pytest tests/samplers/test_no_bad_words.py`.
"""
from typing import Optional
import pytest
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
"""We can run both engines for this test."""
pass
def _generate(
model: LLM,
prompt: str,
......
......@@ -7,6 +7,12 @@ from vllm import SamplingParams
MODELS = ["distilbert/distilgpt2"]
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
"""We can run both engines for this test."""
pass
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_ranks(
......
......@@ -8,6 +8,15 @@ import torch.nn.functional as F
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
from vllm.model_executor.utils import set_random_seed
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This file tests V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
......
......@@ -18,6 +18,14 @@ from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
from vllm.utils import Counter, is_pin_memory_available
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This file tests V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
class MockLogitsSampler(Sampler):
def __init__(self, fake_logits: torch.Tensor):
......
......@@ -17,7 +17,9 @@ RANDOM_SEEDS = list(range(5))
@pytest.fixture
def vllm_model(vllm_runner):
def vllm_model(vllm_runner, monkeypatch):
# This file relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
with vllm_runner(MODEL, dtype="half") as vllm_model:
yield vllm_model
......
......@@ -11,6 +11,14 @@ from vllm.model_executor.utils import set_random_seed
CUDA_DEVICES = [f"cuda:{i}" for i in range(1)]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This file tests V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
def get_zero_temperature_prob_dist(batch_size, k, vocab_size):
"""
Generates a fake temperature zero probability distribution.
......
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
......@@ -12,6 +12,14 @@ from vllm.distributed import cleanup_dist_env_and_memory
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Tensorizer only tested on V0 so far.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.fixture(autouse=True)
def cleanup():
cleanup_dist_env_and_memory(shutdown_ray=True)
......
......@@ -7,11 +7,13 @@ will never happen again.
"""
import gc
import pytest
import torch
from vllm import LLM, SamplingParams
@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
def test_duplicated_ignored_sequence_group():
"""https://github.com/vllm-project/vllm/issues/1655"""
......
......@@ -366,7 +366,10 @@ def test_bind_kv_cache_non_attention():
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
def test_bind_kv_cache_encoder_decoder():
def test_bind_kv_cache_encoder_decoder(monkeypatch):
# V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
monkeypatch.setenv("VLLM_USE_V1", "0")
from vllm.attention import Attention, AttentionType
# example from bart
......
......@@ -279,7 +279,12 @@ def test_decode_prompt_logprobs_chunked_prefill(
model,
chunked_prefill_token_size: int,
example_prompts,
monkeypatch,
):
# VLLM V1 does not use incremental detokenization for
# prompt logprobs, so this test strategy is irrelevant.
monkeypatch.setenv("VLLM_USE_V1", "0")
max_num_seqs = 256
enable_chunked_prefill = False
max_num_batched_tokens = None
......
......@@ -91,20 +91,22 @@ CONFIGS: dict[str, ServerConfig] = {
"without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
"to the user's question - just respond to it normally."
},
"granite20b": {
"model":
"mbayser/granite-20b-functioncalling-FP8-KV",
"arguments": [
"--tool-call-parser", "granite-20b-fc", "--chat-template",
str(VLLM_PATH /
"examples/tool_chat_template_granite_20b_fc.jinja"),
"--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20"
],
"supports_parallel":
False,
"supports_rocm":
False,
},
# V1 Test: Passing locally but failing in CI. This runs the
# V0 Engine because of CPU offloading. Need to debug why.
# "granite20b": {
# "model":
# "mbayser/granite-20b-functioncalling-FP8-KV",
# "arguments": [
# "--tool-call-parser", "granite-20b-fc", "--chat-template",
# str(VLLM_PATH /
# "examples/tool_chat_template_granite_20b_fc.jinja"),
# "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20"
# ],
# "supports_parallel":
# False,
# "supports_rocm":
# False,
# },
"granite-3.0-8b": {
"model":
"ibm-granite/granite-3.0-8b-instruct",
......
......@@ -19,6 +19,16 @@ from opentelemetry.sdk.environment_variables import (
from vllm import LLM, SamplingParams
from vllm.tracing import SpanAttributes
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
......
......@@ -18,19 +18,19 @@ if not envs.VLLM_USE_V1:
def test_prefix_caching_from_cli():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
args = parser.parse_args([])
engine_args = EngineArgs.from_cli_args(args=args)
assert (engine_args.enable_prefix_caching
vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
assert (vllm_config.cache_config.enable_prefix_caching
), "V1 turns on prefix caching by default."
# Turn it off possible with flag.
args = parser.parse_args(["--no-enable-prefix-caching"])
engine_args = EngineArgs.from_cli_args(args=args)
assert not engine_args.enable_prefix_caching
vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
assert not vllm_config.cache_config.enable_prefix_caching
# Turn it on with flag.
args = parser.parse_args(["--enable-prefix-caching"])
engine_args = EngineArgs.from_cli_args(args=args)
assert engine_args.enable_prefix_caching
vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
assert vllm_config.cache_config.enable_prefix_caching
def test_defaults_with_usage_context():
......@@ -38,11 +38,21 @@ def test_defaults_with_usage_context():
vllm_config: VllmConfig = engine_args.create_engine_config(
UsageContext.LLM_CLASS)
from vllm.platforms import current_platform
device_name = current_platform.get_device_name().lower()
if "h100" in device_name or "h200" in device_name:
# For H100 and H200, we use larger default values.
default_llm_tokens = 16384
default_server_tokens = 8192
else:
default_llm_tokens = 8192
default_server_tokens = 2048
assert vllm_config.scheduler_config.max_num_seqs == 1024
assert vllm_config.scheduler_config.max_num_batched_tokens == 8192
assert vllm_config.scheduler_config.max_num_batched_tokens == default_llm_tokens # noqa: E501
engine_args = EngineArgs(model="facebook/opt-125m")
vllm_config = engine_args.create_engine_config(
UsageContext.OPENAI_API_SERVER)
assert vllm_config.scheduler_config.max_num_seqs == 1024
assert vllm_config.scheduler_config.max_num_batched_tokens == 2048
assert vllm_config.scheduler_config.max_num_batched_tokens == default_server_tokens # noqa: E501
......@@ -6,7 +6,6 @@ from collections.abc import Generator
import pytest
import torch
from tests.kernels.utils import override_backend_env_variable
from tests.v1.sample.utils import (
BatchLogprobsComposition, BatchLogprobsSpecType,
assert_incr_detok_str_matches_non_incr_detok_str,
......@@ -334,7 +333,7 @@ def test_get_logprobs_and_prompt_logprobs(
do_apc=do_apc)
def test_max_logprobs(monkeypatch):
def test_max_logprobs():
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs`
Should also fail for `prompt_logprobs > max_logprobs`
......@@ -344,7 +343,6 @@ def test_max_logprobs(monkeypatch):
Args:
monkeypatch
"""
override_backend_env_variable(monkeypatch, "FLASH_ATTN")
runner = VllmRunner("facebook/opt-125m",
max_logprobs=1,
......
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
import vllm.envs as envs
from vllm import LLM
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
UNSUPPORTED_MODELS_V1 = [
"openai/whisper-large-v3", # transcription
"facebook/bart-large-cnn", # encoder decoder
"mistralai/Mamba-Codestral-7B-v0.1", # mamba
"ibm-ai-platform/Bamba-9B", # hybrid
"BAAI/bge-m3", # embedding
]
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
@pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1)
def test_reject_unsupported_models(monkeypatch, model):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
args = AsyncEngineArgs(model=model)
with pytest.raises(NotImplementedError):
_ = args.create_engine_config()
m.delenv("VLLM_USE_V1")
def test_reject_bad_config(monkeypatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0")
def test_unsupported_configs(monkeypatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
with pytest.raises(NotImplementedError):
AsyncEngineArgs(
model=MODEL,
kv_cache_dtype="fp8",
).create_engine_config()
with pytest.raises(NotImplementedError):
AsyncEngineArgs(
model=MODEL,
speculative_model=MODEL,
).create_engine_config()
with pytest.raises(NotImplementedError):
AsyncEngineArgs(
model=MODEL,
guided_decoding_backend="lm-format-enforcer:no-fallback",
).create_engine_config()
with pytest.raises(NotImplementedError):
AsyncEngineArgs(
model=MODEL,
preemption_mode="swap",
).create_engine_config()
with pytest.raises(NotImplementedError):
AsyncEngineArgs(
model=MODEL,
disable_async_output_proc=True,
).create_engine_config()
with pytest.raises(NotImplementedError):
AsyncEngineArgs(
model=MODEL,
scheduling_policy="priority",
).create_engine_config()
with pytest.raises(NotImplementedError):
AsyncEngineArgs(
model=MODEL,
num_scheduler_steps=5,
).create_engine_config()
with pytest.raises(NotImplementedError):
AsyncEngineArgs(
model=MODEL,
scheduler_delay_factor=1.2,
).create_engine_config()
def test_enable_by_default_fallback(monkeypatch):
with monkeypatch.context() as m:
if os.getenv("VLLM_USE_V1", None):
m.delenv("VLLM_USE_V1")
# Should default to V1 for supported config.
_ = AsyncEngineArgs(
model=MODEL,
enforce_eager=True,
).create_engine_config()
assert envs.VLLM_USE_V1
m.delenv("VLLM_USE_V1")
# Should fall back to V0 for experimental config.
_ = AsyncEngineArgs(
model=MODEL,
enable_lora=True,
).create_engine_config()
assert not envs.VLLM_USE_V1
m.delenv("VLLM_USE_V1")
# Should fall back to V0 for supported model.
_ = AsyncEngineArgs(
model=UNSUPPORTED_MODELS_V1[0]).create_engine_config()
assert not envs.VLLM_USE_V1
m.delenv("VLLM_USE_V1")
def test_v1_llm_by_default(monkeypatch):
with monkeypatch.context() as m:
if os.getenv("VLLM_USE_V1", None):
m.delenv("VLLM_USE_V1")
# Should default to V1 for supported config.
model = LLM(MODEL, enforce_eager=True)
print(model.generate("Hello my name is"))
assert hasattr(model.llm_engine, "engine_core")
m.delenv("VLLM_USE_V1")
def test_v1_attn_backend(monkeypatch):
with monkeypatch.context() as m:
if os.getenv("VLLM_USE_V1", None):
m.delenv("VLLM_USE_V1")
m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
# Fall back to V0.
_ = AsyncEngineArgs(model=MODEL).create_engine_config()
assert not envs.VLLM_USE_V1
m.delenv("VLLM_USE_V1")
# Reject if V1.
m.setenv("VLLM_USE_V1", "1")
with pytest.raises(NotImplementedError):
AsyncEngineArgs(model=MODEL).create_engine_config()
m.delenv("VLLM_USE_V1")
m.setenv("VLLM_ATTENTION_BACKEND", "FLASHMLA")
_ = AsyncEngineArgs(model=MODEL).create_engine_config()
assert envs.VLLM_USE_V1
m.delenv("VLLM_USE_V1")
def test_reject_using_constructor_directly(monkeypatch):
with monkeypatch.context() as m:
if os.getenv("VLLM_USE_V1", None):
m.delenv("VLLM_USE_V1")
# Sets VLLM_USE_V1=1.
vllm_config = AsyncEngineArgs(model=MODEL).create_engine_config()
# This uses the V0 constructor directly.
with pytest.raises(ValueError):
AsyncLLMEngine(vllm_config,
AsyncLLMEngine._get_executor_cls(vllm_config),
log_stats=True)
m.delenv("VLLM_USE_V1")
......@@ -15,6 +15,9 @@ QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "80")
@pytest.mark.skipif(
MODEL_NAME == "casperhansen/deepseek-coder-v2-instruct-awq",
reason="OOM in the CI")
@pytest.mark.skipif(
not current_platform.has_device_capability(int(MIN_CAPABILITY)),
reason="Current system does not have minimum capability.")
......@@ -22,10 +25,14 @@ def test_weight_loading(vllm_runner):
"""
Test parameter weight loading with tp>1.
"""
# MoE models need fp16.
NEEDS_FP16 = (QUANTIZATION == "gptq" or MODEL_NAME
== "nm-testing/test-w4a16-mixtral-actorder-group")
with vllm_runner(
model_name=MODEL_NAME,
revision=REVISION,
dtype=torch.half if QUANTIZATION == "gptq" else "auto",
dtype=torch.half if NEEDS_FP16 else "auto",
quantization=None if QUANTIZATION == "None" else QUANTIZATION,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=2) as model:
......
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This module tests V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
\ No newline at end of file
......@@ -1140,6 +1140,10 @@ class CacheConfig:
if self.cache_dtype == "auto":
pass
elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"):
if envs.VLLM_USE_V1:
raise NotImplementedError(
"V1 does not yet support fp8 KV cache. "
"Set VLLM_USE_V1=0 to enable fp8 kv cache.")
logger.info(
"Using fp8 data type to store kv cache. It reduces the GPU "
"memory footprint and boosts the performance. "
......@@ -3142,15 +3146,6 @@ class CompilationConfig(BaseModel):
self.inductor_compile_config[KEY] = False
if self.splitting_ops is None:
if envs.VLLM_USE_V1:
# v1 must split the graph on attention ops
# for piecewise cudagraph
self.splitting_ops = [
"vllm.unified_attention",
"vllm.unified_attention_with_output",
]
else:
# v0 uses full graph compilation
self.splitting_ops = []
for k, v in self.inductor_passes.items():
......@@ -3246,6 +3241,15 @@ class CompilationConfig(BaseModel):
self.bs_to_padded_graph_size[
self.max_capture_size] = self.max_capture_size
def set_splitting_ops_for_v1(self):
# If default, override splitting ops for piecewise cudagraph on V1.
# NOTE: this function needs to be called
if not self.splitting_ops:
self.splitting_ops = [
"vllm.unified_attention",
"vllm.unified_attention_with_output",
]
@dataclass
class VllmConfig:
......@@ -3297,6 +3301,7 @@ class VllmConfig:
vllm_factors: list[Any] = []
from vllm import __version__
vllm_factors.append(__version__)
vllm_factors.append(envs.VLLM_USE_V1)
if self.model_config:
vllm_factors.append(self.model_config.compute_hash())
else:
......@@ -3460,6 +3465,7 @@ class VllmConfig:
# CUDA graphs do not work properly with the custom CUDA kernels.
# FIXME(woosuk): Disable inductor to reduce the compilation time
# and avoid any potential issues with the inductor.
# FIXME(rob): Add function to set all of these.
self.compilation_config.custom_ops = ["none"]
self.compilation_config.use_cudagraph = True
self.compilation_config.use_inductor = True
......@@ -3467,6 +3473,7 @@ class VllmConfig:
self.compilation_config.pass_config.enable_fusion = False
self.compilation_config.pass_config.enable_noop = False
self.compilation_config.level = CompilationLevel.PIECEWISE
self.compilation_config.set_splitting_ops_for_v1()
self._set_cudagraph_sizes()
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment