Unverified Commit d4d93db2 authored by Robert Shaw's avatar Robert Shaw Committed by GitHub
Browse files

[V1] V1 Enablement Oracle (#13726)


Signed-off-by: default avatarrshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: default avatarrshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: default avatarNicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: default avatarTyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: default avatarMichael Goin <michael@neuralmagic.com>
parent 8c0d15d5
...@@ -35,13 +35,6 @@ def test_classification_models( ...@@ -35,13 +35,6 @@ def test_classification_models(
with vllm_runner(model, dtype=dtype) as vllm_model: with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.classify(example_prompts) vllm_outputs = vllm_model.classify(example_prompts)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
def print_model(model):
print(model)
vllm_model.apply_model(print_model)
with hf_runner(model, with hf_runner(model,
dtype=dtype, dtype=dtype,
auto_cls=AutoModelForSequenceClassification) as hf_model: auto_cls=AutoModelForSequenceClassification) as hf_model:
......
...@@ -73,13 +73,6 @@ def test_models( ...@@ -73,13 +73,6 @@ def test_models(
**vllm_extra_kwargs) as vllm_model: **vllm_extra_kwargs) as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts) vllm_outputs = vllm_model.encode(example_prompts)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
def print_model(model):
print(model)
vllm_model.apply_model(print_model)
check_embeddings_close( check_embeddings_close(
embeddings_0_lst=hf_outputs, embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs, embeddings_1_lst=vllm_outputs,
......
...@@ -256,7 +256,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -256,7 +256,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501 "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501 {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
"LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf", "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
extras={"mistral": "mistral-community/pixtral-12b"}), # noqa: E501 extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
"mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}), # noqa: E501
"LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"), # noqa: E501 "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"), # noqa: E501
"LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"), # noqa: E501 "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"), # noqa: E501
"LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501
...@@ -274,8 +275,9 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -274,8 +275,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code=True), trust_remote_code=True),
"PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501 "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501
extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501 extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501
"Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-vision-instruct", "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
trust_remote_code=True), trust_remote_code=True,
extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}), # noqa: E501),
"Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct", "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
trust_remote_code=True), trust_remote_code=True),
"PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501 "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501
......
...@@ -6,6 +6,8 @@ import pytest ...@@ -6,6 +6,8 @@ import pytest
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm import LLM from vllm import LLM
from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
from vllm.v1.engine.core import EngineCore as V1EngineCore
from .registry import HF_EXAMPLE_MODELS from .registry import HF_EXAMPLE_MODELS
...@@ -36,12 +38,18 @@ def test_can_initialize(model_arch): ...@@ -36,12 +38,18 @@ def test_can_initialize(model_arch):
return hf_config return hf_config
# Avoid calling model.forward() # Avoid calling model.forward()
def _initialize_kv_caches(self) -> None: def _initialize_kv_caches_v0(self) -> None:
self.cache_config.num_gpu_blocks = 0 self.cache_config.num_gpu_blocks = 0
self.cache_config.num_cpu_blocks = 0 self.cache_config.num_cpu_blocks = 0
with patch.object(LLM.get_engine_class(), "_initialize_kv_caches", def _initalize_kv_caches_v1(self, vllm_config):
_initialize_kv_caches): # gpu_blocks (> 0), cpu_blocks
return 1, 0
with (patch.object(V0LLMEngine, "_initialize_kv_caches",
_initialize_kv_caches_v0),
patch.object(V1EngineCore, "_initialize_kv_caches",
_initalize_kv_caches_v1)):
LLM( LLM(
model_info.default, model_info.default,
tokenizer=model_info.tokenizer, tokenizer=model_info.tokenizer,
......
...@@ -11,12 +11,14 @@ from ..utils import fork_new_process_for_each_test ...@@ -11,12 +11,14 @@ from ..utils import fork_new_process_for_each_test
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_plugin(dummy_opt_path): def test_plugin(dummy_opt_path, monkeypatch):
# V1 shuts down rather than raising an error here.
monkeypatch.setenv("VLLM_USE_V1", "0")
os.environ["VLLM_PLUGINS"] = "" os.environ["VLLM_PLUGINS"] = ""
with pytest.raises(Exception) as excinfo: with pytest.raises(Exception) as excinfo:
LLM(model=dummy_opt_path, load_format="dummy") LLM(model=dummy_opt_path, load_format="dummy")
error_msg = "has no vLLM implementation and " \ error_msg = "has no vLLM implementation and " \
"the Transformers implementation is not compatible with vLLM." "the Transformers implementation is not compatible with vLLM"
assert (error_msg in str(excinfo.value)) assert (error_msg in str(excinfo.value))
...@@ -51,7 +53,7 @@ image = ImageAsset("cherry_blossom").pil_image.convert("RGB") ...@@ -51,7 +53,7 @@ image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_oot_registration_multimodal(dummy_llava_path): def test_oot_registration_multimodal(dummy_llava_path, monkeypatch):
os.environ["VLLM_PLUGINS"] = "register_dummy_model" os.environ["VLLM_PLUGINS"] = "register_dummy_model"
prompts = [{ prompts = [{
"prompt": "What's in the image?<image>", "prompt": "What's in the image?<image>",
......
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
\ No newline at end of file
...@@ -34,7 +34,10 @@ def test_disable_sliding_window(model_len_len, ): ...@@ -34,7 +34,10 @@ def test_disable_sliding_window(model_len_len, ):
del vllm_disabled_model del vllm_disabled_model
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
vllm_enabled_model = LLM(model, disable_sliding_window=False) vllm_enabled_model = LLM(model,
enforce_eager=True,
disable_sliding_window=False,
enable_prefix_caching=False)
vllm_enabled_model.generate("Hi my name is") vllm_enabled_model.generate("Hi my name is")
model_config = vllm_enabled_model.llm_engine.model_config model_config = vllm_enabled_model.llm_engine.model_config
assert model_config.max_model_len == full_len, ( assert model_config.max_model_len == full_len, (
......
...@@ -16,6 +16,15 @@ from vllm.platforms import current_platform ...@@ -16,6 +16,15 @@ from vllm.platforms import current_platform
from ..models.utils import check_outputs_equal from ..models.utils import check_outputs_equal
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This module relies on V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
MODELS = [ MODELS = [
"distilbert/distilgpt2", "distilbert/distilgpt2",
] ]
......
...@@ -21,6 +21,14 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( ...@@ -21,6 +21,14 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
from vllm.platforms import current_platform from vllm.platforms import current_platform
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This module relies on V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_args", "model_args",
[ [
......
...@@ -10,6 +10,13 @@ from tests.quantization.utils import is_quant_method_supported ...@@ -10,6 +10,13 @@ from tests.quantization.utils import is_quant_method_supported
from ..utils import compare_two_settings from ..utils import compare_two_settings
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
# Fall back to V0 if cpu offloading is enabled.
# Fixture is required to that baseline uses V0.
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.mark.skipif(not is_quant_method_supported("fp8"), @pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.") reason="fp8 is not supported on this GPU type.")
def test_cpu_offload_fp8(): def test_cpu_offload_fp8():
......
...@@ -47,7 +47,9 @@ KV_CACHE_MODELS = [ ...@@ -47,7 +47,9 @@ KV_CACHE_MODELS = [
@pytest.mark.skipif(not is_quant_method_supported("fp8"), @pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="FP8 is not supported on this GPU type.") reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_id", KV_CACHE_MODELS) @pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
def test_kv_cache_model_load_and_run(vllm_runner, model_id: str): def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch):
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
with vllm_runner(model_id, kv_cache_dtype="fp8") as llm: with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
def check_model(model): def check_model(model):
...@@ -86,6 +88,9 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str): ...@@ -86,6 +88,9 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
@pytest.mark.parametrize("force_marlin", [False, True]) @pytest.mark.parametrize("force_marlin", [False, True])
def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool, def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
monkeypatch) -> None: monkeypatch) -> None:
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
if force_marlin: if force_marlin:
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
......
...@@ -28,8 +28,10 @@ MODEL_QUANT = [ ...@@ -28,8 +28,10 @@ MODEL_QUANT = [
@pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT) @pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT)
def test_gptq_with_dynamic(vllm_runner, model_id: str, def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
use_marlin_kernel: bool): monkeypatch):
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048)
......
...@@ -29,7 +29,10 @@ def test_lm_head( ...@@ -29,7 +29,10 @@ def test_lm_head(
vllm_runner, vllm_runner,
model_id: str, model_id: str,
lm_head_quantized: bool, lm_head_quantized: bool,
monkeypatch,
) -> None: ) -> None:
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
with vllm_runner(model_id, dtype=torch.float16, with vllm_runner(model_id, dtype=torch.float16,
max_model_len=2048) as vllm_model: max_model_len=2048) as vllm_model:
......
...@@ -10,7 +10,9 @@ from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501 ...@@ -10,7 +10,9 @@ from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501
QuarkLinearMethod, QuarkW8A8Fp8) QuarkLinearMethod, QuarkW8A8Fp8)
def test_quark_fp8(vllm_runner): def test_quark_fp8(vllm_runner, monkeypatch):
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test" model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
with vllm_runner(model_path) as llm: with vllm_runner(model_path) as llm:
......
...@@ -101,8 +101,10 @@ def test_register_quantization_config(): ...@@ -101,8 +101,10 @@ def test_register_quantization_config():
argvalues=[ argvalues=[
"meta-llama/Llama-3.2-1B-Instruct", "meta-llama/Llama-3.2-1B-Instruct",
]) ])
def test_custom_quant(vllm_runner, model): def test_custom_quant(vllm_runner, model, monkeypatch):
"""Test infer with the custom quantization method.""" """Test infer with the custom quantization method."""
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
with vllm_runner(model_name=model, with vllm_runner(model_name=model,
quantization="custom_quant", quantization="custom_quant",
enforce_eager=True) as llm: enforce_eager=True) as llm:
......
...@@ -6,6 +6,13 @@ Run `pytest tests/samplers/test_beam_search.py`. ...@@ -6,6 +6,13 @@ Run `pytest tests/samplers/test_beam_search.py`.
import pytest import pytest
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
"""We can run both engines for this test."""
pass
# FIXME(zhuohan): The test can not pass if we: # FIXME(zhuohan): The test can not pass if we:
# 1. Increase max_tokens to 256. # 1. Increase max_tokens to 256.
# 2. Increase beam_width to 8. # 2. Increase beam_width to 8.
...@@ -15,6 +22,7 @@ BEAM_WIDTHS = [4] ...@@ -15,6 +22,7 @@ BEAM_WIDTHS = [4]
MODELS = ["TinyLlama/TinyLlama-1.1B-Chat-v1.0"] MODELS = ["TinyLlama/TinyLlama-1.1B-Chat-v1.0"]
@pytest.mark.skip_v1 # FIXME: This fails on V1 right now.
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", MAX_TOKENS) @pytest.mark.parametrize("max_tokens", MAX_TOKENS)
......
...@@ -8,6 +8,13 @@ import pytest ...@@ -8,6 +8,13 @@ import pytest
from vllm import SamplingParams from vllm import SamplingParams
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
"""We can run both engines for this test."""
pass
# We also test with llama because it has generation_config to specify EOS # We also test with llama because it has generation_config to specify EOS
# (past regression). # (past regression).
MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"] MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"]
......
...@@ -8,6 +8,14 @@ from vllm import SamplingParams ...@@ -8,6 +8,14 @@ from vllm import SamplingParams
MODELS = ["distilbert/distilgpt2"] MODELS = ["distilbert/distilgpt2"]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This file tests V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
def test_logits_processor_force_generate( def test_logits_processor_force_generate(
......
...@@ -10,6 +10,15 @@ from ..conftest import VllmRunner ...@@ -10,6 +10,15 @@ from ..conftest import VllmRunner
MODELS = ["distilbert/distilgpt2"] MODELS = ["distilbert/distilgpt2"]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This module is V0 only since it uses dtype=float, so
set VLLM_USE_V1=0 for all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", @pytest.mark.parametrize("dtype",
["float"]) # needed for comparing logprobs with HF ["float"]) # needed for comparing logprobs with HF
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment