Unverified Commit d4d93db2 authored by Robert Shaw's avatar Robert Shaw Committed by GitHub
Browse files

[V1] V1 Enablement Oracle (#13726)


Signed-off-by: default avatarrshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: default avatarrshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: default avatarNicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: default avatarTyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: default avatarMichael Goin <michael@neuralmagic.com>
parent 8c0d15d5
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
...@@ -15,7 +15,7 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, ...@@ -15,7 +15,7 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
from vllm.transformers_utils.detokenizer import Detokenizer from vllm.transformers_utils.detokenizer import Detokenizer
from vllm.utils import Counter from vllm.utils import Counter
from ...core.utils import create_seq_group from ..core.utils import create_seq_group
@pytest.mark.parametrize("seq_output_len", [128]) @pytest.mark.parametrize("seq_output_len", [128])
......
...@@ -3,12 +3,21 @@ ...@@ -3,12 +3,21 @@
import sys import sys
from contextlib import nullcontext from contextlib import nullcontext
import pytest
from vllm_test_utils import BlameResult, blame from vllm_test_utils import BlameResult, blame
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
V1 only supports xgrammar so this is irrelevant.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
def run_normal_opt125m(): def run_normal_opt125m():
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
......
...@@ -10,7 +10,6 @@ from ...utils import RemoteOpenAIServer ...@@ -10,7 +10,6 @@ from ...utils import RemoteOpenAIServer
# # any model with a chat template should work here # # any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct" MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
...@@ -22,8 +21,6 @@ def server(): ...@@ -22,8 +21,6 @@ def server():
"--enforce-eager", "--enforce-eager",
"--max-model-len", "--max-model-len",
"4080", "4080",
"--chat-template",
DUMMY_CHAT_TEMPLATE,
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
......
...@@ -11,7 +11,6 @@ from ...utils import RemoteOpenAIServer ...@@ -11,7 +11,6 @@ from ...utils import RemoteOpenAIServer
# # any model with a chat template should work here # # any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct" MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
API_KEY = "abc-123" API_KEY = "abc-123"
ERROR_API_KEY = "abc" ERROR_API_KEY = "abc"
ROOT_PATH = "llm" ROOT_PATH = "llm"
...@@ -28,8 +27,6 @@ def server(): ...@@ -28,8 +27,6 @@ def server():
"4080", "4080",
"--root-path", # use --root-path=/llm for testing "--root-path", # use --root-path=/llm for testing
"/" + ROOT_PATH, "/" + ROOT_PATH,
"--chat-template",
DUMMY_CHAT_TEMPLATE,
] ]
envs = os.environ.copy() envs = os.environ.copy()
......
...@@ -23,12 +23,14 @@ def clear_cache(): ...@@ -23,12 +23,14 @@ def clear_cache():
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"]) "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
@pytest.mark.parametrize("use_v1", [True, False])
@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"]) @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
def test_env(name: str, device: str, monkeypatch): def test_env(name: str, use_v1: bool, device: str, monkeypatch):
"""Test that the attention selector can be set via environment variable. """Test that the attention selector can be set via environment variable.
Note that we do not test FlashAttn because it is the default backend. Note that we do not test FlashAttn because it is the default backend.
""" """
monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
override_backend_env_variable(monkeypatch, name) override_backend_env_variable(monkeypatch, name)
if device == "cpu": if device == "cpu":
...@@ -40,7 +42,8 @@ def test_env(name: str, device: str, monkeypatch): ...@@ -40,7 +42,8 @@ def test_env(name: str, device: str, monkeypatch):
with patch("vllm.attention.selector.current_platform", RocmPlatform()): with patch("vllm.attention.selector.current_platform", RocmPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16, 16, backend = get_attn_backend(16, torch.float16, torch.float16, 16,
False) False)
assert backend.get_name() == "ROCM_FLASH" EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
assert backend.get_name() == EXPECTED
elif device == "openvino": elif device == "openvino":
with patch("vllm.attention.selector.current_platform", with patch("vllm.attention.selector.current_platform",
OpenVinoPlatform()), patch.dict('sys.modules', OpenVinoPlatform()), patch.dict('sys.modules',
...@@ -54,7 +57,8 @@ def test_env(name: str, device: str, monkeypatch): ...@@ -54,7 +57,8 @@ def test_env(name: str, device: str, monkeypatch):
CudaPlatform()): CudaPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16, backend = get_attn_backend(16, torch.float16, torch.float16,
16, False) 16, False)
assert backend.get_name() == name EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
assert backend.get_name() == EXPECTED
def test_flash_attn(monkeypatch): def test_flash_attn(monkeypatch):
...@@ -95,13 +99,23 @@ def test_flash_attn(monkeypatch): ...@@ -95,13 +99,23 @@ def test_flash_attn(monkeypatch):
assert backend.get_name() != STR_FLASH_ATTN_VAL assert backend.get_name() != STR_FLASH_ATTN_VAL
def test_invalid_env(monkeypatch): @pytest.mark.parametrize("use_v1", [True, False])
def test_invalid_env(use_v1: bool, monkeypatch):
"""Ignore the invalid env variable if it is set.""" """Ignore the invalid env variable if it is set."""
monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
override_backend_env_variable(monkeypatch, STR_INVALID_VAL) override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
with patch("vllm.attention.selector.current_platform", CudaPlatform()): with patch("vllm.attention.selector.current_platform", CudaPlatform()):
backend = get_attn_backend(32, torch.float16, None, 16, False) backend = get_attn_backend(32, torch.float16, None, 16, False)
assert backend.get_name() == "FLASH_ATTN" EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
assert backend.get_name() == EXPECTED
# when block size == 16, backend will fall back to XFORMERS # when block size == 16, backend will fall back to XFORMERS
backend = get_attn_backend(16, torch.float16, None, 16, False) # this behavior is not yet supported on V1.
assert backend.get_name() == "XFORMERS" if use_v1:
# TODO: support fallback on V1!
# https://github.com/vllm-project/vllm/issues/14524
pass
else:
backend = get_attn_backend(16, torch.float16, None, 16, False)
assert backend.get_name() == "XFORMERS"
...@@ -22,6 +22,16 @@ from vllm.config import VllmConfig, set_current_vllm_config ...@@ -22,6 +22,16 @@ from vllm.config import VllmConfig, set_current_vllm_config
from vllm.forward_context import set_forward_context from vllm.forward_context import set_forward_context
from vllm.platforms import current_platform from vllm.platforms import current_platform
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Encoder-decoder is only supported on V0, so set
VLLM_USE_V1=0 for all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
# List of support backends for encoder/decoder models # List of support backends for encoder/decoder models
LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN] LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
HEAD_SIZES = [64, 256] HEAD_SIZES = [64, 256]
......
...@@ -24,7 +24,8 @@ def test_selector(monkeypatch): ...@@ -24,7 +24,8 @@ def test_selector(monkeypatch):
with patch("vllm.attention.selector.current_platform", RocmPlatform()): with patch("vllm.attention.selector.current_platform", RocmPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
assert backend.get_name() == "ROCM_FLASH" assert (backend.get_name() == "ROCM_FLASH"
or backend.get_name() == "ROCM_ATTN_VLLM_V1")
# mla test for deepseek related # mla test for deepseek related
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
False, True) False, True)
......
...@@ -80,6 +80,8 @@ def v1(run_with_both_engines_lora): ...@@ -80,6 +80,8 @@ def v1(run_with_both_engines_lora):
pass pass
# V1 Test: Failing due to numerics on V1.
@pytest.mark.skip_v1
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_llama_lora(sql_lora_files): def test_llama_lora(sql_lora_files):
...@@ -123,6 +125,8 @@ def test_llama_lora_warmup(sql_lora_files): ...@@ -123,6 +125,8 @@ def test_llama_lora_warmup(sql_lora_files):
"less when using lora than when not using lora") "less when using lora than when not using lora")
# V1 Test: Failing due to numerics on V1.
@pytest.mark.skip_v1
@multi_gpu_test(num_gpus=4) @multi_gpu_test(num_gpus=4)
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_llama_lora_tp4(sql_lora_files): def test_llama_lora_tp4(sql_lora_files):
......
...@@ -8,7 +8,7 @@ import os ...@@ -8,7 +8,7 @@ import os
import pytest import pytest
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.entrypoints.llm import LLM from vllm.engine.llm_engine import LLMEngine
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
MODEL_PATH = "meta-llama/Llama-2-7b-hf" MODEL_PATH = "meta-llama/Llama-2-7b-hf"
...@@ -43,7 +43,7 @@ def test_lora_functions_sync(): ...@@ -43,7 +43,7 @@ def test_lora_functions_sync():
gpu_memory_utilization=0.8, gpu_memory_utilization=0.8,
enforce_eager=True) enforce_eager=True)
llm = LLM.get_engine_class().from_engine_args(engine_args) llm = LLMEngine.from_engine_args(engine_args)
def run_check(fn, args, expected: list): def run_check(fn, args, expected: list):
fn(args) fn(args)
......
...@@ -7,6 +7,7 @@ import torch ...@@ -7,6 +7,7 @@ import torch
from safetensors.torch import load_file from safetensors.torch import load_file
from torch import nn from torch import nn
from vllm import envs
from vllm.config import LoRAConfig from vllm.config import LoRAConfig
from vllm.lora.layers import (ColumnParallelLinearWithLoRA, from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
MergedColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA,
...@@ -410,6 +411,7 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device): ...@@ -410,6 +411,7 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
assert manager.device == device assert manager.device == device
@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
sql_lora_files, device): sql_lora_files, device):
...@@ -489,6 +491,7 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, ...@@ -489,6 +491,7 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
device) device)
@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
sql_lora_files, device): sql_lora_files, device):
......
...@@ -15,6 +15,15 @@ from vllm.engine.metrics import RayPrometheusStatLogger ...@@ -15,6 +15,15 @@ from vllm.engine.metrics import RayPrometheusStatLogger
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This module tests V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
MODELS = [ MODELS = [
"distilbert/distilgpt2", "distilbert/distilgpt2",
] ]
......
...@@ -110,16 +110,6 @@ def test_models( ...@@ -110,16 +110,6 @@ def test_models(
example_prompts = tokenizer.apply_chat_template( example_prompts = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True) messages, tokenize=False, add_generation_prompt=True)
# Run unquantized model.
with vllm_runner(
model_name=model.original_model,
enforce_eager=True, # faster tests
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tp_size) as original_model:
original_outputs = original_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs)
# Run gguf model. # Run gguf model.
with vllm_runner(model_name=model.gguf_model, with vllm_runner(model_name=model.gguf_model,
enforce_eager=True, enforce_eager=True,
...@@ -130,6 +120,16 @@ def test_models( ...@@ -130,6 +120,16 @@ def test_models(
gguf_outputs = gguf_model.generate_greedy_logprobs( gguf_outputs = gguf_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs) example_prompts[:-1], max_tokens, num_logprobs)
# Run unquantized model.
with vllm_runner(
model_name=model.original_model,
enforce_eager=True, # faster tests
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tp_size) as original_model:
original_outputs = original_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs)
check_logprobs_close( check_logprobs_close(
outputs_0_lst=original_outputs, outputs_0_lst=original_outputs,
outputs_1_lst=gguf_outputs, outputs_1_lst=gguf_outputs,
......
...@@ -9,7 +9,9 @@ from vllm.sampling_params import SamplingParams ...@@ -9,7 +9,9 @@ from vllm.sampling_params import SamplingParams
from ...utils import check_outputs_equal from ...utils import check_outputs_equal
# This test is for the hybrid models # This test is for the hybrid models
MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"] MODELS = ["ai21labs/Jamba-tiny-dev"]
# Bamba at Fp32 is too big for the CI (L4 GPU).
# MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
...@@ -41,13 +43,6 @@ def test_models( ...@@ -41,13 +43,6 @@ def test_models(
with vllm_runner(model, dtype=dtype) as vllm_model: with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
def print_model(model):
print(model)
vllm_model.apply_model(print_model)
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_outputs[i] vllm_output_ids, vllm_output_str = vllm_outputs[i]
...@@ -192,6 +187,7 @@ def test_parallel_sampling( ...@@ -192,6 +187,7 @@ def test_parallel_sampling(
) )
@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.")
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [20]) @pytest.mark.parametrize("max_tokens", [20])
...@@ -293,6 +289,7 @@ def test_state_cleanup( ...@@ -293,6 +289,7 @@ def test_state_cleanup(
"could be related to finished_requests_ids") "could be related to finished_requests_ids")
@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.")
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["float"])
def test_multistep( def test_multistep(
...@@ -308,6 +305,7 @@ def test_multistep( ...@@ -308,6 +305,7 @@ def test_multistep(
vllm_model.generate_greedy([example_prompts[0]] * 10, 1) vllm_model.generate_greedy([example_prompts[0]] * 10, 1)
@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.")
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("max_tokens", [64])
......
...@@ -68,13 +68,6 @@ def test_models( ...@@ -68,13 +68,6 @@ def test_models(
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model: with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
def print_model(model):
print(model)
vllm_model.apply_model(print_model)
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_outputs[i] vllm_output_ids, vllm_output_str = vllm_outputs[i]
......
...@@ -213,16 +213,6 @@ def test_mistral_format( ...@@ -213,16 +213,6 @@ def test_mistral_format(
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
) -> None: ) -> None:
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="auto",
load_format="safetensors",
config_format="hf",
) as hf_format_model:
hf_format_outputs = hf_format_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
with vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
...@@ -233,6 +223,16 @@ def test_mistral_format( ...@@ -233,6 +223,16 @@ def test_mistral_format(
mistral_format_outputs = mistral_format_model.generate_greedy_logprobs( mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs) example_prompts, max_tokens, num_logprobs)
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="auto",
load_format="safetensors",
config_format="hf",
) as hf_format_model:
hf_format_outputs = hf_format_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
check_logprobs_close( check_logprobs_close(
outputs_0_lst=hf_format_outputs, outputs_0_lst=hf_format_outputs,
outputs_1_lst=mistral_format_outputs, outputs_1_lst=mistral_format_outputs,
...@@ -261,6 +261,7 @@ def test_mistral_symbolic_languages( ...@@ -261,6 +261,7 @@ def test_mistral_symbolic_languages(
assert "�" not in outputs[0].outputs[0].text.strip() assert "�" not in outputs[0].outputs[0].text.strip()
@pytest.mark.skip("RE-ENABLE: test is currently failing on main.")
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("model", @pytest.mark.parametrize("model",
MISTRAL_FORMAT_MODELS) # v1 can't do func calling MISTRAL_FORMAT_MODELS) # v1 can't do func calling
......
...@@ -7,6 +7,12 @@ import pytest ...@@ -7,6 +7,12 @@ import pytest
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
# These have unsupported head_dim for FA. We do not
# not have a clean way to fall back, so we fail with
# a clear msg when it happens.
# https://github.com/vllm-project/vllm/issues/14524
REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model", "model",
...@@ -71,7 +77,10 @@ def test_models( ...@@ -71,7 +77,10 @@ def test_models(
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
monkeypatch,
) -> None: ) -> None:
if model in REQUIRES_V0:
monkeypatch.setenv("VLLM_USE_V1", "0")
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
if model.startswith("THUDM/chatglm3"): if model.startswith("THUDM/chatglm3"):
...@@ -85,13 +94,6 @@ def test_models( ...@@ -85,13 +94,6 @@ def test_models(
vllm_outputs = vllm_model.generate_greedy_logprobs( vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs) example_prompts, max_tokens, num_logprobs)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
def print_model(model):
print(model)
vllm_model.apply_model(print_model)
check_logprobs_close( check_logprobs_close(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
......
...@@ -108,7 +108,12 @@ def run_awq_test( ...@@ -108,7 +108,12 @@ def run_awq_test(
@pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("num_logprobs", [5])
@torch.inference_mode() @torch.inference_mode()
def test_awq_models(vllm_runner, image_assets, source_model, quant_model, def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
size_factors, dtype, max_tokens, num_logprobs) -> None: size_factors, dtype, max_tokens, num_logprobs,
monkeypatch) -> None:
# Test V1: this test hangs during setup on single-scale input.
# TODO: fixure out why and re-enable this on V1.
monkeypatch.setenv("VLLM_USE_V1", "0")
run_awq_test( run_awq_test(
vllm_runner, vllm_runner,
image_assets, image_assets,
......
...@@ -9,8 +9,7 @@ from pathlib import PosixPath ...@@ -9,8 +9,7 @@ from pathlib import PosixPath
import pytest import pytest
from packaging.version import Version from packaging.version import Version
from transformers import (AutoModelForImageTextToText, AutoModelForPreTraining, from transformers import AutoModelForPreTraining, AutoModelForVision2Seq
AutoModelForVision2Seq)
from transformers import __version__ as TRANSFORMERS_VERSION from transformers import __version__ as TRANSFORMERS_VERSION
from vllm.platforms import current_platform from vllm.platforms import current_platform
...@@ -33,6 +32,16 @@ from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs, ...@@ -33,6 +32,16 @@ from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
if current_platform.is_rocm(): if current_platform.is_rocm():
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0" os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
REQUIRES_V0_MODELS = [
# V1 Test: no way to fall back for head_dim = 80
# https://github.com/vllm-project/vllm/issues/14524
"qwen_vl",
"h2ovl",
"blip2",
# V1 Test: not enough KV cache space in C1.
"fuyu",
]
# yapf: disable # yapf: disable
COMMON_BROADCAST_SETTINGS = { COMMON_BROADCAST_SETTINGS = {
"test_type": VLMTestType.IMAGE, "test_type": VLMTestType.IMAGE,
...@@ -157,25 +166,25 @@ VLM_TEST_SETTINGS = { ...@@ -157,25 +166,25 @@ VLM_TEST_SETTINGS = {
marks=[pytest.mark.core_model, pytest.mark.cpu_model], marks=[pytest.mark.core_model, pytest.mark.cpu_model],
), ),
#### Extended model tests #### Extended model tests
"aria": VLMTestInfo( # "aria": VLMTestInfo(
models=["rhymes-ai/Aria"], # models=["rhymes-ai/Aria"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), # test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501 # prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n", # img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
max_model_len=4096, # max_model_len=4096,
max_num_seqs=2, # max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, # auto_cls=AutoModelForImageTextToText,
single_image_prompts=IMAGE_ASSETS.prompts({ # single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<vlm_image>Please describe the image shortly.", # "stop_sign": "<vlm_image>Please describe the image shortly.",
"cherry_blossom": "<vlm_image>Please infer the season with reason.", # "cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
}), # }),
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501 # multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"), # postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"), # noqa: E501
stop_str=["<|im_end|>"], # stop_str=["<|im_end|>"],
image_size_factors=[(0.10, 0.15)], # image_size_factors=[(0.10, 0.15)],
max_tokens=64, # max_tokens=64,
marks=[large_gpu_mark(min_gb=64)], # marks=[large_gpu_mark(min_gb=64)],
), # ),
"blip2": VLMTestInfo( "blip2": VLMTestInfo(
models=["Salesforce/blip2-opt-2.7b"], models=["Salesforce/blip2-opt-2.7b"],
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
...@@ -589,7 +598,9 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str, ...@@ -589,7 +598,9 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets): image_assets: _ImageAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_single_image_test( runners.run_single_image_test(
tmp_path=tmp_path, tmp_path=tmp_path,
...@@ -612,7 +623,9 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str, ...@@ -612,7 +623,9 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets): image_assets: _ImageAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_multi_image_test( runners.run_multi_image_test(
tmp_path=tmp_path, tmp_path=tmp_path,
...@@ -635,7 +648,9 @@ def test_image_embedding_models(model_type: str, ...@@ -635,7 +648,9 @@ def test_image_embedding_models(model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets): image_assets: _ImageAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_embedding_test( runners.run_embedding_test(
model_test_info=model_test_info, model_test_info=model_test_info,
...@@ -655,7 +670,9 @@ def test_image_embedding_models(model_type: str, ...@@ -655,7 +670,9 @@ def test_image_embedding_models(model_type: str,
)) ))
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
video_assets: _VideoAssets): video_assets: _VideoAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_video_test( runners.run_video_test(
model_test_info=model_test_info, model_test_info=model_test_info,
...@@ -678,7 +695,10 @@ def test_custom_inputs_models( ...@@ -678,7 +695,10 @@ def test_custom_inputs_models(
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch,
): ):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_custom_inputs_test( runners.run_custom_inputs_test(
model_test_info=model_test_info, model_test_info=model_test_info,
...@@ -701,7 +721,9 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, ...@@ -701,7 +721,9 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets): image_assets: _ImageAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_single_image_test( runners.run_single_image_test(
tmp_path=tmp_path, tmp_path=tmp_path,
...@@ -725,7 +747,9 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, ...@@ -725,7 +747,9 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets): image_assets: _ImageAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_multi_image_test( runners.run_multi_image_test(
tmp_path=tmp_path, tmp_path=tmp_path,
...@@ -749,7 +773,9 @@ def test_image_embedding_models_heavy(model_type: str, ...@@ -749,7 +773,9 @@ def test_image_embedding_models_heavy(model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets): image_assets: _ImageAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_embedding_test( runners.run_embedding_test(
model_test_info=model_test_info, model_test_info=model_test_info,
...@@ -770,7 +796,9 @@ def test_image_embedding_models_heavy(model_type: str, ...@@ -770,7 +796,9 @@ def test_image_embedding_models_heavy(model_type: str,
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
video_assets: _VideoAssets): video_assets: _VideoAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_video_test( runners.run_video_test(
model_test_info=model_test_info, model_test_info=model_test_info,
...@@ -794,7 +822,10 @@ def test_custom_inputs_models_heavy( ...@@ -794,7 +822,10 @@ def test_custom_inputs_models_heavy(
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch,
): ):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_custom_inputs_test( runners.run_custom_inputs_test(
model_test_info=model_test_info, model_test_info=model_test_info,
......
...@@ -14,6 +14,15 @@ from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput, ...@@ -14,6 +14,15 @@ from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
PromptVideoInput, VllmRunner) PromptVideoInput, VllmRunner)
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
V1 Test: batch_make_xxxxx_embeddings calls a V0 internal
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
models = ["Qwen/Qwen2-VL-2B-Instruct"] models = ["Qwen/Qwen2-VL-2B-Instruct"]
target_dtype = "half" target_dtype = "half"
...@@ -118,6 +127,7 @@ def batch_make_image_embeddings( ...@@ -118,6 +127,7 @@ def batch_make_image_embeddings(
return visual(pixel_values_on_device, return visual(pixel_values_on_device,
grid_thw=image_grid_thw_on_device) grid_thw=image_grid_thw_on_device)
# V1 Test: this calls a V0 internal.
image_embeds = torch.concat(llm.apply_model(get_image_embeds)) image_embeds = torch.concat(llm.apply_model(get_image_embeds))
# split into original batches # split into original batches
...@@ -201,6 +211,7 @@ def batch_make_video_embeddings( ...@@ -201,6 +211,7 @@ def batch_make_video_embeddings(
return visual(pixel_values_on_device, return visual(pixel_values_on_device,
grid_thw=video_grid_thw_on_device) grid_thw=video_grid_thw_on_device)
# V1 Test: this calls a V0 internal.
video_embeds = torch.concat(llm.apply_model(get_image_embeds)) video_embeds = torch.concat(llm.apply_model(get_image_embeds))
# split into original batches # split into original batches
...@@ -253,7 +264,6 @@ def run_embedding_input_test( ...@@ -253,7 +264,6 @@ def run_embedding_input_test(
processor = AutoProcessor.from_pretrained(model) processor = AutoProcessor.from_pretrained(model)
# NOTE:
# max_model_len should be greater than image_feature_size # max_model_len should be greater than image_feature_size
with vllm_runner(model, with vllm_runner(model,
task="generate", task="generate",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment