Commit 469e903b authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-dev

parents 389ebcf7 25f560a6
......@@ -31,7 +31,10 @@ def test_lm_head(
vllm_runner,
model_id: str,
lm_head_quantized: bool,
monkeypatch,
) -> None:
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
with vllm_runner(model_id, dtype=torch.float16,
max_model_len=2048) as vllm_model:
......
......@@ -10,7 +10,9 @@ from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501
QuarkLinearMethod, QuarkW8A8Fp8)
def test_quark_fp8(vllm_runner):
def test_quark_fp8(vllm_runner, monkeypatch):
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
with vllm_runner(model_path) as llm:
......
......@@ -5,7 +5,7 @@ See https://github.com/vllm-project/vllm/issues/11926 for more details.
Run `pytest tests/quantization/test_register_quantization_config.py`.
"""
from typing import Any, Dict, List, Optional
from typing import Any, Optional
import pytest
import torch
......@@ -58,7 +58,7 @@ class CustomQuantConfig(QuantizationConfig):
"""Name of the quantization method."""
return "custom_quant"
def get_supported_act_dtypes(self) -> List["torch.dtype"]:
def get_supported_act_dtypes(self) -> list["torch.dtype"]:
"""List of supported activation dtypes."""
return [torch.float16, torch.bfloat16]
......@@ -68,12 +68,12 @@ class CustomQuantConfig(QuantizationConfig):
return -1
@staticmethod
def get_config_filenames() -> List[str]:
def get_config_filenames() -> list[str]:
"""List of filenames to search for in the model directory."""
return []
@classmethod
def from_config(cls, config: Dict[str, Any]) -> "CustomQuantConfig":
def from_config(cls, config: dict[str, Any]) -> "CustomQuantConfig":
"""Create a config class from the model's quantization config."""
return CustomQuantConfig(num_bits=config.get("num_bits", 8))
......@@ -101,8 +101,10 @@ def test_register_quantization_config():
argvalues=[
"meta-llama/Llama-3.2-1B-Instruct",
])
def test_custom_quant(vllm_runner, model):
def test_custom_quant(vllm_runner, model, monkeypatch):
"""Test infer with the custom quantization method."""
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
with vllm_runner(model_name=model,
quantization="custom_quant",
enforce_eager=True) as llm:
......
......@@ -8,6 +8,13 @@ import pytest
import os
from ..utils import models_path_prefix
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
"""We can run both engines for this test."""
pass
# FIXME(zhuohan): The test can not pass if we:
# 1. Increase max_tokens to 256.
# 2. Increase beam_width to 8.
......@@ -17,6 +24,7 @@ BEAM_WIDTHS = [4]
MODELS = [os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0")]
@pytest.mark.skip_v1 # FIXME: This fails on V1 right now.
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", MAX_TOKENS)
......
......@@ -10,6 +10,13 @@ import os
from vllm import SamplingParams
from ..utils import models_path_prefix
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
"""We can run both engines for this test."""
pass
# We also test with llama because it has generation_config to specify EOS
# (past regression).
MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2"), os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
......
......@@ -10,6 +10,14 @@ from ..utils import models_path_prefix
MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2")]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This file tests V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_logits_processor_force_generate(
......
# SPDX-License-Identifier: Apache-2.0
from typing import List
import pytest
import torch
import os
......@@ -14,6 +12,15 @@ from ..utils import models_path_prefix
MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2")]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This module is V0 only since it uses dtype=float, so
set VLLM_USE_V1=0 for all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype",
["half"]) # needed for comparing logprobs with HF
......@@ -72,7 +79,7 @@ def test_get_prompt_logprobs(
assert (len(logprobs) == num_top_logprobs
or len(logprobs) == num_top_logprobs + 1)
output_text = result.outputs[0].text
output_string_from_most_likely_tokens_lst: List[str] = []
output_string_from_most_likely_tokens_lst: list[str] = []
for top_logprobs in result.outputs[0].logprobs:
top_logprob = next(iter(top_logprobs.values()))
output_string_from_most_likely_tokens_lst.append(
......
......@@ -5,20 +5,27 @@ Run `pytest tests/samplers/test_no_bad_words.py`.
"""
import os
from typing import List, Optional
from typing import Optional
import pytest
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from ..utils import models_path_prefix
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
"""We can run both engines for this test."""
pass
def _generate(
model: LLM,
prompt: str,
num_prompt_tokens: int,
temperature: float = 0,
bad_words: Optional[List[str]] = None,
) -> List[int]:
bad_words: Optional[list[str]] = None,
) -> list[int]:
sampling_params = SamplingParams(
temperature=temperature,
bad_words=bad_words,
......@@ -60,7 +67,7 @@ class TestOneTokenBadWord:
def _generate(self,
model: LLM,
bad_words: Optional[List[str]] = None) -> List[int]:
bad_words: Optional[list[str]] = None) -> list[int]:
return _generate(
model=model,
prompt=self.PROMPT,
......@@ -70,7 +77,7 @@ class TestOneTokenBadWord:
def _encode(self,
prompt: str,
add_special_tokens: bool = True) -> List[int]:
add_special_tokens: bool = True) -> list[int]:
return self.tokenizer(prompt,
add_special_tokens=add_special_tokens).input_ids
......@@ -150,7 +157,7 @@ class TestTwoTokenBadWord:
def _generate(self,
model: LLM,
bad_words: Optional[List[str]] = None) -> List[int]:
bad_words: Optional[list[str]] = None) -> list[int]:
return _generate(
model=model,
prompt=self.PROMPT,
......@@ -159,7 +166,7 @@ class TestTwoTokenBadWord:
)
@staticmethod
def _contains(sequence: List[int], subsequence: List[int]) -> bool:
def _contains(sequence: list[int], subsequence: list[int]) -> bool:
searched = False
for start in range(len(sequence)):
......@@ -182,6 +189,6 @@ class TestTwoTokenBadWord:
def _encode(self,
prompt: str,
add_special_tokens: bool = True) -> List[int]:
add_special_tokens: bool = True) -> list[int]:
return self.tokenizer(prompt,
add_special_tokens=add_special_tokens).input_ids
......@@ -9,6 +9,12 @@ from ..utils import models_path_prefix
MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2")]
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
"""We can run both engines for this test."""
pass
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_ranks(
......
# SPDX-License-Identifier: Apache-2.0
"""Tests for rejection sampling."""
from typing import List, Tuple
import pytest
import torch
......@@ -8,7 +7,16 @@ import torch.nn.functional as F
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
from vllm.model_executor.utils import set_random_seed
from vllm.utils import is_hip
from vllm.platforms import current_platform
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This file tests V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
......@@ -46,7 +54,7 @@ def mock_causal_accepted_tensor(
"which_tokens_accepted",
["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("use_flashinfer", [True, False] if not is_hip() else [False])
@pytest.mark.parametrize("use_flashinfer", [True, False] if not current_platform.is_rocm() else [False])
@torch.inference_mode()
def test_correct_output_format(which_tokens_accepted: str, seed: int,
device: str, use_flashinfer: bool):
......@@ -130,7 +138,7 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
@pytest.mark.parametrize("batch_size", list(range(1, 32)))
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("use_flashinfer", [True, False] if not is_hip() else [False])
@pytest.mark.parametrize("use_flashinfer", [True, False] if not current_platform.is_rocm() else [False])
@torch.inference_mode()
def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
device: str, use_flashinfer: bool):
......@@ -162,7 +170,7 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
@pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
@pytest.mark.parametrize("n_rep", [100])
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("use_flashinfer", [True, False] if not is_hip() else [False])
@pytest.mark.parametrize("use_flashinfer", [True, False] if not current_platform.is_rocm() else [False])
@torch.inference_mode()
def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
frac_seeded: float, n_rep: int, device: str,
......@@ -203,7 +211,7 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
assert torch.equal(results[j][i], results[0][i])
@pytest.mark.skipif(is_hip(),
@pytest.mark.skipif(current_platform.is_rocm(),
reason="Consistent with NV.")
@pytest.mark.parametrize("k", [1, 3, 6])
@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
......@@ -305,7 +313,7 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
for i in range(batch_size)
}
for use_flashinfer in [True, False] if not is_hip() else [False]:
for use_flashinfer in [True, False] if not current_platform.is_rocm() else [False]:
rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
rejection_sampler.init_gpu_tensors(device=device)
# We use seeded sequences to ensure the same tokens are accepted
......@@ -326,7 +334,7 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
@pytest.mark.parametrize("which_token_ids",
["bonus_token_ids", "draft_token_ids"])
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("use_flashinfer", [True, False] if not is_hip() else [False])
@pytest.mark.parametrize("use_flashinfer", [True, False] if not current_platform.is_rocm() else [False])
@torch.inference_mode()
def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
which_token_ids: str, device: str,
......@@ -378,7 +386,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
@pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
@pytest.mark.parametrize("seed", list(range(5)))
@pytest.mark.parametrize("use_flashinfer", [True, False] if not is_hip() else [False])
@pytest.mark.parametrize("use_flashinfer", [True, False] if not current_platform.is_rocm() else [False])
@torch.inference_mode()
def test_rejection_sampling_approximates_target_distribution(
seed: int, draft_and_target_probs_equal: bool, use_flashinfer: bool):
......@@ -419,8 +427,8 @@ def test_rejection_sampling_approximates_target_distribution(
draft_and_target_probs_equal)
sample_sizes = [10, 100, 1_000, 10_000, 100_000]
distance_wrt_reference: List[float] = []
distance_wrt_target: List[float] = []
distance_wrt_reference: list[float] = []
distance_wrt_target: list[float] = []
for num_samples in sample_sizes:
(reference_vs_rejsample_dist,
......@@ -455,7 +463,7 @@ def test_rejection_sampling_approximates_target_distribution(
expected_improvement_multiplier)
def get_ratio_first_to_last(elements: List[float]) -> float:
def get_ratio_first_to_last(elements: list[float]) -> float:
return elements[0] / elements[-1]
......@@ -480,7 +488,7 @@ class _CorrectnessTestHelper:
def generate_probs_for_test(
self, draft_and_target_probs_equal: bool
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
draft_probs, target_probs = (F.softmax(
torch.rand(self.vocab_size, dtype=torch.float32),
dim=-1,
......@@ -502,7 +510,7 @@ class _CorrectnessTestHelper:
def run_and_compare_distributions(self, draft_probs: torch.Tensor,
target_probs: torch.Tensor,
reference_probs: torch.Tensor,
num_samples: int) -> Tuple[float, float]:
num_samples: int) -> tuple[float, float]:
# Sample using rejection sampling.
rej_sample_probs = self._estimate_rejection_sampling_pdf(
draft_probs, target_probs, num_samples)
......
......@@ -3,7 +3,7 @@
import itertools
import random
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
from typing import Optional
from unittest.mock import Mock, patch
import pytest
......@@ -18,6 +18,14 @@ from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
from vllm.utils import Counter, is_pin_memory_available
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This file tests V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
class MockLogitsSampler(Sampler):
def __init__(self, fake_logits: torch.Tensor):
......@@ -30,7 +38,7 @@ class MockLogitsSampler(Sampler):
def _prepare_test(
batch_size: int
) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]:
) -> tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]:
input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
fake_logits = torch.full((batch_size, VOCAB_SIZE),
1e-2,
......@@ -53,8 +61,8 @@ def _do_sample(
sampling_params: SamplingParams,
device: str,
):
seq_group_metadata_list: List[SequenceGroupMetadata] = []
seq_lens: List[int] = []
seq_group_metadata_list: list[SequenceGroupMetadata] = []
seq_lens: list[int] = []
for i in range(batch_size):
seq_group_metadata_list.append(
SequenceGroupMetadata(
......@@ -171,7 +179,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
def create_sampling_params(min_tokens,
eos_token_id=0,
*,
stop_token_ids: Optional[List[int]] = None,
stop_token_ids: Optional[list[int]] = None,
prompt_logprobs: Optional[int] = None):
sampling_params = SamplingParams(
min_tokens=min_tokens,
......@@ -196,7 +204,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
batch_size = random.randint(1, 128)
expected_penalization = []
sequence_metadata_list: List[SequenceGroupMetadata] = []
sequence_metadata_list: list[SequenceGroupMetadata] = []
# 20% chance to generate seq group metadata list with all prompts
is_prompt = random.random() < 0.2
while batch_size > 0:
......@@ -216,8 +224,8 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
eos_token_id=eos_token_id,
stop_token_ids=stop_token_ids)
seq_data: Dict[int, SequenceData] = {}
seq_group_penalization: List[bool] = []
seq_data: dict[int, SequenceData] = {}
seq_group_penalization: list[bool] = []
for _ in range(num_seqs):
num_input = random.randint(1, 100)
num_generated = 0 if is_prompt else random.randint(1, 100)
......@@ -376,16 +384,16 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
else:
test_cases = [generate_test_case()]
def run_test_case(*, expected_penalization: List[bool],
seq_group_metadata_list: List[SequenceGroupMetadata]):
def run_test_case(*, expected_penalization: list[bool],
seq_group_metadata_list: list[SequenceGroupMetadata]):
assert expected_penalization, \
"Invalid test case, need expected_penalization"
assert seq_group_metadata_list, \
"Invalid test case, need seq_group_metadata_list"
batch_size = 0
seq_lens: List[int] = []
sampling_params_per_row: List[SamplingParams] = []
seq_lens: list[int] = []
sampling_params_per_row: list[SamplingParams] = []
for sgm in seq_group_metadata_list:
sampling_params = sgm.sampling_params
......@@ -456,11 +464,11 @@ def test_sampler_mixed(seed: int, device: str):
batch_size = random.randint(1, 256)
input_tensor, fake_logits, sampler = _prepare_test(batch_size)
seq_group_metadata_list: List[SequenceGroupMetadata] = []
expected_tokens: List[Optional[List[int]]] = []
seq_lens: List[int] = []
seq_group_metadata_list: list[SequenceGroupMetadata] = []
expected_tokens: list[Optional[list[int]]] = []
seq_lens: list[int] = []
for i in range(batch_size):
expected: Optional[List[int]] = None
expected: Optional[list[int]] = None
sampling_type = random.randint(0, 2)
if sampling_type == 0:
sampling_params = SamplingParams(temperature=0)
......@@ -492,7 +500,7 @@ def test_sampler_mixed(seed: int, device: str):
))
seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
generators: Dict[str, torch.Generator] = {}
generators: dict[str, torch.Generator] = {}
def test_sampling():
sampling_metadata = SamplingMetadata.prepare(
......@@ -587,8 +595,8 @@ def test_sampler_top_k_top_p(seed: int, device: str):
device=device)
assert len(processors) == 2 # top_p and top_k
seq_group_metadata_list: List[SequenceGroupMetadata] = []
seq_lens: List[int] = []
seq_group_metadata_list: list[SequenceGroupMetadata] = []
seq_lens: list[int] = []
for i in range(batch_size):
seq_group_metadata_list.append(
SequenceGroupMetadata(
......@@ -669,10 +677,10 @@ def test_sampler_repetition_penalty_mixed(device: str):
vocab_size = 8
def test_sampling_params(sampling_params: List[SamplingParams]):
def test_sampling_params(sampling_params: list[SamplingParams]):
seq_group_metadata_list: List[SequenceGroupMetadata] = []
seq_lens: List[int] = []
seq_group_metadata_list: list[SequenceGroupMetadata] = []
seq_lens: list[int] = []
for i in range(2):
seq_group_metadata_list.append(
SequenceGroupMetadata(
......
......@@ -19,7 +19,9 @@ RANDOM_SEEDS = list(range(5))
@pytest.fixture
def vllm_model(vllm_runner):
def vllm_model(vllm_runner, monkeypatch):
# This file relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
with vllm_runner(MODEL, dtype="half") as vllm_model:
yield vllm_model
......
......@@ -11,6 +11,14 @@ from vllm.model_executor.utils import set_random_seed
CUDA_DEVICES = [f"cuda:{i}" for i in range(1)]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This file tests V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
def get_zero_temperature_prob_dist(batch_size, k, vocab_size):
"""
Generates a fake temperature zero probability distribution.
......
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
# SPDX-License-Identifier: Apache-2.0
from collections.abc import Sequence
from itertools import cycle
from typing import List, Optional, Sequence, Tuple, Union
from typing import Optional, Union
import pytest
import torch
......@@ -55,7 +56,7 @@ def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
def maybe_assert_ngram_worker(llm):
# Verify the proposer worker is ngram if ngram is specified.
if (llm.llm_engine.speculative_config is not None
and llm.llm_engine.speculative_config.ngram_prompt_lookup_max > 0):
and llm.llm_engine.speculative_config.method == "ngram"):
from vllm.spec_decode.ngram_worker import NGramWorker
assert isinstance(
llm.llm_engine.model_executor.driver_worker.proposer_worker,
......@@ -64,9 +65,9 @@ def maybe_assert_ngram_worker(llm):
def get_output_from_llm_generator(
llm_generator, prompts,
sampling_params) -> Tuple[List[str], List[List[int]], float]:
tokens: List[str] = []
token_ids: List[List[int]] = []
sampling_params) -> tuple[list[str], list[list[int]], float]:
tokens: list[str] = []
token_ids: list[list[int]] = []
acceptance_rate: float = -1.0
for llm in llm_generator():
maybe_assert_ngram_worker(llm)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment