Commit fcfc474d authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.3' into v0.8.3-dev

parents bb94d2e5 296c6572
......@@ -54,8 +54,10 @@ def test_can_initialize(model_arch):
model_info.default,
tokenizer=model_info.tokenizer,
tokenizer_mode=model_info.tokenizer_mode,
speculative_model=model_info.speculative_model,
num_speculative_tokens=1 if model_info.speculative_model else None,
speculative_config={
"model": model_info.speculative_model,
"num_speculative_tokens": 1,
} if model_info.speculative_model else None,
trust_remote_code=model_info.trust_remote_code,
load_format="dummy",
hf_overrides=hf_overrides,
......
......@@ -23,6 +23,11 @@ from .registry import HF_EXAMPLE_MODELS
@pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
def test_registry_imports(model_arch):
# Llama4ForCausalLM does not have a standalone model
if model_arch == "Llama4ForCausalLM":
return
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
model_info.check_transformers_version(on_fail="skip")
......@@ -91,8 +96,11 @@ def test_registry_is_pp(model_arch, is_pp, init_cuda):
def test_hf_registry_coverage():
untested_archs = (ModelRegistry.get_supported_archs() -
HF_EXAMPLE_MODELS.get_supported_archs())
untested_archs = set(ModelRegistry.get_supported_archs() -
HF_EXAMPLE_MODELS.get_supported_archs())
# Llama4ForCausalLM does not have a standalone model
untested_archs.discard("Llama4ForCausalLM")
assert not untested_archs, (
"Please add the following architectures to "
......
......@@ -3,8 +3,6 @@
Run `pytest tests/models/test_transformers.py`.
"""
from contextlib import nullcontext
import pytest
from ..conftest import HfRunner, VllmRunner
......@@ -42,7 +40,6 @@ def check_implementation(
"model,model_impl",
[
("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
("openai-community/gpt2", "transformers"),
("ArthurZ/Ilama-3.2-1B", "auto"), # CUSTOM CODE
]) # trust_remote_code=True by default
def test_models(
......@@ -52,20 +49,11 @@ def test_models(
model: str,
model_impl: str,
) -> None:
maybe_raises = nullcontext()
if model == "openai-community/gpt2" and model_impl == "transformers":
# Model is not backend compatible
maybe_raises = pytest.raises(
ValueError,
match="The Transformers implementation.*not compatible with vLLM")
with maybe_raises:
check_implementation(hf_runner,
vllm_runner,
example_prompts,
model,
model_impl=model_impl)
check_implementation(hf_runner,
vllm_runner,
example_prompts,
model,
model_impl=model_impl)
@multi_gpu_test(num_gpus=2)
......@@ -84,7 +72,6 @@ def test_distributed(
"meta-llama/Llama-3.2-1B-Instruct",
{
"quantization": "bitsandbytes",
"load_format": "bitsandbytes",
},
),
])
......
# SPDX-License-Identifier: Apache-2.0
import torch
from vllm.model_executor.models.utils import AutoWeightsLoader
class ModuleWithBatchNorm(torch.nn.Module):
def __init__(self):
super().__init__()
self.bn = torch.nn.BatchNorm1d(2)
def forward(self, x):
return self.bn(x)
class ModuleWithNestedBatchNorm(torch.nn.Module):
def __init__(self):
super().__init__()
self.nested_mod = ModuleWithBatchNorm()
def forward(self, x):
return self.nested_mod(x)
def test_module_with_batchnorm_can_load():
"""Ensure the auto weight loader can load batchnorm stats."""
mod = ModuleWithBatchNorm()
# Run some data through the module with batchnorm
mod(torch.Tensor([[1, 2], [3, 4]]))
# Try to load the weights to a new instance
def weight_generator():
yield from mod.state_dict().items()
new_mod = ModuleWithBatchNorm()
assert not torch.all(new_mod.bn.running_mean == mod.bn.running_mean)
assert not torch.all(new_mod.bn.running_var == mod.bn.running_var)
assert new_mod.bn.num_batches_tracked.item() == 0
loader = AutoWeightsLoader(new_mod)
loader.load_weights(weight_generator())
# Ensure the stats are updated
assert torch.all(new_mod.bn.running_mean == mod.bn.running_mean)
assert torch.all(new_mod.bn.running_var == mod.bn.running_var)
assert new_mod.bn.num_batches_tracked.item() == 1
def test_module_with_child_containing_batchnorm_can_autoload():
"""Ensure the auto weight loader can load nested modules batchnorm stats."""
mod = ModuleWithNestedBatchNorm()
# Run some data through the module with batchnorm
mod(torch.Tensor([[1, 2], [3, 4]]))
# Try to load the weights to a new instance
def weight_generator():
yield from mod.state_dict().items()
new_mod = ModuleWithNestedBatchNorm()
assert not torch.all(
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
assert not torch.all(
new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
loader = AutoWeightsLoader(new_mod)
loader.load_weights(weight_generator())
# Ensure the stats are updated
assert torch.all(
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
assert torch.all(
new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
......@@ -28,8 +28,7 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
replace_token_matches)
# yapf: enable
from vllm.multimodal.profiling import MultiModalProfiler
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
cached_tokenizer_from_config)
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import full_groupby
from .utils import random_image
......@@ -955,10 +954,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
limit_mm_per_prompt=limit_mm_per_prompt,
)
processor = MULTIMODAL_REGISTRY.create_processor(
model_config,
tokenizer=cached_tokenizer_from_config(model_config),
)
processor = MULTIMODAL_REGISTRY.create_processor(model_config)
profiler = MultiModalProfiler(processor)
mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
......@@ -994,10 +990,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
limit_mm_per_prompt=limit_mm_per_prompt,
)
processor = MULTIMODAL_REGISTRY.create_processor(
model_config,
tokenizer=cached_tokenizer_from_config(model_config),
)
processor = MULTIMODAL_REGISTRY.create_processor(model_config)
rng = np.random.RandomState(0)
image = random_image(rng, min_wh=128, max_wh=256)
......@@ -1066,10 +1059,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
revision=None,
)
processor = MULTIMODAL_REGISTRY.create_processor(
model_config,
tokenizer=cached_tokenizer_from_config(model_config),
)
processor = MULTIMODAL_REGISTRY.create_processor(model_config)
orig_get_hf_processor = processor.info.get_hf_processor
def get_hf_processor(self, **kwargs):
......
......@@ -11,12 +11,10 @@ import pytest
import os
from PIL import Image, ImageChops
from transformers import AutoConfig, AutoTokenizer
from vllm.multimodal.inputs import PlaceholderRange
from vllm.multimodal.utils import (MediaConnector,
merge_and_sort_multimodal_metadata,
repeat_and_pad_placeholder_tokens)
merge_and_sort_multimodal_metadata)
from ..utils import models_path_prefix, urls_port
if TYPE_CHECKING:
......@@ -139,71 +137,6 @@ async def test_fetch_image_local_files(image_url: str):
f"file://{temp_dir}/../{os.path.basename(image_url)}")
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")])
def test_repeat_and_pad_placeholder_tokens(model):
config = AutoConfig.from_pretrained(model)
image_token_id = config.image_token_index
tokenizer = AutoTokenizer.from_pretrained(model)
test_cases = [
(
"<image>",
2,
"<image><image>",
[32000, 32000],
[{ "offset": 0, "length": 2 }],
),
(
"<image><image>",
2,
"<image><image><image>",
[32000, 32000, 32000],
[{ "offset": 0, "length": 2 }],
),
(
"<image><image>",
[3, 2],
"<image><image><image><image><image>",
[32000, 32000, 32000, 32000, 32000],
[{ "offset": 0, "length": 3 }, { "offset": 3, "length": 2 }],
),
(
"Image:<image>Image:<image>!",
[3, 2],
"Image:<image><image><image>Image:<image><image>!",
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
[{ "offset": 2, "length": 3 }, { "offset": 7, "length": 2 }],
),
(
"<image>",
[3, 2],
"<image><image><image>",
[32000, 32000, 32000],
[{ "offset": 0, "length": 3 }],
),
] # yapf: disable
for (
prompt,
repeat_count,
expected_prompt,
expected_token_ids,
expected_ranges,
) in test_cases:
new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
tokenizer=tokenizer,
prompt=prompt,
prompt_token_ids=tokenizer.encode(prompt,
add_special_tokens=False),
placeholder_token_id=image_token_id,
repeat_count=repeat_count,
)
assert new_prompt == expected_prompt
assert new_token_ids == expected_token_ids
assert ranges == expected_ranges
# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
class TestCase(NamedTuple):
mm_positions: "MultiModalPlaceholderDict"
......@@ -225,7 +158,7 @@ def test_merge_and_sort_multimodal_metadata():
]
},
mm_hashes={"image": ["hash1", "hash2"]},
expected_modalities=["image"],
expected_modalities=["image", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=3, length=2),
......@@ -242,7 +175,7 @@ def test_merge_and_sort_multimodal_metadata():
]
},
mm_hashes=None,
expected_modalities=["image"],
expected_modalities=["image", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=2),
......@@ -267,7 +200,7 @@ def test_merge_and_sort_multimodal_metadata():
"image": ["image_hash1", "image_hash2"],
"audio": ["audio_hash1", "audio_hash2"],
},
expected_modalities=["audio", "image"],
expected_modalities=["audio", "audio", "image", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3),
......@@ -293,7 +226,7 @@ def test_merge_and_sort_multimodal_metadata():
]
},
mm_hashes=None,
expected_modalities=["audio", "image"],
expected_modalities=["audio", "audio", "image", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3),
......@@ -324,7 +257,9 @@ def test_merge_and_sort_multimodal_metadata():
"audio": ["audio_hash1"],
"video": ["video_hash1", "video_hash2", "video_hash3"]
},
expected_modalities=["audio", "video", "image"],
expected_modalities=[
"audio", "video", "video", "video", "image", "image"
],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=3, length=4),
......@@ -370,12 +305,19 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
"image": ["image_hash1", "image_hash2"],
"audio": ["audio_hash1", "audio_hash2"],
},
expected_modalities=[],
expected_ranges=[],
expected_hashes=None,
expected_modalities=["image", "audio", "image", "audio"],
expected_ranges=[
PlaceholderRange(offset=0, length=4),
PlaceholderRange(offset=5, length=2),
PlaceholderRange(offset=8, length=2),
PlaceholderRange(offset=11, length=4),
],
expected_hashes=[
"image_hash1", "audio_hash1", "image_hash2", "audio_hash2"
],
),
# <image> <image> <video> <audio> <image>
# <image> <image> <audio> <video> <image>
TestCase(
mm_positions={
"image": [
......@@ -391,15 +333,54 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
]
},
mm_hashes=None,
expected_modalities=[],
expected_ranges=[],
expected_modalities=["image", "image", "audio", "video", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3),
PlaceholderRange(offset=5, length=2),
PlaceholderRange(offset=8, length=5),
PlaceholderRange(offset=20, length=4),
],
expected_hashes=None,
),
# <image> <audio> <video> <image> with hashes
TestCase(
mm_positions={
"image": [
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=18, length=4),
],
"audio": [
PlaceholderRange(offset=6, length=2),
],
"video": [
PlaceholderRange(offset=10, length=5),
]
},
mm_hashes={
"image": ["image_hash1", "image_hash2"],
"audio": ["audio_hash1"],
"video": ["video_hash1"],
},
expected_modalities=["image", "audio", "video", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=6, length=2),
PlaceholderRange(offset=10, length=5),
PlaceholderRange(offset=18, length=4),
],
expected_hashes=[
"image_hash1", "audio_hash1", "video_hash1", "image_hash2"
],
),
]
for case in test_cases:
with pytest.raises(ValueError) as ex_info:
merge_and_sort_multimodal_metadata(case.mm_positions,
case.mm_hashes)
for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
expected_hashes) in test_cases:
modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
mm_positions, mm_hashes)
assert "Interleaved mixed-modality" in str(ex_info.value)
assert modalities == expected_modalities
assert ranges == expected_ranges
assert hashes == expected_hashes
......@@ -64,9 +64,11 @@ def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
key_cache = torch.zeros_like(key_cache_cpu, device=device)
value_cache = torch.zeros_like(value_cache_cpu, device=device)
slot_mapping = slot_mapping_cpu.to(device)
kv_cache = torch.stack([key_cache, value_cache])
# Run vectorized implementation on XLA device
reshape_and_cache(key, value, key_cache, value_cache, slot_mapping)
reshape_and_cache(key, value, kv_cache, slot_mapping)
key_cache, value_cache = torch.unbind(kv_cache, dim=0)
# Move results back to CPU for comparison
key_cache_result = key_cache.cpu()
......
......@@ -258,13 +258,13 @@ def sample_inputs(
value[start_loc:end_loc])
cur_ctx += block_size
block_id += 1
kv_cache = torch.stack([k_cache, v_cache])
return (
query,
k,
v,
k_cache,
v_cache,
kv_cache,
block_table,
key,
value,
......@@ -361,8 +361,7 @@ def test_contexted_kv_attention(
query,
k_active,
v_active,
k_cache,
v_cache,
kv_cache,
block_table,
key,
value,
......@@ -439,8 +438,7 @@ def test_contexted_kv_attention(
query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
kv_cache = kv_cache.permute(0, 1, 3, 2, 4).contiguous()
# transform block table
active_block_table = get_active_block_tables(
......@@ -487,8 +485,7 @@ def test_contexted_kv_attention(
query.to(device=device),
k.to(device=device),
v.to(device=device),
k_cache.to(device=device),
v_cache.to(device=device),
kv_cache.to(device=device),
active_block_table.to(device=device),
attn_mask.to(device=device),
)
......
......@@ -105,8 +105,6 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
"--enable-prefix-caching",
"--quantization",
"bitsandbytes",
"--load-format",
"bitsandbytes",
"--gpu-memory-utilization",
"0.7",
]
......@@ -141,7 +139,6 @@ def validate_generated_texts(hf_runner,
# when using distributed inference
with vllm_runner(model_name,
quantization='bitsandbytes',
load_format='bitsandbytes',
tensor_parallel_size=vllm_tp_size,
enforce_eager=False) as llm:
vllm_outputs = llm.generate_greedy(prompts, 8)
......
......@@ -23,6 +23,23 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
from vllm.platforms import current_platform
from ..utils import models_path_prefix
# AITER only supports per-channel-per-channel INT8 gemm
# and per-tensor-per-tensor INT8 GEMM.
# It does not support mix precision MM and mix quantization scheme.
ROCM_AITER_SUPPORTED_INT8_MODEL = [
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
]
# TritonScaledMMLinearKernel only supports symmetric quantization.
ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL = [
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
......@@ -60,6 +77,11 @@ def use_v0_only(monkeypatch):
)
def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
model_path, strategy, quant_type, shape_0, is_symmetric = model_args
if current_platform.is_rocm(
) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
with vllm_runner(model_path, enforce_eager=True) as llm:
def check_model(model):
......@@ -126,6 +148,8 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
)
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [10])
@pytest.mark.parametrize(
"use_aiter", [True, False] if current_platform.is_rocm() else [False])
def test_compressed_tensors_w8a8_logprobs(
hf_runner,
vllm_runner,
......@@ -133,7 +157,21 @@ def test_compressed_tensors_w8a8_logprobs(
model_path,
max_tokens,
num_logprobs,
use_aiter,
monkeypatch,
):
if current_platform.is_rocm(
) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
if use_aiter:
if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
pytest.skip(
f"Skip model {model_path} as it is not support by aiter.")
# this will enable VLLM_ROCM_USE_AITER_LINEAR
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
dtype = "bfloat16"
# skip language translation prompt for the static per tensor asym model
......@@ -157,6 +195,9 @@ def test_compressed_tensors_w8a8_logprobs(
name_1="vllm",
)
if current_platform.is_rocm():
torch.cuda.synchronize()
def test_compressed_tensors_no_enforce_eager(vllm_runner):
model_path = os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change")
......@@ -180,8 +221,27 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
),
],
)
def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
@pytest.mark.parametrize(
"use_aiter", [True, False] if current_platform.is_rocm() else [False])
def test_compressed_tensors_w8a8_dynamic_per_token(
vllm_runner,
model_args,
use_aiter,
monkeypatch,
):
model_path, strategy = model_args
if current_platform.is_rocm(
) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
if use_aiter:
if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
pytest.skip(
f"Skip model {model_path} as it is not support by aiter.")
# this will enable VLLM_ROCM_USE_AITER_LINEAR
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
with vllm_runner(model_path, dtype=torch.float16) as llm:
def check_model(model):
......@@ -212,6 +272,8 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
(os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w8a16-per-channel"), "channel", None, 4),
],
)
@pytest.mark.skipif(not current_platform.is_cuda(),
reason="The tests are skipped on non-CUDA platform.")
def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
model, strategy, group, pack_factor = wNa16_args
with vllm_runner(model) as llm:
......@@ -236,8 +298,8 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
assert output
@pytest.mark.skipif(current_platform(),
reason="W4A16 MARLIN is not supported on ROCm.")
@pytest.mark.skipif(not current_platform.is_cuda(),
reason="This test is skipped on non-CUDA platform.")
def test_compressed_tensors_w4a16_marlin24(vllm_runner):
model_path = os.path.join(models_path_prefix,"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t")
with vllm_runner(model_path) as llm:
......@@ -280,7 +342,7 @@ def test_compressed_tensors_fp8(vllm_runner):
if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
assert len(qkv_proj.input_scale.shape) == 0
assert qkv_proj.weight.dtype is torch.float8_e4m3fn
assert qkv_proj.weight.dtype is current_platform.fp8_dtype()
assert qkv_proj.weight_scale.dtype is torch.float32
assert len(qkv_proj.weight_scale.shape) == 0
......@@ -290,8 +352,8 @@ def test_compressed_tensors_fp8(vllm_runner):
assert output
@pytest.mark.skipif(current_platform(),
reason="FP8 KV cache is not supported on ROCm.")
@pytest.mark.skipif(not current_platform.is_cuda(),
reason="This test is skipped on non-CUDA platform.")
def test_compressed_tensors_kv_cache(vllm_runner):
model_path = os.path.join(models_path_prefix,"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")
with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
......@@ -320,7 +382,8 @@ def _test_2of4_quant_models(qkv_proj,
@pytest.mark.skipif(
not current_platform.has_device_capability(90),
not current_platform.is_cuda()
or not current_platform.has_device_capability(90),
reason="Sparse FP8 is not yet supported on this GPU type.",
)
@pytest.mark.parametrize(
......@@ -367,7 +430,8 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
@pytest.mark.skipif(
not current_platform.has_device_capability(90),
not current_platform.is_cuda()
or not current_platform.has_device_capability(90),
reason="Sparse FP8 is not yet supported on this GPU type.",
)
@pytest.mark.parametrize(
......
......@@ -12,13 +12,6 @@ from ..utils import compare_two_settings, models_path_prefix
from vllm.platforms import current_platform
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
# Fall back to V0 if cpu offloading is enabled.
# Fixture is required to that baseline uses V0.
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.mark.skipif(not is_quant_method_supported("fp8") or current_platform.is_rocm(),
reason="fp8 is not supported on this GPU type.")
def test_cpu_offload_fp8():
......@@ -35,7 +28,9 @@ def test_cpu_offload_fp8():
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or current_platform.is_rocm(),
reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_gptq():
def test_cpu_offload_gptq(monkeypatch):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
# Test GPTQ Marlin
compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"), [],
["--cpu-offload-gb", "1"],
......@@ -49,7 +44,9 @@ def test_cpu_offload_gptq():
@pytest.mark.skipif(not is_quant_method_supported("awq_marlin") or current_platform.is_rocm(),
reason="awq_marlin is not supported on this GPU type.")
def test_cpu_offload_awq():
def test_cpu_offload_awq(monkeypatch):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
# Test AWQ Marlin
compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-AWQ"), [],
["--cpu-offload-gb", "1"],
......@@ -63,7 +60,9 @@ def test_cpu_offload_awq():
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or current_platform.is_rocm(),
reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_compressed_tensors():
def test_cpu_offload_compressed_tensors(monkeypatch):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
# Test wNa16
compare_two_settings(os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w4a16-channel-v2"), [],
["--cpu-offload-gb", "1"],
......
......@@ -25,8 +25,14 @@ MODELS = [
reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_id", MODELS)
@pytest.mark.parametrize("force_marlin", [False, True])
@pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
monkeypatch) -> None:
use_rocm_aiter: bool, monkeypatch) -> None:
if use_rocm_aiter:
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
if force_marlin:
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
......@@ -49,7 +55,13 @@ KV_CACHE_MODELS = [
@pytest.mark.skipif(not is_quant_method_supported("fp8") or current_platform.is_rocm(),
reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch):
@pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
use_rocm_aiter: bool, monkeypatch):
if use_rocm_aiter:
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
......@@ -88,8 +100,13 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch):
reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
@pytest.mark.parametrize("force_marlin", [False, True])
@pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
monkeypatch) -> None:
use_rocm_aiter: bool, monkeypatch) -> None:
if use_rocm_aiter:
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
......
......@@ -3,74 +3,126 @@
import pytest
from transformers import AutoTokenizer
from tests.entrypoints.openai.reasoning_parsers.utils import (
run_reasoning_extraction)
from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
ReasoningParserManager)
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "deepseek_r1"
start_token = "<think>"
end_token = "</think>"
REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
@pytest.fixture(scope="module")
def deepseek_r1_qwen_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
SIMPLE_REASONING = {
"output": "This is a reasoning section</think>This is the rest",
"reasoning_content": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
COMPLETE_REASONING = {
"output": "This is a reasoning section</think>",
"reasoning_content": "This is a reasoning section",
"content": None,
"is_reasoning_end": True,
}
NO_CONTENT = {
"output": "This is content",
"reasoning_content": "This is content",
"content": None,
"is_reasoning_end": False,
}
NO_REASONING_STREAMING = {
"output": "This is a reasoning section",
"reasoning_content": "This is a reasoning section",
"content": None,
"is_reasoning_end": False,
}
MULTIPLE_LINES = {
"output": "This\nThat</think>This is the rest\nThat",
"reasoning_content": "This\nThat",
"content": "This is the rest\nThat",
"is_reasoning_end": True,
}
SHORTEST_REASONING_NO_STREAMING = {
"output": "</think>This is the rest",
"reasoning_content": "",
"content": "This is the rest",
"is_reasoning_end": True,
}
SHORTEST_REASONING = {
"output": "</think>This is the rest",
"reasoning_content": None,
"content": "This is the rest",
"is_reasoning_end": True,
}
REASONING_WITH_THINK = {
"output": "<think>This is a reasoning section</think>This is the rest",
"reasoning_content": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
COMPLETE_REASONING_WITH_THINK = {
"output": "<think>This is a reasoning section</think>",
"reasoning_content": "This is a reasoning section",
"content": None,
"is_reasoning_end": True,
}
MULTIPLE_LINES_WITH_THINK = {
"output": "<think>This\nThat</think>This is the rest\nThat",
"reasoning_content": "This\nThat",
"content": "This is the rest\nThat",
"is_reasoning_end": True,
}
SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
"output": "</think>This is the rest",
"reasoning_content": "",
"content": "This is the rest",
"is_reasoning_end": True,
}
SHORTEST_REASONING_WITH_THINK = {
"output": "</think>This is the rest",
"reasoning_content": None,
"content": "This is the rest",
"is_reasoning_end": True,
}
THINK_NO_END = {
"output": "<think>This is a reasoning section",
"reasoning_content": "This is a reasoning section",
"content": None,
"is_reasoning_end": False,
}
EMPTY = {
"output": "",
"reasoning_content": "",
"content": None,
"is_reasoning_end": False,
}
EMPTY_STREAMING = {
"output": "",
"reasoning_content": None,
"content": None,
"is_reasoning_end": False,
}
NEW_LINE = {
"output": "\n<think>This is a reasoning section</think>\nThis is the rest",
"reasoning_content": "This is a reasoning section",
"content": "\nThis is the rest",
"is_reasoning_end": True,
}
# Streaming cannot handle new lines at the beginning of the output
# because we need to support <think>...</think> and </think>...
# We cannot know if the text before <think> is reasoning content
# or not.
NEW_LINE_STREAMING = {
"output": "\n<think>This is a reasoning section</think>\nThis is the rest",
"reasoning_content": "\nThis is a reasoning section",
"content": "\nThis is the rest",
"is_reasoning_end": True,
}
TEST_CASES = [
......@@ -164,25 +216,53 @@ TEST_CASES = [
SHORTEST_REASONING_WITH_THINK,
id="shortest_with_think_streaming",
),
pytest.param(
False,
THINK_NO_END,
id="think_no_end",
),
pytest.param(
True,
THINK_NO_END,
id="think_no_end_streaming",
),
pytest.param(
False,
EMPTY,
id="empty",
),
pytest.param(
True,
EMPTY_STREAMING,
id="empty_streaming",
),
pytest.param(
False,
NEW_LINE,
id="new_line",
),
pytest.param(
True,
NEW_LINE_STREAMING,
id="new_line_streaming",
),
]
# Global tokenizer initialization to avoid repeated loading
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
tokenizer.add_tokens([start_token, end_token])
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
deepseek_r1_qwen_tokenizer,
):
output = tokenizer.tokenize(param_dict["output"])
output = deepseek_r1_qwen_tokenizer.tokenize(param_dict["output"])
# decode everything to tokens
output_tokens: list[str] = [
tokenizer.convert_tokens_to_string([token]) for token in output
deepseek_r1_qwen_tokenizer.convert_tokens_to_string([token])
for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
parser_name)(tokenizer)
parser_name)(deepseek_r1_qwen_tokenizer)
reasoning, content = run_reasoning_extraction(parser,
output_tokens,
......@@ -190,3 +270,17 @@ def test_reasoning(
assert reasoning == param_dict["reasoning_content"]
assert content == param_dict["content"]
# Test is_reasoning_end
output_ids = deepseek_r1_qwen_tokenizer.convert_tokens_to_ids(output)
is_reasoning_end = parser.is_reasoning_end(output_ids)
assert is_reasoning_end == param_dict["is_reasoning_end"]
# Test extract_content
if param_dict["content"] is not None:
content = parser.extract_content_ids(output_ids)
assert content == deepseek_r1_qwen_tokenizer.convert_tokens_to_ids(
deepseek_r1_qwen_tokenizer.tokenize(param_dict["content"]))
else:
content = parser.extract_content_ids(output)
assert content == []
# SPDX-License-Identifier: Apache-2.0
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import DeltaMessage, run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "granite"
START_REASONING = "Here is my thought process:"
START_RESPONSE = "Here is my response:"
SIMPLE_REASONING = {
"output":
f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest", #noqa: E501
"reasoning_content": "This is a reasoning section",
"content": "This is the rest",
}
COMPLETE_REASONING = {
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
"reasoning_content": "This is a reasoning section",
"content": None,
}
NO_REASONING = {
"output": "This is content",
"reasoning_content": None,
"content": "This is content",
}
MULTIPLE_LINES = {
"output":
f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
"reasoning_content": "This\nThat",
"content": "This is the rest\nThat",
}
REASONING_WITH_THINK = {
"output":
f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest", #noqa: E501
"reasoning_content": "This is a reasoning section",
"content": "This is the rest",
}
COMPLETE_REASONING_WITH_THINK = {
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
"reasoning_content": "This is a reasoning section",
"content": None,
}
MULTIPLE_LINES_WITH_THINK = {
"output":
f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
"reasoning_content": "This\nThat",
"content": "This is the rest\nThat",
}
TEST_CASES = [
pytest.param(
False,
SIMPLE_REASONING,
id="simple_reasoning",
),
pytest.param(
False,
COMPLETE_REASONING,
id="complete_reasoning",
),
pytest.param(
False,
NO_REASONING,
id="no_reasoning",
),
pytest.param(
False,
MULTIPLE_LINES,
id="multiple_lines",
),
pytest.param(
False,
REASONING_WITH_THINK,
id="reasoning_with_think",
),
pytest.param(
False,
COMPLETE_REASONING_WITH_THINK,
id="complete_reasoning_with_think",
),
pytest.param(
False,
MULTIPLE_LINES_WITH_THINK,
id="multiple_lines_with_think",
),
pytest.param(
True,
SIMPLE_REASONING,
id="simple_reasoning_streaming",
),
pytest.param(
True,
COMPLETE_REASONING,
id="complete_reasoning_streaming",
),
pytest.param(
True,
NO_REASONING,
id="no_reasoning_streaming",
),
pytest.param(
True,
MULTIPLE_LINES,
id="multiple_lines_streaming",
),
pytest.param(
True,
REASONING_WITH_THINK,
id="reasoning_with_think_streaming",
),
pytest.param(
True,
COMPLETE_REASONING_WITH_THINK,
id="complete_reasoning_with_think_streaming",
),
pytest.param(
True,
MULTIPLE_LINES_WITH_THINK,
id="multiple_lines_with_think_streaming",
),
]
# Global tokenizer initialization to avoid repeated loading
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
):
output = tokenizer.tokenize(param_dict["output"])
# decode everything to tokens
output_tokens: list[str] = [
tokenizer.convert_tokens_to_string([token]) for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
parser_name)(tokenizer)
reasoning, content = run_reasoning_extraction(parser,
output_tokens,
streaming=streaming)
assert reasoning == param_dict["reasoning_content"]
assert content == param_dict["content"]
# Additional tests for verifying the correctness of granite streaming; this
# is complicated because granite uses multiple tokens to indicate when thinking
# is starting / when it's starting its response, so skipping special tokens
# is awkward.
### Handling the start of reasoning
STREAMING_1 = {
"previous_text": None,
"current_text": "Here",
"delta_text": "Here",
"reasoning_content": None,
"content": None,
}
# When we fail, we should give what was previously being silenced first
STREAMING_2 = {
"previous_text": "Here is my thought",
"current_text": "Here is my thought failure",
"delta_text": " failure",
"reasoning_content": None,
"content": "Here is my thought failure",
}
# But then after the first one, we should only add the delta text to content
STREAMING_3 = {
"previous_text": "Here wrong",
"current_text": " words",
"delta_text": " Here wrong words",
"reasoning_content": None,
"content": " words",
}
# But then after the first one, we should only add the delta text to content
STREAMING_4 = {
"previous_text": "Here is my thought",
"current_text": "Here is my thought process:",
"delta_text": " process:",
"reasoning_content": None,
"content": None,
}
# Reasoning started successfully; parse reasoning content
STREAMING_5 = {
"previous_text": "Here is my thought process:",
"current_text": "Here is my thought process: foo",
"delta_text": " foo",
"reasoning_content": " foo",
"content": None,
}
# Response special sequence has started, but not finished.
STREAMING_6 = {
"previous_text": "Here is my thought process: foo",
"current_text": "Here is my thought process: foo Here is",
"delta_text": " Here is",
"reasoning_content": " ",
"content": None,
}
# Response special sequence started, but was broken; the reasoning
# content should be the content that was previously unused.
STREAMING_7 = {
"previous_text": "Here is my thought process: foo Here is",
"current_text": "Here is my thought process: foo Here is Here",
"delta_text": " Here",
"reasoning_content": "Here is ",
"content": None,
}
# Response special sequence is ongoing
STREAMING_8 = {
"previous_text": "Here is my thought process: foo Here is my response:",
"current_text": "Here is my thought process: foo Here is my response: bar",
"delta_text": " bar",
"reasoning_content": None,
"content": " bar",
}
# The delta text has everything; we should be able to correctly parse both
STREAMING_9 = {
"previous_text": None,
"current_text": "Here is my thought process: foo Here is my response: bar",
"delta_text": "Here is my thought process: foo Here is my response: bar",
"reasoning_content": " foo ",
"content": " bar",
}
## The Response is ongoing, and the delta mixes reasoning content / content
STREAMING_10 = {
"previous_text": "Here is my thought process: foo",
"current_text":
"Here is my thought process: foo bar Here is my response: baz",
"delta_text": " bar Here is my response: baz",
"reasoning_content": " bar ",
"content": " baz",
}
# The delta text starts a new substring that might be a response special seq
STREAMING_11 = {
"previous_text":
"Here is my thought process: This is a reasoning section ",
"current_text":
"Here is my thought process: This is a reasoning section Here",
"delta_text": "Here",
"reasoning_content": None,
"content": None,
}
# The delta text is finishing the response special seq
STREAMING_12 = {
"previous_text": "Here is my thought process: foo Here is my response",
"current_text": "Here is my thought process: foo Here is my response:",
"delta_text": ":",
"reasoning_content": None,
"content": None,
}
STREAMING_13 = {
"previous_text": "Here is my thought process: foo Here",
"current_text": "Here is my thought process: foo Here was",
"delta_text": " was",
"reasoning_content": "Here was",
"content": None,
}
STREAMING_SUBCASES = [
pytest.param(
STREAMING_1,
id="Starting reasoning special sequence",
),
pytest.param(
STREAMING_2,
id="Unexpected start reasoning sequence",
),
pytest.param(
STREAMING_3,
id="Continuing unexpected start reasoning sequence",
),
pytest.param(
STREAMING_4,
id="Only start reasoning sequence and nothing else",
),
pytest.param(
STREAMING_5,
id="Reasoning content has started",
),
pytest.param(
STREAMING_6,
id="Response special sequence has started",
),
pytest.param(
STREAMING_7,
id="Response special sequence reset",
),
pytest.param(
STREAMING_8,
id="Response text has started",
),
pytest.param(
STREAMING_9,
id="Delta contains everything",
),
pytest.param(
STREAMING_10,
id="Delta contains some reasoning and response",
),
pytest.param(
STREAMING_11,
id="Delta starts response sequence",
),
pytest.param(
STREAMING_12,
id="Delta finishes response sequence",
),
pytest.param(
STREAMING_13,
id="Delta breaks potential responise sequence",
),
]
@pytest.mark.parametrize("param_dict", STREAMING_SUBCASES)
def test_streaming_subcases(param_dict):
# Get all of the token IDs
previous_token_ids = tokenizer.encode(
param_dict["previous_text"]
) if param_dict["previous_text"] is not None else []
current_token_ids = tokenizer.encode(param_dict["current_text"])
delta_token_ids = tokenizer.encode(param_dict["delta_text"])
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
parser_name)(tokenizer)
response = parser.extract_reasoning_content_streaming(
previous_text=param_dict["previous_text"],
current_text=param_dict["current_text"],
delta_text=param_dict["delta_text"],
previous_token_ids=previous_token_ids,
current_token_ids=current_token_ids,
delta_token_ids=delta_token_ids,
)
# Streaming currently expects at least one of reasoning content / content,
# so the response should return None in that case.
if param_dict["reasoning_content"] is None and param_dict[
"content"] is None:
assert response is None
else:
assert isinstance(response, DeltaMessage)
assert param_dict["reasoning_content"] == response.reasoning_content
assert param_dict["content"] == response.content
......@@ -4,7 +4,7 @@ from typing import Optional, Union
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaMessage)
from vllm.entrypoints.openai.reasoning_parsers import ReasoningParser
from vllm.reasoning import ReasoningParser
class StreamingReasoningReconstructor:
......
......@@ -3,6 +3,7 @@
tensor parallelism.
"""
import json
from typing import Optional
import pytest
......@@ -30,14 +31,14 @@ from ...utils import models_path_prefix
@pytest.mark.parametrize("test_llm_kwargs", [
[
"--speculative_config",
str({
json.dumps({
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 3,
}),
],
[
"--speculative_config",
str({
json.dumps({
"model": "ngram",
"num_speculative_tokens": 5,
"prompt_lookup_max": 3,
......@@ -90,7 +91,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
"model, test_llm_kwargs",
[(os.path.join(models_path_prefix, "JackFram/llama-68m"), [
"--speculative_config",
str({
json.dumps({
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1,
......@@ -98,7 +99,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
]),
(os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"), [
"--speculative_config",
str({
json.dumps({
"model": os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"),
"num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1,
......@@ -149,20 +150,20 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
@pytest.mark.parametrize("model, test_llm_kwargs",
[("JackFram/llama-68m", [
"--speculative_config",
str({
json.dumps({
"model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
}),
]),
("JackFram/llama-68m", [
"--speculative_config",
str({
json.dumps({
"model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
"draft_tensor_parallel_size": 1,
}),
])])
@pytest.mark.parametrize("logprobs", [None, 2])
@pytest.mark.parametrize("logprobs", [None])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
......@@ -173,9 +174,68 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
"""Verify spec decode works well with same and different TP size for
the draft model with chunked prefill.
"""
if logprobs:
test_llm_kwargs.extend(
["--disable_logprobs_during_spec_decoding", "False"])
run_equality_correctness_test_tp(model,
common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size,
max_output_len=32,
seed=seed,
temperature=0.0,
logprobs=logprobs)
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize(
"common_llm_kwargs",
[[
# Skip cuda graph recording for fast test.
"--enforce-eager",
"--tensor_parallel_size",
"2",
# precision
"--dtype",
"bfloat16",
]])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
[["--enable-chunked-prefill", "False"],
[
"--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
"--max-num-seqs", "4"
]])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize("model, test_llm_kwargs",
[("JackFram/llama-68m", [
"--speculative_config",
json.dumps({
"model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
"disable_logprobs": False,
}),
]),
("JackFram/llama-68m", [
"--speculative_config",
json.dumps({
"model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
"draft_tensor_parallel_size": 1,
"disable_logprobs": False,
}),
])])
@pytest.mark.parametrize("logprobs", [2])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
def test_spec_decode_chunked_prefill_tp2_with_logprobs(
model, common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs, logprobs: Optional[int],
batch_size: int, seed: int):
"""Verify spec decode works well with same and different TP size for
the draft model with chunked prefill.
"""
run_equality_correctness_test_tp(model,
common_llm_kwargs,
per_test_common_llm_kwargs,
......
......@@ -3,6 +3,8 @@
tensor parallelism.
"""
import json
import openai
import pytest
import torch
......@@ -35,7 +37,7 @@ SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
#TODO(wooyeon): add spec_draft_dp=2 case
[
"--speculative_config",
str({
json.dumps({
"model": f"{SPEC_MODEL}",
"num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1,
......@@ -82,7 +84,7 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
# Artificially limit the draft model max model len; this forces vLLM
# to skip speculation once the sequences grow beyond 32-k tokens.
"--speculative_config",
str({
json.dumps({
"model": f"{SPEC_MODEL}",
"num_speculative_tokens": 5,
"max_model_len": 32,
......
......@@ -2,19 +2,22 @@
# ruff: noqa
import asyncio
import hashlib
import pickle
import socket
from collections.abc import AsyncIterator
from unittest.mock import patch
import pytest
import torch
from vllm_test_utils import monitor
from vllm_test_utils.monitor import monitor
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
PlaceholderModule, StoreBoolean, bind_kv_cache,
deprecate_kwargs, get_open_port, memory_profiling,
merge_async_iterators, supports_kw, swap_dict_values)
merge_async_iterators, sha256, supports_kw,
swap_dict_values)
from .utils import create_new_process_for_each_test, error_on_warning
from .utils import models_path_prefix
......@@ -141,7 +144,8 @@ def parser():
def parser_with_config():
parser = FlexibleArgumentParser()
parser.add_argument('serve')
parser.add_argument('model_tag')
parser.add_argument('model_tag', nargs='?')
parser.add_argument('--model', type=str)
parser.add_argument('--served-model-name', type=str)
parser.add_argument('--config', type=str)
parser.add_argument('--port', type=int)
......@@ -198,29 +202,29 @@ def test_missing_required_argument(parser):
parser.parse_args([])
def test_cli_override_to_config(parser_with_config):
def test_cli_override_to_config(parser_with_config, cli_config_file):
args = parser_with_config.parse_args([
'serve', 'mymodel', '--config', './data/test_config.yaml',
'serve', 'mymodel', '--config', cli_config_file,
'--tensor-parallel-size', '3'
])
assert args.tensor_parallel_size == 3
args = parser_with_config.parse_args([
'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
'./data/test_config.yaml'
cli_config_file
])
assert args.tensor_parallel_size == 3
assert args.port == 12312
args = parser_with_config.parse_args([
'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
'./data/test_config.yaml', '--port', '666'
cli_config_file, '--port', '666'
])
assert args.tensor_parallel_size == 3
assert args.port == 666
def test_config_args(parser_with_config):
def test_config_args(parser_with_config, cli_config_file):
args = parser_with_config.parse_args(
['serve', 'mymodel', '--config', './data/test_config.yaml'])
['serve', 'mymodel', '--config', cli_config_file])
assert args.tensor_parallel_size == 2
assert args.trust_remote_code
assert not args.multi_step_stream_outputs
......@@ -242,10 +246,9 @@ def test_config_file(parser_with_config):
])
def test_no_model_tag(parser_with_config):
def test_no_model_tag(parser_with_config, cli_config_file):
with pytest.raises(ValueError):
parser_with_config.parse_args(
['serve', '--config', './data/test_config.yaml'])
parser_with_config.parse_args(['serve', '--config', cli_config_file])
# yapf: enable
......@@ -478,3 +481,63 @@ def test_swap_dict_values(obj, key1, key2):
assert obj[key1] == original_obj[key2]
else:
assert key1 not in obj
def test_model_specification(parser_with_config,
cli_config_file,
cli_config_file_with_model):
# Test model in CLI takes precedence over config
args = parser_with_config.parse_args([
'serve', 'cli-model', '--config', cli_config_file_with_model
])
assert args.model_tag == 'cli-model'
assert args.served_model_name == 'mymodel'
# Test model from config file works
args = parser_with_config.parse_args([
'serve', '--config', cli_config_file_with_model,
])
assert args.model == 'config-model'
assert args.served_model_name == 'mymodel'
# Test no model specified anywhere raises error
with pytest.raises(ValueError, match="No model specified!"):
parser_with_config.parse_args(['serve', '--config', cli_config_file])
# Test using --model option raises error
with pytest.raises(
ValueError,
match=(
"With `vllm serve`, you should provide the model as a positional "
"argument or in a config file instead of via the `--model` option."
),
):
parser_with_config.parse_args(['serve', '--model', 'my-model'])
# Test other config values are preserved
args = parser_with_config.parse_args([
'serve', 'cli-model', '--config', cli_config_file_with_model,
])
assert args.tensor_parallel_size == 2
assert args.trust_remote_code is True
assert args.multi_step_stream_outputs is False
assert args.port == 12312
@pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
(None, bool, [1, 2, 3])])
@pytest.mark.parametrize("output", [0, 1, 2])
def test_sha256(input: tuple, output: int):
hash = sha256(input)
assert hash is not None
assert isinstance(hash, int)
assert hash != 0
bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
assert hash == int.from_bytes(hashlib.sha256(bytes).digest(), byteorder="big")
# hashing again, returns the same value
assert hash == sha256(input)
# hashing different input, returns different value
assert hash != sha256(input + (1, ))
......@@ -45,7 +45,8 @@ def test_chat_completion_request_with_no_tools():
assert request.tool_choice == 'none'
def test_chat_completion_request_with_tool_choice_but_no_tools():
@pytest.mark.parametrize('tool_choice', ['auto', 'required'])
def test_chat_completion_request_with_tool_choice_but_no_tools(tool_choice):
with pytest.raises(ValueError,
match="When using `tool_choice`, `tools` must be set."):
ChatCompletionRequest.model_validate({
......@@ -56,7 +57,7 @@ def test_chat_completion_request_with_tool_choice_but_no_tools():
'model':
os.path.join(models_path_prefix, 'facebook/opt-125m'),
'tool_choice':
'auto'
tool_choice
})
with pytest.raises(ValueError,
......@@ -69,7 +70,7 @@ def test_chat_completion_request_with_tool_choice_but_no_tools():
'model':
os.path.join(models_path_prefix, 'facebook/opt-125m'),
'tool_choice':
'auto',
tool_choice,
'tools':
None
})
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment