"vscode:/vscode.git/clone" did not exist on "57a314d1556cdcb17d26e55e324e21b02bdd9399"
Commit fcfc474d authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.3' into v0.8.3-dev

parents bb94d2e5 296c6572
...@@ -54,8 +54,10 @@ def test_can_initialize(model_arch): ...@@ -54,8 +54,10 @@ def test_can_initialize(model_arch):
model_info.default, model_info.default,
tokenizer=model_info.tokenizer, tokenizer=model_info.tokenizer,
tokenizer_mode=model_info.tokenizer_mode, tokenizer_mode=model_info.tokenizer_mode,
speculative_model=model_info.speculative_model, speculative_config={
num_speculative_tokens=1 if model_info.speculative_model else None, "model": model_info.speculative_model,
"num_speculative_tokens": 1,
} if model_info.speculative_model else None,
trust_remote_code=model_info.trust_remote_code, trust_remote_code=model_info.trust_remote_code,
load_format="dummy", load_format="dummy",
hf_overrides=hf_overrides, hf_overrides=hf_overrides,
......
...@@ -23,6 +23,11 @@ from .registry import HF_EXAMPLE_MODELS ...@@ -23,6 +23,11 @@ from .registry import HF_EXAMPLE_MODELS
@pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs()) @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
def test_registry_imports(model_arch): def test_registry_imports(model_arch):
# Llama4ForCausalLM does not have a standalone model
if model_arch == "Llama4ForCausalLM":
return
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
model_info.check_transformers_version(on_fail="skip") model_info.check_transformers_version(on_fail="skip")
...@@ -91,8 +96,11 @@ def test_registry_is_pp(model_arch, is_pp, init_cuda): ...@@ -91,8 +96,11 @@ def test_registry_is_pp(model_arch, is_pp, init_cuda):
def test_hf_registry_coverage(): def test_hf_registry_coverage():
untested_archs = (ModelRegistry.get_supported_archs() - untested_archs = set(ModelRegistry.get_supported_archs() -
HF_EXAMPLE_MODELS.get_supported_archs()) HF_EXAMPLE_MODELS.get_supported_archs())
# Llama4ForCausalLM does not have a standalone model
untested_archs.discard("Llama4ForCausalLM")
assert not untested_archs, ( assert not untested_archs, (
"Please add the following architectures to " "Please add the following architectures to "
......
...@@ -3,8 +3,6 @@ ...@@ -3,8 +3,6 @@
Run `pytest tests/models/test_transformers.py`. Run `pytest tests/models/test_transformers.py`.
""" """
from contextlib import nullcontext
import pytest import pytest
from ..conftest import HfRunner, VllmRunner from ..conftest import HfRunner, VllmRunner
...@@ -42,7 +40,6 @@ def check_implementation( ...@@ -42,7 +40,6 @@ def check_implementation(
"model,model_impl", "model,model_impl",
[ [
("meta-llama/Llama-3.2-1B-Instruct", "transformers"), ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
("openai-community/gpt2", "transformers"),
("ArthurZ/Ilama-3.2-1B", "auto"), # CUSTOM CODE ("ArthurZ/Ilama-3.2-1B", "auto"), # CUSTOM CODE
]) # trust_remote_code=True by default ]) # trust_remote_code=True by default
def test_models( def test_models(
...@@ -52,20 +49,11 @@ def test_models( ...@@ -52,20 +49,11 @@ def test_models(
model: str, model: str,
model_impl: str, model_impl: str,
) -> None: ) -> None:
check_implementation(hf_runner,
maybe_raises = nullcontext() vllm_runner,
if model == "openai-community/gpt2" and model_impl == "transformers": example_prompts,
# Model is not backend compatible model,
maybe_raises = pytest.raises( model_impl=model_impl)
ValueError,
match="The Transformers implementation.*not compatible with vLLM")
with maybe_raises:
check_implementation(hf_runner,
vllm_runner,
example_prompts,
model,
model_impl=model_impl)
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
...@@ -84,7 +72,6 @@ def test_distributed( ...@@ -84,7 +72,6 @@ def test_distributed(
"meta-llama/Llama-3.2-1B-Instruct", "meta-llama/Llama-3.2-1B-Instruct",
{ {
"quantization": "bitsandbytes", "quantization": "bitsandbytes",
"load_format": "bitsandbytes",
}, },
), ),
]) ])
......
# SPDX-License-Identifier: Apache-2.0
import torch
from vllm.model_executor.models.utils import AutoWeightsLoader
class ModuleWithBatchNorm(torch.nn.Module):
def __init__(self):
super().__init__()
self.bn = torch.nn.BatchNorm1d(2)
def forward(self, x):
return self.bn(x)
class ModuleWithNestedBatchNorm(torch.nn.Module):
def __init__(self):
super().__init__()
self.nested_mod = ModuleWithBatchNorm()
def forward(self, x):
return self.nested_mod(x)
def test_module_with_batchnorm_can_load():
"""Ensure the auto weight loader can load batchnorm stats."""
mod = ModuleWithBatchNorm()
# Run some data through the module with batchnorm
mod(torch.Tensor([[1, 2], [3, 4]]))
# Try to load the weights to a new instance
def weight_generator():
yield from mod.state_dict().items()
new_mod = ModuleWithBatchNorm()
assert not torch.all(new_mod.bn.running_mean == mod.bn.running_mean)
assert not torch.all(new_mod.bn.running_var == mod.bn.running_var)
assert new_mod.bn.num_batches_tracked.item() == 0
loader = AutoWeightsLoader(new_mod)
loader.load_weights(weight_generator())
# Ensure the stats are updated
assert torch.all(new_mod.bn.running_mean == mod.bn.running_mean)
assert torch.all(new_mod.bn.running_var == mod.bn.running_var)
assert new_mod.bn.num_batches_tracked.item() == 1
def test_module_with_child_containing_batchnorm_can_autoload():
"""Ensure the auto weight loader can load nested modules batchnorm stats."""
mod = ModuleWithNestedBatchNorm()
# Run some data through the module with batchnorm
mod(torch.Tensor([[1, 2], [3, 4]]))
# Try to load the weights to a new instance
def weight_generator():
yield from mod.state_dict().items()
new_mod = ModuleWithNestedBatchNorm()
assert not torch.all(
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
assert not torch.all(
new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
loader = AutoWeightsLoader(new_mod)
loader.load_weights(weight_generator())
# Ensure the stats are updated
assert torch.all(
new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
assert torch.all(
new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
...@@ -28,8 +28,7 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo, ...@@ -28,8 +28,7 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
replace_token_matches) replace_token_matches)
# yapf: enable # yapf: enable
from vllm.multimodal.profiling import MultiModalProfiler from vllm.multimodal.profiling import MultiModalProfiler
from vllm.transformers_utils.tokenizer import (AnyTokenizer, from vllm.transformers_utils.tokenizer import AnyTokenizer
cached_tokenizer_from_config)
from vllm.utils import full_groupby from vllm.utils import full_groupby
from .utils import random_image from .utils import random_image
...@@ -955,10 +954,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): ...@@ -955,10 +954,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
limit_mm_per_prompt=limit_mm_per_prompt, limit_mm_per_prompt=limit_mm_per_prompt,
) )
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(model_config)
model_config,
tokenizer=cached_tokenizer_from_config(model_config),
)
profiler = MultiModalProfiler(processor) profiler = MultiModalProfiler(processor)
mock_supported_mm_limits = MagicMock(return_value={"image": num_supported}) mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
...@@ -994,10 +990,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): ...@@ -994,10 +990,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
limit_mm_per_prompt=limit_mm_per_prompt, limit_mm_per_prompt=limit_mm_per_prompt,
) )
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(model_config)
model_config,
tokenizer=cached_tokenizer_from_config(model_config),
)
rng = np.random.RandomState(0) rng = np.random.RandomState(0)
image = random_image(rng, min_wh=128, max_wh=256) image = random_image(rng, min_wh=128, max_wh=256)
...@@ -1066,10 +1059,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs): ...@@ -1066,10 +1059,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
revision=None, revision=None,
) )
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(model_config)
model_config,
tokenizer=cached_tokenizer_from_config(model_config),
)
orig_get_hf_processor = processor.info.get_hf_processor orig_get_hf_processor = processor.info.get_hf_processor
def get_hf_processor(self, **kwargs): def get_hf_processor(self, **kwargs):
......
...@@ -11,12 +11,10 @@ import pytest ...@@ -11,12 +11,10 @@ import pytest
import os import os
from PIL import Image, ImageChops from PIL import Image, ImageChops
from transformers import AutoConfig, AutoTokenizer
from vllm.multimodal.inputs import PlaceholderRange from vllm.multimodal.inputs import PlaceholderRange
from vllm.multimodal.utils import (MediaConnector, from vllm.multimodal.utils import (MediaConnector,
merge_and_sort_multimodal_metadata, merge_and_sort_multimodal_metadata)
repeat_and_pad_placeholder_tokens)
from ..utils import models_path_prefix, urls_port from ..utils import models_path_prefix, urls_port
if TYPE_CHECKING: if TYPE_CHECKING:
...@@ -139,71 +137,6 @@ async def test_fetch_image_local_files(image_url: str): ...@@ -139,71 +137,6 @@ async def test_fetch_image_local_files(image_url: str):
f"file://{temp_dir}/../{os.path.basename(image_url)}") f"file://{temp_dir}/../{os.path.basename(image_url)}")
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")])
def test_repeat_and_pad_placeholder_tokens(model):
config = AutoConfig.from_pretrained(model)
image_token_id = config.image_token_index
tokenizer = AutoTokenizer.from_pretrained(model)
test_cases = [
(
"<image>",
2,
"<image><image>",
[32000, 32000],
[{ "offset": 0, "length": 2 }],
),
(
"<image><image>",
2,
"<image><image><image>",
[32000, 32000, 32000],
[{ "offset": 0, "length": 2 }],
),
(
"<image><image>",
[3, 2],
"<image><image><image><image><image>",
[32000, 32000, 32000, 32000, 32000],
[{ "offset": 0, "length": 3 }, { "offset": 3, "length": 2 }],
),
(
"Image:<image>Image:<image>!",
[3, 2],
"Image:<image><image><image>Image:<image><image>!",
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
[{ "offset": 2, "length": 3 }, { "offset": 7, "length": 2 }],
),
(
"<image>",
[3, 2],
"<image><image><image>",
[32000, 32000, 32000],
[{ "offset": 0, "length": 3 }],
),
] # yapf: disable
for (
prompt,
repeat_count,
expected_prompt,
expected_token_ids,
expected_ranges,
) in test_cases:
new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
tokenizer=tokenizer,
prompt=prompt,
prompt_token_ids=tokenizer.encode(prompt,
add_special_tokens=False),
placeholder_token_id=image_token_id,
repeat_count=repeat_count,
)
assert new_prompt == expected_prompt
assert new_token_ids == expected_token_ids
assert ranges == expected_ranges
# Used for the next two tests related to `merge_and_sort_multimodal_metadata`. # Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
class TestCase(NamedTuple): class TestCase(NamedTuple):
mm_positions: "MultiModalPlaceholderDict" mm_positions: "MultiModalPlaceholderDict"
...@@ -225,7 +158,7 @@ def test_merge_and_sort_multimodal_metadata(): ...@@ -225,7 +158,7 @@ def test_merge_and_sort_multimodal_metadata():
] ]
}, },
mm_hashes={"image": ["hash1", "hash2"]}, mm_hashes={"image": ["hash1", "hash2"]},
expected_modalities=["image"], expected_modalities=["image", "image"],
expected_ranges=[ expected_ranges=[
PlaceholderRange(offset=0, length=2), PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=3, length=2), PlaceholderRange(offset=3, length=2),
...@@ -242,7 +175,7 @@ def test_merge_and_sort_multimodal_metadata(): ...@@ -242,7 +175,7 @@ def test_merge_and_sort_multimodal_metadata():
] ]
}, },
mm_hashes=None, mm_hashes=None,
expected_modalities=["image"], expected_modalities=["image", "image"],
expected_ranges=[ expected_ranges=[
PlaceholderRange(offset=0, length=2), PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=2), PlaceholderRange(offset=2, length=2),
...@@ -267,7 +200,7 @@ def test_merge_and_sort_multimodal_metadata(): ...@@ -267,7 +200,7 @@ def test_merge_and_sort_multimodal_metadata():
"image": ["image_hash1", "image_hash2"], "image": ["image_hash1", "image_hash2"],
"audio": ["audio_hash1", "audio_hash2"], "audio": ["audio_hash1", "audio_hash2"],
}, },
expected_modalities=["audio", "image"], expected_modalities=["audio", "audio", "image", "image"],
expected_ranges=[ expected_ranges=[
PlaceholderRange(offset=0, length=2), PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3), PlaceholderRange(offset=2, length=3),
...@@ -293,7 +226,7 @@ def test_merge_and_sort_multimodal_metadata(): ...@@ -293,7 +226,7 @@ def test_merge_and_sort_multimodal_metadata():
] ]
}, },
mm_hashes=None, mm_hashes=None,
expected_modalities=["audio", "image"], expected_modalities=["audio", "audio", "image", "image"],
expected_ranges=[ expected_ranges=[
PlaceholderRange(offset=0, length=2), PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3), PlaceholderRange(offset=2, length=3),
...@@ -324,7 +257,9 @@ def test_merge_and_sort_multimodal_metadata(): ...@@ -324,7 +257,9 @@ def test_merge_and_sort_multimodal_metadata():
"audio": ["audio_hash1"], "audio": ["audio_hash1"],
"video": ["video_hash1", "video_hash2", "video_hash3"] "video": ["video_hash1", "video_hash2", "video_hash3"]
}, },
expected_modalities=["audio", "video", "image"], expected_modalities=[
"audio", "video", "video", "video", "image", "image"
],
expected_ranges=[ expected_ranges=[
PlaceholderRange(offset=0, length=2), PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=3, length=4), PlaceholderRange(offset=3, length=4),
...@@ -370,12 +305,19 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving(): ...@@ -370,12 +305,19 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
"image": ["image_hash1", "image_hash2"], "image": ["image_hash1", "image_hash2"],
"audio": ["audio_hash1", "audio_hash2"], "audio": ["audio_hash1", "audio_hash2"],
}, },
expected_modalities=[], expected_modalities=["image", "audio", "image", "audio"],
expected_ranges=[], expected_ranges=[
expected_hashes=None, PlaceholderRange(offset=0, length=4),
PlaceholderRange(offset=5, length=2),
PlaceholderRange(offset=8, length=2),
PlaceholderRange(offset=11, length=4),
],
expected_hashes=[
"image_hash1", "audio_hash1", "image_hash2", "audio_hash2"
],
), ),
# <image> <image> <video> <audio> <image> # <image> <image> <audio> <video> <image>
TestCase( TestCase(
mm_positions={ mm_positions={
"image": [ "image": [
...@@ -391,15 +333,54 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving(): ...@@ -391,15 +333,54 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
] ]
}, },
mm_hashes=None, mm_hashes=None,
expected_modalities=[], expected_modalities=["image", "image", "audio", "video", "image"],
expected_ranges=[], expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3),
PlaceholderRange(offset=5, length=2),
PlaceholderRange(offset=8, length=5),
PlaceholderRange(offset=20, length=4),
],
expected_hashes=None, expected_hashes=None,
), ),
# <image> <audio> <video> <image> with hashes
TestCase(
mm_positions={
"image": [
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=18, length=4),
],
"audio": [
PlaceholderRange(offset=6, length=2),
],
"video": [
PlaceholderRange(offset=10, length=5),
]
},
mm_hashes={
"image": ["image_hash1", "image_hash2"],
"audio": ["audio_hash1"],
"video": ["video_hash1"],
},
expected_modalities=["image", "audio", "video", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=6, length=2),
PlaceholderRange(offset=10, length=5),
PlaceholderRange(offset=18, length=4),
],
expected_hashes=[
"image_hash1", "audio_hash1", "video_hash1", "image_hash2"
],
),
] ]
for case in test_cases: for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
with pytest.raises(ValueError) as ex_info: expected_hashes) in test_cases:
merge_and_sort_multimodal_metadata(case.mm_positions, modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
case.mm_hashes) mm_positions, mm_hashes)
assert "Interleaved mixed-modality" in str(ex_info.value) assert modalities == expected_modalities
assert ranges == expected_ranges
assert hashes == expected_hashes
...@@ -64,9 +64,11 @@ def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks, ...@@ -64,9 +64,11 @@ def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
key_cache = torch.zeros_like(key_cache_cpu, device=device) key_cache = torch.zeros_like(key_cache_cpu, device=device)
value_cache = torch.zeros_like(value_cache_cpu, device=device) value_cache = torch.zeros_like(value_cache_cpu, device=device)
slot_mapping = slot_mapping_cpu.to(device) slot_mapping = slot_mapping_cpu.to(device)
kv_cache = torch.stack([key_cache, value_cache])
# Run vectorized implementation on XLA device # Run vectorized implementation on XLA device
reshape_and_cache(key, value, key_cache, value_cache, slot_mapping) reshape_and_cache(key, value, kv_cache, slot_mapping)
key_cache, value_cache = torch.unbind(kv_cache, dim=0)
# Move results back to CPU for comparison # Move results back to CPU for comparison
key_cache_result = key_cache.cpu() key_cache_result = key_cache.cpu()
......
...@@ -258,13 +258,13 @@ def sample_inputs( ...@@ -258,13 +258,13 @@ def sample_inputs(
value[start_loc:end_loc]) value[start_loc:end_loc])
cur_ctx += block_size cur_ctx += block_size
block_id += 1 block_id += 1
kv_cache = torch.stack([k_cache, v_cache])
return ( return (
query, query,
k, k,
v, v,
k_cache, kv_cache,
v_cache,
block_table, block_table,
key, key,
value, value,
...@@ -361,8 +361,7 @@ def test_contexted_kv_attention( ...@@ -361,8 +361,7 @@ def test_contexted_kv_attention(
query, query,
k_active, k_active,
v_active, v_active,
k_cache, kv_cache,
v_cache,
block_table, block_table,
key, key,
value, value,
...@@ -439,8 +438,7 @@ def test_contexted_kv_attention( ...@@ -439,8 +438,7 @@ def test_contexted_kv_attention(
query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous() query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous() k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous() v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
k_cache = k_cache.permute(0, 2, 1, 3).contiguous() kv_cache = kv_cache.permute(0, 1, 3, 2, 4).contiguous()
v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
# transform block table # transform block table
active_block_table = get_active_block_tables( active_block_table = get_active_block_tables(
...@@ -487,8 +485,7 @@ def test_contexted_kv_attention( ...@@ -487,8 +485,7 @@ def test_contexted_kv_attention(
query.to(device=device), query.to(device=device),
k.to(device=device), k.to(device=device),
v.to(device=device), v.to(device=device),
k_cache.to(device=device), kv_cache.to(device=device),
v_cache.to(device=device),
active_block_table.to(device=device), active_block_table.to(device=device),
attn_mask.to(device=device), attn_mask.to(device=device),
) )
......
...@@ -105,8 +105,6 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None: ...@@ -105,8 +105,6 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
"--enable-prefix-caching", "--enable-prefix-caching",
"--quantization", "--quantization",
"bitsandbytes", "bitsandbytes",
"--load-format",
"bitsandbytes",
"--gpu-memory-utilization", "--gpu-memory-utilization",
"0.7", "0.7",
] ]
...@@ -141,7 +139,6 @@ def validate_generated_texts(hf_runner, ...@@ -141,7 +139,6 @@ def validate_generated_texts(hf_runner,
# when using distributed inference # when using distributed inference
with vllm_runner(model_name, with vllm_runner(model_name,
quantization='bitsandbytes', quantization='bitsandbytes',
load_format='bitsandbytes',
tensor_parallel_size=vllm_tp_size, tensor_parallel_size=vllm_tp_size,
enforce_eager=False) as llm: enforce_eager=False) as llm:
vllm_outputs = llm.generate_greedy(prompts, 8) vllm_outputs = llm.generate_greedy(prompts, 8)
......
...@@ -23,6 +23,23 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( ...@@ -23,6 +23,23 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ..utils import models_path_prefix from ..utils import models_path_prefix
# AITER only supports per-channel-per-channel INT8 gemm
# and per-tensor-per-tensor INT8 GEMM.
# It does not support mix precision MM and mix quantization scheme.
ROCM_AITER_SUPPORTED_INT8_MODEL = [
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
]
# TritonScaledMMLinearKernel only supports symmetric quantization.
ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL = [
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
]
@pytest.fixture(scope="function", autouse=True) @pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch): def use_v0_only(monkeypatch):
...@@ -60,6 +77,11 @@ def use_v0_only(monkeypatch): ...@@ -60,6 +77,11 @@ def use_v0_only(monkeypatch):
) )
def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
model_path, strategy, quant_type, shape_0, is_symmetric = model_args model_path, strategy, quant_type, shape_0, is_symmetric = model_args
if current_platform.is_rocm(
) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
with vllm_runner(model_path, enforce_eager=True) as llm: with vllm_runner(model_path, enforce_eager=True) as llm:
def check_model(model): def check_model(model):
...@@ -126,6 +148,8 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): ...@@ -126,6 +148,8 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
) )
@pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [10]) @pytest.mark.parametrize("num_logprobs", [10])
@pytest.mark.parametrize(
"use_aiter", [True, False] if current_platform.is_rocm() else [False])
def test_compressed_tensors_w8a8_logprobs( def test_compressed_tensors_w8a8_logprobs(
hf_runner, hf_runner,
vllm_runner, vllm_runner,
...@@ -133,7 +157,21 @@ def test_compressed_tensors_w8a8_logprobs( ...@@ -133,7 +157,21 @@ def test_compressed_tensors_w8a8_logprobs(
model_path, model_path,
max_tokens, max_tokens,
num_logprobs, num_logprobs,
use_aiter,
monkeypatch,
): ):
if current_platform.is_rocm(
) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
if use_aiter:
if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
pytest.skip(
f"Skip model {model_path} as it is not support by aiter.")
# this will enable VLLM_ROCM_USE_AITER_LINEAR
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
dtype = "bfloat16" dtype = "bfloat16"
# skip language translation prompt for the static per tensor asym model # skip language translation prompt for the static per tensor asym model
...@@ -157,6 +195,9 @@ def test_compressed_tensors_w8a8_logprobs( ...@@ -157,6 +195,9 @@ def test_compressed_tensors_w8a8_logprobs(
name_1="vllm", name_1="vllm",
) )
if current_platform.is_rocm():
torch.cuda.synchronize()
def test_compressed_tensors_no_enforce_eager(vllm_runner): def test_compressed_tensors_no_enforce_eager(vllm_runner):
model_path = os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change") model_path = os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change")
...@@ -180,8 +221,27 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner): ...@@ -180,8 +221,27 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
), ),
], ],
) )
def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args): @pytest.mark.parametrize(
"use_aiter", [True, False] if current_platform.is_rocm() else [False])
def test_compressed_tensors_w8a8_dynamic_per_token(
vllm_runner,
model_args,
use_aiter,
monkeypatch,
):
model_path, strategy = model_args model_path, strategy = model_args
if current_platform.is_rocm(
) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
if use_aiter:
if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
pytest.skip(
f"Skip model {model_path} as it is not support by aiter.")
# this will enable VLLM_ROCM_USE_AITER_LINEAR
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
with vllm_runner(model_path, dtype=torch.float16) as llm: with vllm_runner(model_path, dtype=torch.float16) as llm:
def check_model(model): def check_model(model):
...@@ -212,6 +272,8 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args): ...@@ -212,6 +272,8 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
(os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w8a16-per-channel"), "channel", None, 4), (os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w8a16-per-channel"), "channel", None, 4),
], ],
) )
@pytest.mark.skipif(not current_platform.is_cuda(),
reason="The tests are skipped on non-CUDA platform.")
def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
model, strategy, group, pack_factor = wNa16_args model, strategy, group, pack_factor = wNa16_args
with vllm_runner(model) as llm: with vllm_runner(model) as llm:
...@@ -236,8 +298,8 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): ...@@ -236,8 +298,8 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
assert output assert output
@pytest.mark.skipif(current_platform(), @pytest.mark.skipif(not current_platform.is_cuda(),
reason="W4A16 MARLIN is not supported on ROCm.") reason="This test is skipped on non-CUDA platform.")
def test_compressed_tensors_w4a16_marlin24(vllm_runner): def test_compressed_tensors_w4a16_marlin24(vllm_runner):
model_path = os.path.join(models_path_prefix,"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t") model_path = os.path.join(models_path_prefix,"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t")
with vllm_runner(model_path) as llm: with vllm_runner(model_path) as llm:
...@@ -280,7 +342,7 @@ def test_compressed_tensors_fp8(vllm_runner): ...@@ -280,7 +342,7 @@ def test_compressed_tensors_fp8(vllm_runner):
if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8): if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
assert len(qkv_proj.input_scale.shape) == 0 assert len(qkv_proj.input_scale.shape) == 0
assert qkv_proj.weight.dtype is torch.float8_e4m3fn assert qkv_proj.weight.dtype is current_platform.fp8_dtype()
assert qkv_proj.weight_scale.dtype is torch.float32 assert qkv_proj.weight_scale.dtype is torch.float32
assert len(qkv_proj.weight_scale.shape) == 0 assert len(qkv_proj.weight_scale.shape) == 0
...@@ -290,8 +352,8 @@ def test_compressed_tensors_fp8(vllm_runner): ...@@ -290,8 +352,8 @@ def test_compressed_tensors_fp8(vllm_runner):
assert output assert output
@pytest.mark.skipif(current_platform(), @pytest.mark.skipif(not current_platform.is_cuda(),
reason="FP8 KV cache is not supported on ROCm.") reason="This test is skipped on non-CUDA platform.")
def test_compressed_tensors_kv_cache(vllm_runner): def test_compressed_tensors_kv_cache(vllm_runner):
model_path = os.path.join(models_path_prefix,"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme") model_path = os.path.join(models_path_prefix,"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")
with vllm_runner(model_path, kv_cache_dtype="fp8") as llm: with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
...@@ -320,7 +382,8 @@ def _test_2of4_quant_models(qkv_proj, ...@@ -320,7 +382,8 @@ def _test_2of4_quant_models(qkv_proj,
@pytest.mark.skipif( @pytest.mark.skipif(
not current_platform.has_device_capability(90), not current_platform.is_cuda()
or not current_platform.has_device_capability(90),
reason="Sparse FP8 is not yet supported on this GPU type.", reason="Sparse FP8 is not yet supported on this GPU type.",
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -367,7 +430,8 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4): ...@@ -367,7 +430,8 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
@pytest.mark.skipif( @pytest.mark.skipif(
not current_platform.has_device_capability(90), not current_platform.is_cuda()
or not current_platform.has_device_capability(90),
reason="Sparse FP8 is not yet supported on this GPU type.", reason="Sparse FP8 is not yet supported on this GPU type.",
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
......
...@@ -12,13 +12,6 @@ from ..utils import compare_two_settings, models_path_prefix ...@@ -12,13 +12,6 @@ from ..utils import compare_two_settings, models_path_prefix
from vllm.platforms import current_platform from vllm.platforms import current_platform
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
# Fall back to V0 if cpu offloading is enabled.
# Fixture is required to that baseline uses V0.
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.mark.skipif(not is_quant_method_supported("fp8") or current_platform.is_rocm(), @pytest.mark.skipif(not is_quant_method_supported("fp8") or current_platform.is_rocm(),
reason="fp8 is not supported on this GPU type.") reason="fp8 is not supported on this GPU type.")
def test_cpu_offload_fp8(): def test_cpu_offload_fp8():
...@@ -35,7 +28,9 @@ def test_cpu_offload_fp8(): ...@@ -35,7 +28,9 @@ def test_cpu_offload_fp8():
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or current_platform.is_rocm(), @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or current_platform.is_rocm(),
reason="gptq_marlin is not supported on this GPU type.") reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_gptq(): def test_cpu_offload_gptq(monkeypatch):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
# Test GPTQ Marlin # Test GPTQ Marlin
compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"), [], compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"), [],
["--cpu-offload-gb", "1"], ["--cpu-offload-gb", "1"],
...@@ -49,7 +44,9 @@ def test_cpu_offload_gptq(): ...@@ -49,7 +44,9 @@ def test_cpu_offload_gptq():
@pytest.mark.skipif(not is_quant_method_supported("awq_marlin") or current_platform.is_rocm(), @pytest.mark.skipif(not is_quant_method_supported("awq_marlin") or current_platform.is_rocm(),
reason="awq_marlin is not supported on this GPU type.") reason="awq_marlin is not supported on this GPU type.")
def test_cpu_offload_awq(): def test_cpu_offload_awq(monkeypatch):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
# Test AWQ Marlin # Test AWQ Marlin
compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-AWQ"), [], compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-AWQ"), [],
["--cpu-offload-gb", "1"], ["--cpu-offload-gb", "1"],
...@@ -63,7 +60,9 @@ def test_cpu_offload_awq(): ...@@ -63,7 +60,9 @@ def test_cpu_offload_awq():
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or current_platform.is_rocm(), @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or current_platform.is_rocm(),
reason="gptq_marlin is not supported on this GPU type.") reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_compressed_tensors(): def test_cpu_offload_compressed_tensors(monkeypatch):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
# Test wNa16 # Test wNa16
compare_two_settings(os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w4a16-channel-v2"), [], compare_two_settings(os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w4a16-channel-v2"), [],
["--cpu-offload-gb", "1"], ["--cpu-offload-gb", "1"],
......
...@@ -25,8 +25,14 @@ MODELS = [ ...@@ -25,8 +25,14 @@ MODELS = [
reason="FP8 is not supported on this GPU type.") reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_id", MODELS) @pytest.mark.parametrize("model_id", MODELS)
@pytest.mark.parametrize("force_marlin", [False, True]) @pytest.mark.parametrize("force_marlin", [False, True])
@pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool, def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
monkeypatch) -> None: use_rocm_aiter: bool, monkeypatch) -> None:
if use_rocm_aiter:
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
if force_marlin: if force_marlin:
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
...@@ -49,7 +55,13 @@ KV_CACHE_MODELS = [ ...@@ -49,7 +55,13 @@ KV_CACHE_MODELS = [
@pytest.mark.skipif(not is_quant_method_supported("fp8") or current_platform.is_rocm(), @pytest.mark.skipif(not is_quant_method_supported("fp8") or current_platform.is_rocm(),
reason="FP8 is not supported on this GPU type.") reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_id", KV_CACHE_MODELS) @pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch): @pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
use_rocm_aiter: bool, monkeypatch):
if use_rocm_aiter:
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
# vllm_runner.apply_model() relies on V0 internals. # vllm_runner.apply_model() relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0") monkeypatch.setenv("VLLM_USE_V1", "0")
with vllm_runner(model_id, kv_cache_dtype="fp8") as llm: with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
...@@ -88,8 +100,13 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch): ...@@ -88,8 +100,13 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch):
reason="FP8 is not supported on this GPU type.") reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
@pytest.mark.parametrize("force_marlin", [False, True]) @pytest.mark.parametrize("force_marlin", [False, True])
@pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool, def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
monkeypatch) -> None: use_rocm_aiter: bool, monkeypatch) -> None:
if use_rocm_aiter:
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
# vllm_runner.apply_model() relies on V0 internals. # vllm_runner.apply_model() relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0") monkeypatch.setenv("VLLM_USE_V1", "0")
......
...@@ -3,74 +3,126 @@ ...@@ -3,74 +3,126 @@
import pytest import pytest
from transformers import AutoTokenizer from transformers import AutoTokenizer
from tests.entrypoints.openai.reasoning_parsers.utils import ( from tests.reasoning.utils import run_reasoning_extraction
run_reasoning_extraction) from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
ReasoningParserManager)
parser_name = "deepseek_r1" parser_name = "deepseek_r1"
start_token = "<think>" start_token = "<think>"
end_token = "</think>" end_token = "</think>"
REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
@pytest.fixture(scope="module")
def deepseek_r1_qwen_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
SIMPLE_REASONING = { SIMPLE_REASONING = {
"output": "This is a reasoning section</think>This is the rest", "output": "This is a reasoning section</think>This is the rest",
"reasoning_content": "This is a reasoning section", "reasoning_content": "This is a reasoning section",
"content": "This is the rest", "content": "This is the rest",
"is_reasoning_end": True,
} }
COMPLETE_REASONING = { COMPLETE_REASONING = {
"output": "This is a reasoning section</think>", "output": "This is a reasoning section</think>",
"reasoning_content": "This is a reasoning section", "reasoning_content": "This is a reasoning section",
"content": None, "content": None,
"is_reasoning_end": True,
} }
NO_CONTENT = { NO_CONTENT = {
"output": "This is content", "output": "This is content",
"reasoning_content": "This is content", "reasoning_content": "This is content",
"content": None, "content": None,
"is_reasoning_end": False,
} }
NO_REASONING_STREAMING = { NO_REASONING_STREAMING = {
"output": "This is a reasoning section", "output": "This is a reasoning section",
"reasoning_content": "This is a reasoning section", "reasoning_content": "This is a reasoning section",
"content": None, "content": None,
"is_reasoning_end": False,
} }
MULTIPLE_LINES = { MULTIPLE_LINES = {
"output": "This\nThat</think>This is the rest\nThat", "output": "This\nThat</think>This is the rest\nThat",
"reasoning_content": "This\nThat", "reasoning_content": "This\nThat",
"content": "This is the rest\nThat", "content": "This is the rest\nThat",
"is_reasoning_end": True,
} }
SHORTEST_REASONING_NO_STREAMING = { SHORTEST_REASONING_NO_STREAMING = {
"output": "</think>This is the rest", "output": "</think>This is the rest",
"reasoning_content": "", "reasoning_content": "",
"content": "This is the rest", "content": "This is the rest",
"is_reasoning_end": True,
} }
SHORTEST_REASONING = { SHORTEST_REASONING = {
"output": "</think>This is the rest", "output": "</think>This is the rest",
"reasoning_content": None, "reasoning_content": None,
"content": "This is the rest", "content": "This is the rest",
"is_reasoning_end": True,
} }
REASONING_WITH_THINK = { REASONING_WITH_THINK = {
"output": "<think>This is a reasoning section</think>This is the rest", "output": "<think>This is a reasoning section</think>This is the rest",
"reasoning_content": "This is a reasoning section", "reasoning_content": "This is a reasoning section",
"content": "This is the rest", "content": "This is the rest",
"is_reasoning_end": True,
} }
COMPLETE_REASONING_WITH_THINK = { COMPLETE_REASONING_WITH_THINK = {
"output": "<think>This is a reasoning section</think>", "output": "<think>This is a reasoning section</think>",
"reasoning_content": "This is a reasoning section", "reasoning_content": "This is a reasoning section",
"content": None, "content": None,
"is_reasoning_end": True,
} }
MULTIPLE_LINES_WITH_THINK = { MULTIPLE_LINES_WITH_THINK = {
"output": "<think>This\nThat</think>This is the rest\nThat", "output": "<think>This\nThat</think>This is the rest\nThat",
"reasoning_content": "This\nThat", "reasoning_content": "This\nThat",
"content": "This is the rest\nThat", "content": "This is the rest\nThat",
"is_reasoning_end": True,
} }
SHORTEST_REASONING_NO_STREAMING_WITH_THINK = { SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
"output": "</think>This is the rest", "output": "</think>This is the rest",
"reasoning_content": "", "reasoning_content": "",
"content": "This is the rest", "content": "This is the rest",
"is_reasoning_end": True,
} }
SHORTEST_REASONING_WITH_THINK = { SHORTEST_REASONING_WITH_THINK = {
"output": "</think>This is the rest", "output": "</think>This is the rest",
"reasoning_content": None, "reasoning_content": None,
"content": "This is the rest", "content": "This is the rest",
"is_reasoning_end": True,
}
THINK_NO_END = {
"output": "<think>This is a reasoning section",
"reasoning_content": "This is a reasoning section",
"content": None,
"is_reasoning_end": False,
}
EMPTY = {
"output": "",
"reasoning_content": "",
"content": None,
"is_reasoning_end": False,
}
EMPTY_STREAMING = {
"output": "",
"reasoning_content": None,
"content": None,
"is_reasoning_end": False,
}
NEW_LINE = {
"output": "\n<think>This is a reasoning section</think>\nThis is the rest",
"reasoning_content": "This is a reasoning section",
"content": "\nThis is the rest",
"is_reasoning_end": True,
}
# Streaming cannot handle new lines at the beginning of the output
# because we need to support <think>...</think> and </think>...
# We cannot know if the text before <think> is reasoning content
# or not.
NEW_LINE_STREAMING = {
"output": "\n<think>This is a reasoning section</think>\nThis is the rest",
"reasoning_content": "\nThis is a reasoning section",
"content": "\nThis is the rest",
"is_reasoning_end": True,
} }
TEST_CASES = [ TEST_CASES = [
...@@ -164,25 +216,53 @@ TEST_CASES = [ ...@@ -164,25 +216,53 @@ TEST_CASES = [
SHORTEST_REASONING_WITH_THINK, SHORTEST_REASONING_WITH_THINK,
id="shortest_with_think_streaming", id="shortest_with_think_streaming",
), ),
pytest.param(
False,
THINK_NO_END,
id="think_no_end",
),
pytest.param(
True,
THINK_NO_END,
id="think_no_end_streaming",
),
pytest.param(
False,
EMPTY,
id="empty",
),
pytest.param(
True,
EMPTY_STREAMING,
id="empty_streaming",
),
pytest.param(
False,
NEW_LINE,
id="new_line",
),
pytest.param(
True,
NEW_LINE_STREAMING,
id="new_line_streaming",
),
] ]
# Global tokenizer initialization to avoid repeated loading
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
tokenizer.add_tokens([start_token, end_token])
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES) @pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning( def test_reasoning(
streaming: bool, streaming: bool,
param_dict: dict, param_dict: dict,
deepseek_r1_qwen_tokenizer,
): ):
output = tokenizer.tokenize(param_dict["output"]) output = deepseek_r1_qwen_tokenizer.tokenize(param_dict["output"])
# decode everything to tokens # decode everything to tokens
output_tokens: list[str] = [ output_tokens: list[str] = [
tokenizer.convert_tokens_to_string([token]) for token in output deepseek_r1_qwen_tokenizer.convert_tokens_to_string([token])
for token in output
] ]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser( parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
parser_name)(tokenizer) parser_name)(deepseek_r1_qwen_tokenizer)
reasoning, content = run_reasoning_extraction(parser, reasoning, content = run_reasoning_extraction(parser,
output_tokens, output_tokens,
...@@ -190,3 +270,17 @@ def test_reasoning( ...@@ -190,3 +270,17 @@ def test_reasoning(
assert reasoning == param_dict["reasoning_content"] assert reasoning == param_dict["reasoning_content"]
assert content == param_dict["content"] assert content == param_dict["content"]
# Test is_reasoning_end
output_ids = deepseek_r1_qwen_tokenizer.convert_tokens_to_ids(output)
is_reasoning_end = parser.is_reasoning_end(output_ids)
assert is_reasoning_end == param_dict["is_reasoning_end"]
# Test extract_content
if param_dict["content"] is not None:
content = parser.extract_content_ids(output_ids)
assert content == deepseek_r1_qwen_tokenizer.convert_tokens_to_ids(
deepseek_r1_qwen_tokenizer.tokenize(param_dict["content"]))
else:
content = parser.extract_content_ids(output)
assert content == []
# SPDX-License-Identifier: Apache-2.0
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import DeltaMessage, run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "granite"
START_REASONING = "Here is my thought process:"
START_RESPONSE = "Here is my response:"
SIMPLE_REASONING = {
"output":
f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest", #noqa: E501
"reasoning_content": "This is a reasoning section",
"content": "This is the rest",
}
COMPLETE_REASONING = {
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
"reasoning_content": "This is a reasoning section",
"content": None,
}
NO_REASONING = {
"output": "This is content",
"reasoning_content": None,
"content": "This is content",
}
MULTIPLE_LINES = {
"output":
f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
"reasoning_content": "This\nThat",
"content": "This is the rest\nThat",
}
REASONING_WITH_THINK = {
"output":
f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest", #noqa: E501
"reasoning_content": "This is a reasoning section",
"content": "This is the rest",
}
COMPLETE_REASONING_WITH_THINK = {
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
"reasoning_content": "This is a reasoning section",
"content": None,
}
MULTIPLE_LINES_WITH_THINK = {
"output":
f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
"reasoning_content": "This\nThat",
"content": "This is the rest\nThat",
}
TEST_CASES = [
pytest.param(
False,
SIMPLE_REASONING,
id="simple_reasoning",
),
pytest.param(
False,
COMPLETE_REASONING,
id="complete_reasoning",
),
pytest.param(
False,
NO_REASONING,
id="no_reasoning",
),
pytest.param(
False,
MULTIPLE_LINES,
id="multiple_lines",
),
pytest.param(
False,
REASONING_WITH_THINK,
id="reasoning_with_think",
),
pytest.param(
False,
COMPLETE_REASONING_WITH_THINK,
id="complete_reasoning_with_think",
),
pytest.param(
False,
MULTIPLE_LINES_WITH_THINK,
id="multiple_lines_with_think",
),
pytest.param(
True,
SIMPLE_REASONING,
id="simple_reasoning_streaming",
),
pytest.param(
True,
COMPLETE_REASONING,
id="complete_reasoning_streaming",
),
pytest.param(
True,
NO_REASONING,
id="no_reasoning_streaming",
),
pytest.param(
True,
MULTIPLE_LINES,
id="multiple_lines_streaming",
),
pytest.param(
True,
REASONING_WITH_THINK,
id="reasoning_with_think_streaming",
),
pytest.param(
True,
COMPLETE_REASONING_WITH_THINK,
id="complete_reasoning_with_think_streaming",
),
pytest.param(
True,
MULTIPLE_LINES_WITH_THINK,
id="multiple_lines_with_think_streaming",
),
]
# Global tokenizer initialization to avoid repeated loading
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
):
output = tokenizer.tokenize(param_dict["output"])
# decode everything to tokens
output_tokens: list[str] = [
tokenizer.convert_tokens_to_string([token]) for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
parser_name)(tokenizer)
reasoning, content = run_reasoning_extraction(parser,
output_tokens,
streaming=streaming)
assert reasoning == param_dict["reasoning_content"]
assert content == param_dict["content"]
# Additional tests for verifying the correctness of granite streaming; this
# is complicated because granite uses multiple tokens to indicate when thinking
# is starting / when it's starting its response, so skipping special tokens
# is awkward.
### Handling the start of reasoning
STREAMING_1 = {
"previous_text": None,
"current_text": "Here",
"delta_text": "Here",
"reasoning_content": None,
"content": None,
}
# When we fail, we should give what was previously being silenced first
STREAMING_2 = {
"previous_text": "Here is my thought",
"current_text": "Here is my thought failure",
"delta_text": " failure",
"reasoning_content": None,
"content": "Here is my thought failure",
}
# But then after the first one, we should only add the delta text to content
STREAMING_3 = {
"previous_text": "Here wrong",
"current_text": " words",
"delta_text": " Here wrong words",
"reasoning_content": None,
"content": " words",
}
# But then after the first one, we should only add the delta text to content
STREAMING_4 = {
"previous_text": "Here is my thought",
"current_text": "Here is my thought process:",
"delta_text": " process:",
"reasoning_content": None,
"content": None,
}
# Reasoning started successfully; parse reasoning content
STREAMING_5 = {
"previous_text": "Here is my thought process:",
"current_text": "Here is my thought process: foo",
"delta_text": " foo",
"reasoning_content": " foo",
"content": None,
}
# Response special sequence has started, but not finished.
STREAMING_6 = {
"previous_text": "Here is my thought process: foo",
"current_text": "Here is my thought process: foo Here is",
"delta_text": " Here is",
"reasoning_content": " ",
"content": None,
}
# Response special sequence started, but was broken; the reasoning
# content should be the content that was previously unused.
STREAMING_7 = {
"previous_text": "Here is my thought process: foo Here is",
"current_text": "Here is my thought process: foo Here is Here",
"delta_text": " Here",
"reasoning_content": "Here is ",
"content": None,
}
# Response special sequence is ongoing
STREAMING_8 = {
"previous_text": "Here is my thought process: foo Here is my response:",
"current_text": "Here is my thought process: foo Here is my response: bar",
"delta_text": " bar",
"reasoning_content": None,
"content": " bar",
}
# The delta text has everything; we should be able to correctly parse both
STREAMING_9 = {
"previous_text": None,
"current_text": "Here is my thought process: foo Here is my response: bar",
"delta_text": "Here is my thought process: foo Here is my response: bar",
"reasoning_content": " foo ",
"content": " bar",
}
## The Response is ongoing, and the delta mixes reasoning content / content
STREAMING_10 = {
"previous_text": "Here is my thought process: foo",
"current_text":
"Here is my thought process: foo bar Here is my response: baz",
"delta_text": " bar Here is my response: baz",
"reasoning_content": " bar ",
"content": " baz",
}
# The delta text starts a new substring that might be a response special seq
STREAMING_11 = {
"previous_text":
"Here is my thought process: This is a reasoning section ",
"current_text":
"Here is my thought process: This is a reasoning section Here",
"delta_text": "Here",
"reasoning_content": None,
"content": None,
}
# The delta text is finishing the response special seq
STREAMING_12 = {
"previous_text": "Here is my thought process: foo Here is my response",
"current_text": "Here is my thought process: foo Here is my response:",
"delta_text": ":",
"reasoning_content": None,
"content": None,
}
STREAMING_13 = {
"previous_text": "Here is my thought process: foo Here",
"current_text": "Here is my thought process: foo Here was",
"delta_text": " was",
"reasoning_content": "Here was",
"content": None,
}
STREAMING_SUBCASES = [
pytest.param(
STREAMING_1,
id="Starting reasoning special sequence",
),
pytest.param(
STREAMING_2,
id="Unexpected start reasoning sequence",
),
pytest.param(
STREAMING_3,
id="Continuing unexpected start reasoning sequence",
),
pytest.param(
STREAMING_4,
id="Only start reasoning sequence and nothing else",
),
pytest.param(
STREAMING_5,
id="Reasoning content has started",
),
pytest.param(
STREAMING_6,
id="Response special sequence has started",
),
pytest.param(
STREAMING_7,
id="Response special sequence reset",
),
pytest.param(
STREAMING_8,
id="Response text has started",
),
pytest.param(
STREAMING_9,
id="Delta contains everything",
),
pytest.param(
STREAMING_10,
id="Delta contains some reasoning and response",
),
pytest.param(
STREAMING_11,
id="Delta starts response sequence",
),
pytest.param(
STREAMING_12,
id="Delta finishes response sequence",
),
pytest.param(
STREAMING_13,
id="Delta breaks potential responise sequence",
),
]
@pytest.mark.parametrize("param_dict", STREAMING_SUBCASES)
def test_streaming_subcases(param_dict):
# Get all of the token IDs
previous_token_ids = tokenizer.encode(
param_dict["previous_text"]
) if param_dict["previous_text"] is not None else []
current_token_ids = tokenizer.encode(param_dict["current_text"])
delta_token_ids = tokenizer.encode(param_dict["delta_text"])
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
parser_name)(tokenizer)
response = parser.extract_reasoning_content_streaming(
previous_text=param_dict["previous_text"],
current_text=param_dict["current_text"],
delta_text=param_dict["delta_text"],
previous_token_ids=previous_token_ids,
current_token_ids=current_token_ids,
delta_token_ids=delta_token_ids,
)
# Streaming currently expects at least one of reasoning content / content,
# so the response should return None in that case.
if param_dict["reasoning_content"] is None and param_dict[
"content"] is None:
assert response is None
else:
assert isinstance(response, DeltaMessage)
assert param_dict["reasoning_content"] == response.reasoning_content
assert param_dict["content"] == response.content
...@@ -4,7 +4,7 @@ from typing import Optional, Union ...@@ -4,7 +4,7 @@ from typing import Optional, Union
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaMessage) DeltaMessage)
from vllm.entrypoints.openai.reasoning_parsers import ReasoningParser from vllm.reasoning import ReasoningParser
class StreamingReasoningReconstructor: class StreamingReasoningReconstructor:
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
tensor parallelism. tensor parallelism.
""" """
import json
from typing import Optional from typing import Optional
import pytest import pytest
...@@ -30,14 +31,14 @@ from ...utils import models_path_prefix ...@@ -30,14 +31,14 @@ from ...utils import models_path_prefix
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
[ [
"--speculative_config", "--speculative_config",
str({ json.dumps({
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"), "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
}), }),
], ],
[ [
"--speculative_config", "--speculative_config",
str({ json.dumps({
"model": "ngram", "model": "ngram",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"prompt_lookup_max": 3, "prompt_lookup_max": 3,
...@@ -90,7 +91,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs, ...@@ -90,7 +91,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
"model, test_llm_kwargs", "model, test_llm_kwargs",
[(os.path.join(models_path_prefix, "JackFram/llama-68m"), [ [(os.path.join(models_path_prefix, "JackFram/llama-68m"), [
"--speculative_config", "--speculative_config",
str({ json.dumps({
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"), "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1, "draft_tensor_parallel_size": 1,
...@@ -98,7 +99,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs, ...@@ -98,7 +99,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
]), ]),
(os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"), [ (os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"), [
"--speculative_config", "--speculative_config",
str({ json.dumps({
"model": os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"), "model": os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1, "draft_tensor_parallel_size": 1,
...@@ -149,20 +150,20 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs, ...@@ -149,20 +150,20 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
@pytest.mark.parametrize("model, test_llm_kwargs", @pytest.mark.parametrize("model, test_llm_kwargs",
[("JackFram/llama-68m", [ [("JackFram/llama-68m", [
"--speculative_config", "--speculative_config",
str({ json.dumps({
"model": "JackFram/llama-68m", "model": "JackFram/llama-68m",
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
}), }),
]), ]),
("JackFram/llama-68m", [ ("JackFram/llama-68m", [
"--speculative_config", "--speculative_config",
str({ json.dumps({
"model": "JackFram/llama-68m", "model": "JackFram/llama-68m",
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
"draft_tensor_parallel_size": 1, "draft_tensor_parallel_size": 1,
}), }),
])]) ])])
@pytest.mark.parametrize("logprobs", [None, 2]) @pytest.mark.parametrize("logprobs", [None])
@pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("seed", [1])
def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs, def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
...@@ -173,9 +174,68 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs, ...@@ -173,9 +174,68 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
"""Verify spec decode works well with same and different TP size for """Verify spec decode works well with same and different TP size for
the draft model with chunked prefill. the draft model with chunked prefill.
""" """
if logprobs: run_equality_correctness_test_tp(model,
test_llm_kwargs.extend( common_llm_kwargs,
["--disable_logprobs_during_spec_decoding", "False"]) per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size,
max_output_len=32,
seed=seed,
temperature=0.0,
logprobs=logprobs)
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize(
"common_llm_kwargs",
[[
# Skip cuda graph recording for fast test.
"--enforce-eager",
"--tensor_parallel_size",
"2",
# precision
"--dtype",
"bfloat16",
]])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
[["--enable-chunked-prefill", "False"],
[
"--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
"--max-num-seqs", "4"
]])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize("model, test_llm_kwargs",
[("JackFram/llama-68m", [
"--speculative_config",
json.dumps({
"model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
"disable_logprobs": False,
}),
]),
("JackFram/llama-68m", [
"--speculative_config",
json.dumps({
"model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
"draft_tensor_parallel_size": 1,
"disable_logprobs": False,
}),
])])
@pytest.mark.parametrize("logprobs", [2])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
def test_spec_decode_chunked_prefill_tp2_with_logprobs(
model, common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs, logprobs: Optional[int],
batch_size: int, seed: int):
"""Verify spec decode works well with same and different TP size for
the draft model with chunked prefill.
"""
run_equality_correctness_test_tp(model, run_equality_correctness_test_tp(model,
common_llm_kwargs, common_llm_kwargs,
per_test_common_llm_kwargs, per_test_common_llm_kwargs,
......
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
tensor parallelism. tensor parallelism.
""" """
import json
import openai import openai
import pytest import pytest
import torch import torch
...@@ -35,7 +37,7 @@ SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m") ...@@ -35,7 +37,7 @@ SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
#TODO(wooyeon): add spec_draft_dp=2 case #TODO(wooyeon): add spec_draft_dp=2 case
[ [
"--speculative_config", "--speculative_config",
str({ json.dumps({
"model": f"{SPEC_MODEL}", "model": f"{SPEC_MODEL}",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1, "draft_tensor_parallel_size": 1,
...@@ -82,7 +84,7 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs, ...@@ -82,7 +84,7 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
# Artificially limit the draft model max model len; this forces vLLM # Artificially limit the draft model max model len; this forces vLLM
# to skip speculation once the sequences grow beyond 32-k tokens. # to skip speculation once the sequences grow beyond 32-k tokens.
"--speculative_config", "--speculative_config",
str({ json.dumps({
"model": f"{SPEC_MODEL}", "model": f"{SPEC_MODEL}",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"max_model_len": 32, "max_model_len": 32,
......
...@@ -2,19 +2,22 @@ ...@@ -2,19 +2,22 @@
# ruff: noqa # ruff: noqa
import asyncio import asyncio
import hashlib
import pickle
import socket import socket
from collections.abc import AsyncIterator from collections.abc import AsyncIterator
from unittest.mock import patch from unittest.mock import patch
import pytest import pytest
import torch import torch
from vllm_test_utils import monitor from vllm_test_utils.monitor import monitor
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.utils import (FlexibleArgumentParser, MemorySnapshot, from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
PlaceholderModule, StoreBoolean, bind_kv_cache, PlaceholderModule, StoreBoolean, bind_kv_cache,
deprecate_kwargs, get_open_port, memory_profiling, deprecate_kwargs, get_open_port, memory_profiling,
merge_async_iterators, supports_kw, swap_dict_values) merge_async_iterators, sha256, supports_kw,
swap_dict_values)
from .utils import create_new_process_for_each_test, error_on_warning from .utils import create_new_process_for_each_test, error_on_warning
from .utils import models_path_prefix from .utils import models_path_prefix
...@@ -141,7 +144,8 @@ def parser(): ...@@ -141,7 +144,8 @@ def parser():
def parser_with_config(): def parser_with_config():
parser = FlexibleArgumentParser() parser = FlexibleArgumentParser()
parser.add_argument('serve') parser.add_argument('serve')
parser.add_argument('model_tag') parser.add_argument('model_tag', nargs='?')
parser.add_argument('--model', type=str)
parser.add_argument('--served-model-name', type=str) parser.add_argument('--served-model-name', type=str)
parser.add_argument('--config', type=str) parser.add_argument('--config', type=str)
parser.add_argument('--port', type=int) parser.add_argument('--port', type=int)
...@@ -198,29 +202,29 @@ def test_missing_required_argument(parser): ...@@ -198,29 +202,29 @@ def test_missing_required_argument(parser):
parser.parse_args([]) parser.parse_args([])
def test_cli_override_to_config(parser_with_config): def test_cli_override_to_config(parser_with_config, cli_config_file):
args = parser_with_config.parse_args([ args = parser_with_config.parse_args([
'serve', 'mymodel', '--config', './data/test_config.yaml', 'serve', 'mymodel', '--config', cli_config_file,
'--tensor-parallel-size', '3' '--tensor-parallel-size', '3'
]) ])
assert args.tensor_parallel_size == 3 assert args.tensor_parallel_size == 3
args = parser_with_config.parse_args([ args = parser_with_config.parse_args([
'serve', 'mymodel', '--tensor-parallel-size', '3', '--config', 'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
'./data/test_config.yaml' cli_config_file
]) ])
assert args.tensor_parallel_size == 3 assert args.tensor_parallel_size == 3
assert args.port == 12312 assert args.port == 12312
args = parser_with_config.parse_args([ args = parser_with_config.parse_args([
'serve', 'mymodel', '--tensor-parallel-size', '3', '--config', 'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
'./data/test_config.yaml', '--port', '666' cli_config_file, '--port', '666'
]) ])
assert args.tensor_parallel_size == 3 assert args.tensor_parallel_size == 3
assert args.port == 666 assert args.port == 666
def test_config_args(parser_with_config): def test_config_args(parser_with_config, cli_config_file):
args = parser_with_config.parse_args( args = parser_with_config.parse_args(
['serve', 'mymodel', '--config', './data/test_config.yaml']) ['serve', 'mymodel', '--config', cli_config_file])
assert args.tensor_parallel_size == 2 assert args.tensor_parallel_size == 2
assert args.trust_remote_code assert args.trust_remote_code
assert not args.multi_step_stream_outputs assert not args.multi_step_stream_outputs
...@@ -242,10 +246,9 @@ def test_config_file(parser_with_config): ...@@ -242,10 +246,9 @@ def test_config_file(parser_with_config):
]) ])
def test_no_model_tag(parser_with_config): def test_no_model_tag(parser_with_config, cli_config_file):
with pytest.raises(ValueError): with pytest.raises(ValueError):
parser_with_config.parse_args( parser_with_config.parse_args(['serve', '--config', cli_config_file])
['serve', '--config', './data/test_config.yaml'])
# yapf: enable # yapf: enable
...@@ -478,3 +481,63 @@ def test_swap_dict_values(obj, key1, key2): ...@@ -478,3 +481,63 @@ def test_swap_dict_values(obj, key1, key2):
assert obj[key1] == original_obj[key2] assert obj[key1] == original_obj[key2]
else: else:
assert key1 not in obj assert key1 not in obj
def test_model_specification(parser_with_config,
cli_config_file,
cli_config_file_with_model):
# Test model in CLI takes precedence over config
args = parser_with_config.parse_args([
'serve', 'cli-model', '--config', cli_config_file_with_model
])
assert args.model_tag == 'cli-model'
assert args.served_model_name == 'mymodel'
# Test model from config file works
args = parser_with_config.parse_args([
'serve', '--config', cli_config_file_with_model,
])
assert args.model == 'config-model'
assert args.served_model_name == 'mymodel'
# Test no model specified anywhere raises error
with pytest.raises(ValueError, match="No model specified!"):
parser_with_config.parse_args(['serve', '--config', cli_config_file])
# Test using --model option raises error
with pytest.raises(
ValueError,
match=(
"With `vllm serve`, you should provide the model as a positional "
"argument or in a config file instead of via the `--model` option."
),
):
parser_with_config.parse_args(['serve', '--model', 'my-model'])
# Test other config values are preserved
args = parser_with_config.parse_args([
'serve', 'cli-model', '--config', cli_config_file_with_model,
])
assert args.tensor_parallel_size == 2
assert args.trust_remote_code is True
assert args.multi_step_stream_outputs is False
assert args.port == 12312
@pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
(None, bool, [1, 2, 3])])
@pytest.mark.parametrize("output", [0, 1, 2])
def test_sha256(input: tuple, output: int):
hash = sha256(input)
assert hash is not None
assert isinstance(hash, int)
assert hash != 0
bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
assert hash == int.from_bytes(hashlib.sha256(bytes).digest(), byteorder="big")
# hashing again, returns the same value
assert hash == sha256(input)
# hashing different input, returns different value
assert hash != sha256(input + (1, ))
...@@ -45,7 +45,8 @@ def test_chat_completion_request_with_no_tools(): ...@@ -45,7 +45,8 @@ def test_chat_completion_request_with_no_tools():
assert request.tool_choice == 'none' assert request.tool_choice == 'none'
def test_chat_completion_request_with_tool_choice_but_no_tools(): @pytest.mark.parametrize('tool_choice', ['auto', 'required'])
def test_chat_completion_request_with_tool_choice_but_no_tools(tool_choice):
with pytest.raises(ValueError, with pytest.raises(ValueError,
match="When using `tool_choice`, `tools` must be set."): match="When using `tool_choice`, `tools` must be set."):
ChatCompletionRequest.model_validate({ ChatCompletionRequest.model_validate({
...@@ -56,7 +57,7 @@ def test_chat_completion_request_with_tool_choice_but_no_tools(): ...@@ -56,7 +57,7 @@ def test_chat_completion_request_with_tool_choice_but_no_tools():
'model': 'model':
os.path.join(models_path_prefix, 'facebook/opt-125m'), os.path.join(models_path_prefix, 'facebook/opt-125m'),
'tool_choice': 'tool_choice':
'auto' tool_choice
}) })
with pytest.raises(ValueError, with pytest.raises(ValueError,
...@@ -69,7 +70,7 @@ def test_chat_completion_request_with_tool_choice_but_no_tools(): ...@@ -69,7 +70,7 @@ def test_chat_completion_request_with_tool_choice_but_no_tools():
'model': 'model':
os.path.join(models_path_prefix, 'facebook/opt-125m'), os.path.join(models_path_prefix, 'facebook/opt-125m'),
'tool_choice': 'tool_choice':
'auto', tool_choice,
'tools': 'tools':
None None
}) })
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment