"tests/vscode:/vscode.git/clone" did not exist on "a6760f6456b714409685e23301c820a85da856ca"
Commit 8d75f22e authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.13.0rc1' into v0.13.0rc1-ori

parents ce888aa4 7d80c73d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import subprocess
import sys
import pytest
import torch
def run_python_script(script_name, timeout):
script_name = f"kv_transfer/{script_name}"
try:
# Start both processes asynchronously using Popen
process0 = subprocess.Popen(
[sys.executable, script_name],
env={"RANK": "0"}, # Set the RANK environment variable for process 0
stdout=sys.stdout, # Pipe stdout to current stdout
stderr=sys.stderr, # Pipe stderr to current stderr
)
process1 = subprocess.Popen(
[sys.executable, script_name],
env={"RANK": "1"}, # Set the RANK environment variable for process 1
stdout=sys.stdout, # Pipe stdout to current stdout
stderr=sys.stderr, # Pipe stderr to current stderr
)
# Wait for both processes to complete, with a timeout
process0.wait(timeout=timeout)
process1.wait(timeout=timeout)
# Check the return status of both processes
if process0.returncode != 0:
pytest.fail(f"Test {script_name} failed for RANK=0, {process0.returncode}")
if process1.returncode != 0:
pytest.fail(f"Test {script_name} failed for RANK=1, {process1.returncode}")
except subprocess.TimeoutExpired:
# If either process times out, terminate both and fail the test
process0.terminate()
process1.terminate()
pytest.fail(f"Test {script_name} timed out")
except Exception as e:
pytest.fail(f"Test {script_name} failed with error: {str(e)}")
# Define the test cases using pytest's parametrize
@pytest.mark.parametrize(
"script_name,timeout",
[
("test_lookup_buffer.py", 60), # Second test case with a 60-second timeout
("test_send_recv.py", 120), # First test case with a 120-second timeout
],
)
def test_run_python_script(script_name, timeout):
# Check the number of GPUs
if torch.cuda.device_count() < 2:
pytest.skip(f"Skipping test {script_name} because <2 GPUs are available")
# Run the test if there are at least 2 GPUs
run_python_script(script_name, timeout)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import time
import torch
from tqdm import tqdm
from vllm.config import KVTransferConfig
from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
def test_run(my_rank, pipe):
print(f"rank {my_rank} test_run starts....")
# test run
x = torch.tensor([1]).to(pipe.device)
y = torch.tensor([[2.0, 3.0, 4.0, 8.0]]).to(pipe.device)
if my_rank == 0:
pipe.send_tensor(x)
print(f"rank {my_rank} sent tensor x")
pipe.send_tensor(y)
print(f"rank {my_rank} sent tensor y")
x2 = pipe.recv_tensor()
print(f"rank {my_rank} received x2 = ", x2)
y2 = pipe.recv_tensor()
print(f"rank {my_rank} received y2 = ", y2)
else:
x2 = pipe.recv_tensor()
print(f"rank {my_rank} received x2 = ", x2)
y2 = pipe.recv_tensor()
print(f"rank {my_rank} received y2 = ", y2)
pipe.send_tensor(x)
print(f"rank {my_rank} sent tensor x")
pipe.send_tensor(y)
print(f"rank {my_rank} sent tensor y")
assert torch.allclose(x, x2)
assert torch.allclose(y, y2)
print(f"rank {my_rank} test_run passed!")
def stress_test(my_rank, pipe):
print(f"rank {my_rank} stress_test starts....")
tensors: list[torch.Tensor] = []
torch.distributed.barrier()
torch.manual_seed(0)
for i in tqdm(range(500)):
mean = torch.rand(1).item() * 100
std = torch.rand(1).item() * 100
size = torch.randint(900, 1000, (2,))
x = torch.normal(mean * 1.0, std * 1.0, size=size.tolist()).to(pipe.device)
# 5% probability of sending a None
if torch.rand(1).item() < 0.05:
tensors.append(None)
tensors.append(None)
tensors.append(None)
else:
tensors.append(x)
tensors.append(x.mean().unsqueeze(0))
tensors.append(x.std().unsqueeze(0))
torch.distributed.barrier()
for i in tqdm(range(500)):
if my_rank == int((i % 10) > 3):
pipe.send_tensor(tensors[3 * i])
pipe.send_tensor(tensors[3 * i + 1])
pipe.send_tensor(tensors[3 * i + 2])
else:
x = pipe.recv_tensor()
mean = pipe.recv_tensor()
std = pipe.recv_tensor()
if x is None:
assert mean is None
assert std is None
else:
assert torch.allclose(x, tensors[3 * i])
assert x.mean() == mean[0]
assert x.std() == std[0]
torch.distributed.barrier()
def latency_test(my_rank, pipe, nelement, ntensor):
latencies = []
torch.distributed.barrier()
for i in tqdm(range(500)):
tensors = []
if my_rank == 0:
# create tensor
tensors = [torch.rand(nelement).to(pipe.device) for _ in range(ntensor)]
torch.distributed.barrier()
if my_rank == 0:
t = torch.tensor([time.time()], dtype=torch.float64).to(pipe.device)
for tensor in tensors:
pipe.send_tensor(tensor)
pipe.send_tensor(t)
else:
for _ in range(ntensor):
pipe.recv_tensor()
t = pipe.recv_tensor()
latencies.append(time.time() - t.item())
torch.distributed.barrier()
print("Latency test passed.")
print("Latency:", torch.tensor(latencies).mean().item() * 1000, "ms")
if __name__ == "__main__":
my_rank = int(os.environ["RANK"])
torch.distributed.init_process_group(
backend="gloo",
init_method="tcp://localhost:12398",
world_size=2,
rank=my_rank,
)
config = KVTransferConfig(
kv_connector="P2pNcclConnector",
kv_buffer_device="cuda",
kv_buffer_size=1e9,
kv_rank=my_rank,
kv_role="kv_both", # this arg doesn't matter in this test
kv_parallel_size=2,
kv_ip="127.0.0.1",
kv_port=12345,
)
pipe = PyNcclPipe(
local_rank=my_rank,
config=config,
)
test_run(my_rank, pipe)
stress_test(my_rank, pipe)
# Use this function if you want to test the latency of pipe impl.
# latency_test(my_rank, pipe, 1024 * 8 * 128, 80)
#!/bin/bash
RANK=0 python3 test_send_recv.py &
PID0=$!
RANK=1 python3 test_send_recv.py &
PID1=$!
wait $PID0
wait $PID1
...@@ -13,6 +13,7 @@ from huggingface_hub import snapshot_download ...@@ -13,6 +13,7 @@ from huggingface_hub import snapshot_download
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from ..conftest import AudioTestAssets, VllmRunner from ..conftest import AudioTestAssets, VllmRunner
from ..utils import create_new_process_for_each_test
MODEL_PATH = snapshot_download("microsoft/Phi-4-multimodal-instruct") MODEL_PATH = snapshot_download("microsoft/Phi-4-multimodal-instruct")
AUDIO_LORA_PATH = os.path.join(MODEL_PATH, "speech-lora") AUDIO_LORA_PATH = os.path.join(MODEL_PATH, "speech-lora")
...@@ -60,6 +61,7 @@ def run_test(vllm_runner, audio_assets, lora_request, expected_suffix, **kwargs) ...@@ -60,6 +61,7 @@ def run_test(vllm_runner, audio_assets, lora_request, expected_suffix, **kwargs)
assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(expected_suffix) assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(expected_suffix)
@create_new_process_for_each_test()
def test_active_default_mm_lora( def test_active_default_mm_lora(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets, audio_assets: AudioTestAssets,
...@@ -74,6 +76,7 @@ def test_active_default_mm_lora( ...@@ -74,6 +76,7 @@ def test_active_default_mm_lora(
) )
@create_new_process_for_each_test()
def test_inactive_default_mm_lora( def test_inactive_default_mm_lora(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets, audio_assets: AudioTestAssets,
...@@ -89,6 +92,7 @@ def test_inactive_default_mm_lora( ...@@ -89,6 +92,7 @@ def test_inactive_default_mm_lora(
) )
@create_new_process_for_each_test()
def test_default_mm_lora_succeeds_with_redundant_lora_request( def test_default_mm_lora_succeeds_with_redundant_lora_request(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets, audio_assets: AudioTestAssets,
...@@ -103,6 +107,7 @@ def test_default_mm_lora_succeeds_with_redundant_lora_request( ...@@ -103,6 +107,7 @@ def test_default_mm_lora_succeeds_with_redundant_lora_request(
) )
@create_new_process_for_each_test()
def test_default_mm_lora_fails_with_overridden_lora_request( def test_default_mm_lora_fails_with_overridden_lora_request(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets, audio_assets: AudioTestAssets,
...@@ -118,6 +123,7 @@ def test_default_mm_lora_fails_with_overridden_lora_request( ...@@ -118,6 +123,7 @@ def test_default_mm_lora_fails_with_overridden_lora_request(
) )
@create_new_process_for_each_test()
def test_default_mm_lora_does_not_expand_string_reqs(vllm_runner): def test_default_mm_lora_does_not_expand_string_reqs(vllm_runner):
class MockEngineException(Exception): class MockEngineException(Exception):
pass pass
......
...@@ -28,7 +28,7 @@ from vllm.lora.layers import ( ...@@ -28,7 +28,7 @@ from vllm.lora.layers import (
RowParallelLinearWithShardedLoRA, RowParallelLinearWithShardedLoRA,
VocabParallelEmbeddingWithLoRA, VocabParallelEmbeddingWithLoRA,
) )
from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
from vllm.lora.punica_wrapper import get_punica_wrapper from vllm.lora.punica_wrapper import get_punica_wrapper
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
ColumnParallelLinear, ColumnParallelLinear,
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import pytest import pytest
from vllm.lora.models import LoRAModel from vllm.lora.lora_model import LoRAModel
from vllm.lora.peft_helper import PEFTHelper from vllm.lora.peft_helper import PEFTHelper
from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
from vllm.model_executor.models.utils import WeightsMapper from vllm.model_executor.models.utils import WeightsMapper
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import pytest import pytest
from vllm.lora.models import LoRAModel from vllm.lora.lora_model import LoRAModel
from vllm.lora.peft_helper import PEFTHelper from vllm.lora.peft_helper import PEFTHelper
from vllm.lora.utils import get_adapter_absolute_path from vllm.lora.utils import get_adapter_absolute_path
from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
......
...@@ -15,10 +15,10 @@ from vllm.lora.layers import ( ...@@ -15,10 +15,10 @@ from vllm.lora.layers import (
MergedColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA,
RowParallelLinearWithLoRA, RowParallelLinearWithLoRA,
) )
from vllm.lora.lora_model import LoRAModel
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
from vllm.lora.models import ( from vllm.lora.model_manager import (
LoRAMapping, LoRAMapping,
LoRAModel,
LoRAModelManager, LoRAModelManager,
LRUCacheLoRAModelManager, LRUCacheLoRAModelManager,
) )
......
...@@ -32,7 +32,7 @@ def sample_data(num_experts, max_loras, num_tokens, topk_num): ...@@ -32,7 +32,7 @@ def sample_data(num_experts, max_loras, num_tokens, topk_num):
@pytest.mark.parametrize("num_tokens", [100, 200, 1024, 4096]) # 81920 @pytest.mark.parametrize("num_tokens", [100, 200, 1024, 4096]) # 81920
@pytest.mark.parametrize("topk_num", [6]) @pytest.mark.parametrize("topk_num", [6])
@pytest.mark.parametrize("num_experts", [64, 128]) @pytest.mark.parametrize("num_experts", [64, 128, 256, 512])
@pytest.mark.parametrize("max_loras", [2, 32]) @pytest.mark.parametrize("max_loras", [2, 32])
@pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("block_size", [16])
def test_moe_lora_align_block_size( def test_moe_lora_align_block_size(
......
...@@ -16,7 +16,7 @@ from vllm.config import ( ...@@ -16,7 +16,7 @@ from vllm.config import (
) )
from vllm.config.load import LoadConfig from vllm.config.load import LoadConfig
from vllm.config.lora import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.lora.models import LoRAMapping from vllm.lora.model_manager import LoRAMapping
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.v1.worker.gpu_worker import Worker from vllm.v1.worker.gpu_worker import Worker
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port
from vllm.v1.executor import UniProcExecutor
from vllm.v1.worker.worker_base import WorkerWrapperBase
# This is a dummy executor for patching in test_runai_model_streamer_s3.py.
# We cannot use vllm_runner fixture here, because it spawns worker process.
# The worker process reimports the patched entities, and the patch is not applied.
class RunaiDummyExecutor(UniProcExecutor):
def _init_executor(self) -> None:
distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())
local_rank = 0
rank = 0
is_driver_worker = True
device_info = self.vllm_config.device_config.device.__str__().split(":")
if len(device_info) > 1:
local_rank = int(device_info[1])
worker_rpc_kwargs = dict(
vllm_config=self.vllm_config,
local_rank=local_rank,
rank=rank,
distributed_init_method=distributed_init_method,
is_driver_worker=is_driver_worker,
)
wrapper_kwargs = {
"vllm_config": self.vllm_config,
}
self.driver_worker = WorkerWrapperBase(**wrapper_kwargs)
self.collective_rpc("init_worker", args=([worker_rpc_kwargs],))
self.collective_rpc("init_device")
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
from huggingface_hub import snapshot_download
from runai_model_streamer.safetensors_streamer.streamer_mock import StreamerPatcher
from vllm.engine.arg_utils import EngineArgs
from .conftest import RunaiDummyExecutor
load_format = "runai_streamer"
test_model = "openai-community/gpt2"
def test_runai_model_loader_download_files_s3_mocked_with_patch(
vllm_runner,
tmp_path: Path,
monkeypatch,
):
patcher = StreamerPatcher(str(tmp_path))
test_mock_s3_model = "s3://my-mock-bucket/gpt2/"
# Download model from HF
mock_model_dir = f"{tmp_path}/gpt2"
snapshot_download(repo_id=test_model, local_dir=mock_model_dir)
monkeypatch.setattr(
"vllm.transformers_utils.runai_utils.runai_list_safetensors",
patcher.shim_list_safetensors,
)
monkeypatch.setattr(
"vllm.transformers_utils.runai_utils.runai_pull_files",
patcher.shim_pull_files,
)
monkeypatch.setattr(
"vllm.model_executor.model_loader.weight_utils.SafetensorsStreamer",
patcher.create_mock_streamer,
)
engine_args = EngineArgs(
model=test_mock_s3_model,
load_format=load_format,
tensor_parallel_size=1,
)
vllm_config = engine_args.create_engine_config()
executor = RunaiDummyExecutor(vllm_config)
executor.driver_worker.load_model()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from transformers import AutoModel
from tests.models.utils import check_embeddings_close
from vllm import TokensPrompt
@pytest.mark.parametrize(
"model",
["Qwen/Qwen3-Embedding-0.6B"],
)
@torch.inference_mode
def test_embed_models(hf_runner, vllm_runner, model: str):
chunk_size = 10
n_prompt_tokens = [55, 56, 57]
token_prompts = [[1024 + i for i in range(n)] for n in n_prompt_tokens]
with vllm_runner(
model,
runner="pooling",
max_model_len=128,
max_num_batched_tokens=chunk_size,
enforce_eager=True,
# `enable_chunked_prefill`: Set to `False` instead of `None` in VllmRunner
enable_chunked_prefill=True,
enable_prefix_caching=True,
) as vllm_model:
vllm_outputs = vllm_model.token_embed(
[TokensPrompt(prompt_token_ids=t) for t in token_prompts],
)
with hf_runner(
model,
auto_cls=AutoModel,
) as hf_model:
hf_outputs = []
for token_prompt in token_prompts:
inputs = hf_model.wrap_device({"input_ids": torch.tensor([token_prompt])})
input_ids = inputs["input_ids"]
output = hf_model.model(input_ids)
hf_outputs.append(output.last_hidden_state.cpu().float()[0])
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
check_embeddings_close(
embeddings_0_lst=hf_output,
embeddings_1_lst=vllm_output,
name_0="hf",
name_1="vllm",
tol=1e-2,
)
...@@ -20,7 +20,6 @@ def test_extract_hidden_states(hf_runner, vllm_runner, model: str): ...@@ -20,7 +20,6 @@ def test_extract_hidden_states(hf_runner, vllm_runner, model: str):
max_model_len=128, max_model_len=128,
enforce_eager=True, enforce_eager=True,
runner="pooling", runner="pooling",
enable_chunked_prefill=False,
enable_prefix_caching=True, enable_prefix_caching=True,
) as vllm_model: ) as vllm_model:
pooling_outputs = vllm_model.llm.encode( pooling_outputs = vllm_model.llm.encode(
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM tests."""
import warnings
import torch
from vllm.platforms import current_platform
def pytest_configure(config):
"""Disable Flash/MemEfficient SDP on ROCm to avoid HF
Transformers accuracy issues.
"""
if not current_platform.is_rocm():
return
skip_patterns = ["test_granite_speech.py"]
if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
# Skip disabling SDP for Granite Speech tests on ROCm
return
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)
warnings.warn(
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
"to avoid HuggingFace Transformers accuracy issues",
UserWarning,
stacklevel=1,
)
...@@ -137,7 +137,7 @@ VLM_TEST_SETTINGS = { ...@@ -137,7 +137,7 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[pytest.mark.core_model, pytest.mark.cpu_model], marks=[pytest.mark.core_model, pytest.mark.cpu_model],
), ),
"qwen2_5_omni": VLMTestInfo( "qwen2_5_omni": VLMTestInfo(
...@@ -152,7 +152,7 @@ VLM_TEST_SETTINGS = { ...@@ -152,7 +152,7 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForTextToWaveform, auto_cls=AutoModelForTextToWaveform,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner, patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[pytest.mark.core_model, pytest.mark.cpu_model], marks=[pytest.mark.core_model, pytest.mark.cpu_model],
), ),
"qwen3_vl": VLMTestInfo( "qwen3_vl": VLMTestInfo(
...@@ -173,7 +173,7 @@ VLM_TEST_SETTINGS = { ...@@ -173,7 +173,7 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner, patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[ marks=[
pytest.mark.core_model, pytest.mark.core_model,
], ],
...@@ -278,7 +278,7 @@ VLM_TEST_SETTINGS = { ...@@ -278,7 +278,7 @@ VLM_TEST_SETTINGS = {
marks=[large_gpu_mark(min_gb=64)], marks=[large_gpu_mark(min_gb=64)],
), ),
"aya_vision": VLMTestInfo( "aya_vision": VLMTestInfo(
models=["CohereForAI/aya-vision-8b"], models=["CohereLabs/aya-vision-8b"],
test_type=(VLMTestType.IMAGE), test_type=(VLMTestType.IMAGE),
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
...@@ -294,7 +294,7 @@ VLM_TEST_SETTINGS = { ...@@ -294,7 +294,7 @@ VLM_TEST_SETTINGS = {
vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}, vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
), ),
"aya_vision-multi_image": VLMTestInfo( "aya_vision-multi_image": VLMTestInfo(
models=["CohereForAI/aya-vision-8b"], models=["CohereLabs/aya-vision-8b"],
test_type=(VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
...@@ -350,7 +350,7 @@ VLM_TEST_SETTINGS = { ...@@ -350,7 +350,7 @@ VLM_TEST_SETTINGS = {
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner, patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output, hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"],
image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], image_size_factors=[(1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
), ),
"fuyu": VLMTestInfo( "fuyu": VLMTestInfo(
models=["adept/fuyu-8b"], models=["adept/fuyu-8b"],
...@@ -382,7 +382,6 @@ VLM_TEST_SETTINGS = { ...@@ -382,7 +382,6 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}}, vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
patch_hf_runner=model_utils.gemma3_patch_hf_runner, patch_hf_runner=model_utils.gemma3_patch_hf_runner,
num_logprobs=10,
), ),
"glm4v": VLMTestInfo( "glm4v": VLMTestInfo(
models=["zai-org/glm-4v-9b"], models=["zai-org/glm-4v-9b"],
...@@ -403,12 +402,13 @@ VLM_TEST_SETTINGS = { ...@@ -403,12 +402,13 @@ VLM_TEST_SETTINGS = {
# So, we need to reduce the number of tokens for the test to pass. # So, we need to reduce the number of tokens for the test to pass.
max_tokens=8, max_tokens=8,
num_logprobs=10, num_logprobs=10,
auto_cls=AutoModelForCausalLM,
marks=[large_gpu_mark(min_gb=32)], marks=[large_gpu_mark(min_gb=32)],
), ),
"glm4_1v": VLMTestInfo( "glm4_1v": VLMTestInfo(
models=["zai-org/GLM-4.1V-9B-Thinking"], models=["zai-org/GLM-4.1V-9B-Thinking"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
max_model_len=2048, max_model_len=2048,
...@@ -423,6 +423,7 @@ VLM_TEST_SETTINGS = { ...@@ -423,6 +423,7 @@ VLM_TEST_SETTINGS = {
models=["zai-org/GLM-4.1V-9B-Thinking"], models=["zai-org/GLM-4.1V-9B-Thinking"],
# GLM4.1V require include video metadata for input # GLM4.1V require include video metadata for input
test_type=VLMTestType.CUSTOM_INPUTS, test_type=VLMTestType.CUSTOM_INPUTS,
prompt_formatter=lambda vid_prompt: f"[gMASK]<|user|>\n{vid_prompt}<|assistant|>\n", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
...@@ -707,7 +708,7 @@ VLM_TEST_SETTINGS = { ...@@ -707,7 +708,7 @@ VLM_TEST_SETTINGS = {
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForCausalLM, auto_cls=AutoModelForCausalLM,
image_size_factors=[(), (0.25,)], image_size_factors=[(0.25,)],
marks=[ marks=[
pytest.mark.skipif( pytest.mark.skipif(
Version(TRANSFORMERS_VERSION) == Version("4.57.3"), Version(TRANSFORMERS_VERSION) == Version("4.57.3"),
...@@ -737,7 +738,13 @@ VLM_TEST_SETTINGS = { ...@@ -737,7 +738,13 @@ VLM_TEST_SETTINGS = {
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=48)], marks=[
large_gpu_mark(min_gb=48),
pytest.mark.skipif(
current_platform.is_rocm(),
reason="Model produces a vector of <UNK> output in HF on ROCm",
),
],
), ),
"qwen_vl": VLMTestInfo( "qwen_vl": VLMTestInfo(
models=["Qwen/Qwen-VL"], models=["Qwen/Qwen-VL"],
...@@ -760,7 +767,7 @@ VLM_TEST_SETTINGS = { ...@@ -760,7 +767,7 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[pytest.mark.cpu_model], marks=[pytest.mark.cpu_model],
), ),
"skywork_r1v": VLMTestInfo( "skywork_r1v": VLMTestInfo(
...@@ -812,7 +819,7 @@ VLM_TEST_SETTINGS = { ...@@ -812,7 +819,7 @@ VLM_TEST_SETTINGS = {
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[pytest.mark.skip("Model initialization hangs")], marks=[pytest.mark.skip("Model initialization hangs")],
), ),
### Tensor parallel / multi-gpu broadcast tests ### Tensor parallel / multi-gpu broadcast tests
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment