Commit 96ae75ad authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.6.post1' into v0.6.6.post1-dev

parents f9f4a735 2339d59f
...@@ -10,39 +10,42 @@ from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe ...@@ -10,39 +10,42 @@ from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
def test_run(my_rank, pipe): def test_run(my_rank, pipe):
print(f"rank {my_rank} test_run starts....")
# test run # test run
x = torch.tensor([1]).to(pipe.device) x = torch.tensor([1]).to(pipe.device)
y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device) y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
if my_rank == 0: if my_rank == 0:
pipe.send_tensor(x) pipe.send_tensor(x)
print("sent tensor x") print(f"rank {my_rank} sent tensor x")
pipe.send_tensor(y) pipe.send_tensor(y)
print("sent tensor y") print(f"rank {my_rank} sent tensor y")
x2 = pipe.recv_tensor() x2 = pipe.recv_tensor()
print("received x2 = ", x2) print(f"rank {my_rank} received x2 = ", x2)
y2 = pipe.recv_tensor() y2 = pipe.recv_tensor()
print("received y2 = ", x2) print(f"rank {my_rank} received y2 = ", x2)
else: else:
x2 = pipe.recv_tensor() x2 = pipe.recv_tensor()
print("received x2 = ", x2) print(f"rank {my_rank} received x2 = ", x2)
y2 = pipe.recv_tensor() y2 = pipe.recv_tensor()
print("received y2 = ", x2) print(f"rank {my_rank} received y2 = ", x2)
pipe.send_tensor(x) pipe.send_tensor(x)
print("sent tensor x") print(f"rank {my_rank} sent tensor x")
pipe.send_tensor(y) pipe.send_tensor(y)
print("sent tensor y") print(f"rank {my_rank} sent tensor y")
assert torch.allclose(x, x2) assert torch.allclose(x, x2)
assert torch.allclose(y, y2) assert torch.allclose(y, y2)
print(f"rank {my_rank} test_run passed!")
def stress_test(my_rank, pipe):
torch.distributed.barrier() def stress_test(my_rank, pipe):
print(f"rank {my_rank} stress_test starts....")
tensors: List[torch.Tensor] = [] tensors: List[torch.Tensor] = []
torch.distributed.barrier()
torch.manual_seed(0) torch.manual_seed(0)
for i in tqdm(range(500)): for i in tqdm(range(500)):
...@@ -86,7 +89,6 @@ def stress_test(my_rank, pipe): ...@@ -86,7 +89,6 @@ def stress_test(my_rank, pipe):
def latency_test(my_rank, pipe, nelement, ntensor): def latency_test(my_rank, pipe, nelement, ntensor):
latencies = [] latencies = []
torch.distributed.barrier() torch.distributed.barrier()
...@@ -149,6 +151,7 @@ if __name__ == "__main__": ...@@ -149,6 +151,7 @@ if __name__ == "__main__":
) )
test_run(my_rank, pipe) test_run(my_rank, pipe)
stress_test(my_rank, pipe) stress_test(my_rank, pipe)
# Use this function if you want to test the latency of pipe impl. # Use this function if you want to test the latency of pipe impl.
......
#!/bin/bash #!/bin/bash
RANK=0 python3 test_send_recv.py & RANK=0 python3 test_send_recv.py &
RANK=1 python3 test_send_recv.py & PID0=$!
\ No newline at end of file RANK=1 python3 test_send_recv.py &
PID1=$!
wait $PID0
wait $PID1
...@@ -208,6 +208,11 @@ def minicpmv_lora_files(): ...@@ -208,6 +208,11 @@ def minicpmv_lora_files():
return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon") return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon")
@pytest.fixture(scope="session")
def qwen2vl_lora_files():
return snapshot_download(repo_id="jeeejeee/qwen2-vl-lora-pokemon")
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def tinyllama_lora_files(): def tinyllama_lora_files():
# return snapshot_download(repo_id="jashing/tinyllama-colorist-lora") # return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
......
...@@ -4,6 +4,7 @@ import pytest ...@@ -4,6 +4,7 @@ import pytest
from vllm.lora.models import LoRAModel from vllm.lora.models import LoRAModel
from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
from vllm.model_executor.models.utils import WeightsMapper
lora_lst = [ lora_lst = [
"baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b" "baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"
...@@ -71,3 +72,37 @@ def test_load_checkpoints( ...@@ -71,3 +72,37 @@ def test_load_checkpoints(
device="cpu", device="cpu",
embedding_modules=embedding_modules, embedding_modules=embedding_modules,
embedding_padding_modules=embed_padding_modules) embedding_padding_modules=embed_padding_modules)
def test_lora_weights_mapping(baichuan_lora_files):
supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
expected_lora_modules: List[str] = []
for module in supported_lora_modules:
if module in packed_modules_mapping:
expected_lora_modules.extend(packed_modules_mapping[module])
else:
expected_lora_modules.append(module)
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
"model.": "language_model.model.",
},
orig_to_new_substr={
".layers.": ".baichuan_layers.",
},
)
lora_model = LoRAModel.from_local_checkpoint(
baichuan_lora_files,
expected_lora_modules,
lora_model_id=1,
device="cpu",
embedding_modules=embedding_modules,
embedding_padding_modules=embed_padding_modules,
weights_mapper=hf_to_vllm_mapper,
)
for name in lora_model.loras:
assert name.startswith(hf_to_vllm_mapper.orig_to_new_prefix["model."])
assert ".baichuan_layers." in name
...@@ -69,7 +69,6 @@ def test_minicpmv_lora(minicpmv_lora_files): ...@@ -69,7 +69,6 @@ def test_minicpmv_lora(minicpmv_lora_files):
max_loras=4, max_loras=4,
max_lora_rank=64, max_lora_rank=64,
trust_remote_code=True, trust_remote_code=True,
gpu_memory_utilization=0.97, # This model is pretty big for CI gpus
enable_chunked_prefill=True, enable_chunked_prefill=True,
) )
output1 = do_sample(llm, minicpmv_lora_files, lora_id=1) output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
......
...@@ -64,8 +64,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size): ...@@ -64,8 +64,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
@pytest.mark.parametrize("tp_size", [4]) @pytest.mark.parametrize("tp_size", [4])
@pytest.mark.parametrize("fully_shard", [True, False])
def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules, def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
tp_size): tp_size, fully_shard):
"""This LoRA model has all supported Mixtral target modules""" """This LoRA model has all supported Mixtral target modules"""
if torch.cuda.device_count() < tp_size: if torch.cuda.device_count() < tp_size:
...@@ -84,6 +85,7 @@ def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules, ...@@ -84,6 +85,7 @@ def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
max_loras=4, max_loras=4,
distributed_executor_backend="ray", distributed_executor_backend="ray",
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
fully_sharded_loras=fully_shard,
max_lora_rank=32, max_lora_rank=32,
) )
......
from typing import List
import pytest
import vllm
from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform
MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct"
PROMPT_TEMPLATE = (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
"\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
"What is in the image?<|im_end|>\n"
"<|im_start|>assistant\n")
IMAGE_ASSETS = [
ImageAsset("stop_sign"),
ImageAsset("cherry_blossom"),
]
# After fine-tuning with LoRA, all generated content should start begin `A`.
EXPECTED_OUTPUT = [
"A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.", # noqa: E501
"A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.", # noqa: E501
]
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
sampling_params = vllm.SamplingParams(
temperature=0,
max_tokens=5,
)
inputs = [{
"prompt": PROMPT_TEMPLATE,
"multi_modal_data": {
"image": asset.pil_image
},
} for asset in IMAGE_ASSETS]
outputs = llm.generate(
inputs,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None,
)
# Print the outputs.
generated_texts: List[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts
@pytest.mark.xfail(current_platform.is_rocm(),
reason="Qwen2-VL dependency xformers incompatible with ROCm"
)
def test_qwen2vl_lora(qwen2vl_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_num_seqs=2,
enable_lora=True,
max_loras=2,
max_lora_rank=16,
trust_remote_code=True,
mm_processor_kwargs={
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
},
max_model_len=4096,
)
output1 = do_sample(llm, qwen2vl_lora_files, lora_id=1)
for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output1[i])
output2 = do_sample(llm, qwen2vl_lora_files, lora_id=2)
for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output2[i])
import pickle
import pytest import pytest
import os import os
import torch import torch
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm.config import ModelConfig
from vllm.model_executor.guided_decoding import ( from vllm.model_executor.guided_decoding import (
get_guided_decoding_logits_processor) get_guided_decoding_logits_processor,
get_local_guided_decoding_logits_processor)
from vllm.model_executor.guided_decoding.outlines_logits_processors import ( from vllm.model_executor.guided_decoding.outlines_logits_processors import (
JSONLogitsProcessor, RegexLogitsProcessor) JSONLogitsProcessor, RegexLogitsProcessor)
from vllm.sampling_params import GuidedDecodingParams from vllm.sampling_params import GuidedDecodingParams
from ..utils import models_path_prefix from ..utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, 'HuggingFaceH4/zephyr-7b-beta')
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
def test_guided_logits_processors(sample_regex, sample_json_schema): def test_guided_logits_processors(sample_regex, sample_json_schema):
"""Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor.""" """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
...@@ -39,16 +46,30 @@ def test_guided_logits_processors(sample_regex, sample_json_schema): ...@@ -39,16 +46,30 @@ def test_guided_logits_processors(sample_regex, sample_json_schema):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("backend", @pytest.mark.parametrize("backend", GUIDED_DECODING_BACKENDS)
["outlines", "lm-format-enforcer", "xgrammar"]) @pytest.mark.parametrize("is_local", [True, False])
async def test_guided_logits_processor_black_box(backend: str, sample_regex, async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
sample_regex,
sample_json_schema): sample_json_schema):
tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, 'HuggingFaceH4/zephyr-7b-beta'))
config = ModelConfig(
MODEL_NAME,
task="generate",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="bfloat16",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
token_ids = tokenizer.encode( token_ids = tokenizer.encode(
f"Give an example IPv4 address with this regex: {sample_regex}") f"Give an example IPv4 address with this regex: {sample_regex}")
regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend) regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
regex_lp = await get_guided_decoding_logits_processor(
regex_request, tokenizer) regex_lp = get_local_guided_decoding_logits_processor(
regex_request, tokenizer, config) if is_local else \
await get_guided_decoding_logits_processor(
regex_request, tokenizer, config)
assert regex_lp is not None assert regex_lp is not None
tensor = torch.rand(32000) tensor = torch.rand(32000)
original_tensor = torch.clone(tensor) original_tensor = torch.clone(tensor)
...@@ -62,7 +83,7 @@ async def test_guided_logits_processor_black_box(backend: str, sample_regex, ...@@ -62,7 +83,7 @@ async def test_guided_logits_processor_black_box(backend: str, sample_regex,
json_request = GuidedDecodingParams(json=sample_json_schema, json_request = GuidedDecodingParams(json=sample_json_schema,
backend=backend) backend=backend)
json_lp = await get_guided_decoding_logits_processor( json_lp = await get_guided_decoding_logits_processor(
json_request, tokenizer) json_request, tokenizer, config)
assert json_lp is not None assert json_lp is not None
tensor = torch.rand(32000) tensor = torch.rand(32000)
original_tensor = torch.clone(tensor) original_tensor = torch.clone(tensor)
...@@ -87,3 +108,24 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex): ...@@ -87,3 +108,24 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
with pytest.raises(ValueError, with pytest.raises(ValueError,
match="You can only use one kind of guided"): match="You can only use one kind of guided"):
GuidedDecodingParams(json=sample_json_schema, grammar="test grammar") GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
def test_pickle_xgrammar_tokenizer_data():
# TODO: move to another test file for xgrammar
try:
import xgrammar as xgr
except ImportError:
pytest.skip("Could not import xgrammar to run test")
from vllm.model_executor.guided_decoding.xgrammar_decoding import (
TokenizerData)
tokenizer_data = TokenizerData(vocab_type=xgr.VocabType.RAW)
pickled = pickle.dumps(tokenizer_data)
assert pickled is not None
depickled: TokenizerData = pickle.loads(pickled)
assert depickled is not None
assert depickled.vocab_type == xgr.VocabType.RAW
...@@ -7,6 +7,7 @@ import os ...@@ -7,6 +7,7 @@ import os
import pytest_asyncio import pytest_asyncio
from transformers import AutoModel, AutoTokenizer, BatchEncoding from transformers import AutoModel, AutoTokenizer, BatchEncoding
from vllm.multimodal.audio import resample_audio
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
...@@ -133,16 +134,14 @@ def run_test( ...@@ -133,16 +134,14 @@ def run_test(
dtype=dtype, dtype=dtype,
postprocess_inputs=process, postprocess_inputs=process,
auto_cls=AutoModel) as hf_model: auto_cls=AutoModel) as hf_model:
import librosa
hf_outputs_per_audio = [ hf_outputs_per_audio = [
hf_model.generate_greedy_logprobs_limit( hf_model.generate_greedy_logprobs_limit(
[hf_prompt], [hf_prompt],
max_tokens, max_tokens,
num_logprobs=num_logprobs, num_logprobs=num_logprobs,
audios=[(librosa.resample(audio[0], audios=[(resample_audio(audio[0],
orig_sr=audio[1], orig_sr=audio[1],
target_sr=16000), 16000)]) target_sr=16000), 16000)])
for _, hf_prompt, audio in prompts_and_audios for _, hf_prompt, audio in prompts_and_audios
] ]
......
...@@ -3,19 +3,22 @@ ...@@ -3,19 +3,22 @@
Run `pytest tests/models/test_mistral.py`. Run `pytest tests/models/test_mistral.py`.
""" """
import copy import copy
import json
import jsonschema
import jsonschema.exceptions
import pytest import pytest
import os import os
from vllm import SamplingParams
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import ( # noqa from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import ( # noqa
MistralToolParser) MistralToolParser)
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
from ....utils import models_path_prefix from ....utils import models_path_prefix
MODELS = [ MODELS = [
os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.1"), os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.3"),
] ]
MISTRAL_FORMAT_MODELS = [ MISTRAL_FORMAT_MODELS = [
...@@ -128,6 +131,45 @@ MSGS = [ ...@@ -128,6 +131,45 @@ MSGS = [
} }
] ]
SAMPLE_JSON_SCHEMA = {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"age": {
"type": "integer"
},
"skills": {
"type": "array",
"items": {
"type": "string",
"maxLength": 10
},
"minItems": 3
},
"work_history": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company": {
"type": "string"
},
"duration": {
"type": "number"
},
"position": {
"type": "string"
}
},
"required": ["company", "position"]
}
}
},
"required": ["name", "age", "skills", "work_history"]
}
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
...@@ -253,3 +295,43 @@ def test_mistral_function_calling( ...@@ -253,3 +295,43 @@ def test_mistral_function_calling(
assert parsed_message.tool_calls[ assert parsed_message.tool_calls[
0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}' # noqa 0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}' # noqa
assert parsed_message.content is None assert parsed_message.content is None
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("guided_backend",
["outlines", "lm-format-enforcer", "xgrammar"])
def test_mistral_guided_decoding(
vllm_runner,
model: str,
guided_backend: str,
) -> None:
with vllm_runner(model, dtype='bfloat16',
tokenizer_mode="mistral") as vllm_model:
guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA,
backend=guided_backend)
params = SamplingParams(max_tokens=512,
temperature=0.7,
guided_decoding=guided_decoding)
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role":
"user",
"content":
f"Give an example JSON for an employee profile that "
f"fits this schema: {SAMPLE_JSON_SCHEMA}"
}]
outputs = vllm_model.model.chat(messages, sampling_params=params)
generated_text = outputs[0].outputs[0].text
json_response = json.loads(generated_text)
assert outputs is not None
try:
jsonschema.validate(instance=json_response,
schema=SAMPLE_JSON_SCHEMA)
except jsonschema.exceptions.ValidationError:
pytest.fail("Generated response is not valid with JSON schema")
...@@ -60,16 +60,14 @@ def test_max_tokens_override(get_max_phi3v_image_tokens, model: str, ...@@ -60,16 +60,14 @@ def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"num_crops,expected_toks_per_img,num_imgs", "num_crops,expected_toks_per_img",
[ [
(4, 757, 1), (4, 757),
(4, 757, 2), (16, 1921),
(16, 1921, 1),
(16, 1921, 2),
# the default num_crops of phi-3.5-vision is 4 # the default num_crops of phi-3.5-vision is 4
(None, 757, 2), (None, 757),
(None, 757, 2),
]) ])
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets, def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets,
model: str, num_crops: Optional[int], model: str, num_crops: Optional[int],
expected_toks_per_img: int, num_imgs: int): expected_toks_per_img: int, num_imgs: int):
......
...@@ -2,12 +2,9 @@ from typing import Any, Dict, Tuple ...@@ -2,12 +2,9 @@ from typing import Any, Dict, Tuple
import os import os
import pytest import pytest
import torch
from PIL.Image import Image
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm.inputs import InputContext, token_inputs from vllm.inputs import InputContext, InputProcessingContext
from vllm.multimodal import MultiModalRegistry
from .....conftest import _ImageAssets from .....conftest import _ImageAssets
from ....utils import build_model_context from ....utils import build_model_context
...@@ -22,22 +19,9 @@ MAX_PIXELS = "max_pixels" ...@@ -22,22 +19,9 @@ MAX_PIXELS = "max_pixels"
# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple # NOTE: Qwen2VL supports multiple input modalities, so it registers multiple
# input mappers. # input mappers.
@pytest.fixture() @pytest.fixture()
def image_input_mapper_for_qwen2_vl(): def processor_for_qwen2_vl():
from vllm.model_executor.models.qwen2_vl import ( from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor
image_input_mapper_for_qwen2_vl) return Qwen2VLMultiModalProcessor
return image_input_mapper_for_qwen2_vl
@pytest.fixture()
def input_processor_for_qwen2_vl():
from vllm.model_executor.models.qwen2_vl import (
input_processor_for_qwen2_vl)
return input_processor_for_qwen2_vl
@pytest.fixture()
def qwen2_vl_context() -> InputContext:
return build_model_context(model_name=MODEL)
@pytest.fixture() @pytest.fixture()
...@@ -47,12 +31,6 @@ def get_max_qwen2_vl_image_tokens(): ...@@ -47,12 +31,6 @@ def get_max_qwen2_vl_image_tokens():
return get_max_qwen2_vl_image_tokens return get_max_qwen2_vl_image_tokens
@pytest.fixture()
def dummy_data_for_qwen2_vl():
from vllm.model_executor.models.qwen2_vl import dummy_data_for_qwen2_vl
return dummy_data_for_qwen2_vl
@pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [ @pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [
({}, 1225), ({}, 1225),
({ ({
...@@ -60,110 +38,70 @@ def dummy_data_for_qwen2_vl(): ...@@ -60,110 +38,70 @@ def dummy_data_for_qwen2_vl():
MAX_PIXELS: 512**2 MAX_PIXELS: 512**2
}, 324), }, 324),
]) ])
def test_qwen2_vl_max_image_tokens(get_max_qwen2_vl_image_tokens, @pytest.mark.parametrize("model", [MODEL])
qwen2_vl_context: InputContext, def test_qwen2_vl_max_image_tokens(
mm_processor_kwargs: Dict[str, Any], get_max_qwen2_vl_image_tokens,
expected_max_tokens: int): model: str,
mm_processor_kwargs: Dict[str, Any],
expected_max_tokens: int,
):
"""Ensure that the max token calc handles min/max pixels properly.""" """Ensure that the max token calc handles min/max pixels properly."""
actual_max_tokens = get_max_qwen2_vl_image_tokens(qwen2_vl_context, ctx = build_model_context(
**mm_processor_kwargs) model_name=model,
assert actual_max_tokens == expected_max_tokens tokenizer_name=model,
mm_processor_kwargs=None,
@pytest.mark.parametrize("mm_processor_kwargs,token_count,img_size", [
[{}, 1225, (980, 980)],
[{
MIN_PIXELS: 64**2,
MAX_PIXELS: 512**2
}, 324, (504, 504)],
])
def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl,
qwen2_vl_context: InputContext,
mm_processor_kwargs: Dict[str, Any],
token_count: int, img_size: Tuple[int, int]):
"""Ensure that the dummy data handles min/max pixels properly."""
seq_len = 3000
hf_config = qwen2_vl_context.get_hf_config()
image_token_id = hf_config.image_token_id
# NOTE: video value is required, but isn't actually used
# when making the dummy data except for error handling currently
dummy_data = dummy_data_for_qwen2_vl(
ctx=qwen2_vl_context,
seq_len=seq_len,
mm_counts={
"image": 1,
"video": 0
},
**mm_processor_kwargs,
) )
seq_data = dummy_data.seq_data
mm_data = dummy_data.multi_modal_data
# Ensure we have the right number of placeholders for min/max pixel values
assert seq_data.get_token_ids().count(image_token_id) == token_count
# Ensure the images were resized correctly actual_max_tokens = get_max_qwen2_vl_image_tokens(
image = mm_data["image"] InputContext(ctx.model_config), **mm_processor_kwargs)
assert isinstance(image, Image) assert actual_max_tokens == expected_max_tokens
assert image.size == img_size
@pytest.mark.parametrize("mm_processor_kwargs,num_placeholders", [ @pytest.mark.parametrize(
({}, 1426), "mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape", [
({ ({}, 1426, (5704, 1176)),
MIN_PIXELS: 64**2, ({
MAX_PIXELS: 512**2 MIN_PIXELS: 64**2,
}, 330), MAX_PIXELS: 512**2
]) }, 330, (1320, 1176)),
def test_input_processor(input_processor_for_qwen2_vl, ])
qwen2_vl_context: InputContext, @pytest.mark.parametrize("model", [MODEL])
image_assets: _ImageAssets, num_placeholders: int, @pytest.mark.parametrize("num_imgs", [1, 2])
mm_processor_kwargs: Dict[str, Any]): def test_processor_override(
"""Ensure that the image processor handles min/max pixels properly.""" processor_for_qwen2_vl,
tokenizer = AutoTokenizer.from_pretrained(MODEL) image_assets: _ImageAssets,
prompt = "<|vision_start|><|image_pad|><|vision_end|>" model: str,
mm_processor_kwargs: Dict[str, Any],
image = image_assets[0].pil_image expected_toks_per_img: int,
hf_config = qwen2_vl_context.get_hf_config() expected_pixels_shape: Tuple[int, int],
image_token_id = hf_config.image_token_id num_imgs: int,
):
inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt), """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
prompt=prompt, # Same as the previous test - don't initialize mm_processor_kwargs
multi_modal_data={"image": [image]}) # in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
processed_inputs = input_processor_for_qwen2_vl(qwen2_vl_context, inputs, ctx = build_model_context(
**mm_processor_kwargs) model_name=model,
assert processed_inputs["prompt_token_ids"].count( tokenizer_name=model,
image_token_id) == num_placeholders mm_processor_kwargs=None,
assert len(processed_inputs["multi_modal_data"]["image"]) == 1
@pytest.mark.parametrize("mm_processor_kwargs,pixels_shape", [
({}, [5704, 1176]),
({
MIN_PIXELS: 64**2,
MAX_PIXELS: 512**2
}, [1320, 1176]),
])
def test_image_mapper_override(qwen2_vl_context: InputContext,
image_assets: _ImageAssets,
mm_processor_kwargs: Dict[str, Any],
pixels_shape: Tuple[int, int]):
"""Ensure that the image mapper handles min/max pixels properly."""
mm_registry = MultiModalRegistry()
mm_registry.init_mm_limits_per_prompt(qwen2_vl_context.model_config)
image = image_assets[0].pil_image
mapped_output = mm_registry.map_input(
qwen2_vl_context.model_config,
{"image": image},
mm_processor_kwargs=mm_processor_kwargs,
) )
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
# Dimension 0 of pixel values should match the product of image_grid_thw ctx = InputProcessingContext(ctx.model_config, tokenizer)
actual_pixels_shape = mapped_output["pixel_values"].shape # Build the image str / prompt based on the number of images we pass
assert list(actual_pixels_shape) == pixels_shape prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
assert actual_pixels_shape[0] == torch.prod( images = [image_assets[0].pil_image] * num_imgs
mapped_output["image_grid_thw"])
mm_data = {"image": images}
processor = processor_for_qwen2_vl(ctx)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
hf_processor = processor._get_hf_processor(**mm_processor_kwargs)
image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
assert img_tok_count == expected_toks_per_img * num_imgs
assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
assert pixel_shape[1] == expected_pixels_shape[1]
...@@ -4,7 +4,7 @@ import os ...@@ -4,7 +4,7 @@ import os
import pytest import pytest
import torch import torch
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.image import rescale_image_size
from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
......
...@@ -9,7 +9,7 @@ from transformers import AutoConfig ...@@ -9,7 +9,7 @@ from transformers import AutoConfig
# Import the functions to test # Import the functions to test
from vllm.model_executor.models.h2ovl import (calculate_num_blocks, from vllm.model_executor.models.h2ovl import (calculate_num_blocks,
image_to_pixel_values_wrapper) image_to_pixel_values_wrapper)
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.image import rescale_image_size
from ....utils import models_path_prefix from ....utils import models_path_prefix
models = [ models = [
......
...@@ -5,7 +5,7 @@ from typing import List, Optional, Tuple, Type ...@@ -5,7 +5,7 @@ from typing import List, Optional, Tuple, Type
import pytest import pytest
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.image import rescale_image_size
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
......
...@@ -7,8 +7,8 @@ import torch ...@@ -7,8 +7,8 @@ import torch
from PIL import Image from PIL import Image
from vllm.entrypoints.llm import LLM from vllm.entrypoints.llm import LLM
from vllm.multimodal.utils import (rescale_image_size, rescale_video_size, from vllm.multimodal.image import rescale_image_size
sample_frames_from_video) from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput, from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
PromptVideoInput, VllmRunner) PromptVideoInput, VllmRunner)
......
...@@ -5,8 +5,9 @@ from typing import Callable, Iterable, List, Optional, Tuple, Union ...@@ -5,8 +5,9 @@ from typing import Callable, Iterable, List, Optional, Tuple, Union
import torch import torch
from vllm.multimodal.utils import (rescale_image_size, rescale_video_size, from vllm.multimodal.image import rescale_image_size
resize_video, sample_frames_from_video) from vllm.multimodal.video import (rescale_video_size, resize_video,
sample_frames_from_video)
from .....conftest import _ImageAssets, _VideoAssets from .....conftest import _ImageAssets, _VideoAssets
from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER, from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER,
......
"""Custom input builders for edge-cases in different models.""" """Custom input builders for edge-cases in different models."""
from typing import Callable from typing import Callable
from vllm.multimodal.utils import (rescale_image_size, rescale_video_size, from vllm.multimodal.image import rescale_image_size
resize_video, sample_frames_from_video) from vllm.multimodal.video import (rescale_video_size, resize_video,
sample_frames_from_video)
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
from .builders import build_multi_image_inputs, build_single_image_inputs from .builders import build_multi_image_inputs, build_single_image_inputs
......
"""Compare the outputs of HF and vLLM when using greedy sampling. """Compare the classification outputs of HF and vLLM models.
This test only tests small models. Big models such as 7B should be tested from
test_big_models.py because it could use a larger instance to run tests.
Run `pytest tests/models/test_cls_models.py`. Run `pytest tests/models/test_cls_models.py`.
""" """
......
"""Compare the embedding outputs of HF and vLLM models. """Compare the scoring outputs of HF and vLLM models.
Run `pytest tests/models/embedding/language/test_embedding.py`. Run `pytest tests/models/embedding/language/test_scoring.py`.
""" """
import math import math
import os import os
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment