Commit 7a985548 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.0' into v0.9.0-ori

parents 45d3785c dc1440cf
......@@ -4,7 +4,7 @@ import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import _ImageAssets
from ....conftest import ImageTestAssets
from ...utils import build_model_context
......@@ -19,7 +19,7 @@ from ...utils import build_model_context
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
image_assets: _ImageAssets,
image_assets: ImageTestAssets,
model_id: str,
mm_processor_kwargs: dict[str, object],
expected_toks_per_img: int,
......
......@@ -5,7 +5,7 @@ from transformers import SmolVLMConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import _ImageAssets
from ....conftest import ImageTestAssets
from ...utils import build_model_context
......@@ -21,7 +21,7 @@ from ...utils import build_model_context
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
image_assets: _ImageAssets,
image_assets: ImageTestAssets,
model_id: str,
mm_processor_kwargs: dict[str, object],
expected_toks_per_img: int,
......
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of a AQLM model between vLLM and HF Transformers
Run `pytest tests/models/test_aqlm.py`.
"""
import pytest
from tests.quantization.utils import is_quant_method_supported
from vllm.platforms import current_platform
# These ground truth generations were generated using `transformers==4.38.1
# aqlm==1.1.0 torch==2.2.0`
......@@ -39,8 +35,9 @@ ground_truth_generations = [
]
@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("aqlm"),
@pytest.mark.skipif(not is_quant_method_supported("aqlm")
or current_platform.is_rocm()
or not current_platform.is_cuda(),
reason="AQLM is not supported on this GPU type.")
@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
@pytest.mark.parametrize("dtype", ["half"])
......
......@@ -7,8 +7,8 @@ import torch
from vllm.multimodal.image import rescale_image_size
from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
from ...utils import check_logprobs_close
from ...conftest import IMAGE_ASSETS, ImageTestAssets, VllmRunner
from ..utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
......@@ -20,7 +20,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
def run_awq_test(
vllm_runner: type[VllmRunner],
image_assets: _ImageAssets,
image_assets: ImageTestAssets,
source_model: str,
quant_model: str,
*,
......@@ -85,7 +85,6 @@ def run_awq_test(
)
@pytest.mark.quant_model
@pytest.mark.parametrize(
("source_model", "quant_model"),
[("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],
......
......@@ -8,14 +8,12 @@ bitblas/GPTQ models are in the top 3 selections of each other.
Note: bitblas internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for bitblas. As a result, we re-run the
test up to 3 times to see if we pass.
Run `pytest tests/models/test_bitblas.py`.
"""
from dataclasses import dataclass
import pytest
from .utils import check_logprobs_close
from ..utils import check_logprobs_close
@dataclass
......
......@@ -4,20 +4,15 @@
"""Tests fp8 models against ground truth generation
Note: these tests will only pass on L4 GPU.
"""
import os
from typing import Optional
import pytest
from tests.kernels.utils import override_backend_env_variable
from tests.quantization.utils import is_quant_method_supported
from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR
from ...utils import check_logprobs_close
from ..utils import check_logprobs_close
@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize(
......@@ -60,6 +55,14 @@ def test_models(
Only checks log probs match to cover the discrepancy in
numerical sensitive kernels.
"""
if backend == "FLASHINFER" and current_platform.is_rocm():
pytest.skip("Flashinfer does not support ROCm/HIP.")
if kv_cache_dtype == "fp8_e5m2" and current_platform.is_rocm():
pytest.skip(
f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
with monkeypatch.context() as m:
m.setenv("TOKENIZERS_PARALLELISM", 'true')
m.setenv(STR_BACKEND_ENV_VAR, backend)
......
......@@ -14,9 +14,9 @@ from transformers import AutoTokenizer
from tests.quantization.utils import is_quant_method_supported
from ....conftest import VllmRunner
from ....utils import multi_gpu_test
from ...utils import check_logprobs_close
from ...conftest import VllmRunner
from ...utils import multi_gpu_test
from ..utils import check_logprobs_close
os.environ["TOKENIZERS_PARALLELISM"] = "true"
......@@ -38,7 +38,6 @@ LLAMA_CONFIG = GGUFTestConfig(
original_model="meta-llama/Llama-3.2-1B-Instruct",
gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf",
marks=[pytest.mark.quant_model],
)
QWEN2_CONFIG = GGUFTestConfig(
......
......@@ -8,14 +8,12 @@ bitblas/GPTQ models are in the top 3 selections of each other.
Note: bitblas internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for bitblas. As a result, we re-run the
test up to 3 times to see if we pass.
Run `pytest tests/models/test_bitblas.py`.
"""
from dataclasses import dataclass
import pytest
from .utils import check_logprobs_close
from ..utils import check_logprobs_close
@dataclass
......
# SPDX-License-Identifier: Apache-2.0
"""Compares the outputs of gptq vs gptq_marlin
"""Compares the outputs of gptq vs gptq_marlin.
Note: GPTQ and Marlin do not have bitwise correctness.
As a result, in this test, we just confirm that the top selected tokens of the
Marlin/GPTQ models are in the top 5 selections of each other.
Note: Marlin internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for Marlin. As a result, we re-run the test
up to 3 times to see if we pass.
Run `pytest tests/models/test_gptq_marlin.py`.
"""
import os
......@@ -15,8 +14,9 @@ import pytest
from tests.quantization.utils import is_quant_method_supported
from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
from vllm.platforms import current_platform
from ...utils import check_logprobs_close
from ..utils import check_logprobs_close
os.environ["TOKENIZERS_PARALLELISM"] = "true"
......@@ -34,9 +34,10 @@ MODELS = [
]
@pytest.mark.quant_model
@pytest.mark.flaky(reruns=3)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin")
or current_platform.is_rocm()
or not current_platform.is_cuda(),
reason="gptq_marlin is not supported on this GPU type.")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
......
......@@ -4,16 +4,15 @@
Note: GPTQ and Marlin_24 do not have bitwise correctness.
As a result, in this test, we just confirm that the top selected tokens of the
Marlin/GPTQ models are in the top 3 selections of each other.
Run `pytest tests/models/test_marlin_24.py`.
"""
from dataclasses import dataclass
import pytest
from tests.quantization.utils import is_quant_method_supported
from vllm.platforms import current_platform
from ...utils import check_logprobs_close
from ..utils import check_logprobs_close
@dataclass
......@@ -39,9 +38,10 @@ model_pairs = [
]
@pytest.mark.quant_model
@pytest.mark.flaky(reruns=2)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24")
or current_platform.is_rocm()
or not current_platform.is_cuda(),
reason="Marlin24 is not supported on this GPU type.")
@pytest.mark.parametrize("model_pair", model_pairs)
@pytest.mark.parametrize("dtype", ["half"])
......
......@@ -40,7 +40,6 @@ EXPECTED_STRS_MAP = {
@pytest.mark.skip(
reason=
"Prevent unstable test based on golden strings from breaking the build.")
@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS)
......
# SPDX-License-Identifier: Apache-2.0
# flake8: noqa
"""Tests Quark mxfp4 models against ground truth generation
"""
import pytest
from vllm import LLM, SamplingParams
MODELS = ["amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8"]
EXPECTED_STRS_MAP = {
"amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8": [
'\n### Key Features\n\n* **High-throughput Inference**: vLL',
'\nArtificial intelligence (AI) has evolved significantly since its inception in the 1',
'Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been',
'A neural network is a machine learning model inspired by the structure of the human brain. It consists of',
'\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol',
'\nThe COVID-19 pandemic has had a profound impact on global economic structures and business',
'The Mona Lisa painting, created by Leonardo da Vinci in the early 16th',
" everybody knows this proverbial saying, but did you know that it's not entirely accurate?",
]
}
@pytest.mark.skip(reason="Model to be released in the future")
@pytest.mark.quant_model
@pytest.mark.parametrize("model_name", MODELS)
def test_models(example_prompts, model_name) -> None:
sampling_params = SamplingParams(max_tokens=20, temperature=0)
llm = LLM(
model=model_name,
kv_cache_dtype="fp8",
quantization="quark",
)
outputs = llm.generate(example_prompts, sampling_params)
for i, output in enumerate(outputs):
output_str = output.outputs[0].text
expected_str = EXPECTED_STRS_MAP[model_name][i]
assert expected_str == output_str, (
f"Expected: {expected_str!r}\nvLLM: {output_str!r}")
......@@ -41,7 +41,6 @@ EXPECTED_STRS_MAP = {
reason=
"Prevent unstable test based on golden strings from breaking the build "
" and test input model being too large and hanging the system.")
@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("nvfp4"),
reason="nvfp4 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS)
......
......@@ -72,12 +72,15 @@ class _HfExamplesInfo:
return
current_version = TRANSFORMERS_VERSION
cur_base_version = Version(current_version).base_version
min_version = self.min_transformers_version
max_version = self.max_transformers_version
msg = f"`transformers=={current_version}` installed, but `transformers"
if min_version and Version(current_version) < Version(min_version):
# Only check the base version for the min/max version, otherwise preview
# models cannot be run because `x.yy.0.dev0`<`x.yy.0`
if min_version and Version(cur_base_version) < Version(min_version):
msg += f">={min_version}` is required to run this model."
elif max_version and Version(current_version) > Version(max_version):
elif max_version and Version(cur_base_version) > Version(max_version):
msg += f"<={max_version}` is required to run this model."
else:
return
......@@ -120,7 +123,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True),
"BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat",
trust_remote_code=True),
"BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B"),
"BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B",
extras={"tiny": "hmellor/bamba-tiny-random"}), # noqa: E501
"BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
{"1b": "bigscience/bloomz-1b1"}),
"ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b",
......@@ -162,6 +166,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
{"1b": "EleutherAI/pythia-1.4b"}),
"GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
"GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
"GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview", # noqa: E501
min_transformers_version="4.52.0"), # noqa: E501
"GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"), # noqa: E501
"Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
trust_remote_code=True),
......@@ -176,7 +182,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
"JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
extras={"tiny": "ai21labs/Jamba-tiny-dev"}), # noqa: E501
"LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct"),
"LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct",
extras={"guard": "meta-llama/Llama-Guard-3-1B", # noqa: E501
"hermes": "NousResearch/Hermes-3-Llama-3.1-8B"}), # noqa: E501
"LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
is_available_online=False),
"MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
......@@ -191,13 +199,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True),
"MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
"MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1", # noqa: E501
{"falcon3": "ehristoforu/Falcon3-MoE-2x7B-Insruct"}), # noqa: E501
{"tiny": "TitanML/tiny-mixtral"}), # noqa: E501
"QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"), # noqa: E501
"MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
"MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
"NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
"OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
"Olmo2ForCausalLM": _HfExamplesInfo("shanearora/OLMo-7B-1124-hf"),
"Olmo2ForCausalLM": _HfExamplesInfo("allenai/OLMo-2-0425-1B"),
"OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
"OPTForCausalLM": _HfExamplesInfo("facebook/opt-125m",
{"1b": "facebook/opt-iml-max-1.3b"}),
......@@ -217,16 +225,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-0.5B-Instruct",
extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"}), # noqa: E501
"Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
"Qwen3ForCausalLM": _HfExamplesInfo(
"Qwen/Qwen3-8B",
is_available_online=False,
min_transformers_version="4.51"
),
"Qwen3MoeForCausalLM": _HfExamplesInfo(
"Qwen/Qwen3-MoE-15B-A2B",
is_available_online=False,
min_transformers_version="4.51"
),
"Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
"Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
"RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
is_available_online=False),
"StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b", # noqa: E501
......@@ -242,6 +242,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
is_available_online=False,
trust_remote_code=True),
"Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
"MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
trust_remote_code=True),
# [Encoder-decoder]
"BartModel": _HfExamplesInfo("facebook/bart-base"),
"BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
......@@ -254,11 +256,17 @@ _EMBEDDING_EXAMPLE_MODELS = {
"GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
"GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
trust_remote_code=True),
"GteNewModel": _HfExamplesInfo("Alibaba-NLP/gte-base-en-v1.5",
trust_remote_code=True,
hf_overrides={"architectures":
["GteNewModel"]}),
"InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward",
trust_remote_code=True),
"JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"), # noqa: E501
"LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
"MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
"ModernBertModel": _HfExamplesInfo("Alibaba-NLP/gte-modernbert-base",
trust_remote_code=True),
"NomicBertModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-long", # noqa: E501
trust_remote_code=True),
"Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
......@@ -337,6 +345,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
extras={"2.6": "openbmb/MiniCPM-V-2_6"}, # noqa: E501
trust_remote_code=True),
"MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501
trust_remote_code=True),
"Mistral3ForConditionalGeneration": _HfExamplesInfo("mistralai/Mistral-Small-3.1-24B-Instruct-2503", # noqa: E501
extras={"fp8": "nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"}), # noqa: E501
"MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
......@@ -353,6 +363,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
max_transformers_version="4.48",
transformers_version_reason="Use of deprecated imports which have been removed.", # noqa: E501
extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}), # noqa: E501
"Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True,
extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
"1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501
"Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
trust_remote_code=True),
"PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501
......@@ -364,8 +377,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501
"Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501
"Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct"), # noqa: E501
"Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B", # noqa: E501
min_transformers_version="4.52"), # noqa: E501
"Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B",
min_transformers_version="4.52"),
"SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
"SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501
......@@ -375,7 +388,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
# Therefore, we borrow the BartTokenizer from the original Bart model
"Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base", # noqa: E501
tokenizer="Isotr0py/Florence-2-tokenizer",
trust_remote_code=True), # noqa: E501
trust_remote_code=True,), # noqa: E501
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
"Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct"), # noqa: E501
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
......@@ -399,6 +412,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
trust_remote_code=True,
speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
tokenizer="meta-llama/Llama-3.1-8B-Instruct"),
"MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
trust_remote_code=True,
speculative_model="XiaomiMiMo/MiMo-7B-RL")
}
_TRANSFORMERS_MODELS = {
......
# SPDX-License-Identifier: Apache-2.0
"""Test the functionality of the Transformers backend.
Run `pytest tests/models/test_transformers.py`.
"""
"""Test the functionality of the Transformers backend."""
import pytest
from vllm.platforms import current_platform
from ..conftest import HfRunner, VllmRunner
from ..utils import multi_gpu_test
from .utils import check_logprobs_close
......@@ -36,6 +35,9 @@ def check_implementation(
)
@pytest.mark.skipif(
current_platform.is_rocm(),
reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.")
@pytest.mark.parametrize(
"model,model_impl",
[
......@@ -67,6 +69,9 @@ def test_distributed(
"meta-llama/Llama-3.2-1B-Instruct", **kwargs)
@pytest.mark.skipif(
current_platform.is_rocm(),
reason="bitsandbytes quantization is currently not supported in rocm.")
@pytest.mark.parametrize("model, quantization_kwargs", [
(
"meta-llama/Llama-3.2-1B-Instruct",
......
......@@ -2,9 +2,10 @@
import warnings
from collections.abc import Sequence
from typing import Any, Optional, Union
from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
import torch
import torch.nn.functional as F
from vllm.config import ModelConfig, TaskOption
from vllm.inputs import InputContext
......@@ -12,6 +13,9 @@ from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
from .registry import HF_EXAMPLE_MODELS
if TYPE_CHECKING:
from ..conftest import HfRunner
TokensText = tuple[list[int], str]
......@@ -291,3 +295,64 @@ def build_model_context(
**model_config_kwargs,
)
return InputContext(model_config)
def check_embeddings_close(
*,
embeddings_0_lst: Sequence[list[float]],
embeddings_1_lst: Sequence[list[float]],
name_0: str,
name_1: str,
tol: float = 1e-3,
) -> None:
assert len(embeddings_0_lst) == len(embeddings_1_lst)
for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
zip(embeddings_0_lst, embeddings_1_lst)):
assert len(embeddings_0) == len(embeddings_1), (
f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
sim = F.cosine_similarity(torch.tensor(embeddings_0),
torch.tensor(embeddings_1),
dim=0)
fail_msg = (f"Test{prompt_idx}:"
f"\n{name_0}:\t{embeddings_0[:16]!r}"
f"\n{name_1}:\t{embeddings_1[:16]!r}")
assert sim >= 1 - tol, fail_msg
def matryoshka_fy(tensor: torch.Tensor, dimensions: int):
tensor = torch.tensor(tensor)
tensor = tensor[..., :dimensions]
tensor = F.normalize(tensor, p=2, dim=1)
return tensor
class EmbedModelInfo(NamedTuple):
name: str
is_matryoshka: bool = False
matryoshka_dimensions: Optional[list[int]] = None
architecture: str = ""
dtype: str = "auto"
enable_test: bool = True
def run_embedding_correctness_test(
hf_model: "HfRunner",
inputs: list[str],
vllm_outputs: Sequence[list[float]],
dimensions: Optional[int] = None,
):
hf_outputs = hf_model.encode(inputs)
if dimensions:
hf_outputs = matryoshka_fy(hf_outputs, dimensions)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
tol=1e-2,
)
# SPDX-License-Identifier: Apache-2.0
from pathlib import Path
import numpy as np
import pytest
import torch
from PIL import Image, ImageDraw
from vllm.multimodal.hasher import MultiModalHasher
ASSETS_DIR = Path(__file__).parent / "assets"
assert ASSETS_DIR.exists()
# NOTE: Images that are the same visually are allowed to have the same hash
@pytest.mark.parametrize("mode_pair", [("1", "L"), ("RGBA", "CMYK")])
def test_hash_collision_image_mode(mode_pair):
mode1, mode2 = mode_pair
image1 = Image.new(mode1, size=(10, 10), color=1)
image2 = Image.new(mode2, size=(10, 10), color=1)
hasher = MultiModalHasher
assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2)
def test_hash_collision_image_palette():
# These images differ only in Image.palette._palette
image1 = Image.open(ASSETS_DIR / "image1.png")
image2 = Image.open(ASSETS_DIR / "image2.png")
hasher = MultiModalHasher
assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2)
def test_hash_collision_image_transpose():
image1 = Image.new("1", size=(10, 20))
ImageDraw.Draw(image1).line([(0, 0), (10, 0)])
image2 = Image.new("1", size=(20, 10))
ImageDraw.Draw(image2).line([(0, 0), (0, 10)])
hasher = MultiModalHasher
assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2)
def test_hash_collision_tensor_shape():
# The hash should be different though the data is the same when flattened
arr1 = torch.zeros((5, 10, 20, 3))
arr2 = torch.zeros((10, 20, 5, 3))
hasher = MultiModalHasher
assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2)
def test_hash_collision_array_shape():
# The hash should be different though the data is the same when flattened
arr1 = np.zeros((5, 10, 20, 3))
arr2 = np.zeros((10, 20, 5, 3))
hasher = MultiModalHasher
assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment