Commit f48954a4 authored by zhuwenwen's avatar zhuwenwen
Browse files

merge v0.5.0

parents 1dba29d3 8f89d720
......@@ -102,22 +102,21 @@ def batched_generate(
return [outputs[i].outputs[0].text.strip() for i in range(len(outputs))]
@pytest.fixture
@pytest.fixture(scope="module")
def lora_llm(long_context_infos):
scaling_factors = [
context_len_to_scaling_factor[info["context_length"]]
for info in long_context_infos.values()
]
llm = vllm.LLM(
"meta-llama/Llama-2-13b-chat-hf",
enable_lora=True,
max_num_seqs=16,
max_loras=2,
long_lora_scaling_factors=tuple(scaling_factors),
max_num_batched_tokens=4096 * 8,
tensor_parallel_size=4,
)
llm = vllm.LLM("meta-llama/Llama-2-13b-chat-hf",
enable_lora=True,
max_num_seqs=16,
max_loras=2,
long_lora_scaling_factors=tuple(scaling_factors),
max_num_batched_tokens=4096 * 8,
tensor_parallel_size=4,
distributed_executor_backend="mp")
yield llm
del llm
......@@ -154,6 +153,7 @@ def test_rotary_emb_replaced(dist_init):
assert rotary_emb_count == 32
@pytest.mark.skip_global_cleanup
def test_batched_rope_kernel(lora_llm, long_context_infos):
"""We test the batched kernel by comparing the results of batched an
non-batched generation.
......@@ -188,6 +188,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
f"same:\n{batched}\n{non_batched}")
@pytest.mark.skip_global_cleanup
def test_self_consistency(lora_llm, long_context_infos):
"""We test consistency of the batched kernel by permuting batched
inputs and comparing the results to the non-permuted batched results.
......@@ -227,6 +228,7 @@ def test_self_consistency(lora_llm, long_context_infos):
f"\n{permutated_batched_results[permutation[i]]}")
@pytest.mark.skip_global_cleanup
def test_quality(lora_llm, long_context_infos):
"""We test the quality of the answers given by the LoRA model by
comparing the generated text to the merged model's outputs.
......@@ -257,6 +259,7 @@ def test_quality(lora_llm, long_context_infos):
assert np.mean(scores) > 0.5
@pytest.mark.skip_global_cleanup
def test_max_len(lora_llm, long_context_infos):
"""Test that we raise an ValueError when the input of a given LoRA
model exceeds the maximum length."""
......
from collections import OrderedDict
import pytest
from torch import nn
from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
from vllm.utils import LRUCache
def test_parse_fine_tuned_lora_name():
def test_parse_fine_tuned_lora_name_valid():
fixture = {
("base_model.model.lm_head.lora_A.weight", "lm_head", True),
("base_model.model.lm_head.lora_B.weight", "lm_head", False),
......@@ -35,6 +36,17 @@ def test_parse_fine_tuned_lora_name():
assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(name)
def test_parse_fine_tuned_lora_name_invalid():
fixture = {
"weight",
"base_model.weight",
"base_model.model.weight",
}
for name in fixture:
with pytest.raises(ValueError, match="unsupported LoRA weight"):
parse_fine_tuned_lora_name(name)
def test_replace_submodule():
model = nn.Sequential(
OrderedDict([
......
......@@ -23,23 +23,25 @@ def test_metric_counter_prompt_tokens(
dtype: str,
max_tokens: int,
) -> None:
vllm_model = vllm_runner(model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4)
tokenizer = vllm_model.model.get_tokenizer()
prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts]
# This test needs at least 2 prompts in a batch of different lengths to
# verify their token count is correct despite padding.
assert len(example_prompts) > 1, "at least 2 prompts are required"
assert prompt_token_counts[0] != prompt_token_counts[1], (
"prompts of different lengths are required")
vllm_prompt_token_count = sum(prompt_token_counts)
_ = vllm_model.generate_greedy(example_prompts, max_tokens)
stat_logger = vllm_model.model.llm_engine.stat_logger
metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
**stat_logger.labels)._value.get()
with vllm_runner(model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer()
prompt_token_counts = [
len(tokenizer.encode(p)) for p in example_prompts
]
# This test needs at least 2 prompts in a batch of different lengths to
# verify their token count is correct despite padding.
assert len(example_prompts) > 1, "at least 2 prompts are required"
assert prompt_token_counts[0] != prompt_token_counts[1], (
"prompts of different lengths are required")
vllm_prompt_token_count = sum(prompt_token_counts)
_ = vllm_model.generate_greedy(example_prompts, max_tokens)
stat_logger = vllm_model.model.llm_engine.stat_logger
metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
**stat_logger.labels)._value.get()
assert vllm_prompt_token_count == metric_count, (
f"prompt token count: {vllm_prompt_token_count!r}\n"
......@@ -56,22 +58,22 @@ def test_metric_counter_generation_tokens(
dtype: str,
max_tokens: int,
) -> None:
vllm_model = vllm_runner(model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4)
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
tokenizer = vllm_model.model.get_tokenizer()
stat_logger = vllm_model.model.llm_engine.stat_logger
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
**stat_logger.labels)._value.get()
vllm_generation_count = 0
for i in range(len(example_prompts)):
vllm_output_ids, vllm_output_str = vllm_outputs[i]
prompt_ids = tokenizer.encode(example_prompts[i])
# vllm_output_ids contains both prompt tokens and generation tokens.
# We're interested only in the count of the generation tokens.
vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
with vllm_runner(model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
tokenizer = vllm_model.model.get_tokenizer()
stat_logger = vllm_model.model.llm_engine.stat_logger
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
**stat_logger.labels)._value.get()
vllm_generation_count = 0
for i in range(len(example_prompts)):
vllm_output_ids, vllm_output_str = vllm_outputs[i]
prompt_ids = tokenizer.encode(example_prompts[i])
# vllm_output_ids contains both prompt tokens and generation tokens.
# We're interested only in the count of the generation tokens.
vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
assert vllm_generation_count == metric_count, (
f"generation token count: {vllm_generation_count!r}\n"
......@@ -85,15 +87,13 @@ def test_metric_counter_generation_tokens(
[None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
served_model_name: List[str]) -> None:
vllm_model = vllm_runner(model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.3,
served_model_name=served_model_name)
stat_logger = vllm_model.model.llm_engine.stat_logger
metrics_tag_content = stat_logger.labels["model_name"]
del vllm_model
with vllm_runner(model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.3,
served_model_name=served_model_name) as vllm_model:
stat_logger = vllm_model.model.llm_engine.stat_logger
metrics_tag_content = stat_logger.labels["model_name"]
if served_model_name is None or served_model_name == []:
assert metrics_tag_content == model, (
......
......@@ -8,10 +8,13 @@ import torch
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
aqlm_not_supported = (capability <
QUANTIZATION_METHODS["aqlm"].get_min_capability())
aqlm_not_supported = True
if torch.cuda.is_available():
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
aqlm_not_supported = (capability <
QUANTIZATION_METHODS["aqlm"].get_min_capability())
# In this test we hardcode prompts and generations for the model so we don't
# need to require the AQLM package as a dependency
......@@ -79,10 +82,9 @@ def test_models(
num_logprobs: int,
) -> None:
vllm_model = vllm_runner(model, dtype=dtype)
vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
max_tokens,
num_logprobs)
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
# loop through the prompts to compare against the ground truth generations
for prompt_idx in range(len(example_prompts)):
......
......@@ -5,6 +5,7 @@ This tests bigger models and use half precision.
Run `pytest tests/models/test_big_models.py`.
"""
import pytest
import torch
MODELS = [
"meta-llama/Llama-2-7b-hf",
......@@ -16,9 +17,14 @@ MODELS = [
# "Qwen/Qwen1.5-0.5B" # Broken,
]
#TODO: remove this after CPU float16 support ready
target_dtype = "float"
if torch.cuda.is_available():
target_dtype = "half"
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [32])
def test_models(
hf_runner,
......@@ -28,13 +34,11 @@ def test_models(
dtype: str,
max_tokens: int,
) -> None:
hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner(model, dtype=dtype)
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
......@@ -46,15 +50,14 @@ def test_models(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("dtype", [target_dtype])
def test_model_print(
vllm_runner,
model: str,
dtype: str,
) -> None:
vllm_model = vllm_runner(model, dtype=dtype)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model)
del vllm_model
with vllm_runner(model, dtype=dtype) as vllm_model:
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model)
......@@ -28,13 +28,11 @@ def test_models(
model: str,
dtype: str,
) -> None:
hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.encode(example_prompts)
del hf_model
with hf_runner(model, dtype=dtype, is_embedding_model=True) as hf_model:
hf_outputs = hf_model.encode(example_prompts)
vllm_model = vllm_runner(model, dtype=dtype)
vllm_outputs = vllm_model.encode(example_prompts)
del vllm_model
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts)
similarities = compare_embeddings(hf_outputs, vllm_outputs)
all_similarities = torch.stack(similarities)
......
......@@ -67,10 +67,13 @@ EXPECTED_STRS_MAP = {
},
}
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
fp8_not_supported = (capability <
QUANTIZATION_METHODS["fp8"].get_min_capability())
fp8_not_supported = True
if torch.cuda.is_available():
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
fp8_not_supported = (capability <
QUANTIZATION_METHODS["fp8"].get_min_capability())
@pytest.mark.skipif(fp8_not_supported,
......
......@@ -22,10 +22,13 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
MAX_MODEL_LEN = 1024
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
gptq_marlin_not_supported = (
capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
gptq_marlin_not_supported = True
if torch.cuda.is_available():
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
gptq_marlin_not_supported = (
capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
MODELS = [
# act_order==False, group_size=channelwise
......@@ -67,32 +70,29 @@ def test_models(
model_name, revision = model
# Run marlin.
gptq_marlin_model = vllm_runner(model_name=model_name,
revision=revision,
dtype=dtype,
quantization="marlin",
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1)
gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs)
del gptq_marlin_model
with vllm_runner(model_name=model_name,
revision=revision,
dtype=dtype,
quantization="marlin",
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1) as gptq_marlin_model:
gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs)
_ROPE_DICT.clear() # clear rope cache to avoid rope dtype error
# Run gptq.
# The naive gptq kernel doesn't support bf16 yet.
# Here we always compare fp16/bf16 gpt marlin kernel
# to fp16 gptq kernel.
gptq_model = vllm_runner(model_name=model_name,
revision=revision,
dtype="half",
quantization="gptq",
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1)
gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts[:-1],
max_tokens,
num_logprobs)
del gptq_model
with vllm_runner(model_name=model_name,
revision=revision,
dtype="half",
quantization="gptq",
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1) as gptq_model:
gptq_outputs = gptq_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs)
check_logprobs_close(
outputs_0_lst=gptq_outputs,
......
......@@ -14,10 +14,13 @@ import torch
from tests.models.utils import check_logprobs_close
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
marlin_not_supported = (capability <
QUANTIZATION_METHODS["marlin"].get_min_capability())
marlin_not_supported = True
if torch.cuda.is_available():
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
marlin_not_supported = (
capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
@dataclass
......@@ -58,20 +61,16 @@ def test_models(
max_tokens: int,
num_logprobs: int,
) -> None:
marlin_24_model = vllm_runner(model_pair.model_marlin,
dtype=dtype,
quantization="gptq_marlin_24")
marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
del marlin_24_model
with vllm_runner(model_pair.model_marlin,
dtype=dtype,
quantization="gptq_marlin_24") as marlin_24_model:
marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
gptq_model = vllm_runner(model_pair.model_gptq,
dtype=dtype,
quantization="gptq")
gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
max_tokens,
num_logprobs)
del gptq_model
with vllm_runner(model_pair.model_gptq, dtype=dtype,
quantization="gptq") as gptq_model:
gptq_outputs = gptq_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
check_logprobs_close(
outputs_0_lst=gptq_outputs,
......
import gc
from dataclasses import fields
from enum import Enum
from typing import Dict, List, Tuple
from typing import List, Tuple
import pytest
import torch
from transformers import AutoTokenizer
from vllm.config import VisionLanguageConfig
model_and_vl_config = [
("llava-hf/llava-1.5-7b-hf",
VisionLanguageConfig(
image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
image_feature_size=576,
image_token_id=32000,
image_input_shape=(1, 3, 336, 336))),
("llava-hf/llava-1.5-7b-hf",
VisionLanguageConfig(
image_input_type=VisionLanguageConfig.ImageInputType.IMAGE_FEATURES,
image_feature_size=576,
image_token_id=32000,
image_input_shape=(1, 576, 1024)))
from ..conftest import IMAGE_FILES
pytestmark = pytest.mark.llava
# The image token is placed before "user" on purpose so that the test can pass
HF_IMAGE_PROMPTS = [
"<image>\nUSER: What's the content of the image?\nASSISTANT:",
"<image>\nUSER: What is the season?\nASSISTANT:",
]
assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
def as_dict(vision_language_config: VisionLanguageConfig) -> Dict:
"""Flatten vision language config to pure args.
Compatible with what llm entrypoint expects.
"""
result = {}
for field in fields(vision_language_config):
value = getattr(vision_language_config, field.name)
if isinstance(value, Enum):
result[field.name] = value.name.lower()
elif isinstance(value, tuple):
result[field.name] = ",".join([str(item) for item in value])
else:
result[field.name] = value
return result
def sanitize_vllm_output(vllm_output: Tuple[List[int], str],
vision_language_config: VisionLanguageConfig,
model_id: str):
def iter_llava_configs(model_name: str):
image_hw_to_feature_size = {
(336, 336): 576,
}
for (h, w), f in image_hw_to_feature_size.items():
for input_type, input_shape in [
(VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
(VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)),
]:
yield (model_name,
VisionLanguageConfig(image_input_type=input_type,
image_feature_size=f,
image_token_id=32000,
image_input_shape=input_shape,
image_processor=model_name,
image_processor_revision=None))
model_and_vl_config = [
*iter_llava_configs("llava-hf/llava-1.5-7b-hf"),
]
def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
vlm_config: VisionLanguageConfig, model_id: str):
"""Sanitize vllm output to be comparable with hf output.
The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
It also reduces `output_str` from "<image><image>bla" to "bla".
"""
tokenizer = AutoTokenizer.from_pretrained(model_id)
image_token_str = tokenizer.decode(vision_language_config.image_token_id)
image_token_str_len = len(image_token_str)
input_ids, output_str = vllm_output
sanitized_input_ids = input_ids[0:2] + input_ids[2 + vision_language_config
.image_feature_size - 1:]
sanitzied_output_str = output_str[vision_language_config.
image_feature_size *
image_token_str_len:]
return sanitized_input_ids, sanitzied_output_str
image_token_id = vlm_config.image_token_id
tokenizer = AutoTokenizer.from_pretrained(model_id)
image_token_str = tokenizer.decode(image_token_id)
hf_input_ids = [
input_id for idx, input_id in enumerate(input_ids)
if input_id != image_token_id or input_ids[idx - 1] != image_token_id
]
hf_output_str = output_str \
.replace(image_token_str * vlm_config.image_feature_size, "")
@pytest.mark.parametrize("worker_use_ray", [False])
return hf_input_ids, hf_output_str
# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
@pytest.mark.parametrize("model_and_config", model_and_vl_config)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
vllm_image_prompts, vllm_images, model_and_config: tuple,
dtype: str, max_tokens: int, worker_use_ray: bool) -> None:
def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
model_and_config, dtype: str, max_tokens: int) -> None:
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the raw images as input.
For vllm runner, we provide image tensors and corresponding
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalData objects and corresponding
vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
model_id, vision_language_config = model_and_config
hf_model = hf_runner(model_id, dtype=dtype)
hf_outputs = hf_model.generate_greedy(hf_image_prompts,
max_tokens,
images=hf_images)
del hf_model
vllm_model = vllm_runner(model_id,
dtype=dtype,
worker_use_ray=worker_use_ray,
**as_dict(vision_language_config))
vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
model_id, vlm_config = model_and_config
with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
max_tokens,
images=vllm_images)
del vllm_model
images=hf_images)
vllm_image_prompts = [
p.replace("<image>", "<image>" * vlm_config.image_feature_size)
for p in HF_IMAGE_PROMPTS
]
gc.collect()
torch.cuda.empty_cache()
with vllm_runner(model_id,
dtype=dtype,
enforce_eager=True,
**vlm_config.as_cli_args_dict()) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
max_tokens,
images=vllm_images)
for i in range(len(hf_image_prompts)):
for i in range(len(HF_IMAGE_PROMPTS)):
hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = sanitize_vllm_output(
vllm_outputs[i], vision_language_config, model_id)
vllm_output_ids, vllm_output_str = vllm_to_hf_output(
vllm_outputs[i], vlm_config, model_id)
assert hf_output_str == vllm_output_str, (
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, (
......
from typing import List, Tuple
import pytest
from transformers import AutoTokenizer
from vllm.config import VisionLanguageConfig
from ..conftest import IMAGE_FILES
pytestmark = pytest.mark.llava
_PREFACE = (
"A chat between a curious human and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the human's "
"questions.")
# The image token is placed before "user" on purpose so that the test can pass
HF_IMAGE_PROMPTS = [
f"{_PREFACE} <image>\nUSER: What's the content of the image? ASSISTANT:",
f"{_PREFACE} <image>\nUSER: What is the season? ASSISTANT:",
]
assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
def iter_llava_next_configs(model_name: str):
image_hw_to_feature_size = {
(336, 336): 1176,
(672, 672): 2928,
(1344, 336): 1944,
(336, 1344): 1890,
}
for (h, w), f in image_hw_to_feature_size.items():
for input_type, input_shape in [
(VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
]:
yield (model_name,
VisionLanguageConfig(image_input_type=input_type,
image_feature_size=f,
image_token_id=32000,
image_input_shape=input_shape,
image_processor=model_name,
image_processor_revision=None))
model_and_vl_config = [
*iter_llava_next_configs("llava-hf/llava-v1.6-vicuna-7b-hf"),
]
def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
vlm_config: VisionLanguageConfig, model_id: str):
"""Sanitize vllm output to be comparable with hf output.
The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
It also reduces `output_str` from "<image><image>bla" to "bla".
"""
input_ids, output_str = vllm_output
image_token_id = vlm_config.image_token_id
tokenizer = AutoTokenizer.from_pretrained(model_id)
image_token_str = tokenizer.decode(image_token_id)
hf_input_ids = [
input_id for idx, input_id in enumerate(input_ids)
if input_id != image_token_id or input_ids[idx - 1] != image_token_id
]
hf_output_str = output_str \
.replace(image_token_str * vlm_config.image_feature_size, " ")
return hf_input_ids, hf_output_str
@pytest.mark.xfail(
reason="Inconsistent image processor being used due to lack "
"of support for dynamic image token replacement")
@pytest.mark.parametrize("model_and_config", model_and_vl_config)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
model_and_config, dtype: str, max_tokens: int) -> None:
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalData objects and corresponding
vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
model_id, vlm_config = model_and_config
with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
max_tokens,
images=hf_images)
vllm_image_prompts = [
p.replace("<image>", "<image>" * vlm_config.image_feature_size)
for p in HF_IMAGE_PROMPTS
]
with vllm_runner(
model_id,
dtype=dtype,
# should be greater than image_feature_size
max_model_len=4096,
enforce_eager=True,
**vlm_config.as_cli_args_dict(),
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
max_tokens,
images=vllm_images)
for i in range(len(HF_IMAGE_PROMPTS)):
hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_to_hf_output(
vllm_outputs[i], vlm_config, model_id)
assert hf_output_str == vllm_output_str, (
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
......@@ -19,10 +19,13 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from .utils import check_logprobs_close
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
marlin_not_supported = (capability <
QUANTIZATION_METHODS["marlin"].get_min_capability())
marlin_not_supported = True
if torch.cuda.is_available():
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
marlin_not_supported = (
capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
@dataclass
......@@ -56,20 +59,16 @@ def test_models(
max_tokens: int,
num_logprobs: int,
) -> None:
marlin_model = vllm_runner(model_pair.model_marlin,
dtype=dtype,
quantization="marlin")
marlin_outputs = marlin_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
del marlin_model
gptq_model = vllm_runner(model_pair.model_gptq,
dtype=dtype,
quantization="gptq")
gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
max_tokens,
num_logprobs)
del gptq_model
with vllm_runner(model_pair.model_marlin,
dtype=dtype,
quantization="marlin") as marlin_model:
marlin_outputs = marlin_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
with vllm_runner(model_pair.model_gptq, dtype=dtype,
quantization="gptq") as gptq_model:
gptq_outputs = gptq_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
check_logprobs_close(
outputs_0_lst=gptq_outputs,
......
......@@ -26,16 +26,13 @@ def test_models(
num_logprobs: int,
) -> None:
# TODO(sang): Sliding window should be tested separately.
hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
del hf_model
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
vllm_model = vllm_runner(model, dtype=dtype)
vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
max_tokens,
num_logprobs)
del vllm_model
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
......
......@@ -34,13 +34,11 @@ def test_models(
# To pass the small model tests, we need full precision.
assert dtype == "float"
hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner(model, dtype=dtype)
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
......@@ -58,9 +56,8 @@ def test_model_print(
model: str,
dtype: str,
) -> None:
vllm_model = vllm_runner(model, dtype=dtype)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model)
del vllm_model
with vllm_runner(model, dtype=dtype) as vllm_model:
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model)
import numpy as np
import pytest
from transformers import CLIPImageProcessor, LlavaNextImageProcessor
from vllm.config import ModelConfig, VisionLanguageConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import ImagePixelData
from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE
@pytest.mark.parametrize("dtype", ["half", "float"])
def test_clip_image_processor(hf_images, dtype):
MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
IMAGE_HEIGHT = IMAGE_WIDTH = 560
hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
assert isinstance(hf_processor, CLIPImageProcessor)
model_config = ModelConfig(
model=MODEL_NAME,
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype=dtype,
revision=None,
)
vlm_config = VisionLanguageConfig(
image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
image_token_id=32000,
image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
image_feature_size=576,
image_processor=MODEL_NAME,
image_processor_revision=None,
)
for image in hf_images:
hf_result = hf_processor.preprocess(
image,
return_tensors="pt",
).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
vllm_result = MULTIMODAL_REGISTRY.process_input(
ImagePixelData(image),
model_config=model_config,
vlm_config=vlm_config,
)
assert hf_result.keys() == vllm_result.keys()
for key, hf_tensor in hf_result.items():
hf_arr: np.ndarray = hf_tensor.numpy()
vllm_arr: np.ndarray = vllm_result[key].numpy()
assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
@pytest.mark.xfail(
reason="Inconsistent image processor being used due to lack "
"of support for dynamic image token replacement")
@pytest.mark.parametrize("dtype", ["half", "float"])
def test_llava_next_image_processor(hf_images, dtype):
MODEL_NAME = "llava-hf/llava-v1.6-34b-hf"
IMAGE_HEIGHT = IMAGE_WIDTH = 560
hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
assert isinstance(hf_processor, LlavaNextImageProcessor)
model_config = ModelConfig(
model=MODEL_NAME,
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype=dtype,
revision=None,
)
vlm_config = VisionLanguageConfig(
image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
image_token_id=64000,
image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
image_feature_size=2928,
image_processor=MODEL_NAME,
image_processor_revision=None,
)
for image in hf_images:
hf_result = hf_processor.preprocess(
image,
return_tensors="pt",
).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
vllm_result = MULTIMODAL_REGISTRY.process_input(
ImagePixelData(image),
model_config=model_config,
vlm_config=vlm_config,
)
assert hf_result.keys() == vllm_result.keys()
for key, hf_tensor in hf_result.items():
hf_arr: np.ndarray = hf_tensor.numpy()
vllm_arr: np.ndarray = vllm_result[key].numpy()
assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
@pytest.mark.xfail(
reason="Example image pixels were not processed using HuggingFace")
@pytest.mark.parametrize("dtype", ["float"])
def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
IMAGE_HEIGHT = IMAGE_WIDTH = 560
model_config = ModelConfig(
model=MODEL_NAME,
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype=dtype,
revision=None,
)
vlm_config = VisionLanguageConfig(
image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
image_token_id=32000,
image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
image_feature_size=576,
image_processor=MODEL_NAME,
image_processor_revision=None,
)
for image, tensor in zip(hf_images, vllm_image_tensors):
image_result = MULTIMODAL_REGISTRY.process_input(
ImagePixelData(image),
model_config=model_config,
vlm_config=vlm_config,
)
tensor_result = MULTIMODAL_REGISTRY.process_input(
ImagePixelData(tensor),
model_config=model_config,
vlm_config=vlm_config,
)
assert image_result.keys() == tensor_result.keys()
for key, image_arr in image_result.items():
tensor_arr: np.ndarray = tensor_result[key].numpy()
assert image_arr.shape == tensor_arr.shape, f"Failed for key={key}"
assert np.allclose(image_arr, tensor_arr), f"Failed for key={key}"
import base64
import mimetypes
from tempfile import NamedTemporaryFile
from typing import Dict, Tuple
import numpy as np
import pytest
import pytest_asyncio
from PIL import Image
from vllm.multimodal.utils import ImageFetchAiohttp
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
]
@pytest_asyncio.fixture(scope="session")
async def url_images() -> Dict[str, Image.Image]:
return {
image_url: await ImageFetchAiohttp.fetch_image(image_url)
for image_url in TEST_IMAGE_URLS
}
def get_supported_suffixes() -> Tuple[str, ...]:
# We should at least test the file types mentioned in GPT-4 with Vision
OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif')
# Additional file types that are supported by us
EXTRA_SUPPORTED_SUFFIXES = ('.bmp', '.tiff')
return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES
def _image_equals(a: Image.Image, b: Image.Image) -> bool:
return (np.asarray(a) == np.asarray(b.convert(a.mode))).all()
@pytest.mark.asyncio
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("suffix", get_supported_suffixes())
async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
image_url: str, suffix: str):
url_image = url_images[image_url]
try:
mime_type = Image.MIME[Image.registered_extensions()[suffix]]
except KeyError:
try:
mime_type = mimetypes.types_map[suffix]
except KeyError:
pytest.skip('No MIME type')
with NamedTemporaryFile(suffix=suffix) as f:
try:
url_image.save(f.name)
except Exception as e:
if e.args[0] == 'cannot write mode RGBA as JPEG':
pytest.skip('Conversion not supported')
raise
base64_image = base64.b64encode(f.read()).decode("utf-8")
data_url = f"data:{mime_type};base64,{base64_image}"
data_image = await ImageFetchAiohttp.fetch_image(data_url)
if _image_equals(url_image, Image.open(f)):
assert _image_equals(url_image, data_image)
else:
pass # Lossy format; only check that image can be opened
'''Tests whether bitsandbytes computation is enabled correctly.
Run `pytest tests/quantization/test_bitsandbytes.py`.
'''
import pytest
import torch
from vllm import SamplingParams
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
@pytest.mark.skipif(
capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(),
reason='bitsandbytes is not supported on this GPU type.')
def test_load_bnb_model(vllm_runner) -> None:
with vllm_runner('huggyllama/llama-7b',
quantization='bitsandbytes',
load_format='bitsandbytes',
enforce_eager=True) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
# check the weights in MLP & SelfAttention are quantized to torch.uint8
qweight = model.model.layers[0].mlp.gate_up_proj.qweight
assert qweight.dtype == torch.uint8, (
f'Expected gate_up_proj dtype torch.uint8 but got {qweight.dtype}')
qweight = model.model.layers[0].mlp.down_proj.qweight
assert qweight.dtype == torch.uint8, (
f'Expected down_proj dtype torch.uint8 but got {qweight.dtype}')
qweight = model.model.layers[0].self_attn.o_proj.qweight
assert qweight.dtype == torch.uint8, (
f'Expected o_proj dtype torch.uint8 but got {qweight.dtype}')
qweight = model.model.layers[0].self_attn.qkv_proj.qweight
assert qweight.dtype == torch.uint8, (
f'Expected qkv_proj dtype torch.uint8 but got {qweight.dtype}')
# some weights should not be quantized
weight = model.lm_head.weight
assert weight.dtype != torch.uint8, (
'lm_head weight dtype should not be torch.uint8')
weight = model.model.embed_tokens.weight
assert weight.dtype != torch.uint8, (
'embed_tokens weight dtype should not be torch.uint8')
weight = model.model.layers[0].input_layernorm.weight
assert weight.dtype != torch.uint8, (
'input_layernorm weight dtype should not be torch.uint8')
weight = model.model.layers[0].post_attention_layernorm.weight
assert weight.dtype != torch.uint8, (
'input_layernorm weight dtype should not be torch.uint8')
# check the output of the model is expected
sampling_params = SamplingParams(temperature=0.0,
logprobs=1,
prompt_logprobs=1,
max_tokens=8)
prompts = ['That which does not kill us', 'To be or not to be,']
expected_outputs = [
'That which does not kill us makes us stronger.',
'To be or not to be, that is the question.'
]
outputs = llm.generate(prompts, sampling_params=sampling_params)
assert len(outputs) == len(prompts)
for index in range(len(outputs)):
# compare the first line of the output
actual_output = outputs[index][1][0].split('\n', 1)[0]
expected_output = expected_outputs[index].split('\n', 1)[0]
assert actual_output == expected_output, (
f'Expected: {expected_output}, but got: {actual_output}')
......@@ -5,32 +5,58 @@ Run `pytest tests/quantization/test_compressed_tensors.py`.
import torch
from vllm import SamplingParams
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
CompressedTensorsLinearMethod, CompressedTensorsW8A8StaticTensor)
CompressedTensorsLinearMethod, CompressedTensorsW8A8DynamicToken,
CompressedTensorsW8A8StaticTensor)
def test_compressed_tensors_w8a8_static_setup(vllm_runner):
model_path = "nm-testing/tinyllama-one-shot-static-quant-test-compressed"
llm = vllm_runner(model_path, quantization="sparseml", enforce_eager=True)
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
o_proj = layer.self_attn.o_proj
gate_up_proj = layer.mlp.gate_up_proj
down_proj = layer.mlp.down_proj
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)
assert qkv_proj.weight.dtype is torch.int8
assert o_proj.weight.dtype is torch.int8
assert gate_up_proj.weight.dtype is torch.int8
assert qkv_proj.weight_scale.shard_splitter is not None
assert qkv_proj.weight_scale.logical_widths is not None
assert qkv_proj.input_scale.dtype is torch.float32
model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
with vllm_runner(model_path, enforce_eager=True) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
o_proj = layer.self_attn.o_proj
gate_up_proj = layer.mlp.gate_up_proj
down_proj = layer.mlp.down_proj
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(gate_up_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(down_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)
assert qkv_proj.weight.dtype is torch.int8
assert o_proj.weight.dtype is torch.int8
assert gate_up_proj.weight.dtype is torch.int8
assert qkv_proj.weight_scale.shard_splitter is not None
assert qkv_proj.weight_scale.logical_widths is not None
assert qkv_proj.input_scale.dtype is torch.float32
def test_compressed_tensors_no_enforce_eager(vllm_runner):
model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
with vllm_runner(model_path) as llm:
sampling_params = SamplingParams()
output = llm.generate("Hello world!", sampling_params=sampling_params)
assert output
def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
model_path = "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
with vllm_runner(model_path, enforce_eager=True,
dtype=torch.float16) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
assert qkv_proj.weight.dtype is torch.int8
......@@ -16,9 +16,9 @@ capability = capability[0] * 10 + capability[1]
capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
reason="FP8 is not supported on this GPU type.")
def test_load_fp16_model(vllm_runner) -> None:
llm = vllm_runner("facebook/opt-125m", quantization="fp8")
with vllm_runner("facebook/opt-125m", quantization="fp8") as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
fc1 = model.model.decoder.layers[0].fc1
assert isinstance(fc1.quant_method, Fp8LinearMethod)
assert fc1.weight.dtype == torch.float8_e4m3fn
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
fc1 = model.model.decoder.layers[0].fc1
assert isinstance(fc1.quant_method, Fp8LinearMethod)
assert fc1.weight.dtype == torch.float8_e4m3fn
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment