Unverified Commit 5e85f4f8 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[VLM] Use `SequenceData.from_token_counts` to create dummy data (#8687)

parent 71c60491
...@@ -125,7 +125,7 @@ class InputRegistry: ...@@ -125,7 +125,7 @@ class InputRegistry:
# Avoid circular import # Avoid circular import
from vllm.sequence import SequenceData from vllm.sequence import SequenceData
dummy_seq_data = SequenceData.from_counts({0: seq_len}) dummy_seq_data = SequenceData.from_token_counts((0, seq_len))
dummy_multi_modal_data = None dummy_multi_modal_data = None
return dummy_seq_data, dummy_multi_modal_data return dummy_seq_data, dummy_multi_modal_data
......
"""Minimal implementation of BlipVisionModel intended to be only used """Minimal implementation of BlipVisionModel intended to be only used
within a vision language model.""" within a vision language model."""
from array import array
from typing import Optional, Union from typing import Optional, Union
import torch import torch
...@@ -19,7 +18,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, ...@@ -19,7 +18,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal.utils import (cached_get_tokenizer, from vllm.multimodal.utils import (cached_get_tokenizer,
repeat_and_pad_placeholder_tokens) repeat_and_pad_placeholder_tokens)
from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData from vllm.sequence import SequenceData
try: try:
from xformers import ops as xops from xformers import ops as xops
...@@ -53,6 +52,7 @@ def get_max_blip_image_tokens( ...@@ -53,6 +52,7 @@ def get_max_blip_image_tokens(
def dummy_seq_data_for_blip( def dummy_seq_data_for_blip(
hf_config: Union[BlipVisionConfig, Blip2VisionConfig], hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
seq_len: int, seq_len: int,
num_images: int,
*, *,
image_token_id: int, image_token_id: int,
image_feature_size_override: Optional[int] = None, image_feature_size_override: Optional[int] = None,
...@@ -62,11 +62,10 @@ def dummy_seq_data_for_blip( ...@@ -62,11 +62,10 @@ def dummy_seq_data_for_blip(
else: else:
image_feature_size = image_feature_size_override image_feature_size = image_feature_size_override
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, return SequenceData.from_token_counts(
[image_token_id]) * image_feature_size (image_token_id, image_feature_size * num_images),
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, (0, seq_len - image_feature_size * num_images),
[0]) * (seq_len - image_feature_size) )
return SequenceData(token_ids)
def dummy_image_for_blip( def dummy_image_for_blip(
......
from array import array
from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
TypedDict, Union) TypedDict, Union)
...@@ -18,8 +17,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader ...@@ -18,8 +17,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.opt import OPTModel from vllm.model_executor.models.opt import OPTModel
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, from vllm.sequence import IntermediateTensors, SequenceData
SequenceData)
from .blip import (BlipVisionModel, dummy_image_for_blip, from .blip import (BlipVisionModel, dummy_image_for_blip,
get_max_blip_image_tokens) get_max_blip_image_tokens)
...@@ -429,11 +427,10 @@ def dummy_seq_data_for_blip2( ...@@ -429,11 +427,10 @@ def dummy_seq_data_for_blip2(
else: else:
image_feature_size = image_feature_size_override image_feature_size = image_feature_size_override
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, return SequenceData.from_token_counts(
[image_token_id]) * image_feature_size * num_images (image_token_id, image_feature_size * num_images),
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, (0, seq_len - image_feature_size * num_images),
[0]) * (seq_len - image_feature_size * num_images) )
return SequenceData(token_ids)
def dummy_data_for_blip2(ctx: InputContext, seq_len: int, def dummy_data_for_blip2(ctx: InputContext, seq_len: int,
......
from array import array
from functools import cached_property from functools import cached_property
from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional,
Tuple, TypedDict) Tuple, TypedDict)
...@@ -32,8 +31,7 @@ from vllm.model_executor.utils import set_weight_attrs ...@@ -32,8 +31,7 @@ from vllm.model_executor.utils import set_weight_attrs
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.utils import (cached_get_tokenizer, from vllm.multimodal.utils import (cached_get_tokenizer,
repeat_and_pad_placeholder_tokens) repeat_and_pad_placeholder_tokens)
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, from vllm.sequence import IntermediateTensors, SequenceData
SequenceData)
from vllm.utils import print_warning_once from vllm.utils import print_warning_once
from .interfaces import SupportsMultiModal from .interfaces import SupportsMultiModal
...@@ -72,11 +70,10 @@ def dummy_seq_data_for_chameleon( ...@@ -72,11 +70,10 @@ def dummy_seq_data_for_chameleon(
else: else:
image_feature_size = image_feature_size_override image_feature_size = image_feature_size_override
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, return SequenceData.from_token_counts(
[image_token_id]) * image_feature_size * num_images (image_token_id, image_feature_size * num_images),
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, (0, seq_len - image_feature_size * num_images),
[0]) * (seq_len - image_feature_size * num_images) )
return SequenceData(token_ids)
def dummy_image_for_chameleon( def dummy_image_for_chameleon(
......
"""Minimal implementation of CLIPVisionModel intended to be only used """Minimal implementation of CLIPVisionModel intended to be only used
within a vision language model.""" within a vision language model."""
from array import array
from typing import Iterable, List, Optional, Tuple, Union from typing import Iterable, List, Optional, Tuple, Union
import torch import torch
...@@ -20,7 +19,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig ...@@ -20,7 +19,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal.utils import (cached_get_tokenizer, from vllm.multimodal.utils import (cached_get_tokenizer,
repeat_and_pad_placeholder_tokens) repeat_and_pad_placeholder_tokens)
from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData from vllm.sequence import SequenceData
try: try:
from xformers import ops as xops from xformers import ops as xops
...@@ -62,11 +61,10 @@ def dummy_seq_data_for_clip( ...@@ -62,11 +61,10 @@ def dummy_seq_data_for_clip(
else: else:
image_feature_size = image_feature_size_override image_feature_size = image_feature_size_override
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, return SequenceData.from_token_counts(
[image_token_id]) * image_feature_size * num_images (image_token_id, image_feature_size * num_images),
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, (0, seq_len - image_feature_size * num_images),
[0]) * (seq_len - image_feature_size * num_images) )
return SequenceData(token_ids)
def dummy_image_for_clip( def dummy_image_for_clip(
......
...@@ -23,7 +23,6 @@ ...@@ -23,7 +23,6 @@
"""Inference-only MiniCPM-V model compatible with HuggingFace weights.""" """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
import math import math
import re import re
from array import array
from functools import partial from functools import partial
from typing import (Any, Callable, Iterable, List, Mapping, Optional, Tuple, from typing import (Any, Callable, Iterable, List, Mapping, Optional, Tuple,
TypedDict) TypedDict)
...@@ -56,8 +55,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata ...@@ -56,8 +55,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.image import cached_get_image_processor
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, from vllm.sequence import IntermediateTensors, SequenceData
SequenceData)
from .idefics2_vision_model import Idefics2VisionTransformer from .idefics2_vision_model import Idefics2VisionTransformer
...@@ -259,8 +257,7 @@ def get_max_minicpmv_image_tokens(ctx: InputContext): ...@@ -259,8 +257,7 @@ def get_max_minicpmv_image_tokens(ctx: InputContext):
def dummy_seq_data_for_minicpmv(seq_len: int, num_images: int): def dummy_seq_data_for_minicpmv(seq_len: int, num_images: int):
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [0]) * seq_len return SequenceData.from_token_counts((0, seq_len))
return SequenceData(token_ids)
def dummy_image_for_minicpmv(hf_config: PretrainedConfig, num_images: int): def dummy_image_for_minicpmv(hf_config: PretrainedConfig, num_images: int):
......
from array import array
from dataclasses import dataclass, fields from dataclasses import dataclass, fields
from itertools import tee from itertools import tee
from typing import Iterable, List, Mapping, Optional, Tuple, Union from typing import Iterable, List, Mapping, Optional, Tuple, Union
...@@ -24,8 +23,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata ...@@ -24,8 +23,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, from vllm.sequence import IntermediateTensors, SequenceData
SequenceData)
from .interfaces import SupportsMultiModal from .interfaces import SupportsMultiModal
from .utils import init_vllm_registered_model from .utils import init_vllm_registered_model
...@@ -63,13 +61,11 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int, ...@@ -63,13 +61,11 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
image_feature_size = (size**2) // (patch_size**2) image_feature_size = (size**2) // (patch_size**2)
num_image_tokens = image_feature_size * num_images num_image_tokens = image_feature_size * num_images
seq_data = SequenceData.from_token_counts(
(image_token_id, num_image_tokens),
(0, seq_len - num_image_tokens),
)
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
[image_token_id]) * num_image_tokens
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
[0]) * (seq_len - num_image_tokens)
seq_data = SequenceData(token_ids)
mm_data = {"image": num_images * [image]} mm_data = {"image": num_images * [image]}
return seq_data, mm_data return seq_data, mm_data
......
...@@ -7,7 +7,6 @@ ...@@ -7,7 +7,6 @@
import math import math
import re import re
from array import array
from functools import partial from functools import partial
from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
Optional, Tuple, TypedDict, Union) Optional, Tuple, TypedDict, Union)
...@@ -45,8 +44,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata ...@@ -45,8 +44,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, from vllm.sequence import IntermediateTensors, SequenceData
SequenceData)
from vllm.utils import is_list_of from vllm.utils import is_list_of
from .utils import flatten_bn, is_pp_missing_parameter, make_layers from .utils import flatten_bn, is_pp_missing_parameter, make_layers
...@@ -819,7 +817,7 @@ def dummy_data_for_qwen( ...@@ -819,7 +817,7 @@ def dummy_data_for_qwen(
# The presence of a visual config indicates this is a multimodal model. # The presence of a visual config indicates this is a multimodal model.
# If we don't have it, the model is considered an LLM for warmup purposes. # If we don't have it, the model is considered an LLM for warmup purposes.
if not hasattr(hf_config, "visual"): if not hasattr(hf_config, "visual"):
seq_data = SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * seq_len)) seq_data = SequenceData.from_token_counts((0, seq_len))
mm_data = None mm_data = None
return seq_data, mm_data return seq_data, mm_data
...@@ -846,11 +844,13 @@ def dummy_data_for_qwen( ...@@ -846,11 +844,13 @@ def dummy_data_for_qwen(
if len(toks) < seq_len: if len(toks) < seq_len:
toks += [0] * (seq_len - len(toks)) toks += [0] * (seq_len - len(toks))
seq_data = SequenceData.from_seqs(toks)
# Build the input images; width/height doesn't actually matter here since # Build the input images; width/height doesn't actually matter here since
# the data will get resized and the # of tokens per image is constant # the data will get resized and the # of tokens per image is constant
image = Image.new("RGB", (224, 224), color=0) image = Image.new("RGB", (224, 224), color=0)
mm_data = {"image": image if num_images == 1 else [image] * num_images} mm_data = {"image": image if num_images == 1 else [image] * num_images}
return SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, toks)), mm_data return seq_data, mm_data
@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_qwen) @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_qwen)
......
...@@ -22,7 +22,6 @@ ...@@ -22,7 +22,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Inference-only Qwen2-VL model compatible with HuggingFace weights.""" """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
from array import array
from functools import lru_cache, partial from functools import lru_cache, partial
from typing import (Iterable, List, Mapping, Optional, Tuple, Type, TypedDict, from typing import (Iterable, List, Mapping, Optional, Tuple, Type, TypedDict,
Union) Union)
...@@ -66,8 +65,7 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, ...@@ -66,8 +65,7 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
from vllm.multimodal.base import MultiModalData from vllm.multimodal.base import MultiModalData
from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.image import cached_get_image_processor
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, from vllm.sequence import IntermediateTensors, SequenceData
SequenceData)
from vllm.transformers_utils.processor import get_processor from vllm.transformers_utils.processor import get_processor
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -681,15 +679,14 @@ def dummy_data_for_qwen2_vl( ...@@ -681,15 +679,14 @@ def dummy_data_for_qwen2_vl(
"--limit-mm-per-prompt.") "--limit-mm-per-prompt.")
hf_config = ctx.get_hf_config(Qwen2VLConfig) hf_config = ctx.get_hf_config(Qwen2VLConfig)
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
[hf_config.vision_start_token_id]) dummy_seqdata = SequenceData.from_token_counts(
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, (hf_config.vision_start_token_id, 1),
[hf_config.image_token_id]) * max_llm_image_tokens (hf_config.image_token_id, max_llm_image_tokens),
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, (hf_config.vision_end_token_id, 1),
[hf_config.vision_end_token_id]) (0, seq_len - max_llm_image_tokens - 2),
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, )
[0]) * (seq_len - max_llm_image_tokens - 2)
dummy_seqdata = SequenceData(token_ids)
dummy_image = Image.new("RGB", (max_resized_width, max_resized_height), dummy_image = Image.new("RGB", (max_resized_width, max_resized_height),
color=0) color=0)
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
within a vision language model.""" within a vision language model."""
import math import math
from array import array
from typing import Iterable, List, Optional, Tuple, Union from typing import Iterable, List, Optional, Tuple, Union
import torch import torch
...@@ -24,7 +23,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ...@@ -24,7 +23,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal.utils import (cached_get_tokenizer, from vllm.multimodal.utils import (cached_get_tokenizer,
repeat_and_pad_placeholder_tokens) repeat_and_pad_placeholder_tokens)
from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData from vllm.sequence import SequenceData
try: try:
from xformers import ops as xops from xformers import ops as xops
...@@ -67,11 +66,10 @@ def dummy_seq_data_for_siglip( ...@@ -67,11 +66,10 @@ def dummy_seq_data_for_siglip(
else: else:
image_feature_size = image_feature_size_override image_feature_size = image_feature_size_override
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, return SequenceData.from_token_counts(
[image_token_id]) * image_feature_size (image_token_id, image_feature_size * num_images),
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, (0, seq_len - image_feature_size * num_images),
[0]) * (seq_len - image_feature_size) )
return SequenceData(token_ids)
def dummy_image_for_siglip( def dummy_image_for_siglip(
......
...@@ -77,15 +77,11 @@ def get_ultravox_max_audio_tokens(ctx: InputContext): ...@@ -77,15 +77,11 @@ def get_ultravox_max_audio_tokens(ctx: InputContext):
return math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND) return math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND)
def dummy_data_for_ultravox( def dummy_seq_data_for_ultravox(
ctx: InputContext, ctx: InputContext,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], audio_count: int,
): ):
feature_extractor = whisper_feature_extractor(ctx)
audio_count = mm_counts["audio"]
audio_placeholder = array( audio_placeholder = array(
VLLM_TOKEN_ID_ARRAY_TYPE, VLLM_TOKEN_ID_ARRAY_TYPE,
[_AUDIO_PLACEHOLDER_TOKEN]) * get_ultravox_max_audio_tokens(ctx) [_AUDIO_PLACEHOLDER_TOKEN]) * get_ultravox_max_audio_tokens(ctx)
...@@ -96,10 +92,28 @@ def dummy_data_for_ultravox( ...@@ -96,10 +92,28 @@ def dummy_data_for_ultravox(
other_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, other_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
[0]) * (seq_len - len(audio_token_ids)) [0]) * (seq_len - len(audio_token_ids))
return SequenceData(audio_token_ids + other_token_ids)
def dummy_audio_for_ultravox(
ctx: InputContext,
audio_count: int,
):
feature_extractor = whisper_feature_extractor(ctx)
audio_and_sr = (np.array([0.0] * feature_extractor.chunk_length), 1) audio_and_sr = (np.array([0.0] * feature_extractor.chunk_length), 1)
mm_dict = {"audio": [audio_and_sr] * audio_count} return {"audio": [audio_and_sr] * audio_count}
def dummy_data_for_ultravox(
ctx: InputContext,
seq_len: int,
mm_counts: Mapping[str, int],
):
audio_count = mm_counts["audio"]
seq_data = dummy_seq_data_for_ultravox(ctx, seq_len, audio_count)
mm_dict = dummy_audio_for_ultravox(ctx, audio_count)
return (SequenceData(audio_token_ids + other_token_ids), mm_dict) return (seq_data, mm_dict)
def input_mapper_for_ultravox(ctx: InputContext, data: object): def input_mapper_for_ultravox(ctx: InputContext, data: object):
......
...@@ -171,13 +171,13 @@ class SequenceData(msgspec.Struct, ...@@ -171,13 +171,13 @@ class SequenceData(msgspec.Struct,
_mrope_position_delta: Optional[int] = None _mrope_position_delta: Optional[int] = None
@staticmethod @staticmethod
def from_counts(counts_by_token: Mapping[int, int]) -> "SequenceData": def from_token_counts(*token_counts: Tuple[int, int]) -> "SequenceData":
if len(counts_by_token) == 0: if len(token_counts) == 0:
return SequenceData.from_seqs([]) return SequenceData.from_seqs([])
arrs = [ arrs = [
array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
for token_id, count in counts_by_token.items() for token_id, count in token_counts
] ]
return SequenceData(reduce(array.__add__, arrs)) return SequenceData(reduce(array.__add__, arrs))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment