Unverified Commit 99872085 authored by Benji Beck's avatar Benji Beck Committed by GitHub
Browse files

Migrate MiniCPMOAudioInputs to TensorSchema (#21847)


Signed-off-by: default avatarBenji Beck <benjibeck@meta.com>
Co-authored-by: default avatargemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
parent 0ba1b54a
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
# limitations under the License. # limitations under the License.
"""Inference-only MiniCPM-O model compatible with HuggingFace weights.""" """Inference-only MiniCPM-O model compatible with HuggingFace weights."""
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
from typing import Any, Callable, Literal, Optional, TypedDict, Union from typing import Annotated, Any, Callable, Literal, Optional, Union
import torch import torch
from torch import nn from torch import nn
...@@ -49,6 +49,7 @@ from vllm.multimodal.parse import (AudioItem, AudioProcessorItems, ...@@ -49,6 +49,7 @@ from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
MultiModalDataParser) MultiModalDataParser)
from vllm.multimodal.processing import (PromptReplacement, PromptUpdate, from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
PromptUpdateDetails) PromptUpdateDetails)
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6, from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
MiniCPMVDummyInputsBuilder, MiniCPMVDummyInputsBuilder,
...@@ -61,35 +62,52 @@ from .utils import (AutoWeightsLoader, cast_overflow_tensors, flatten_bn, ...@@ -61,35 +62,52 @@ from .utils import (AutoWeightsLoader, cast_overflow_tensors, flatten_bn,
CPU_DEVICE = torch.device("cpu") CPU_DEVICE = torch.device("cpu")
class MiniCPMOAudioFeatureInputs(TypedDict): class MiniCPMOAudioFeatureInputs(TensorSchema):
type: Literal["audio_features"]
audio_features: Union[torch.Tensor, list[torch.Tensor]]
""" """
Shape: `(batch_size * num_audios * num_slices, num_channels, length)` Dimensions:
Slice here means chunk. Audio that is too long will be split into slices, - bns: Batch size * number of audios * number of slices
which is the same as image. - bn: Batch size * number of audios
Padding is used therefore `audio_features` is `torch.Tensor`. - c: Number of channels
- l: Length
- s: Number of slices
""" """
type: Literal["audio_features"] = "audio_features"
audio_feature_lens: Union[torch.Tensor, list[torch.Tensor]] audio_features: Annotated[
Union[torch.Tensor, list[torch.Tensor]],
TensorShape("bns", "c", "l", dynamic_dims={"l"}),
]
"""
Slice here means chunk. Audio that is too long will be split into slices,
which is the same as image. Padding is used therefore `audio_features` is
`torch.Tensor`.
""" """
Shape: `(batch_size * num_audios, num_slices)`
audio_feature_lens: Annotated[
Union[torch.Tensor, list[torch.Tensor]],
TensorShape("bn", "s"),
]
"""
This should be feature length of each audio slice, This should be feature length of each audio slice,
which equals to `audio_features.shape[-1]` which equals to `audio_features.shape[-1]`
""" """
class MiniCPMOAudioEmbeddingInputs(TypedDict): class MiniCPMOAudioEmbeddingInputs(TensorSchema):
type: Literal["audio_embeds"]
audio_embeds: Union[torch.Tensor, list[torch.Tensor]]
""" """
Shape: `(batch_size * num_audios, num_slices, hidden_size)` Dimensions:
- bn: Batch size * number of audios
`hidden_size` must match the hidden size of language model backbone. - s: Number of slices
instead of a batched tensor. - h: Hidden size (must match language model backbone)
Length of each slice may vary, so pass it as a list. Length of each slice may vary, so pass it as a list.
""" """
type: Literal["audio_embeds"] = "audio_embeds"
audio_embeds: Annotated[
Union[torch.Tensor, list[torch.Tensor]],
TensorShape("bn", "s", "h", dynamic_dims={"s"}),
]
MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs, MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment