Commit 53076d70 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-ori

parents 322a0be6 9c5c81b0
......@@ -183,7 +183,7 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
image_width: int,
image_height: int,
processor: Optional[Gemma3Processor],
) -> PromptUpdateDetails:
) -> PromptUpdateDetails[str]:
if processor is None:
processor = self.get_hf_processor()
......
......@@ -249,20 +249,15 @@ class H2OVLProcessor(BaseInternVLProcessor):
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_CONTEXT]
def get_image_repl_features(
def get_image_repl(
self,
feature_size: int,
num_patches: Optional[int],
) -> str:
return IMG_CONTEXT * feature_size
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
def get_image_repl_full(
self,
feature_size: int,
num_patches: Optional[int],
) -> str:
features = self.get_image_repl_features(feature_size, num_patches)
return IMG_START + features + IMG_END
return PromptUpdateDetails(full=repl_full, features=repl_features)
def resolve_min_max_num(
self,
......@@ -501,12 +496,7 @@ class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo]
if num_patches is not None:
assert isinstance(num_patches, int)
return PromptUpdateDetails(
full=hf_processor.get_image_repl_full(feature_size,
num_patches),
features=hf_processor.get_image_repl_features(
feature_size, num_patches),
)
return hf_processor.get_image_repl(feature_size, num_patches)
return [
PromptReplacement(
......
......@@ -9,14 +9,13 @@
from abc import ABC, abstractmethod
from collections.abc import Iterable, Mapping, Sequence
from functools import cached_property
from typing import (List, Literal, Optional, Set, Tuple, TypedDict, TypeVar,
Union)
from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union
import torch
import torch.nn as nn
import torchvision.transforms as T
from PIL import Image
from transformers import BatchFeature, PretrainedConfig, TensorType
from transformers import BatchEncoding, PretrainedConfig, TensorType
from vllm.config import VllmConfig
from vllm.model_executor.layers.quantization import QuantizationConfig
......@@ -36,10 +35,12 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import flatten_2d_lists
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
maybe_prefix, merge_multimodal_embeddings)
from .vision import scatter_patch_features, select_patch_features
IMG_START = '<img>'
IMG_END = '</img>'
......@@ -51,16 +52,26 @@ IMAGENET_STD = (0.229, 0.224, 0.225)
class InternVLImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: torch.Tensor
pixel_values_flat: torch.Tensor
"""
Shape:
`(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
"""
patches_per_image: List[int]
num_patches: torch.Tensor
"""Shape: `(batch_size * num_images)`"""
embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
"""
List of number of total patches for each image in the batch.
A boolean mask indicating which image embeddings correspond
to patch tokens.
Shape: `(batch_size, num_images, num_embeds)`
"""
num_embeds: Union[torch.Tensor, list[torch.Tensor]]
"""Shape: `(batch_size, num_images)`"""
class InternVLImageEmbeddingInputs(TypedDict):
type: Literal["image_embeds"]
......@@ -286,19 +297,11 @@ class BaseInternVLProcessor(ABC):
raise NotImplementedError
@abstractmethod
def get_image_repl_features(
def get_image_repl(
self,
feature_size: int,
num_patches: Optional[int],
) -> str:
raise NotImplementedError
@abstractmethod
def get_image_repl_full(
self,
feature_size: int,
num_patches: Optional[int],
) -> str:
) -> PromptUpdateDetails[str]:
raise NotImplementedError
def resolve_min_max_num(
......@@ -394,7 +397,7 @@ class BaseInternVLProcessor(ABC):
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
) -> BatchFeature:
) -> Mapping[str, NestedTensors]:
if text is None:
text = []
if not isinstance(text, list):
......@@ -413,28 +416,41 @@ class BaseInternVLProcessor(ABC):
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
image_inputs = {
"pixel_values_flat": torch.cat(pixel_values_lst),
"image_num_patches": list(map(len, pixel_values_lst)),
image_inputs: dict[str, NestedTensors] = {
"pixel_values_flat":
torch.cat(pixel_values_lst),
"image_num_patches":
torch.tensor([len(item) for item in pixel_values_lst]),
}
tokenizer = self.tokenizer
image_token_id = self.image_token_id
num_embeds = list[int]()
embed_is_patch = list[torch.Tensor]()
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl_full(feature_size,
num_patches)
text = [t.replace('<image>', image_repl, 1) for t in text]
image_repl = self.get_image_repl(feature_size, num_patches)
feature_tokens = tokenizer.encode(image_repl.features,
add_special_tokens=False)
text = [t.replace('<image>', image_repl.full, 1) for t in text]
num_embeds.append(len(feature_tokens))
embed_is_patch.append(
torch.tensor(feature_tokens) == image_token_id)
image_inputs["num_embeds"] = torch.tensor(num_embeds)
image_inputs["embed_is_patch"] = embed_is_patch
text_inputs = self.tokenizer(text)
return BatchFeature(
{
**text_inputs,
**image_inputs,
},
tensor_type=return_tensors,
)
return {
**BatchEncoding(text_inputs, tensor_type=return_tensors),
**image_inputs,
}
class InternVLProcessor(BaseInternVLProcessor):
......@@ -443,20 +459,15 @@ class InternVLProcessor(BaseInternVLProcessor):
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_CONTEXT]
def get_image_repl_features(
def get_image_repl(
self,
feature_size: int,
num_patches: Optional[int],
) -> str:
return IMG_CONTEXT * feature_size
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
def get_image_repl_full(
self,
feature_size: int,
num_patches: Optional[int],
) -> str:
features = self.get_image_repl_features(feature_size, num_patches)
return IMG_START + features + IMG_END
return PromptUpdateDetails(full=repl_full, features=repl_features)
class BaseInternVLProcessingInfo(BaseProcessingInfo):
......@@ -566,16 +577,15 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
prompt: str,
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
) -> BatchFeature:
) -> Mapping[str, NestedTensors]:
processed_outputs = super()._call_hf_processor(
prompt=prompt,
mm_data=mm_data,
mm_kwargs=mm_kwargs,
)
image_token_id = self.info.get_hf_processor(**mm_kwargs).image_token_id
image_data = mm_data.get("images", [])
assert isinstance(image_data, list)
hf_processor = self.info.get_hf_processor(**mm_kwargs)
image_token_id = hf_processor.image_token_id
# Since there may be extra tokens in the feature placeholders,
# we need to pass the image token ID to the model to select the
......@@ -586,7 +596,7 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
hf_inputs: Mapping[str, NestedTensors],
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
......@@ -596,6 +606,8 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
"image", image_num_patches),
image_num_patches=MultiModalFieldConfig.batched("image"),
embed_is_patch=MultiModalFieldConfig.batched("image"),
num_embeds=MultiModalFieldConfig.batched("image"),
image_embeds=MultiModalFieldConfig.batched("image"),
image_token_id=MultiModalFieldConfig.shared("image", num_images),
)
......@@ -637,12 +649,7 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
if num_patches is not None:
assert isinstance(num_patches, int)
return PromptUpdateDetails(
full=hf_processor.get_image_repl_full(feature_size,
num_patches),
features=hf_processor.get_image_repl_features(
feature_size, num_patches),
)
return hf_processor.get_image_repl(feature_size, num_patches)
return [
PromptReplacement(
......@@ -832,6 +839,8 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
self, **kwargs: object) -> Optional[InternVLImageInputs]:
pixel_values_flat = kwargs.pop("pixel_values_flat", None)
image_num_patches = kwargs.pop("image_num_patches", None)
embed_is_patch = kwargs.pop("embed_is_patch", None)
num_embeds = kwargs.pop("num_embeds", None)
image_embeds = kwargs.pop("image_embeds", None)
if pixel_values_flat is None and image_embeds is None:
......@@ -858,35 +867,47 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
if not isinstance(image_num_patches, (torch.Tensor, list)):
raise ValueError("Incorrect type of image_num_patches. "
f"Got type: {type(pixel_values_flat)}")
f"Got type: {type(image_num_patches)}")
if not isinstance(embed_is_patch, (torch.Tensor, list)):
raise ValueError("Incorrect type of embed_is_patch. "
f"Got type: {type(embed_is_patch)}")
if not isinstance(num_embeds, (torch.Tensor, list)):
raise ValueError("Incorrect type of num_embeds. "
f"Got type: {type(num_embeds)}")
pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
image_num_patches = flatten_bn(image_num_patches, concat=True)
return InternVLImagePixelInputs(
type="pixel_values",
data=self._validate_pixel_values(
flatten_bn(pixel_values_flat, concat=True)),
patches_per_image=flatten_bn(image_num_patches,
concat=True).tolist())
pixel_values_flat=self._validate_pixel_values(
pixel_values_flat),
num_patches=image_num_patches,
embed_is_patch=embed_is_patch,
num_embeds=num_embeds,
)
raise AssertionError("This line should be unreachable.")
def _process_image_input(
self,
image_input: InternVLImageInputs,
) -> tuple[torch.Tensor, ...]:
) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
if image_input["type"] == "image_embeds":
return image_input["data"]
assert self.vision_model is not None
image_embeds = self.extract_feature(image_input["data"])
image_embeds = self.extract_feature(image_input["pixel_values_flat"])
patches_per_image = image_input["patches_per_image"]
num_patches = image_input["num_patches"]
# Only one image in the current batch
if len(patches_per_image) == 1:
image_embeds = image_embeds.view(
if len(num_patches) == 1:
return image_embeds.view(
-1, self.config.text_config.hidden_size).unsqueeze(0)
return image_embeds
# NOTE: Image embeddings are split into separate tensors for each image
# by the size of each embedding.
......@@ -894,10 +915,9 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
image_embeds = image_embeds.view(-1,
self.config.text_config.hidden_size)
image_feature_sizes = [
num_patches * feature_size for num_patches in patches_per_image
num_patches * feature_size for num_patches in num_patches
]
image_embeds = image_embeds.split(image_feature_sizes)
return image_embeds
return image_embeds.split(image_feature_sizes)
def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
if self.is_mono:
......@@ -911,8 +931,19 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
vision_embeddings = self._process_image_input(image_input)
return vision_embeddings
image_features = self._process_image_input(image_input)
if (kwargs.get("v0_path", False)
or image_input["type"] != "pixel_values"):
return image_features
return flatten_2d_lists(
scatter_patch_features(*args) for args in zip(
image_features,
image_input["num_embeds"],
image_input["embed_is_patch"],
))
def get_input_embeddings(
self,
......@@ -924,8 +955,11 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
assert self.img_context_token_id is not None
self._set_visual_token_mask(input_ids)
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.img_context_token_id)
input_ids,
inputs_embeds,
select_patch_features(multimodal_embeddings),
self.img_context_token_id,
)
return inputs_embeds
def forward(
......@@ -944,6 +978,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
kwargs.update({"v0_path": True})
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(input_ids,
vision_embeddings)
......
......@@ -233,7 +233,13 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
class LlavaProcessingInfo(BaseLlavaProcessingInfo):
def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
hf_processor = self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
# In case patch_size is omitted from `processor_config.json`
# e.g. for E5-V: https://huggingface.co/royokong/e5-v
if hf_processor.patch_size is None:
patch_size = self.get_vision_encoder_info().get_patch_size()
hf_processor.patch_size = patch_size
return hf_processor
class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
......
......@@ -25,7 +25,6 @@ from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
from vllm.multimodal.processing import PromptReplacement, PromptUpdate
from vllm.multimodal.profiling import ProcessorInputs
from vllm.sequence import IntermediateTensors
from vllm.utils import is_list_of
from .clip import CLIPVisionModel
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
......@@ -44,7 +43,7 @@ class LlavaOnevisionVideoPixelInputs(TypedDict):
type: Literal["pixel_values_videos"]
pixel_values_videos: Union[torch.Tensor, list[torch.Tensor]]
"""
Shape: `(batch_size, num_videos, num_frames, num_channels, height, width)`
Shape: `(batch_size * num_videos, num_frames, num_channels, height, width)`
Note that `num_videos` may be different for each batch, and 'num_frames'
may be different for each video, in which case the data is passed as a
......@@ -580,7 +579,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
return LlavaOnevisionVideoPixelInputs(
type="pixel_values_videos",
pixel_values_videos=pixel_values_videos,
pixel_values_videos=flatten_bn(pixel_values_videos),
)
def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
......@@ -768,22 +767,6 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
for i, patch_features_batch in enumerate(patch_embeddings)
]
def _add_image_newline(
self,
video_features: torch.Tensor,
videos: int = 1,
frames: int = 1,
strategy: str = "one_token",
) -> torch.Tensor:
if strategy == "one_token":
video_features = video_features.reshape(
videos, frames * video_features.shape[1], -1)
image_newline = self.image_newline[None, None, :].repeat(
videos, 1, 1).to(video_features.device)
video_features = torch.cat((video_features, image_newline), dim=1)
return video_features
raise ValueError(f"Unexpected video newline strategy: {strategy}")
def _video_pixels_to_features(
self,
vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
......@@ -807,33 +790,43 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
video_pixels = inputs["pixel_values_videos"]
if isinstance(video_pixels, torch.Tensor):
b, num_videos, frames, c, h, w = video_pixels.shape
pixel_values = video_pixels.view(b * num_videos * frames, c, h, w)
stacked_embeddings = self._video_pixels_to_features(
self.vision_tower, pixel_values)
stacked_embeddings = self._add_image_newline(stacked_embeddings,
videos=b * num_videos,
frames=frames,
strategy="one_token")
return stacked_embeddings
elif is_list_of(video_pixels, torch.Tensor):
stacked_embeddings = []
for video_pixel in video_pixels:
num_videos, frames, c, h, w = video_pixel.shape
pixel_values = video_pixel.view(num_videos * frames, c, h, w)
embeddings = self._video_pixels_to_features(
self.vision_tower, pixel_values)
embeddings = self._add_image_newline(embeddings,
videos=num_videos,
frames=frames,
strategy="one_token")
stacked_embeddings.append(embeddings)
return stacked_embeddings
else:
raise ValueError(
f"Unsupported type of video input {type(video_pixels)}")
total_videos, frames, c, h, w = video_pixels.shape
video_pixels_flat = video_pixels.view(total_videos * frames, c, h,
w)
embeddings_flat = self._video_pixels_to_features(
self.vision_tower, video_pixels_flat)
embeddings_flat = embeddings_flat.reshape(
total_videos, frames * embeddings_flat.shape[1], -1)
image_newline = self.image_newline[None, None, :].expand(
total_videos, -1, -1)
return torch.cat((embeddings_flat, image_newline), dim=1)
frames_per_video = [len(video) for video in video_pixels]
video_pixels_flat = torch.cat(video_pixels)
embeddings_flat = self._video_pixels_to_features(
self.vision_tower, video_pixels_flat)
image_newline = self.image_newline[None, None, :]
return [
torch.cat(
(
embeds.reshape(1, num_frame * embeddings_flat.shape[1],
-1),
image_newline,
),
dim=1,
) for num_frame, embeds in zip(
frames_per_video,
torch.split(embeddings_flat, frames_per_video),
)
]
def apply_pooling(self, image_features, stride=2):
def apply_pooling(self, image_features: torch.Tensor, stride: int = 2):
vision_config = self.config.vision_config
height = width = vision_config.image_size // vision_config.patch_size
batch_frames, _, dim = image_features.shape
......
......@@ -1368,7 +1368,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
full_text_row_masked_out_mask = (
attn_metadata.encoder_seq_lens_tensor
!= 0).reshape(-1, 1).to(input_ids.device)
skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0
skip_cross_attention = attn_metadata.max_encoder_seq_len == 0
# For image-present prefill.
else:
......
......@@ -36,11 +36,11 @@ class NVLMProcessor(BaseInternVLProcessor):
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_PAD]
def get_image_repl_features(
def get_image_repl(
self,
feature_size: int,
num_patches: Optional[int],
) -> str:
) -> PromptUpdateDetails[str]:
if num_patches is None:
raise NotImplementedError("Embedding inputs are not supported")
......@@ -55,14 +55,9 @@ class NVLMProcessor(BaseInternVLProcessor):
# We include the start and end as well because "<Image><tile" is
# tokenized as ["<Image", "><", "tile"], resulting in assertion error
# when trying to find "<tile" as a subsequence of "<Image><tile"
return "<Image>" + features + "</Image>"
repl = "<Image>" + features + "</Image>"
def get_image_repl_full(
self,
feature_size: int,
num_patches: Optional[int],
) -> str:
return self.get_image_repl_features(feature_size, num_patches)
return PromptUpdateDetails(full=repl, features=repl)
class NVLMProcessingInfo(BaseInternVLProcessingInfo):
......@@ -180,11 +175,11 @@ class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):
if num_patches is not None:
assert isinstance(num_patches, int)
repl = hf_processor.get_image_repl(feature_size, num_patches)
return PromptUpdateDetails(
full=hf_processor.get_image_repl_full(feature_size,
num_patches) + "\n",
features=hf_processor.get_image_repl_features(
feature_size, num_patches) + "\n",
full=repl.full + "\n",
features=repl.features + "\n",
)
# See note in dummy data regarding why we have the extra newline
......
......@@ -608,6 +608,17 @@ class Qwen2_5_VisionTransformer(nn.Module):
window_index = torch.cat(window_index, dim=0)
return window_index, cu_window_seqlens
def compute_attn_mask_seqlen(
self,
cu_seqlens: torch.Tensor,
) -> tuple[Optional[int], Optional[list[int]]]:
max_seqlen, seqlens = None, None
if self.attn_backend == _Backend.FLASH_ATTN:
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
elif self.attn_backend == _Backend.XFORMERS:
seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
return max_seqlen, seqlens
def forward(
self,
x: torch.Tensor,
......@@ -645,23 +656,27 @@ class Qwen2_5_VisionTransformer(nn.Module):
# transformers
hidden_states = hidden_states.unsqueeze(1)
max_seqlen = None
seqlens = None
if self.attn_backend == _Backend.FLASH_ATTN:
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
elif self.attn_backend == _Backend.XFORMERS:
seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
# pre-compute seqlens for window/full attn to reduce cuMemcpy operations
max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen(
cu_seqlens)
max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen(
cu_window_seqlens)
for layer_num, blk in enumerate(self.blocks):
if layer_num in self.fullatt_block_indexes:
cu_seqlens_now = cu_seqlens
max_seqlen_now = max_seqlen_full
seqlens_now = seqlens_full
else:
cu_seqlens_now = cu_window_seqlens
max_seqlen_now = max_seqlen_window
seqlens_now = seqlens_window
hidden_states = blk(
hidden_states,
cu_seqlens=cu_seqlens_now,
rotary_pos_emb=rotary_pos_emb,
max_seqlen=max_seqlen,
seqlens=seqlens,
max_seqlen=max_seqlen_now,
seqlens=seqlens_now,
)
# For Qwen2.5-VL-3B, float16 will overflow at last block
......
......@@ -617,6 +617,16 @@ class Qwen2VisionTransformer(nn.Module):
rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
return rotary_pos_emb
def compute_attn_mask_seqlen(
self, cu_seqlens: torch.Tensor
) -> tuple[Optional[int], Optional[list[int]]]:
max_seqlen, seqlens = None, None
if self.attn_backend == _Backend.FLASH_ATTN:
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
elif self.attn_backend == _Backend.XFORMERS:
seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
return max_seqlen, seqlens
def forward(
self,
x: torch.Tensor,
......@@ -638,12 +648,8 @@ class Qwen2VisionTransformer(nn.Module):
# transformers
x = x.unsqueeze(1)
max_seqlen = None
seqlens = None
if self.attn_backend == _Backend.FLASH_ATTN:
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
elif self.attn_backend == _Backend.XFORMERS:
seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
# pre-compute seqlens for attn mask to reduce cuMemcpy operations
max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
for blk in self.blocks:
x = blk(
x,
......
......@@ -104,6 +104,7 @@ _TEXT_GENERATION_MODELS = {
"Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
"SolarForCausalLM": ("solar", "SolarForCausalLM"),
"TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
"TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"),
"XverseForCausalLM": ("llama", "LlamaForCausalLM"),
"Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"),
# [Encoder-decoder]
......@@ -418,11 +419,13 @@ class _ModelRegistry:
if not architectures:
logger.warning("No model architectures are specified")
normalized_arch = []
for model in architectures:
if model not in self.models:
model = "TransformersModel"
normalized_arch.append(model)
# filter out support architectures
normalized_arch = list(
filter(lambda model: model in self.models, architectures))
# make sure Transformers fallback are put at the last
if len(normalized_arch) != len(architectures):
normalized_arch.append("TransformersModel")
return normalized_arch
def inspect_model_cls(
......
# SPDX-License-Identifier: Apache-2.0
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Type
import torch
from vllm.config import VllmConfig
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.models.llama import (LlamaDecoderLayer,
LlamaForCausalLM, LlamaModel)
class TeleFLMModel(LlamaModel):
def __init__(
self,
*,
vllm_config: VllmConfig,
prefix: str = "",
layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer,
):
super().__init__(vllm_config=vllm_config,
prefix=prefix,
layer_type=layer_type)
"""
This implementation is based on the µScaling paper presented at
the ICLR 2025 Workshop:
NanoLM: An Affordable LLM Study Benchmark \
via Accurate Loss Prediction across Scales
by Yiqun Yao et al.
Available at: https://openreview.net/forum?id=IwaPYg1SCA
arXiv preprint: https://arxiv.org/abs/2304.06875
"""
self.use_mup = self.config.use_mup
if self.use_mup:
self.input_mult = self.config.input_mult
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
embedding = self.embed_tokens(input_ids)
if self.use_mup:
embedding = embedding * self.input_mult
return embedding
class TeleFLMForCausalLM(LlamaForCausalLM):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix)
# mup
self.use_mup = self.config.use_mup
if self.use_mup:
self.mup_scale_factor = self.config.mup_scale_factor
self.output_mult = self.config.output_mult / self.mup_scale_factor
logit_scale = self.output_mult
self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
self.config.vocab_size,
logit_scale)
......@@ -103,13 +103,13 @@ The token sequence or text to update.
@dataclass
class PromptUpdateDetails:
class PromptUpdateDetails(Generic[_S]):
"""Details about the token sequence or text that are part of the update."""
full: PromptSeq
full: _S
"""The full content."""
features: PromptSeq
features: _S
"""
The part of the content that corresponds to feature placeholders;
this will be replaced by the output of the vision encoder during model
......@@ -117,7 +117,7 @@ class PromptUpdateDetails:
"""
@staticmethod
def from_seq(seq: PromptSeq) -> "PromptUpdateDetails":
def from_seq(seq: _S) -> "PromptUpdateDetails[_S]":
return PromptUpdateDetails(full=seq, features=seq)
......
......@@ -223,7 +223,12 @@ class RequestOutput:
if delta:
# Slice logprobs delta if applicable
if output_logprobs:
output_logprobs = output_logprobs[-num_output_tokens:]
# num_output_tokens can be 0 when n > 1 and request finishes
# before the others
if num_output_tokens > 0:
output_logprobs = output_logprobs[-num_output_tokens:]
else:
output_logprobs = None
# Don't include prompt if this is after the first output
# containing decode token ids
if include_prompt and seq.get_output_len() > num_output_tokens:
......
......@@ -2,7 +2,6 @@
import logging
import traceback
from contextlib import suppress
from itertools import chain
from typing import TYPE_CHECKING, Optional
......@@ -191,21 +190,6 @@ def neuron_platform_plugin() -> Optional[str]:
return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None
def openvino_platform_plugin() -> Optional[str]:
is_openvino = False
logger.debug("Checking if OpenVINO platform is available.")
with suppress(Exception):
is_openvino = vllm_version_matches_substr("openvino")
if is_openvino:
logger.debug("Confirmed OpenVINO platform is available"
" because vLLM is built with OpenVINO.")
if not is_openvino:
logger.debug("OpenVINO platform is not available because"
" vLLM is not built with OpenVINO.")
return "vllm.platforms.openvino.OpenVinoPlatform" if is_openvino else None
builtin_platform_plugins = {
'tpu': tpu_platform_plugin,
'cuda': cuda_platform_plugin,
......@@ -214,7 +198,6 @@ builtin_platform_plugins = {
'xpu': xpu_platform_plugin,
'cpu': cpu_platform_plugin,
'neuron': neuron_platform_plugin,
'openvino': openvino_platform_plugin,
}
......
......@@ -14,6 +14,7 @@ from typing_extensions import ParamSpec
# import custom ops, trigger op registration
import vllm._C # noqa
import vllm.envs as envs
from vllm.fa_utils import get_flash_attn_version
from vllm.logger import init_logger
from vllm.utils import import_pynvml
......@@ -212,9 +213,14 @@ class CudaPlatformBase(Platform):
return ("vllm.attention.backends."
"flashmla.FlashMLABackend")
if use_v1:
logger.info_once("Using Flash Attention backend on V1 engine.")
return ("vllm.v1.attention.backends.flash_attn."
"FlashAttentionBackend")
if selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
logger.info_once("Using Triton backend on V1 engine.")
return ("vllm.v1.attention.backends."
"triton_attn.TritonAttentionBackend")
if cls.has_device_capability(80):
logger.info_once("Using Flash Attention backend on V1 engine.")
return ("vllm.v1.attention.backends."
"flash_attn.FlashAttentionBackend")
if selected_backend == _Backend.FLASHINFER:
logger.info("Using FlashInfer backend.")
return "vllm.attention.backends.flashinfer.FlashInferBackend"
......@@ -240,15 +246,6 @@ class CudaPlatformBase(Platform):
"Cannot use FlashAttention-2 backend for dtype other than "
"torch.float16 or torch.bfloat16.")
target_backend = _Backend.XFORMERS
elif kv_cache_dtype is not None and \
kv_cache_dtype.startswith("fp8"):
logger.info(
"Cannot use FlashAttention-2 backend for FP8 KV cache.")
logger.warning(
"Please use FlashInfer backend with FP8 KV Cache for "
"better performance by setting environment variable "
"VLLM_ATTENTION_BACKEND=FLASHINFER")
target_backend = _Backend.XFORMERS
elif block_size % 16 != 0:
logger.info(
"Cannot use FlashAttention-2 backend for block size not "
......@@ -270,6 +267,17 @@ class CudaPlatformBase(Platform):
"Cannot use FlashAttention-2 backend for head size %d.",
head_size)
target_backend = _Backend.XFORMERS
fp8_kv_cache = (kv_cache_dtype is not None
and kv_cache_dtype.startswith("fp8"))
if (fp8_kv_cache and get_flash_attn_version() != 3):
logger.info(
"Cannot use FlashAttention-2 backend for FP8 KV cache."
)
logger.warning(
"Please use FlashInfer backend with FP8 KV Cache for "
"better performance by setting environment variable "
"VLLM_ATTENTION_BACKEND=FLASHINFER")
target_backend = _Backend.XFORMERS
except ImportError:
logger.info(
"Cannot use FlashAttention-2 backend because the "
......
......@@ -29,10 +29,10 @@ def in_wsl() -> bool:
class _Backend(enum.Enum):
FLASH_ATTN = enum.auto()
FLASH_ATTN_VLLM_V1 = enum.auto()
TRITON_ATTN_VLLM_V1 = enum.auto()
XFORMERS = enum.auto()
ROCM_FLASH = enum.auto()
TORCH_SDPA = enum.auto()
OPENVINO = enum.auto()
FLASHINFER = enum.auto()
TRITON_MLA = enum.auto() # Supported by V1
FLASHMLA = enum.auto() # Supported by V1
......@@ -52,7 +52,6 @@ class PlatformEnum(enum.Enum):
XPU = enum.auto()
CPU = enum.auto()
NEURON = enum.auto()
OPENVINO = enum.auto()
OOT = enum.auto()
UNSPECIFIED = enum.auto()
......@@ -112,6 +111,8 @@ class Platform:
supported_quantization: list[str] = []
additional_env_vars: list[str] = []
def is_cuda(self) -> bool:
return self._enum == PlatformEnum.CUDA
......@@ -133,9 +134,6 @@ class Platform:
def is_neuron(self) -> bool:
return self._enum == PlatformEnum.NEURON
def is_openvino(self) -> bool:
return self._enum == PlatformEnum.OPENVINO
def is_out_of_tree(self) -> bool:
return self._enum == PlatformEnum.OOT
......
# SPDX-License-Identifier: Apache-2.0
from typing import TYPE_CHECKING, Optional
import torch
import vllm.envs as envs
from vllm.logger import init_logger
from .interface import Platform, PlatformEnum, _Backend
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None
logger = init_logger(__name__)
try:
import openvino as ov
import openvino.properties.hint as hints
except ImportError as e:
logger.warning("Failed to import OpenVINO with %r", e)
class OpenVinoPlatform(Platform):
_enum = PlatformEnum.OPENVINO
device_name: str = "openvino"
device_type: str = "openvino"
dispatch_key: str = "CPU"
@classmethod
def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
dtype: torch.dtype, kv_cache_dtype: Optional[str],
block_size: int, use_v1: bool,
use_mla: bool) -> str:
if selected_backend != _Backend.OPENVINO:
logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
logger.info("Using OpenVINO Attention backend.")
return "vllm.attention.backends.openvino.OpenVINOAttentionBackend"
@classmethod
def get_device_name(cls, device_id: int = 0) -> str:
return "openvino"
@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
return False
@classmethod
def inference_mode(cls):
return torch.inference_mode(mode=True)
@classmethod
def is_openvino_cpu(cls) -> bool:
return "CPU" in envs.VLLM_OPENVINO_DEVICE
@classmethod
def is_openvino_gpu(cls) -> bool:
return "GPU" in envs.VLLM_OPENVINO_DEVICE
@classmethod
def is_pin_memory_available(cls) -> bool:
logger.warning("Pin memory is not supported on OpenViNO.")
return False
@classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
from vllm.utils import GiB_bytes
parallel_config = vllm_config.parallel_config
assert (parallel_config.world_size == 1
), "OpenVINO only supports single CPU socket currently."
if parallel_config.worker_cls == "auto":
parallel_config.worker_cls = \
"vllm.worker.openvino_worker.OpenVINOWorker"
# check and update model config
model_config = vllm_config.model_config
if model_config.dtype != torch.float32:
logger.warning(
f"Only float32 dtype is supported on OpenVINO, casting from {model_config.dtype}." # noqa: G004, E501
)
model_config.dtype = torch.float32
if not model_config.enforce_eager:
logger.warning(
"CUDA graph is not supported on OpenVINO backend, fallback to "
"the eager mode.")
model_config.enforce_eager = True
# check and update cache config
ov_core = ov.Core()
cache_config = vllm_config.cache_config
if cache_config and cache_config.block_size is None:
cache_config.block_size = 16
if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
if not OpenVinoPlatform.is_openvino_cpu():
logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is "
"ignored for GPU, f16 data type will be used.")
cache_config.cache_dtype = ov.Type.f16
else:
logger.info("KV cache type is overridden to u8 via "
"VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
cache_config.cache_dtype = ov.Type.u8
else:
if OpenVinoPlatform.is_openvino_cpu():
ov_device = envs.VLLM_OPENVINO_DEVICE
inference_precision = ov_core.get_property(
ov_device, hints.inference_precision)
if inference_precision == ov.Type.bf16:
cache_config.cache_dtype = ov.Type.bf16
else:
cache_config.cache_dtype = ov.Type.f16
else:
cache_config.cache_dtype = ov.Type.f16
if OpenVinoPlatform.is_openvino_cpu():
if cache_config.block_size != 32:
logger.info(
f"OpenVINO CPU optimal block size is 32, overriding currently set {cache_config.block_size}" # noqa: G004, E501
)
cache_config.block_size = 32
else:
if cache_config.block_size != 16:
logger.info(
f"OpenVINO GPU optimal block size is 16, overriding currently set {cache_config.block_size}" # noqa: G004, E501
)
cache_config.block_size = 16
kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
if kv_cache_space >= 0:
if kv_cache_space == 0 and OpenVinoPlatform.is_openvino_cpu():
cache_config.openvino_kvcache_space_bytes = 4 * GiB_bytes # type: ignore
logger.warning(
"Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
"for OpenVINO backend is not set, using 4 by default.")
else:
cache_config.openvino_kvcache_space_bytes = ( # type: ignore
kv_cache_space * GiB_bytes)
else:
raise RuntimeError(
"Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
f" {kv_cache_space}, expect a positive integer value.")
assert vllm_config.device_config.device_type == "openvino"
assert vllm_config.lora_config is None, \
"OpenVINO backend doesn't support LoRA"
assert cls.is_openvino_cpu() or \
cls.is_openvino_gpu(), \
"OpenVINO backend supports only CPU and GPU devices"
......@@ -120,8 +120,9 @@ class RocmPlatform(Platform):
selected_backend = (_Backend.ROCM_FLASH if selected_backend
== _Backend.FLASH_ATTN else selected_backend)
if envs.VLLM_USE_V1:
logger.info("Using ROCm Attention backend on V1 engine.")
return "vllm.v1.attention.backends.rocm_attn.ROCmAttentionBackend"
logger.info("Using Triton Attention backend on V1 engine.")
return ("vllm.v1.attention.backends."
"triton_attn.TritonAttentionBackend")
if selected_backend == _Backend.ROCM_FLASH:
if not cls.has_device_capability(90):
# not Instinct series GPUs.
......
......@@ -29,6 +29,10 @@ class TpuPlatform(Platform):
"tpu_int8", "compressed-tensors", "compressed_tensors"
]
additional_env_vars: list[str] = [
"TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"
]
@classmethod
def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
dtype: torch.dtype, kv_cache_dtype: Optional[str],
......
......@@ -92,22 +92,20 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
# Override draft-model specific worker args.
draft_worker_kwargs.update(
vllm_config=draft_worker_config,
ngram_prompt_lookup_max=speculative_config.ngram_prompt_lookup_max,
ngram_prompt_lookup_min=speculative_config.ngram_prompt_lookup_min,
ngram_prompt_lookup_max=speculative_config.prompt_lookup_max,
ngram_prompt_lookup_min=speculative_config.prompt_lookup_min,
)
spec_decode_worker = SpecDecodeWorker.create_worker(
scorer_worker=target_worker,
draft_worker_kwargs=draft_worker_kwargs,
disable_mqa_scorer=speculative_config.speculative_disable_mqa_scorer,
disable_by_batch_size=speculative_config.
speculative_disable_by_batch_size,
draft_token_acceptance_method=speculative_config.
draft_token_acceptance_method,
disable_mqa_scorer=speculative_config.disable_mqa_scorer,
disable_by_batch_size=speculative_config.disable_by_batch_size,
draft_token_acceptance_method=speculative_config.acceptance_method,
typical_acceptance_sampler_posterior_threshold=speculative_config.
typical_acceptance_sampler_posterior_threshold,
posterior_threshold,
typical_acceptance_sampler_posterior_alpha=speculative_config.
typical_acceptance_sampler_posterior_alpha,
posterior_alpha,
disable_logprobs=speculative_config.disable_logprobs,
disable_log_stats=speculative_config.disable_log_stats,
num_speculative_tokens=speculative_config.num_speculative_tokens,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment