Merge tag 'v0.8.2' into v0.8.2-ori

53076d70 · zhuwenwen · 322a0be6 · 9c5c81b0 · 53076d70 · 53076d70
Commit 53076d70 authored Mar 24, 2025 by zhuwenwen
20 changed files
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -183,7 +183,7 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
        image_width: int,
        image_height: int,
        processor: Optional[Gemma3Processor],
-    ) -> PromptUpdateDetails:
+    ) -> PromptUpdateDetails[str]:
        if processor is None:
            processor = self.get_hf_processor()


--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -249,20 +249,15 @@ class H2OVLProcessor(BaseInternVLProcessor):
    def image_token_id(self) -> int:
        return self.tokenizer.get_vocab()[IMG_CONTEXT]

-    def get_image_repl_features(
+    def get_image_repl(
        self,
        feature_size: int,
        num_patches: Optional[int],
-    ) -> str:
-        return IMG_CONTEXT * feature_size
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END

-    def get_image_repl_full(
-        self,
-        feature_size: int,
-        num_patches: Optional[int],
-    ) -> str:
-        features = self.get_image_repl_features(feature_size, num_patches)
-        return IMG_START + features + IMG_END
+        return PromptUpdateDetails(full=repl_full, features=repl_features)

    def resolve_min_max_num(
        self,
@@ -501,12 +496,7 @@ class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo]
            if num_patches is not None:
                assert isinstance(num_patches, int)

-            return PromptUpdateDetails(
-                full=hf_processor.get_image_repl_full(feature_size,
-                                                      num_patches),
-                features=hf_processor.get_image_repl_features(
-                    feature_size, num_patches),
-            )
+            return hf_processor.get_image_repl(feature_size, num_patches)

        return [
            PromptReplacement(

--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -9,14 +9,13 @@
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import (List, Literal, Optional, Set, Tuple, TypedDict, TypeVar,
-                    Union)
+from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union

 import torch
 import torch.nn as nn
 import torchvision.transforms as T
 from PIL import Image
-from transformers import BatchFeature, PretrainedConfig, TensorType
+from transformers import BatchEncoding, PretrainedConfig, TensorType

 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -36,10 +35,12 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import flatten_2d_lists

 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                    maybe_prefix, merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features

 IMG_START = '<img>'
 IMG_END = '</img>'
@@ -51,16 +52,26 @@ IMAGENET_STD = (0.229, 0.224, 0.225)

 class InternVLImagePixelInputs(TypedDict):
    type: Literal["pixel_values"]
-    data: torch.Tensor
+    pixel_values_flat: torch.Tensor
    """
    Shape:
    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
    """
-    patches_per_image: List[int]
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
    """
-    List of number of total patches for each image in the batch.
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size, num_images, num_embeds)`
    """

+    num_embeds: Union[torch.Tensor, list[torch.Tensor]]
+    """Shape: `(batch_size, num_images)`"""
+

 class InternVLImageEmbeddingInputs(TypedDict):
    type: Literal["image_embeds"]
@@ -286,19 +297,11 @@ class BaseInternVLProcessor(ABC):
        raise NotImplementedError

    @abstractmethod
-    def get_image_repl_features(
+    def get_image_repl(
        self,
        feature_size: int,
        num_patches: Optional[int],
-    ) -> str:
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_image_repl_full(
-        self,
-        feature_size: int,
-        num_patches: Optional[int],
-    ) -> str:
+    ) -> PromptUpdateDetails[str]:
        raise NotImplementedError

    def resolve_min_max_num(
@@ -394,7 +397,7 @@ class BaseInternVLProcessor(ABC):
        max_dynamic_patch: Optional[int] = None,
        dynamic_image_size: Optional[bool] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
-    ) -> BatchFeature:
+    ) -> Mapping[str, NestedTensors]:
        if text is None:
            text = []
        if not isinstance(text, list):
@@ -413,28 +416,41 @@ class BaseInternVLProcessor(ABC):
                max_dynamic_patch=max_dynamic_patch,
                dynamic_image_size=dynamic_image_size,
            )
-            image_inputs = {
-                "pixel_values_flat": torch.cat(pixel_values_lst),
-                "image_num_patches": list(map(len, pixel_values_lst)),
+            image_inputs: dict[str, NestedTensors] = {
+                "pixel_values_flat":
+                torch.cat(pixel_values_lst),
+                "image_num_patches":
+                torch.tensor([len(item) for item in pixel_values_lst]),
            }

+            tokenizer = self.tokenizer
+            image_token_id = self.image_token_id
+
+            num_embeds = list[int]()
+            embed_is_patch = list[torch.Tensor]()
+
            for pixel_values in pixel_values_lst:
                num_patches = pixel_values.shape[0]
                feature_size = num_patches * self.num_image_token

-                image_repl = self.get_image_repl_full(feature_size,
-                                                      num_patches)
-                text = [t.replace('<image>', image_repl, 1) for t in text]
+                image_repl = self.get_image_repl(feature_size, num_patches)
+                feature_tokens = tokenizer.encode(image_repl.features,
+                                                  add_special_tokens=False)
+
+                text = [t.replace('<image>', image_repl.full, 1) for t in text]
+                num_embeds.append(len(feature_tokens))
+                embed_is_patch.append(
+                    torch.tensor(feature_tokens) == image_token_id)
+
+            image_inputs["num_embeds"] = torch.tensor(num_embeds)
+            image_inputs["embed_is_patch"] = embed_is_patch

        text_inputs = self.tokenizer(text)

-        return BatchFeature(
-            {
-                **text_inputs,
-                **image_inputs,
-            },
-            tensor_type=return_tensors,
-        )
+        return {
+            **BatchEncoding(text_inputs, tensor_type=return_tensors),
+            **image_inputs,
+        }


 class InternVLProcessor(BaseInternVLProcessor):
@@ -443,20 +459,15 @@ class InternVLProcessor(BaseInternVLProcessor):
    def image_token_id(self) -> int:
        return self.tokenizer.get_vocab()[IMG_CONTEXT]

-    def get_image_repl_features(
+    def get_image_repl(
        self,
        feature_size: int,
        num_patches: Optional[int],
-    ) -> str:
-        return IMG_CONTEXT * feature_size
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END

-    def get_image_repl_full(
-        self,
-        feature_size: int,
-        num_patches: Optional[int],
-    ) -> str:
-        features = self.get_image_repl_features(feature_size, num_patches)
-        return IMG_START + features + IMG_END
+        return PromptUpdateDetails(full=repl_full, features=repl_features)


 class BaseInternVLProcessingInfo(BaseProcessingInfo):
@@ -566,16 +577,15 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
        prompt: str,
        mm_data: Mapping[str, object],
        mm_kwargs: Mapping[str, object],
-    ) -> BatchFeature:
+    ) -> Mapping[str, NestedTensors]:
        processed_outputs = super()._call_hf_processor(
            prompt=prompt,
            mm_data=mm_data,
            mm_kwargs=mm_kwargs,
        )

-        image_token_id = self.info.get_hf_processor(**mm_kwargs).image_token_id
-        image_data = mm_data.get("images", [])
-        assert isinstance(image_data, list)
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+        image_token_id = hf_processor.image_token_id

        # Since there may be extra tokens in the feature placeholders,
        # we need to pass the image token ID to the model to select the
@@ -586,7 +596,7 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):

    def _get_mm_fields_config(
        self,
-        hf_inputs: BatchFeature,
+        hf_inputs: Mapping[str, NestedTensors],
        hf_processor_mm_kwargs: Mapping[str, object],
    ) -> Mapping[str, MultiModalFieldConfig]:
        image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
@@ -596,6 +606,8 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
            pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
                "image", image_num_patches),
            image_num_patches=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
+            num_embeds=MultiModalFieldConfig.batched("image"),
            image_embeds=MultiModalFieldConfig.batched("image"),
            image_token_id=MultiModalFieldConfig.shared("image", num_images),
        )
@@ -637,12 +649,7 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
            if num_patches is not None:
                assert isinstance(num_patches, int)

-            return PromptUpdateDetails(
-                full=hf_processor.get_image_repl_full(feature_size,
-                                                      num_patches),
-                features=hf_processor.get_image_repl_features(
-                    feature_size, num_patches),
-            )
+            return hf_processor.get_image_repl(feature_size, num_patches)

        return [
            PromptReplacement(
@@ -832,6 +839,8 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
            self, **kwargs: object) -> Optional[InternVLImageInputs]:
        pixel_values_flat = kwargs.pop("pixel_values_flat", None)
        image_num_patches = kwargs.pop("image_num_patches", None)
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
+        num_embeds = kwargs.pop("num_embeds", None)
        image_embeds = kwargs.pop("image_embeds", None)

        if pixel_values_flat is None and image_embeds is None:
@@ -858,35 +867,47 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):

            if not isinstance(image_num_patches, (torch.Tensor, list)):
                raise ValueError("Incorrect type of image_num_patches. "
-                                 f"Got type: {type(pixel_values_flat)}")
+                                 f"Got type: {type(image_num_patches)}")
+
+            if not isinstance(embed_is_patch, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of embed_is_patch. "
+                                 f"Got type: {type(embed_is_patch)}")
+
+            if not isinstance(num_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of num_embeds. "
+                                 f"Got type: {type(num_embeds)}")
+
+            pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
+            image_num_patches = flatten_bn(image_num_patches, concat=True)

            return InternVLImagePixelInputs(
                type="pixel_values",
-                data=self._validate_pixel_values(
-                    flatten_bn(pixel_values_flat, concat=True)),
-                patches_per_image=flatten_bn(image_num_patches,
-                                             concat=True).tolist())
+                pixel_values_flat=self._validate_pixel_values(
+                    pixel_values_flat),
+                num_patches=image_num_patches,
+                embed_is_patch=embed_is_patch,
+                num_embeds=num_embeds,
+            )

        raise AssertionError("This line should be unreachable.")

    def _process_image_input(
        self,
        image_input: InternVLImageInputs,
-    ) -> tuple[torch.Tensor, ...]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
        if image_input["type"] == "image_embeds":
            return image_input["data"]

        assert self.vision_model is not None

-        image_embeds = self.extract_feature(image_input["data"])
+        image_embeds = self.extract_feature(image_input["pixel_values_flat"])

-        patches_per_image = image_input["patches_per_image"]
+        num_patches = image_input["num_patches"]

        # Only one image in the current batch
-        if len(patches_per_image) == 1:
-            image_embeds = image_embeds.view(
+        if len(num_patches) == 1:
+            return image_embeds.view(
                -1, self.config.text_config.hidden_size).unsqueeze(0)
-            return image_embeds

        # NOTE: Image embeddings are split into separate tensors for each image
        # by the size of each embedding.
@@ -894,10 +915,9 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
        image_embeds = image_embeds.view(-1,
                                         self.config.text_config.hidden_size)
        image_feature_sizes = [
-            num_patches * feature_size for num_patches in patches_per_image
+            num_patches * feature_size for num_patches in num_patches
        ]
-        image_embeds = image_embeds.split(image_feature_sizes)
-        return image_embeds
+        return image_embeds.split(image_feature_sizes)

    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
        if self.is_mono:
@@ -911,8 +931,19 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
        image_input = self._parse_and_validate_image_input(**kwargs)
        if image_input is None:
            return None
-        vision_embeddings = self._process_image_input(image_input)
-        return vision_embeddings
+
+        image_features = self._process_image_input(image_input)
+
+        if (kwargs.get("v0_path", False)
+                or image_input["type"] != "pixel_values"):
+            return image_features
+
+        return flatten_2d_lists(
+            scatter_patch_features(*args) for args in zip(
+                image_features,
+                image_input["num_embeds"],
+                image_input["embed_is_patch"],
+            ))

    def get_input_embeddings(
        self,
@@ -924,8 +955,11 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
            assert self.img_context_token_id is not None
            self._set_visual_token_mask(input_ids)
            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.img_context_token_id)
+                input_ids,
+                inputs_embeds,
+                select_patch_features(multimodal_embeddings),
+                self.img_context_token_id,
+            )
        return inputs_embeds

    def forward(
@@ -944,6 +978,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
        # NOTE: In v1, inputs_embeds is always generated at model runner, this
        # condition is for v0 compatibility.
        elif inputs_embeds is None:
+            kwargs.update({"v0_path": True})
            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
            inputs_embeds = self.get_input_embeddings(input_ids,
                                                      vision_embeddings)

--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -233,7 +233,13 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
 class LlavaProcessingInfo(BaseLlavaProcessingInfo):

    def get_hf_processor(self, **kwargs: object):
-        return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
+        hf_processor = self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
+        # In case patch_size is omitted from `processor_config.json`
+        # e.g. for E5-V: https://huggingface.co/royokong/e5-v
+        if hf_processor.patch_size is None:
+            patch_size = self.get_vision_encoder_info().get_patch_size()
+            hf_processor.patch_size = patch_size
+        return hf_processor


 class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):

--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -25,7 +25,6 @@ from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
 from vllm.multimodal.profiling import ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_list_of

 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -44,7 +43,7 @@ class LlavaOnevisionVideoPixelInputs(TypedDict):
    type: Literal["pixel_values_videos"]
    pixel_values_videos: Union[torch.Tensor, list[torch.Tensor]]
    """
-    Shape: `(batch_size, num_videos, num_frames, num_channels, height, width)`
+    Shape: `(batch_size * num_videos, num_frames, num_channels, height, width)`

    Note that `num_videos` may be different for each batch, and 'num_frames'
    may be different for each video, in which case the data is passed as a
@@ -580,7 +579,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,

        return LlavaOnevisionVideoPixelInputs(
            type="pixel_values_videos",
-            pixel_values_videos=pixel_values_videos,
+            pixel_values_videos=flatten_bn(pixel_values_videos),
        )

    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
@@ -768,22 +767,6 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
            for i, patch_features_batch in enumerate(patch_embeddings)
        ]

-    def _add_image_newline(
-        self,
-        video_features: torch.Tensor,
-        videos: int = 1,
-        frames: int = 1,
-        strategy: str = "one_token",
-    ) -> torch.Tensor:
-        if strategy == "one_token":
-            video_features = video_features.reshape(
-                videos, frames * video_features.shape[1], -1)
-            image_newline = self.image_newline[None, None, :].repeat(
-                videos, 1, 1).to(video_features.device)
-            video_features = torch.cat((video_features, image_newline), dim=1)
-            return video_features
-        raise ValueError(f"Unexpected video newline strategy: {strategy}")
-
    def _video_pixels_to_features(
        self,
        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
@@ -807,33 +790,43 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
        video_pixels = inputs["pixel_values_videos"]

        if isinstance(video_pixels, torch.Tensor):
-            b, num_videos, frames, c, h, w = video_pixels.shape
-            pixel_values = video_pixels.view(b * num_videos * frames, c, h, w)
-            stacked_embeddings = self._video_pixels_to_features(
-                self.vision_tower, pixel_values)
-            stacked_embeddings = self._add_image_newline(stacked_embeddings,
-                                                         videos=b * num_videos,
-                                                         frames=frames,
-                                                         strategy="one_token")
-            return stacked_embeddings
-        elif is_list_of(video_pixels, torch.Tensor):
-            stacked_embeddings = []
-            for video_pixel in video_pixels:
-                num_videos, frames, c, h, w = video_pixel.shape
-                pixel_values = video_pixel.view(num_videos * frames, c, h, w)
-                embeddings = self._video_pixels_to_features(
-                    self.vision_tower, pixel_values)
-                embeddings = self._add_image_newline(embeddings,
-                                                     videos=num_videos,
-                                                     frames=frames,
-                                                     strategy="one_token")
-                stacked_embeddings.append(embeddings)
-            return stacked_embeddings
-        else:
-            raise ValueError(
-                f"Unsupported type of video input {type(video_pixels)}")
+            total_videos, frames, c, h, w = video_pixels.shape
+            video_pixels_flat = video_pixels.view(total_videos * frames, c, h,
+                                                  w)
+
+            embeddings_flat = self._video_pixels_to_features(
+                self.vision_tower, video_pixels_flat)
+
+            embeddings_flat = embeddings_flat.reshape(
+                total_videos, frames * embeddings_flat.shape[1], -1)
+
+            image_newline = self.image_newline[None, None, :].expand(
+                total_videos, -1, -1)
+            return torch.cat((embeddings_flat, image_newline), dim=1)
+
+        frames_per_video = [len(video) for video in video_pixels]
+        video_pixels_flat = torch.cat(video_pixels)
+
+        embeddings_flat = self._video_pixels_to_features(
+            self.vision_tower, video_pixels_flat)
+
+        image_newline = self.image_newline[None, None, :]
+
+        return [
+            torch.cat(
+                (
+                    embeds.reshape(1, num_frame * embeddings_flat.shape[1],
+                                   -1),
+                    image_newline,
+                ),
+                dim=1,
+            ) for num_frame, embeds in zip(
+                frames_per_video,
+                torch.split(embeddings_flat, frames_per_video),
+            )
+        ]

-    def apply_pooling(self, image_features, stride=2):
+    def apply_pooling(self, image_features: torch.Tensor, stride: int = 2):
        vision_config = self.config.vision_config
        height = width = vision_config.image_size // vision_config.patch_size
        batch_frames, _, dim = image_features.shape

--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1368,7 +1368,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
            full_text_row_masked_out_mask = (
                attn_metadata.encoder_seq_lens_tensor
                != 0).reshape(-1, 1).to(input_ids.device)
-            skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0
+            skip_cross_attention = attn_metadata.max_encoder_seq_len == 0

        # For image-present prefill.
        else:

--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -36,11 +36,11 @@ class NVLMProcessor(BaseInternVLProcessor):
    def image_token_id(self) -> int:
        return self.tokenizer.get_vocab()[IMG_PAD]

-    def get_image_repl_features(
+    def get_image_repl(
        self,
        feature_size: int,
        num_patches: Optional[int],
-    ) -> str:
+    ) -> PromptUpdateDetails[str]:
        if num_patches is None:
            raise NotImplementedError("Embedding inputs are not supported")

@@ -55,14 +55,9 @@ class NVLMProcessor(BaseInternVLProcessor):
        # We include the start and end as well because "<Image><tile" is
        # tokenized as ["<Image", "><", "tile"], resulting in assertion error
        # when trying to find "<tile" as a subsequence of "<Image><tile"
-        return "<Image>" + features + "</Image>"
+        repl = "<Image>" + features + "</Image>"

-    def get_image_repl_full(
-        self,
-        feature_size: int,
-        num_patches: Optional[int],
-    ) -> str:
-        return self.get_image_repl_features(feature_size, num_patches)
+        return PromptUpdateDetails(full=repl, features=repl)


 class NVLMProcessingInfo(BaseInternVLProcessingInfo):
@@ -180,11 +175,11 @@ class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):
            if num_patches is not None:
                assert isinstance(num_patches, int)

+            repl = hf_processor.get_image_repl(feature_size, num_patches)
+
            return PromptUpdateDetails(
-                full=hf_processor.get_image_repl_full(feature_size,
-                                                      num_patches) + "\n",
-                features=hf_processor.get_image_repl_features(
-                    feature_size, num_patches) + "\n",
+                full=repl.full + "\n",
+                features=repl.features + "\n",
            )

        # See note in dummy data regarding why we have the extra newline

--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -608,6 +608,17 @@ class Qwen2_5_VisionTransformer(nn.Module):
        window_index = torch.cat(window_index, dim=0)
        return window_index, cu_window_seqlens

+    def compute_attn_mask_seqlen(
+        self,
+        cu_seqlens: torch.Tensor,
+    ) -> tuple[Optional[int], Optional[list[int]]]:
+        max_seqlen, seqlens = None, None
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        elif self.attn_backend == _Backend.XFORMERS:
+            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        return max_seqlen, seqlens
+
    def forward(
        self,
        x: torch.Tensor,
@@ -645,23 +656,27 @@ class Qwen2_5_VisionTransformer(nn.Module):
        # transformers
        hidden_states = hidden_states.unsqueeze(1)

-        max_seqlen = None
-        seqlens = None
-        if self.attn_backend == _Backend.FLASH_ATTN:
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == _Backend.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        # pre-compute seqlens for window/full attn to reduce cuMemcpy operations
+        max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen(
+            cu_seqlens)
+        max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen(
+            cu_window_seqlens)
        for layer_num, blk in enumerate(self.blocks):
            if layer_num in self.fullatt_block_indexes:
                cu_seqlens_now = cu_seqlens
+                max_seqlen_now = max_seqlen_full
+                seqlens_now = seqlens_full
            else:
                cu_seqlens_now = cu_window_seqlens
+                max_seqlen_now = max_seqlen_window
+                seqlens_now = seqlens_window
+
            hidden_states = blk(
                hidden_states,
                cu_seqlens=cu_seqlens_now,
                rotary_pos_emb=rotary_pos_emb,
-                max_seqlen=max_seqlen,
-                seqlens=seqlens,
+                max_seqlen=max_seqlen_now,
+                seqlens=seqlens_now,
            )

        # For Qwen2.5-VL-3B, float16 will overflow at last block

--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -617,6 +617,16 @@ class Qwen2VisionTransformer(nn.Module):
        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
        return rotary_pos_emb

+    def compute_attn_mask_seqlen(
+            self, cu_seqlens: torch.Tensor
+    ) -> tuple[Optional[int], Optional[list[int]]]:
+        max_seqlen, seqlens = None, None
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        elif self.attn_backend == _Backend.XFORMERS:
+            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        return max_seqlen, seqlens
+
    def forward(
        self,
        x: torch.Tensor,
@@ -638,12 +648,8 @@ class Qwen2VisionTransformer(nn.Module):
        # transformers
        x = x.unsqueeze(1)

-        max_seqlen = None
-        seqlens = None
-        if self.attn_backend == _Backend.FLASH_ATTN:
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == _Backend.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        # pre-compute seqlens for attn mask to reduce cuMemcpy operations
+        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
        for blk in self.blocks:
            x = blk(
                x,

--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -104,6 +104,7 @@ _TEXT_GENERATION_MODELS = {
    "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
    "SolarForCausalLM": ("solar", "SolarForCausalLM"),
    "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
+    "TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"),
    "XverseForCausalLM": ("llama", "LlamaForCausalLM"),
    "Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"),
    # [Encoder-decoder]
@@ -418,11 +419,13 @@ class _ModelRegistry:
        if not architectures:
            logger.warning("No model architectures are specified")

-        normalized_arch = []
-        for model in architectures:
-            if model not in self.models:
-                model = "TransformersModel"
-            normalized_arch.append(model)
+        # filter out support architectures
+        normalized_arch = list(
+            filter(lambda model: model in self.models, architectures))
+
+        # make sure Transformers fallback are put at the last
+        if len(normalized_arch) != len(architectures):
+            normalized_arch.append("TransformersModel")
        return normalized_arch

    def inspect_model_cls(

--- a/vllm/model_executor/models/teleflm.py
+++ b/vllm/model_executor/models/teleflm.py
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Type
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.models.llama import (LlamaDecoderLayer,
+                                              LlamaForCausalLM, LlamaModel)
+
+
+class TeleFLMModel(LlamaModel):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer,
+    ):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=layer_type)
+        """
+        This implementation is based on the µScaling paper presented at  
+        the ICLR 2025 Workshop:  
+        NanoLM: An Affordable LLM Study Benchmark \
+        via Accurate Loss Prediction across Scales
+        by Yiqun Yao et al.  
+        Available at: https://openreview.net/forum?id=IwaPYg1SCA  
+        arXiv preprint: https://arxiv.org/abs/2304.06875
+        """
+        self.use_mup = self.config.use_mup
+        if self.use_mup:
+            self.input_mult = self.config.input_mult
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        embedding = self.embed_tokens(input_ids)
+        if self.use_mup:
+            embedding = embedding * self.input_mult
+        return embedding
+
+
+class TeleFLMForCausalLM(LlamaForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        # mup
+        self.use_mup = self.config.use_mup
+        if self.use_mup:
+            self.mup_scale_factor = self.config.mup_scale_factor
+            self.output_mult = self.config.output_mult / self.mup_scale_factor
+            logit_scale = self.output_mult
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    self.config.vocab_size,
+                                                    logit_scale)
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -103,13 +103,13 @@ The token sequence or text to update.


 @dataclass
-class PromptUpdateDetails:
+class PromptUpdateDetails(Generic[_S]):
    """Details about the token sequence or text that are part of the update."""

-    full: PromptSeq
+    full: _S
    """The full content."""

-    features: PromptSeq
+    features: _S
    """
    The part of the content that corresponds to feature placeholders;
    this will be replaced by the output of the vision encoder during model
@@ -117,7 +117,7 @@ class PromptUpdateDetails:
    """

    @staticmethod
-    def from_seq(seq: PromptSeq) -> "PromptUpdateDetails":
+    def from_seq(seq: _S) -> "PromptUpdateDetails[_S]":
        return PromptUpdateDetails(full=seq, features=seq)



--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -223,7 +223,12 @@ class RequestOutput:
            if delta:
                # Slice logprobs delta if applicable
                if output_logprobs:
-                    output_logprobs = output_logprobs[-num_output_tokens:]
+                    # num_output_tokens can be 0 when n > 1 and request finishes
+                    # before the others
+                    if num_output_tokens > 0:
+                        output_logprobs = output_logprobs[-num_output_tokens:]
+                    else:
+                        output_logprobs = None
                # Don't include prompt if this is after the first output
                # containing decode token ids
                if include_prompt and seq.get_output_len() > num_output_tokens:

--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -2,7 +2,6 @@

 import logging
 import traceback
-from contextlib import suppress
 from itertools import chain
 from typing import TYPE_CHECKING, Optional

@@ -191,21 +190,6 @@ def neuron_platform_plugin() -> Optional[str]:
    return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None


-def openvino_platform_plugin() -> Optional[str]:
-    is_openvino = False
-    logger.debug("Checking if OpenVINO platform is available.")
-    with suppress(Exception):
-        is_openvino = vllm_version_matches_substr("openvino")
-        if is_openvino:
-            logger.debug("Confirmed OpenVINO platform is available"
-                         " because vLLM is built with OpenVINO.")
-    if not is_openvino:
-        logger.debug("OpenVINO platform is not available because"
-                     " vLLM is not built with OpenVINO.")
-
-    return "vllm.platforms.openvino.OpenVinoPlatform" if is_openvino else None
-
-
 builtin_platform_plugins = {
    'tpu': tpu_platform_plugin,
    'cuda': cuda_platform_plugin,
@@ -214,7 +198,6 @@ builtin_platform_plugins = {
    'xpu': xpu_platform_plugin,
    'cpu': cpu_platform_plugin,
    'neuron': neuron_platform_plugin,
-    'openvino': openvino_platform_plugin,
 }



--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -14,6 +14,7 @@ from typing_extensions import ParamSpec
 # import custom ops, trigger op registration
 import vllm._C  # noqa
 import vllm.envs as envs
+from vllm.fa_utils import get_flash_attn_version
 from vllm.logger import init_logger
 from vllm.utils import import_pynvml

@@ -212,9 +213,14 @@ class CudaPlatformBase(Platform):
                        return ("vllm.attention.backends."
                                "flashmla.FlashMLABackend")
        if use_v1:
-            logger.info_once("Using Flash Attention backend on V1 engine.")
-            return ("vllm.v1.attention.backends.flash_attn."
-                    "FlashAttentionBackend")
+            if selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
+                logger.info_once("Using Triton backend on V1 engine.")
+                return ("vllm.v1.attention.backends."
+                        "triton_attn.TritonAttentionBackend")
+            if cls.has_device_capability(80):
+                logger.info_once("Using Flash Attention backend on V1 engine.")
+                return ("vllm.v1.attention.backends."
+                        "flash_attn.FlashAttentionBackend")
        if selected_backend == _Backend.FLASHINFER:
            logger.info("Using FlashInfer backend.")
            return "vllm.attention.backends.flashinfer.FlashInferBackend"
@@ -240,15 +246,6 @@ class CudaPlatformBase(Platform):
                "Cannot use FlashAttention-2 backend for dtype other than "
                "torch.float16 or torch.bfloat16.")
            target_backend = _Backend.XFORMERS
-        elif kv_cache_dtype is not None and \
-            kv_cache_dtype.startswith("fp8"):
-            logger.info(
-                "Cannot use FlashAttention-2 backend for FP8 KV cache.")
-            logger.warning(
-                "Please use FlashInfer backend with FP8 KV Cache for "
-                "better performance by setting environment variable "
-                "VLLM_ATTENTION_BACKEND=FLASHINFER")
-            target_backend = _Backend.XFORMERS
        elif block_size % 16 != 0:
            logger.info(
                "Cannot use FlashAttention-2 backend for block size not "
@@ -270,6 +267,17 @@ class CudaPlatformBase(Platform):
                        "Cannot use FlashAttention-2 backend for head size %d.",
                        head_size)
                    target_backend = _Backend.XFORMERS
+                fp8_kv_cache = (kv_cache_dtype is not None
+                                and kv_cache_dtype.startswith("fp8"))
+                if (fp8_kv_cache and get_flash_attn_version() != 3):
+                    logger.info(
+                        "Cannot use FlashAttention-2 backend for FP8 KV cache."
+                    )
+                    logger.warning(
+                        "Please use FlashInfer backend with FP8 KV Cache for "
+                        "better performance by setting environment variable "
+                        "VLLM_ATTENTION_BACKEND=FLASHINFER")
+                    target_backend = _Backend.XFORMERS
            except ImportError:
                logger.info(
                    "Cannot use FlashAttention-2 backend because the "

--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -29,10 +29,10 @@ def in_wsl() -> bool:
 class _Backend(enum.Enum):
    FLASH_ATTN = enum.auto()
    FLASH_ATTN_VLLM_V1 = enum.auto()
+    TRITON_ATTN_VLLM_V1 = enum.auto()
    XFORMERS = enum.auto()
    ROCM_FLASH = enum.auto()
    TORCH_SDPA = enum.auto()
-    OPENVINO = enum.auto()
    FLASHINFER = enum.auto()
    TRITON_MLA = enum.auto()  # Supported by V1
    FLASHMLA = enum.auto()  # Supported by V1
@@ -52,7 +52,6 @@ class PlatformEnum(enum.Enum):
    XPU = enum.auto()
    CPU = enum.auto()
    NEURON = enum.auto()
-    OPENVINO = enum.auto()
    OOT = enum.auto()
    UNSPECIFIED = enum.auto()

@@ -112,6 +111,8 @@ class Platform:

    supported_quantization: list[str] = []

+    additional_env_vars: list[str] = []
+
    def is_cuda(self) -> bool:
        return self._enum == PlatformEnum.CUDA

@@ -133,9 +134,6 @@ class Platform:
    def is_neuron(self) -> bool:
        return self._enum == PlatformEnum.NEURON

-    def is_openvino(self) -> bool:
-        return self._enum == PlatformEnum.OPENVINO
-
    def is_out_of_tree(self) -> bool:
        return self._enum == PlatformEnum.OOT


--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import TYPE_CHECKING, Optional
-
-import torch
-
-import vllm.envs as envs
-from vllm.logger import init_logger
-
-from .interface import Platform, PlatformEnum, _Backend
-
-if TYPE_CHECKING:
-    from vllm.config import VllmConfig
-else:
-    VllmConfig = None
-
-logger = init_logger(__name__)
-
-try:
-    import openvino as ov
-    import openvino.properties.hint as hints
-except ImportError as e:
-    logger.warning("Failed to import OpenVINO with %r", e)
-
-
-class OpenVinoPlatform(Platform):
-    _enum = PlatformEnum.OPENVINO
-    device_name: str = "openvino"
-    device_type: str = "openvino"
-    dispatch_key: str = "CPU"
-
-    @classmethod
-    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
-                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool,
-                             use_mla: bool) -> str:
-        if selected_backend != _Backend.OPENVINO:
-            logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
-        logger.info("Using OpenVINO Attention backend.")
-        return "vllm.attention.backends.openvino.OpenVINOAttentionBackend"
-
-    @classmethod
-    def get_device_name(cls, device_id: int = 0) -> str:
-        return "openvino"
-
-    @classmethod
-    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
-        return False
-
-    @classmethod
-    def inference_mode(cls):
-        return torch.inference_mode(mode=True)
-
-    @classmethod
-    def is_openvino_cpu(cls) -> bool:
-        return "CPU" in envs.VLLM_OPENVINO_DEVICE
-
-    @classmethod
-    def is_openvino_gpu(cls) -> bool:
-        return "GPU" in envs.VLLM_OPENVINO_DEVICE
-
-    @classmethod
-    def is_pin_memory_available(cls) -> bool:
-        logger.warning("Pin memory is not supported on OpenViNO.")
-        return False
-
-    @classmethod
-    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        from vllm.utils import GiB_bytes
-
-        parallel_config = vllm_config.parallel_config
-        assert (parallel_config.world_size == 1
-                ), "OpenVINO only supports single CPU socket currently."
-
-        if parallel_config.worker_cls == "auto":
-            parallel_config.worker_cls = \
-                "vllm.worker.openvino_worker.OpenVINOWorker"
-
-        # check and update model config
-        model_config = vllm_config.model_config
-        if model_config.dtype != torch.float32:
-            logger.warning(
-                f"Only float32 dtype is supported on OpenVINO, casting from {model_config.dtype}."  # noqa: G004, E501
-            )
-            model_config.dtype = torch.float32
-        if not model_config.enforce_eager:
-            logger.warning(
-                "CUDA graph is not supported on OpenVINO backend, fallback to "
-                "the eager mode.")
-            model_config.enforce_eager = True
-
-        # check and update cache config
-        ov_core = ov.Core()
-        cache_config = vllm_config.cache_config
-        if cache_config and cache_config.block_size is None:
-            cache_config.block_size = 16
-
-        if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
-            if not OpenVinoPlatform.is_openvino_cpu():
-                logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is "
-                            "ignored for GPU, f16 data type will be used.")
-                cache_config.cache_dtype = ov.Type.f16
-            else:
-                logger.info("KV cache type is overridden to u8 via "
-                            "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
-                cache_config.cache_dtype = ov.Type.u8
-        else:
-            if OpenVinoPlatform.is_openvino_cpu():
-                ov_device = envs.VLLM_OPENVINO_DEVICE
-                inference_precision = ov_core.get_property(
-                    ov_device, hints.inference_precision)
-                if inference_precision == ov.Type.bf16:
-                    cache_config.cache_dtype = ov.Type.bf16
-                else:
-                    cache_config.cache_dtype = ov.Type.f16
-            else:
-                cache_config.cache_dtype = ov.Type.f16
-
-        if OpenVinoPlatform.is_openvino_cpu():
-            if cache_config.block_size != 32:
-                logger.info(
-                    f"OpenVINO CPU optimal block size is 32, overriding currently set {cache_config.block_size}"  # noqa: G004, E501
-                )
-                cache_config.block_size = 32
-        else:
-            if cache_config.block_size != 16:
-                logger.info(
-                    f"OpenVINO GPU optimal block size is 16, overriding currently set {cache_config.block_size}"  # noqa: G004, E501
-                )
-                cache_config.block_size = 16
-
-        kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
-        if kv_cache_space >= 0:
-            if kv_cache_space == 0 and OpenVinoPlatform.is_openvino_cpu():
-                cache_config.openvino_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
-                logger.warning(
-                    "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
-                    "for OpenVINO backend is not set, using 4 by default.")
-            else:
-                cache_config.openvino_kvcache_space_bytes = (  # type: ignore
-                    kv_cache_space * GiB_bytes)
-        else:
-            raise RuntimeError(
-                "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
-                f" {kv_cache_space}, expect a positive integer value.")
-
-        assert vllm_config.device_config.device_type == "openvino"
-        assert vllm_config.lora_config is None, \
-            "OpenVINO backend doesn't support LoRA"
-        assert cls.is_openvino_cpu() or \
-            cls.is_openvino_gpu(), \
-            "OpenVINO backend supports only CPU and GPU devices"
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -120,8 +120,9 @@ class RocmPlatform(Platform):
        selected_backend = (_Backend.ROCM_FLASH if selected_backend
                            == _Backend.FLASH_ATTN else selected_backend)
        if envs.VLLM_USE_V1:
-            logger.info("Using ROCm Attention backend on V1 engine.")
-            return "vllm.v1.attention.backends.rocm_attn.ROCmAttentionBackend"
+            logger.info("Using Triton Attention backend on V1 engine.")
+            return ("vllm.v1.attention.backends."
+                    "triton_attn.TritonAttentionBackend")
        if selected_backend == _Backend.ROCM_FLASH:
            if not cls.has_device_capability(90):
                # not Instinct series GPUs.

--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -29,6 +29,10 @@ class TpuPlatform(Platform):
        "tpu_int8", "compressed-tensors", "compressed_tensors"
    ]

+    additional_env_vars: list[str] = [
+        "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"
+    ]
+
    @classmethod
    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                             dtype: torch.dtype, kv_cache_dtype: Optional[str],

--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -92,22 +92,20 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
    # Override draft-model specific worker args.
    draft_worker_kwargs.update(
        vllm_config=draft_worker_config,
-        ngram_prompt_lookup_max=speculative_config.ngram_prompt_lookup_max,
-        ngram_prompt_lookup_min=speculative_config.ngram_prompt_lookup_min,
+        ngram_prompt_lookup_max=speculative_config.prompt_lookup_max,
+        ngram_prompt_lookup_min=speculative_config.prompt_lookup_min,
    )

    spec_decode_worker = SpecDecodeWorker.create_worker(
        scorer_worker=target_worker,
        draft_worker_kwargs=draft_worker_kwargs,
-        disable_mqa_scorer=speculative_config.speculative_disable_mqa_scorer,
-        disable_by_batch_size=speculative_config.
-        speculative_disable_by_batch_size,
-        draft_token_acceptance_method=speculative_config.
-        draft_token_acceptance_method,
+        disable_mqa_scorer=speculative_config.disable_mqa_scorer,
+        disable_by_batch_size=speculative_config.disable_by_batch_size,
+        draft_token_acceptance_method=speculative_config.acceptance_method,
        typical_acceptance_sampler_posterior_threshold=speculative_config.
-        typical_acceptance_sampler_posterior_threshold,
+        posterior_threshold,
        typical_acceptance_sampler_posterior_alpha=speculative_config.
-        typical_acceptance_sampler_posterior_alpha,
+        posterior_alpha,
        disable_logprobs=speculative_config.disable_logprobs,
        disable_log_stats=speculative_config.disable_log_stats,
        num_speculative_tokens=speculative_config.num_speculative_tokens,