Merge tag 'v0.8.5' into v0.8.5-dev

dcb5624a · zhuwenwen · 55880ca2 · ba41cc90 · dcb5624a · dcb5624a
Commit dcb5624a authored Apr 29, 2025 by zhuwenwen
20 changed files
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
 # SPDX-License-Identifier: Apache-2.0
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import Literal, Optional, Set, Tuple, TypedDict, Union
 import torch
@@ -12,7 +11,6 @@ from transformers import (BatchFeature, Blip2Config, Blip2QFormerConfig,
 from vllm.config import CacheConfig, VllmConfig
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -62,6 +60,7 @@ class Blip2QFormerMultiHeadAttention(nn.Module):
        quant_config: Optional[QuantizationConfig],
        cache_config: Optional[CacheConfig],
        is_cross_attention: bool = False,
+        prefix: str = "",
    ) -> None:
        super().__init__()
@@ -141,7 +140,7 @@ class Blip2QFormerMultiHeadAttention(nn.Module):
 class Blip2QFormerSelfOutput(nn.Module):
-    def __init__(self, config: Blip2QFormerConfig) -> None:
+    def __init__(self, config: Blip2QFormerConfig, prefix: str = "") -> None:
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -169,6 +168,7 @@ class Blip2QFormerAttention(nn.Module):
        quant_config: Optional[QuantizationConfig],
        cache_config: Optional[CacheConfig],
        is_cross_attention: bool = False,
+        prefix: str = "",
    ) -> None:
        super().__init__()
@@ -177,9 +177,10 @@ class Blip2QFormerAttention(nn.Module):
            quant_config=quant_config,
            cache_config=cache_config,
            is_cross_attention=is_cross_attention,
+            prefix=f"{prefix}.attention",
        )
-        self.output = Blip2QFormerSelfOutput(config)
+        self.output = Blip2QFormerSelfOutput(config, prefix=f"{prefix}.output")
    def forward(
        self,
@@ -197,7 +198,7 @@ class Blip2QFormerAttention(nn.Module):
 class Blip2QFormerIntermediate(nn.Module):
-    def __init__(self, config: Blip2QFormerConfig) -> None:
+    def __init__(self, config: Blip2QFormerConfig, prefix: str = "") -> None:
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
@@ -211,7 +212,7 @@ class Blip2QFormerIntermediate(nn.Module):
 class Blip2QFormerOutput(nn.Module):
-    def __init__(self, config: Blip2QFormerConfig) -> None:
+    def __init__(self, config: Blip2QFormerConfig, prefix: str = "") -> None:
        super().__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
@@ -239,6 +240,7 @@ class Blip2QFormerLayer(nn.Module):
        quant_config: Optional[QuantizationConfig],
        cache_config: Optional[CacheConfig],
        layer_idx: int,
+        prefix: str = "",
    ) -> None:
        super().__init__()
@@ -246,7 +248,8 @@ class Blip2QFormerLayer(nn.Module):
        self.seq_len_dim = 1
        self.attention = Blip2QFormerAttention(config,
                                               quant_config=quant_config,
-                                               cache_config=cache_config)
+                                               cache_config=cache_config,
+                                               prefix=f"{prefix}.attention")
        self.layer_idx = layer_idx
@@ -255,13 +258,16 @@ class Blip2QFormerLayer(nn.Module):
                config,
                quant_config=quant_config,
                cache_config=cache_config,
-                is_cross_attention=True)
+                is_cross_attention=True,
+                prefix=f"{prefix}.crossattention")
            self.has_cross_attention = True
        else:
            self.has_cross_attention = False
-        self.intermediate_query = Blip2QFormerIntermediate(config)
+        self.intermediate_query = Blip2QFormerIntermediate(
-        self.output_query = Blip2QFormerOutput(config)
+            config, prefix=f"{prefix}.intermediate_query")
+        self.output_query = Blip2QFormerOutput(config,
+                                               prefix=f"{prefix}.output_query")
    def forward(
        self,
@@ -327,6 +333,7 @@ class Blip2QFormerEncoder(nn.Module):
        *,
        quant_config: Optional[QuantizationConfig],
        cache_config: Optional[CacheConfig],
+        prefix: str = "",
    ) -> None:
        super().__init__()
@@ -336,7 +343,8 @@ class Blip2QFormerEncoder(nn.Module):
            Blip2QFormerLayer(config,
                              quant_config=quant_config,
                              cache_config=cache_config,
-                              layer_idx=layer_idx)
+                              layer_idx=layer_idx,
+                              prefix=f"{prefix}.layer.{layer_idx}")
            for layer_idx in range(config.num_hidden_layers)
        ])
@@ -367,6 +375,7 @@ class Blip2QFormerModel(nn.Module):
        *,
        quant_config: Optional[QuantizationConfig],
        cache_config: Optional[CacheConfig],
+        prefix: str = "",
    ) -> None:
        super().__init__()
@@ -378,7 +387,8 @@ class Blip2QFormerModel(nn.Module):
        self.encoder = Blip2QFormerEncoder(config,
                                           quant_config=quant_config,
-                                           cache_config=cache_config)
+                                           cache_config=cache_config,
+                                           prefix=f"{prefix}.encoder")
    def forward(
        self,
@@ -513,7 +523,8 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
        self.qformer = Blip2QFormerModel(config.qformer_config,
                                         cache_config=cache_config,
-                                         quant_config=quant_config)
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.qformer")
        self.language_projection = nn.Linear(
            config.qformer_config.hidden_size,
@@ -530,13 +541,6 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
        self.make_empty_intermediate_tensors = (
            self.language_model.make_empty_intermediate_tensors)
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-        return get_sampler()
    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
        h = w = self.config.vision_config.image_size
        expected_dims = (3, h, w)
@@ -649,7 +653,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        **kwargs: object,
-    ) -> Union[SamplerOutput, IntermediateTensors]:
+    ) -> IntermediateTensors:
        """Run forward pass for BLIP-2.
        One key thing to understand is the `input_ids` already accounts for the
@@ -707,13 +711,6 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
        return self.language_model.compute_logits(hidden_states,
                                                  sampling_metadata)
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        loader = AutoWeightsLoader(self)

--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -37,7 +37,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -308,8 +307,6 @@ class BloomForCausalLM(nn.Module, SupportsPP, SupportsV0Only, SupportsQuant):
                                          self.config.hidden_size)
        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
        self.make_empty_intermediate_tensors = (
            self.transformer.make_empty_intermediate_tensors)
@@ -345,14 +342,6 @@ class BloomForCausalLM(nn.Module, SupportsPP, SupportsV0Only, SupportsQuant):
                                       sampling_metadata)
        return logits
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        params_dict = dict(self.named_parameters(remove_duplicate=False))

--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -22,7 +22,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -950,7 +949,6 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
        logit_scale = getattr(config, "logit_scale", 1.0)
        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                config.vocab_size, logit_scale)
-        self.sampler = get_sampler()
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)
@@ -1054,14 +1052,6 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
        return logits
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        stacked_params_mapping = [

--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -23,7 +23,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -494,8 +493,6 @@ class ChatGLMBaseModel(nn.Module):
                self.transformer.embedding.weight)
        self.lm_head = self.transformer.output_layer
        self.logits_processor = LogitsProcessor(config.padded_vocab_size)
-        self.sampler = get_sampler()
        self.make_empty_intermediate_tensors = (
            self.transformer.make_empty_intermediate_tensors)
@@ -508,14 +505,6 @@ class ChatGLMBaseModel(nn.Module):
                                       sampling_metadata)
        return logits
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
        loader = AutoWeightsLoader(self)
        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -38,7 +38,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -89,6 +88,7 @@ class CohereMLP(nn.Module):
        self,
        config: CohereConfig,
        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
    ):
        super().__init__()
        self.config = config
@@ -99,12 +99,14 @@ class CohereMLP(nn.Module):
            [self.intermediate_size] * 2,
            bias=False,
            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
        )
        self.down_proj = RowParallelLinear(
            self.intermediate_size,
            self.hidden_size,
            bias=False,
            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
        )
        self.act_fn = SiluAndMul()
@@ -158,12 +160,14 @@ class CohereAttention(nn.Module):
            self.total_num_kv_heads,
            bias=False,
            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
        )
        self.o_proj = RowParallelLinear(
            self.total_num_heads * self.head_dim,
            self.hidden_size,
            bias=False,
            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
        )
        self.rotary_emb = get_rope(
            self.head_dim,
@@ -244,7 +248,9 @@ class CohereDecoderLayer(nn.Module):
                                         quant_config=quant_config,
                                         prefix=f"{prefix}.self_attn")
-        self.mlp = CohereMLP(config, quant_config=quant_config)
+        self.mlp = CohereMLP(config,
+                             quant_config=quant_config,
+                             prefix=f"{prefix}.mlp")
        self.input_layernorm = LayerNorm(param_shape=(config.hidden_size),
                                         eps=config.layer_norm_eps)
@@ -365,7 +371,6 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
                                                scale=config.logit_scale)
        self.model = CohereModel(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
-        self.sampler = get_sampler()
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)
@@ -399,14 +404,6 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
        return logits
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        stacked_params_mapping = [

--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -16,7 +16,6 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -390,7 +389,6 @@ class DbrxForCausalLM(nn.Module, SupportsPP):
        )
        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                config.vocab_size)
-        self.sampler = get_sampler()
        self.make_empty_intermediate_tensors = (
            self.transformer.make_empty_intermediate_tensors)
@@ -417,14 +415,6 @@ class DbrxForCausalLM(nn.Module, SupportsPP):
                                       sampling_metadata)
        return logits
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        expert_params_mapping = [(

--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -43,7 +43,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -453,7 +452,6 @@ class DeepseekForCausalLM(nn.Module, SupportsPP):
        if self.config.tie_word_embeddings:
            self.lm_head.weight = self.model.embed_tokens.weight
        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)
@@ -480,14 +478,6 @@ class DeepseekForCausalLM(nn.Module, SupportsPP):
                                       sampling_metadata)
        return logits
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        loader = AutoWeightsLoader(self)

--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -13,7 +13,6 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -165,10 +164,9 @@ class DeepSeekMTP(nn.Module):
        self.model = DeepSeekMultiTokenPredictor(vllm_config=vllm_config,
                                                 prefix=maybe_prefix(
                                                     prefix, "model"))
-        self.sampler = get_sampler()
        self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
    def forward(
        self,
        input_ids: torch.Tensor,
@@ -192,14 +190,6 @@ class DeepSeekMTP(nn.Module):
        return self.model.compute_logits(hidden_states, sampling_metadata,
                                         spec_step_idx)
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        stacked_params_mapping = [

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -47,7 +47,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -712,7 +711,6 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
        else:
            self.lm_head = PPMissingLayer()
        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)
        self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
@@ -741,14 +739,6 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
                                       sampling_metadata)
        return logits
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
    def make_empty_intermediate_tensors(
            self, batch_size: int, dtype: torch.dtype,
            device: torch.device) -> IntermediateTensors:

--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -4,7 +4,6 @@
 """Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 import torch
@@ -16,7 +15,6 @@ from transformers import BatchFeature
 from vllm.config import VllmConfig
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -393,13 +391,6 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
        model = model.to(dtype=torch.get_default_dtype())
        return model
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-        return get_sampler()
    def _validate_pixel_values(
        self, data: Union[torch.Tensor, List[torch.Tensor]]
    ) -> Union[torch.Tensor, List[torch.Tensor]]:
@@ -647,13 +638,6 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
        return self.language_model.compute_logits(hidden_states,
                                                  sampling_metadata)
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:

--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -9,7 +9,6 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -131,10 +130,6 @@ class EAGLE(nn.Module):
        # checkpoint file has token_map tensor.
        self.token_map = None
-    @property
-    def sampler(self):
-        return self.model.sampler
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.model.model.get_input_embeddings(input_ids)
@@ -188,14 +183,6 @@ class EAGLE(nn.Module):
        return logits
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
        # This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B
        # due to missing lm_head weights and its config being that of a

--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -510,8 +509,6 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
        else:
            self.lm_head = PPMissingLayer()
-        self.sampler = get_sampler()
        self.make_empty_intermediate_tensors = (
            self.transformer.make_empty_intermediate_tensors)
@@ -538,14 +535,6 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                                       sampling_metadata)
        return logits
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        loader = AutoWeightsLoader(

--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -42,7 +42,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -527,7 +526,6 @@ class FalconForCausalLM(nn.Module, SupportsPP):
                quant_config=quant_config,
            )
        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
        self.make_empty_intermediate_tensors = (
            self.transformer.make_empty_intermediate_tensors)
@@ -554,14 +552,6 @@ class FalconForCausalLM(nn.Module, SupportsPP):
                                       sampling_metadata)
        return logits
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        loader = AutoWeightsLoader(

--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -3,7 +3,6 @@
 import math
 from collections import OrderedDict
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 import torch
@@ -14,7 +13,6 @@ from transformers import BartTokenizer, BatchFeature, PretrainedConfig
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.bart import (BartDecoder, BartEncoder,
                                             BartParallelLMHead,
@@ -673,7 +671,6 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):
        self.logits_processor = LogitsProcessor(self.vocab_size,
                                                config.vocab_size)
-        self.sampler = get_sampler()
    def forward(
        self,
@@ -716,11 +713,6 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):
                                       sampling_metadata)
        return logits
-    def sample(self, logits: torch.Tensor,
-               sampling_metadata: SamplingMetadata) -> SamplerOutput:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        stacked_params_mapping = [
@@ -929,12 +921,6 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
            raise NotImplementedError(
                'Florence2 only supports COSINE as temporal embedding.')
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-        return get_sampler()
    def _validate_pixel_values(
        self, data: Union[torch.Tensor, List[torch.Tensor]]
    ) -> Union[torch.Tensor, List[torch.Tensor]]:
@@ -1110,13 +1096,6 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
        return self.language_model.compute_logits(hidden_states,
                                                  sampling_metadata)
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> SamplerOutput:
-        return self.language_model.sample(logits, sampling_metadata)
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        loader = AutoWeightsLoader(self)

--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -27,7 +27,6 @@ from transformers import (BatchFeature, FuyuConfig, FuyuImageProcessor,
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import ColumnParallelLinear
-from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.persimmon import PersimmonForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -270,10 +269,6 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
        self.make_empty_intermediate_tensors = (
            self.language_model.make_empty_intermediate_tensors)
-    @property
-    def sampler(self):
-        return self.language_model.sampler
    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
        h = w = self.config.patch_size
@@ -387,14 +382,6 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
            self.language_model.lm_head, hidden_states, sampling_metadata)
        return logits
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.language_model.sampler(logits, sampling_metadata)
-        return next_tokens
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        loader = AutoWeightsLoader(self)

--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -35,7 +35,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -388,7 +387,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
        self.model = GemmaModel(vllm_config=vllm_config,
                                prefix=maybe_prefix(prefix, "model"))
        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)
@@ -415,14 +413,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                                       sampling_metadata)
        return logits
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        loader = AutoWeightsLoader(

--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -34,7 +34,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -146,8 +145,8 @@ class Gemma2Attention(nn.Module):
        # reference:
        # https://github.com/huggingface/transformers/blob/54be2d7ae87e873482b984cc956e165ca4dc0ba3/src/transformers/models/gemma2/modeling_gemma2.py#L312 # noqa
        layer_idx = extract_layer_index(prefix)
-        use_sliding_window = (layer_idx % 2 == 0 and
+        use_sliding_window = (layer_idx % 2 == 0 and getattr(
-                              config.interleaved_sliding_window is not None)
+            config, "interleaved_sliding_window", None) is not None)
        sliding_window = config.interleaved_sliding_window if \
            use_sliding_window else None
        self.attn = Attention(self.num_heads,
@@ -388,7 +387,6 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                                 prefix=maybe_prefix(prefix, "model"))
        self.logits_processor = LogitsProcessor(
            config.vocab_size, soft_cap=config.final_logit_softcapping)
-        self.sampler = get_sampler()
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)
@@ -415,14 +413,6 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                                       sampling_metadata)
        return logits
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        loader = AutoWeightsLoader(

--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -34,7 +34,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -147,7 +146,9 @@ class Gemma3Attention(nn.Module):
        # TODO(woosuk): Add reference to the original HF implementation.
        layer_idx = extract_layer_index(prefix)
-        self.is_sliding = bool((layer_idx + 1) % config.sliding_window_pattern)
+        self.is_sliding = (getattr(
+            config, "interleaved_sliding_window", None) is not None and bool(
+                (layer_idx + 1) % config.sliding_window_pattern))
        # Initialize the rotary embedding.
        if self.is_sliding:
            # Local attention. Override the values in config.json.
@@ -493,7 +494,6 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                                 prefix=maybe_prefix(prefix, "model"))
        self.logits_processor = LogitsProcessor(
            config.vocab_size, soft_cap=config.final_logit_softcapping)
-        self.sampler = get_sampler()
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)
@@ -521,14 +521,6 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                                       sampling_metadata)
        return logits
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        loader = AutoWeightsLoader(

--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
 # SPDX-License-Identifier: Apache-2.0
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union
+from typing import Any, Literal, Optional, Set, Tuple, TypedDict
 import torch
 from torch import nn
@@ -12,7 +12,6 @@ import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import GemmaRMSNorm
-from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -479,7 +478,8 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
        self.config = config
        self.quant_config = quant_config
        self.multimodal_config = multimodal_config
-        self.sliding_window = config.text_config.interleaved_sliding_window
+        self.sliding_window = getattr(config.text_config,
+                                      "interleaved_sliding_window", None)
        self.vision_tower = SiglipVisionModel(config.vision_config,
                                              quant_config,
@@ -503,10 +503,6 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
    def dtype(self):
        return next(self.parameters()).dtype
-    @property
-    def sampler(self):
-        return self.language_model.sampler
    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
        h = w = self.config.vision_config.image_size
        expected_dims = (3, h, w)
@@ -607,7 +603,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
                positions: torch.Tensor,
                intermediate_tensors: Optional[IntermediateTensors] = None,
                inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs: object) -> Union[SamplerOutput, IntermediateTensors]:
+                **kwargs: object) -> IntermediateTensors:
        if intermediate_tensors is not None:
            inputs_embeds = None
@@ -685,13 +681,14 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
            global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
            global_attn_masks.append(global_attn_mask)
-            # Create a local causal mask with sliding window (1024).
+            if self.sliding_window is not None:
-            local_attn_mask = torch.ones_like(global_attn_mask)
+                # Create a local causal mask with sliding window (1024).
-            local_attn_mask = torch.tril(local_attn_mask,
+                local_attn_mask = torch.ones_like(global_attn_mask)
-                                         diagonal=-self.sliding_window)
+                local_attn_mask = torch.tril(local_attn_mask,
-            local_attn_mask = torch.where(local_attn_mask == 0,
+                                             diagonal=-self.sliding_window)
-                                          global_attn_mask, float("-inf"))
+                local_attn_mask = torch.where(local_attn_mask == 0,
-            local_attn_masks.append(local_attn_mask)
+                                              global_attn_mask, float("-inf"))
+                local_attn_masks.append(local_attn_mask)
        kwargs["global_attn_masks"] = global_attn_masks
        kwargs["local_attn_masks"] = local_attn_masks
        return kwargs
@@ -704,13 +701,6 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
        return self.language_model.compute_logits(hidden_states,
                                                  sampling_metadata)
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        loader = AutoWeightsLoader(self)

--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -37,7 +37,6 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
@@ -200,8 +199,8 @@ class Glm4DecoderLayer(nn.Module):
        hidden_states = self.post_self_attn_layernorm(hidden_states)
        # Fully Connected
-        residual = hidden_states
+        hidden_states, residual = self.post_attention_layernorm(
-        hidden_states = self.post_attention_layernorm(hidden_states)
+            hidden_states, residual)
        hidden_states = self.mlp(hidden_states)
        hidden_states = self.post_mlp_layernorm(hidden_states)
@@ -267,7 +266,6 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
            self.lm_head = PPMissingLayer()
        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)
@@ -295,14 +293,6 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                                       sampling_metadata)
        return logits
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        loader = AutoWeightsLoader(