[Misc] Remove redundant config definitions (#21891)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Misc] Remove redundant config definitions (#21891)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2ca5f82c · Cyrus Leung · GitHub · 6f8d2618 · 2ca5f82c · 2ca5f82c
Unverified Commit 2ca5f82c authored Jul 30, 2025 by Cyrus Leung Committed by GitHub Jul 29, 2025
20 changed files
--- a/vllm/model_executor/models/aimv2.py
+++ b/vllm/model_executor/models/aimv2.py
@@ -8,6 +8,7 @@ from typing import Optional

 import torch
 import torch.nn as nn
+from transformers import PretrainedConfig

 from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -20,13 +21,12 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
 from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.transformers_utils.configs.ovis import AIMv2Config


 class AIMv2SwiGLUFFN(nn.Module):

-    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
-                 prefix: str):
+    def __init__(self, config: PretrainedConfig,
+                 quant_config: QuantizationConfig, prefix: str):
        super().__init__()
        hidden_features = config.intermediate_size
        in_features = config.hidden_size
@@ -57,7 +57,7 @@ class AIMv2SwiGLUFFN(nn.Module):

 class AIMv2PatchEmbed(nn.Module):

-    def __init__(self, config: AIMv2Config):
+    def __init__(self, config: PretrainedConfig):
        super().__init__()
        self.proj = nn.Conv2d(
            config.num_channels,
@@ -75,7 +75,7 @@ class AIMv2PatchEmbed(nn.Module):

 class AIMv2ViTPreprocessor(nn.Module):

-    def __init__(self, config: AIMv2Config):
+    def __init__(self, config: PretrainedConfig):
        super().__init__()
        num_patches = (config.image_size // config.patch_size)**2

@@ -93,8 +93,8 @@ class AIMv2ViTPreprocessor(nn.Module):

 class AIMv2Attention(nn.Module):

-    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
-                 prefix: str):
+    def __init__(self, config: PretrainedConfig,
+                 quant_config: QuantizationConfig, prefix: str):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
@@ -141,8 +141,8 @@ class AIMv2Attention(nn.Module):

 class AIMv2Block(nn.Module):

-    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
-                 prefix: str):
+    def __init__(self, config: PretrainedConfig,
+                 quant_config: QuantizationConfig, prefix: str):
        super().__init__()
        self.attn = AIMv2Attention(config,
                                   quant_config=quant_config,
@@ -163,7 +163,7 @@ class AIMv2Transformer(nn.Module):

    def __init__(
        self,
-        config: AIMv2Config,
+        config: PretrainedConfig,
        quant_config: QuantizationConfig,
        *,
        require_post_norm: Optional[bool] = None,
@@ -193,7 +193,7 @@ class AIMv2Transformer(nn.Module):
 class AIMv2Model(torch.nn.Module):

    def __init__(self,
-                 config: AIMv2Config,
+                 config: PretrainedConfig,
                 quant_config: QuantizationConfig,
                 *,
                 require_post_norm: Optional[bool] = None,

--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -6,6 +6,7 @@ from typing import Optional, Union

 import torch
 import torch.nn as nn
+from transformers import PretrainedConfig

 from vllm.attention import Attention
 from vllm.config import CacheConfig, VllmConfig
@@ -24,7 +25,6 @@ from vllm.model_executor.model_loader.weight_utils import (
    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.dbrx import DbrxConfig

 from .interfaces import SupportsPP
 from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
@@ -39,7 +39,7 @@ class DbrxRouter(nn.Module):

    def __init__(
        self,
-        config: DbrxConfig,
+        config: PretrainedConfig,
        params_dtype: Optional[torch.dtype] = None,
    ):
        super().__init__()
@@ -63,7 +63,7 @@ class DbrxExperts(FusedMoE):

    def __init__(
        self,
-        config: DbrxConfig,
+        config: PretrainedConfig,
        quant_config: Optional[QuantizationConfig] = None,
        params_dtype: Optional[torch.dtype] = None,
        prefix: str = "",
@@ -138,7 +138,7 @@ class DbrxMoE(nn.Module):

    def __init__(
        self,
-        config: DbrxConfig,
+        config: PretrainedConfig,
        quant_config: Optional[QuantizationConfig] = None,
        params_dtype: Optional[torch.dtype] = None,
        prefix: str = "",
@@ -169,7 +169,7 @@ class DbrxAttention(nn.Module):

    def __init__(
        self,
-        config: DbrxConfig,
+        config: PretrainedConfig,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
@@ -249,7 +249,7 @@ class DbrxFusedNormAttention(nn.Module):

    def __init__(
        self,
-        config: DbrxConfig,
+        config: PretrainedConfig,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
@@ -284,7 +284,7 @@ class DbrxBlock(nn.Module):

    def __init__(
        self,
-        config: DbrxConfig,
+        config: PretrainedConfig,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",

--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -30,6 +30,7 @@ from typing import Any, Optional, Union

 import torch
 from torch import nn
+from transformers import PretrainedConfig

 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
@@ -49,7 +50,6 @@ from vllm.model_executor.model_loader.weight_utils import (
    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.exaone import ExaoneConfig

 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
@@ -99,7 +99,7 @@ class ExaoneAttention(nn.Module):

    def __init__(
        self,
-        config: ExaoneConfig,
+        config: PretrainedConfig,
        hidden_size: int,
        num_heads: int,
        num_kv_heads: int,
@@ -194,7 +194,7 @@ class ExaoneBlockAttention(nn.Module):

    def __init__(
        self,
-        config: ExaoneConfig,
+        config: PretrainedConfig,
        hidden_size: int,
        num_heads: int,
        num_kv_heads: int,
@@ -236,7 +236,7 @@ class ExaoneDecoderLayer(nn.Module):

    def __init__(
        self,
-        config: ExaoneConfig,
+        config: PretrainedConfig,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",

--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -26,6 +26,7 @@ from typing import Any, Optional, Union

 import torch
 from torch import nn
+from transformers import PretrainedConfig

 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
@@ -45,7 +46,6 @@ from vllm.model_executor.model_loader.weight_utils import (
    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.exaone4 import Exaone4Config

 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
@@ -96,7 +96,7 @@ class Exaone4Attention(nn.Module):

    def __init__(
        self,
-        config: Exaone4Config,
+        config: PretrainedConfig,
        hidden_size: int,
        num_heads: int,
        num_kv_heads: int,
@@ -224,7 +224,7 @@ class Exaone4DecoderLayer(nn.Module):

    def __init__(
        self,
-        config: Exaone4Config,
+        config: PretrainedConfig,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",

--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -980,9 +980,6 @@ class KeyeMultiModalDataParser(MultiModalDataParser):

 class KeyeProcessingInfo(BaseProcessingInfo):

-    def get_hf_config(self):
-        return self.ctx.get_hf_config(PretrainedConfig)
-
    def get_hf_processor(
        self,
        *,

--- a/vllm/model_executor/models/minimax_vl_01.py
+++ b/vllm/model_executor/models/minimax_vl_01.py
@@ -5,7 +5,7 @@ from typing import Literal, Optional, TypedDict, Union, cast

 import torch
 import torch.nn as nn
-from transformers import BatchFeature
+from transformers import BatchFeature, PretrainedConfig

 from vllm.config import VllmConfig
 from vllm.jsontree import json_map_leaves
@@ -17,7 +17,6 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config

 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -90,8 +89,8 @@ class MiniMaxVL01DummyInputsBuilder(LlavaDummyInputsBuilder):

 class MiniMaxVL01ProcessingInfo(LlavaNextProcessingInfo):

-    def get_hf_config(self):
-        return self.ctx.get_hf_config(MiniMaxVL01Config)
+    def get_hf_config(self):  # Need to override the config type
+        return self.ctx.get_hf_config(PretrainedConfig)

    def get_hf_processor(self, **kwargs: object):
        hf_processor = self.ctx.get_hf_processor(**kwargs)

--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -8,6 +8,7 @@ from typing import Optional, Union

 import torch
 import torch.nn as nn
+from transformers import PretrainedConfig

 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
@@ -25,7 +26,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.mpt import MPTConfig

 from .interfaces import SupportsPP
 from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
@@ -50,7 +50,7 @@ class MPTAttention(nn.Module):

    def __init__(
        self,
-        config: MPTConfig,
+        config: PretrainedConfig,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
@@ -144,7 +144,7 @@ class MPTMLP(nn.Module):

    def __init__(
        self,
-        config: MPTConfig,
+        config: PretrainedConfig,
        quant_config: Optional[QuantizationConfig] = None,
    ):
        super().__init__()
@@ -176,7 +176,7 @@ class MPTBlock(nn.Module):

    def __init__(
        self,
-        config: MPTConfig,
+        config: PretrainedConfig,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",

--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -25,7 +25,7 @@ import torch
 import torch.nn as nn
 from torch import Tensor
 from torch.nn.functional import gumbel_softmax, pad, softmax
-from transformers import BaseImageProcessor, BatchFeature
+from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig

 from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import ReplicatedLinear
@@ -48,8 +48,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                        BaseProcessingInfo, PromptReplacement)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.ovis import (BaseVisualTokenizerConfig,
-                                                  OvisConfig)
 from vllm.transformers_utils.processors.ovis import OvisProcessor

 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -83,7 +81,7 @@ class VisualTokenizer(torch.nn.Module):

    def __init__(
        self,
-        config: BaseVisualTokenizerConfig,
+        config: PretrainedConfig,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
    ):
@@ -107,7 +105,7 @@ class VisualTokenizer(torch.nn.Module):

    def _init_backbone(
        self,
-        config: BaseVisualTokenizerConfig,
+        config: PretrainedConfig,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
    ) -> nn.Module:
@@ -247,9 +245,6 @@ class VisualEmbedding(torch.nn.Embedding):

 class OvisProcessingInfo(BaseProcessingInfo):

-    def get_hf_config(self):
-        return self.ctx.get_hf_config(OvisConfig)
-
    def get_hf_processor(self, **kwargs):
        return self.ctx.get_hf_processor(
            OvisProcessor,
@@ -417,7 +412,7 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
        config = vllm_config.model_config.hf_config
        quant_config = vllm_config.quant_config

-        self.config: OvisConfig = config
+        self.config: PretrainedConfig = config
        self.llm = init_vllm_registered_model(
            vllm_config=vllm_config.with_hf_config(config.get_text_config()),
            prefix=maybe_prefix(prefix, "llm"),

--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -29,19 +29,13 @@ from vllm import envs
 from vllm.logger import init_logger
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
-                                             DbrxConfig, DeepseekVLV2Config,
-                                             EAGLEConfig, Exaone4Config,
-                                             ExaoneConfig, JAISConfig,
+from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config,
+                                             EAGLEConfig, JAISConfig,
                                             KimiVLConfig, MedusaConfig,
-                                             MiniMaxText01Config,
-                                             MiniMaxVL01Config, MllamaConfig,
-                                             MLPSpeculatorConfig, MPTConfig,
+                                             MllamaConfig, MLPSpeculatorConfig,
                                             Nemotron_Nano_VL_Config,
-                                             NemotronConfig, NVLM_D_Config,
-                                             OvisConfig, RWConfig,
-                                             SkyworkR1VChatConfig, SolarConfig,
-                                             Telechat2Config, UltravoxConfig)
+                                             NemotronConfig, RWConfig,
+                                             UltravoxConfig)
 # yapf: enable
 from vllm.transformers_utils.configs.mistral import adapt_config_dict
 from vllm.transformers_utils.utils import check_gguf_file
@@ -77,28 +71,16 @@ _CONFIG_REGISTRY_OVERRIDE_HF: dict[str, type[PretrainedConfig]] = {

 _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
    "chatglm": ChatGLMConfig,
-    "cohere2": Cohere2Config,
-    "dbrx": DbrxConfig,
    "deepseek_vl_v2": DeepseekVLV2Config,
    "kimi_vl": KimiVLConfig,
    "Llama_Nemotron_Nano_VL": Nemotron_Nano_VL_Config,
-    "mpt": MPTConfig,
    "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
    "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
    "jais": JAISConfig,
    "mlp_speculator": MLPSpeculatorConfig,
    "medusa": MedusaConfig,
    "eagle": EAGLEConfig,
-    "exaone": ExaoneConfig,
-    "exaone4": Exaone4Config,
-    "minimax_text_01": MiniMaxText01Config,
-    "minimax_vl_01": MiniMaxVL01Config,
    "nemotron": NemotronConfig,
-    "NVLM_D": NVLM_D_Config,
-    "ovis": OvisConfig,
-    "solar": SolarConfig,
-    "skywork_chat": SkyworkR1VChatConfig,
-    "telechat": Telechat2Config,
    "ultravox": UltravoxConfig,
    **_CONFIG_REGISTRY_OVERRIDE_HF
 }

--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Model configs may be defined in this directory for the following reasons:
+
+- There is no configuration file defined by HF Hub or Transformers library.
+- There is a need to override the existing config to support vLLM.
+"""

 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
-from vllm.transformers_utils.configs.cohere2 import Cohere2Config
-from vllm.transformers_utils.configs.dbrx import DbrxConfig
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
 from vllm.transformers_utils.configs.eagle import EAGLEConfig
-from vllm.transformers_utils.configs.exaone import ExaoneConfig
-from vllm.transformers_utils.configs.exaone4 import Exaone4Config
 # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
@@ -15,36 +17,21 @@ from vllm.transformers_utils.configs.falcon import RWConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
 from vllm.transformers_utils.configs.medusa import MedusaConfig
-from vllm.transformers_utils.configs.minimax_text_01 import MiniMaxText01Config
-from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config
 from vllm.transformers_utils.configs.mllama import MllamaConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
-from vllm.transformers_utils.configs.mpt import MPTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
 from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config
-from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
-from vllm.transformers_utils.configs.ovis import OvisConfig
-from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig
-from vllm.transformers_utils.configs.solar import SolarConfig
-from vllm.transformers_utils.configs.telechat2 import Telechat2Config
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig

 __all__ = [
    "ChatGLMConfig",
-    "Cohere2Config",
-    "DbrxConfig",
    "DeepseekVLV2Config",
-    "MPTConfig",
    "RWConfig",
    "JAISConfig",
    "MedusaConfig",
    "EAGLEConfig",
-    "ExaoneConfig",
-    "Exaone4Config",
-    "MiniMaxText01Config",
-    "MiniMaxVL01Config",
    "MllamaConfig",
    "MLPSpeculatorConfig",
    "MoonViTConfig",
@@ -52,10 +39,5 @@ __all__ = [
    "NemotronConfig",
    "NemotronHConfig",
    "Nemotron_Nano_VL_Config",
-    "NVLM_D_Config",
-    "OvisConfig",
-    "SkyworkR1VChatConfig",
-    "SolarConfig",
-    "Telechat2Config",
    "UltravoxConfig",
 ]
--- a/vllm/transformers_utils/configs/cohere2.py
+++ b/vllm/transformers_utils/configs/cohere2.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# ruff: noqa
-
-# Adapted from
-# https://github.com/huggingface/transformers/blob/main/src/transformers/models/cohere2/configuration_cohere2.py
-from transformers import PretrainedConfig
-from transformers.modeling_rope_utils import rope_config_validation
-
-
-class Cohere2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
-    model according to the specified arguments, defining the model architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 256000):
-            Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`CohereModel`]
-        hidden_size (`int`, *optional*, defaults to 8192):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22528):
-            Dimension of the MLP representations.
-        logit_scale (`float`, *optional*, defaults to 0.0625):
-            The scaling factor for the output logits.
-        num_hidden_layers (`int`, *optional*, defaults to 40):
-            Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 64):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 8192):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the layer normalization.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*, defaults to 0):
-            Padding token id.
-        bos_token_id (`int`, *optional*, defaults to 5):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 255001):
-            End of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
-            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
-            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
-            accordingly.
-            Expected contents:
-                `rope_type` (`str`):
-                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
-                    'llama3'], with 'default' being the original RoPE implementation.
-                `factor` (`float`, *optional*):
-                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
-                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
-                    original maximum pre-trained length.
-                `original_max_position_embeddings` (`int`, *optional*):
-                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
-                    pretraining.
-                `attention_factor` (`float`, *optional*):
-                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
-                    computation. If unspecified, it defaults to value recommended by the implementation, using the
-                    `factor` field to infer the suggested value.
-                `beta_fast` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 32.
-                `beta_slow` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 1.
-                `short_factor` (`list[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `long_factor` (`list[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `low_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
-                `high_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
-        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Size of the sliding window attention context.
-        sliding_window_pattern (`int`, *optional*, defaults to 4):
-            Pattern for the sliding window attention.
-        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
-
-    ```python
-    >>> from transformers import Cohere2Model, Cohere2Config
-
-    >>> # Initializing a Cohere Nextmodel configuration
-    >>> configuration = Cohere2Config()
-
-    >>> # Initializing a model from the Cohere2 configuration
-    >>> model = Cohere2Model(configuration) # doctest: +SKIP
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config # doctest: +SKIP
-    ```
-    """
-
-    model_type = "cohere2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=256000,
-        hidden_size=8192,
-        intermediate_size=22528,
-        logit_scale=0.0625,
-        num_hidden_layers=40,
-        num_attention_heads=64,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=8192,
-        initializer_range=0.02,
-        layer_norm_eps=1e-5,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=5,
-        eos_token_id=255001,
-        tie_word_embeddings=True,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        sliding_window=4096,
-        sliding_window_pattern=4,
-        cache_implementation="hybrid",
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.logit_scale = logit_scale
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        self.sliding_window = sliding_window
-        self.sliding_window_pattern = sliding_window_pattern
-        # Need to specify head_dim in the config so it can be used in the attention forward functions
-        self.head_dim = hidden_size // num_attention_heads
-        self.cache_implementation = cache_implementation
-
-        # Validate the correctness of rotary position embeddings parameters
-        rope_config_validation(self)
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-
-__all__ = ["Cohere2Config"]
--- a/vllm/transformers_utils/configs/dbrx.py
+++ b/vllm/transformers_utils/configs/dbrx.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# yapf: disable
-# ruff: noqa: E501
-# coding=utf-8
-# Copied from
-# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py
-"""Dbrx configuration."""
-
-from typing import Any, Optional
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} # type: ignore
-
-
-class DbrxAttentionConfig(PretrainedConfig):
-    """Configuration class for Dbrx Attention.
-
-    [`DbrxAttention`] class. It is used to instantiate attention layers
-    according to the specified arguments, defining the layers architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        attn_pdrop (`float`, *optional*, defaults to 0.0):
-            The dropout probability for the attention layers.
-        clip_qkv (`float`, *optional*, defaults to None):
-            If not `None`, clip the queries, keys, and values in the attention layer to this value.
-        kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
-        rope_theta (float): The base frequency for rope.
-    """
-
-    def __init__(
-        self,
-        attn_pdrop: float = 0,
-        clip_qkv: Optional[float] = None,
-        kv_n_heads: int = 1,
-        rope_theta: float = 10000.0,
-        **kwargs: Any,
-    ):
-        super().__init__(**kwargs)
-        self.attn_pdrop = attn_pdrop
-        self.clip_qkv = clip_qkv
-        self.kv_n_heads = kv_n_heads
-        self.rope_theta = rope_theta
-
-        for k in ["model_type"]:
-            if k in kwargs:
-                kwargs.pop(k)
-        if len(kwargs) != 0:
-            raise ValueError(f"Found unknown {kwargs=}")
-
-    @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path: str, **kwargs: Any
-    ) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(
-            pretrained_model_name_or_path, **kwargs
-        )
-
-        if config_dict.get("model_type") == "dbrx":
-            config_dict = config_dict["attn_config"]
-
-        if (
-            "model_type" in config_dict
-            and hasattr(cls, "model_type")
-            and config_dict["model_type"] != cls.model_type
-        ):
-            logger.warning(
-                "You are using a model of type %s to instantiate a model of "
-                "type %s. This is not supported for all configurations of "
-                "models and can yield errors.",
-                config_dict["model_type"], cls.model_type)
-
-        return cls.from_dict(config_dict, **kwargs)
-
-
-class DbrxFFNConfig(PretrainedConfig):
-    """Configuration class for Dbrx FFN.
-
-    [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
-    the specified arguments, defining the layers architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
-            The dict should have a key 'name' with the value being the name of
-            the activation function along with any additional keyword arguments.
-        ffn_hidden_size (int, optional): The hidden size of the feedforward network.
-        moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
-        moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
-        moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
-        moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
-        moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
-        uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
-            This should only be used for benchmarking purposes.
-    """
-
-    def __init__(
-        self,
-        ffn_act_fn: Optional[dict] = None,
-        ffn_hidden_size: int = 3584,
-        moe_num_experts: int = 4,
-        moe_top_k: int = 1,
-        moe_jitter_eps: Optional[float] = None,
-        moe_loss_weight: float = 0.01,
-        moe_normalize_expert_weights: Optional[float] = 1,
-        uniform_expert_assignment: bool = False,
-        **kwargs: Any,
-    ):
-        super().__init__()
-        if ffn_act_fn is None:
-            ffn_act_fn = {"name": "silu"}
-        self.ffn_act_fn = ffn_act_fn
-        self.ffn_hidden_size = ffn_hidden_size
-        self.moe_num_experts = moe_num_experts
-        self.moe_top_k = moe_top_k
-        self.moe_jitter_eps = moe_jitter_eps
-        self.moe_loss_weight = moe_loss_weight
-        self.moe_normalize_expert_weights = moe_normalize_expert_weights
-        self.uniform_expert_assignment = uniform_expert_assignment
-
-        for k in ["model_type"]:
-            if k in kwargs:
-                kwargs.pop(k)
-        if len(kwargs) != 0:
-            raise ValueError(f"Found unknown {kwargs=}")
-
-    @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path: str, **kwargs: Any
-    ) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(
-            pretrained_model_name_or_path, **kwargs
-        )
-
-        if config_dict.get("model_type") == "dbrx":
-            config_dict = config_dict["ffn_config"]
-
-        if (
-            "model_type" in config_dict
-            and hasattr(cls, "model_type")
-            and config_dict["model_type"] != cls.model_type
-        ):
-            logger.warning(
-                "You are using a model of type %s to instantiate a model of "
-                "type %s. This is not supported for all "
-                "configurations of models and can yield errors.", config_dict["model_type"], cls.model_type)
-
-        return cls.from_dict(config_dict, **kwargs)
-
-
-class DbrxConfig(PretrainedConfig):
-    """Configuration class for Dbrx.
-
-    [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
-    specified arguments, defining the model architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        d_model (`int`, *optional*, defaults to 6144):
-            Dimensionality of the embeddings and hidden states.
-        n_heads (`int`, *optional*, defaults to 48):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        n_layers (`int`, *optional*, defaults to 40):
-            Number of hidden layers in the Transformer encoder.
-        max_seq_len (`int`, *optional*, defaults to 32768):
-            The maximum sequence length of the model.
-        vocab_size (`int`, *optional*, defaults to 100352):
-            Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
-            the `inputs_ids` passed when calling [`DbrxModel`].
-        resid_pdrop (`float`, *optional*, defaults to 0.0):
-            The dropout probability applied to the attention output before combining with residual.
-        emb_pdrop (`float`, *optional*, defaults to 0.0):
-            The dropout probability for the embedding layer.
-        attn_config (`dict`, *optional*):
-            A dictionary used to configure the model's attention module.
-        ffn_config (`dict`, *optional*):
-            A dictionary used to configure the model's FFN module.
-        use_cache (`bool`, *optional*, defaults to `False`):
-            Whether or not the model should return the last key/values attentions (not used by all models).
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        output_router_logits (`bool`, *optional*, defaults to `False`):
-            Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss.
-        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
-            The aux loss factor for the total loss.
-
-
-    Example:
-    ```python
-    >>> from transformers import DbrxConfig, DbrxModel
-
-    >>> # Initializing a Dbrx configuration
-    >>> configuration = DbrxConfig()
-
-    >>> # Initializing a model (with random weights) from the configuration
-    >>> model = DbrxModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```
-    """
-
-    model_type = "dbrx"
-    attribute_map = {
-        "num_attention_heads": "n_heads",
-        "hidden_size": "d_model",
-        "num_hidden_layers": "n_layers",
-        "max_position_embeddings": "max_seq_len",
-    }
-
-    def __init__(
-        self,
-        d_model: int = 2048,
-        n_heads: int = 16,
-        n_layers: int = 24,
-        max_seq_len: int = 2048,
-        vocab_size: int = 32000,
-        resid_pdrop: float = 0.0,
-        emb_pdrop: float = 0.0,
-        attn_config: Optional[DbrxAttentionConfig] = None,
-        ffn_config: Optional[DbrxFFNConfig] = None,
-        use_cache: bool = True,
-        initializer_range: float = 0.02,
-        output_router_logits: bool = False,
-        router_aux_loss_coef: float = 0.05,
-        **kwargs: Any,
-    ):
-        if attn_config is None:
-            self.attn_config = DbrxAttentionConfig()
-        elif isinstance(attn_config, dict):
-            self.attn_config = DbrxAttentionConfig(**attn_config)
-        else:
-            self.attn_config = attn_config
-
-        if ffn_config is None:
-            self.ffn_config = DbrxFFNConfig()
-        elif isinstance(ffn_config, dict):
-            self.ffn_config = DbrxFFNConfig(**ffn_config)
-        else:
-            self.ffn_config = ffn_config
-
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.max_seq_len = max_seq_len
-        self.vocab_size = vocab_size
-        self.resid_pdrop = resid_pdrop
-        self.emb_pdrop = emb_pdrop
-        self.use_cache = use_cache
-        self.initializer_range = initializer_range
-        self.output_router_logits = output_router_logits
-        self.router_aux_loss_coef = router_aux_loss_coef
-
-        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
-        if tie_word_embeddings:
-            raise ValueError(
-                "tie_word_embeddings is not supported for Dbrx models."
-            )
-
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
--- a/vllm/transformers_utils/configs/exaone.py
+++ b/vllm/transformers_utils/configs/exaone.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Copied from
-# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
-# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Exaone model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: dict[str, str] = {}
-
-
-class ExaoneConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a :class:
-    `~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
-    according to the specified arguments, defining the model architecture.
-    Instantiating a configuration with the defaults will yield a similar
-    configuration to that of the Exaone
-
-    Configuration objects inherit from {class}`~transformers.PretrainedConfig`
-    and can be used to control the model outputs. Read the documentation from :
-    class:`~transformers.PretrainedConfig` for more information.
-
-    Args:
-        vocab_size ({obj}`int`, `optional`, defaults to 50257):
-            Vocabulary size of the GPT Lingvo model. Defines the number of
-            different tokens that can be represented by the {obj}`inputs_ids`
-            passed when calling {class}`~transformers.ExaoneModel`. Vocabulary
-            size of the model.
-            Defines the different tokens that can be represented by the
-            `inputs_ids` passed to the forward method of :class:
-            `~transformers.EXAONEModel`.
-        hidden_size ({obj}`int`, `optional`, defaults to 2048):
-            Dimensionality of the encoder layers and the pooler layer.
-        num_layers ({obj}`int`, `optional`, defaults to 24):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the
-            Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to
-            implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi
-            Head Attention (MHA), if `num_key_value_heads=1 the model will use
-            Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint,
-            each group key and value head should be constructed by meanpooling
-            all the original heads within that group. For more details checkout
-            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
-            specified, will default to `num_attention_heads`.
-        rotary_pct (`float`, *optional*, defaults to 0.25):
-            percentage of hidden dimensions to allocate to rotary embeddings
-        intermediate_size ({obj}`int`, `optional`, defaults to 8192):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in
-            the Transformer encoder.
-        activation_function ({obj}`str` or {obj}`function`, `optional`,
-        defaults to {obj}`"gelu_new"`):
-            The non-linear activation function (function or string) in the
-            encoder and pooler. If string, {obj}`"gelu"`, {obj}`"relu"`,
-            {obj}`"selu"` and {obj}`"gelu_new"` are supported.
-        embed_dropout ({obj}`float`, `optional`, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the
-            embeddings, encoder, and pooler.
-        attention_dropout ({obj}`float`, `optional`, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        max_position_embeddings ({obj}`int`, `optional`, defaults to 2048):
-            The maximum sequence length that this model might ever be used with.
-            Typically set this to something large just in case
-            (e.g., 512 or 1024 or 2048).
-        type_vocab_size ({obj}`int`, `optional`, defaults to 2):
-            The vocabulary size of the {obj}`token_type_ids` passed when calling
-            {class}`~transformers.EXAONEModel`.
-        initializer_range ({obj}`float`, `optional`, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for
-            initializing all weight matrices.
-        layer_norm_epsilon ({obj}`float`, `optional`, defaults to 1e-5):
-            The epsilon used by the layer normalization layers.
-        use_cache ({obj}`bool`, `optional`, defaults to {obj}`True`):
-            Whether or not the model should return the last key/values
-            attentions (not used by all models).
-            Only relevant if ``config.is_decoder=True``.
-        gradient_checkpointing ({obj}`bool`, `optional`,
-        defaults to {obj}`False`):
-            If True, use gradient checkpointing to save memory at the expense
-            of slower backward pass.
-        Example::
-
-            >>> from transformers import ExoneModel, ExaoneConfig
-
-            >>> # Initializing a EXAONE configuration
-            >>> configuration = ExaoneConfig()
-
-            >>> # Initializing a model from configuration
-            >>> model = ExoneModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
-    """
-
-    model_type = "exaone"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {"num_hidden_layers": "num_layers"}
-
-    def __init__(
-        self,
-        vocab_size=102400,
-        max_position_embeddings=2048,
-        hidden_size=2048,
-        num_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        intermediate_size=None,
-        activation_function="silu",
-        rotary_pct=0.25,
-        resid_dropout=0.0,
-        embed_dropout=0.0,
-        attention_dropout=0.0,
-        layer_norm_epsilon=1e-6,
-        initializer_range=0.02,
-        use_cache=True,
-        bos_token_id=0,
-        eos_token_id=2,
-        tie_word_embeddings=True,
-        **kwargs,
-    ):
-        super().__init__(
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_hidden_layers = num_layers
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        if intermediate_size:
-            self.intermediate_size = intermediate_size
-        else:
-            self.intermediate_size = hidden_size * 4
-        self.activation_function = activation_function
-        self.resid_dropout = resid_dropout
-        self.embed_dropout = embed_dropout
-        self.attention_dropout = attention_dropout
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.use_cache = use_cache
-        self.rotary_pct = rotary_pct
-
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-
-        self.use_logit_cap = kwargs.pop("use_logit_cap", False)
-        self.ln_no_scale = kwargs.pop("ln_no_scale", False)
-        self.use_gated = kwargs.pop("use_gated", False)
-        self.use_emb_norm = kwargs.pop("use_emb_norm", False)
-        self.use_rotary_pos = kwargs.pop("use_rotary_pos", False)
-        self.rotary_type = kwargs.pop("rotary_type", None)
-        self.scaling_factor = kwargs.pop("scaling_factor", 1)
-        self.use_absolute_pos = kwargs.pop("use_absolute_pos", True)
-        self.use_extra_logit = kwargs.pop("use_extra_logit", True)
-        self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
-        self.rotary_base = kwargs.pop("rotary_base", 10000.0)
-        self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
-        self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head",
-                                                 (rotary_pct == 0.25))
-        if self.use_rotary_pos:
-            self.use_absolute_pos = False
--- a/vllm/transformers_utils/configs/exaone4.py
+++ b/vllm/transformers_utils/configs/exaone4.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# ruff: noqa: E501
-
-# Copied from
-# https://github.com/lgai-exaone/transformers/blob/add-exaone4/src/transformers/models/exaone4/configuration_exaone4.py
-# Copyright 2025 The LG CNS Gen AI Solution Delivery Team.
-# Copyright 2025 The LG AI Research and HuggingFace Inc. team. All rights reserved.
-#
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from transformers.configuration_utils import (PretrainedConfig,
-                                              layer_type_validation)
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-def check_is_sliding(config, layer_idx):
-    """
-    Check if the current layer is a sliding window attention (local attention) layer.
-    """
-    if config.sliding_window is None:
-        return False
-    if config.layer_types is not None:
-        return config.layer_types[layer_idx] == "sliding_attention"
-    if isinstance(config.sliding_window_pattern, int):
-        return ((layer_idx + 1) % config.sliding_window_pattern) != 0
-    elif isinstance(config.sliding_window_pattern, str):
-        assert isinstance(config.sliding_window, int), (
-            f"Sliding window must be positive integer, but got {config.sliding_window}"
-        )
-        return (layer_idx != config.num_hidden_layers - 1
-                and config.sliding_window_pattern[layer_idx % len(
-                    config.sliding_window_pattern)] == "L")
-    else:
-        logger.warning_once(
-            "Sliding window is set, but none of `sliding_window_pattern` or `layer_types` is set. "
-            "Defaulting to use 'full_attention' for all layers.")
-    return False
-
-
-class Exaone4Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Exaone4Model`]. It is used to
-    instantiate a EXAONE 4.0 model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the EXAONE-4.0-Instruct [LGAI-EXAONE/EXAONE-4.0-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-Instruct)
-    NOTE: `EXAONE-4.0-Instruct` is a placeholder model ID. The exact model ID will be updated in the future.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
-    outputs. Read the documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 102400):
-            Vocabulary size of the EXAONE 4.0 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Exaone4Model`].
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to `hidden_size * 4`):
-            Dimensionality of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 32768 for EXAONE 3.5).
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the layer normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
-        bos_token_id (`int`, *optional*, defaults to 0):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 2):
-            End of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
-            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
-            accordingly.
-            Expected contents:
-                `rope_type` (`str`):
-                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
-                    'llama3'], with 'default' being the original RoPE implementation.
-                `factor` (`float`, *optional*):
-                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
-                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
-                    original maximum pre-trained length.
-                `original_max_position_embeddings` (`int`, *optional*):
-                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
-                    pretraining.
-                `attention_factor` (`float`, *optional*):
-                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
-                    computation. If unspecified, it defaults to value recommended by the implementation, using the
-                    `factor` field to infer the suggested value.
-                `beta_fast` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 32.
-                `beta_slow` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 1.
-                `short_factor` (`List[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `long_factor` (`List[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `low_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
-                `high_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        sliding_window (`int`, *optional*):
-            The size of the sliding window for the sliding window attention.
-        sliding_window_pattern (`str`, *optional*):
-            The pattern to use for sliding window attention. Can be one of:
-                - `None`: No sliding window attention is used
-                - `int`: Every `sliding_window` layers, use global attention, else use local attention.
-                - `str`: A sequence of "L" (local attention) and "G" (global attention) characters that defines the
-                  attention pattern. The pattern starts from layer 0 and repeats every `sliding_window` layers. The
-                  final layer always uses global attention regardless of the pattern.
-            For instance, sliding_window_pattern="LLLG" same as sliding_window=4, which means:
-                - Layer 0, 1, 2: local attention,
-                - Layer 3: global attention,
-                ...(repeated)
-        layer_types (`list`, *optional*):
-            Attention pattern for each layer. Prioritized over `sliding_window_pattern`.
-
-    Example:
-
-    ```python
-    >>> from transformers import Exaone4Model, Exaone4Config
-
-    >>> # Initializing a EXAONE configuration
-    >>> configuration = Exaone4Config()
-
-    >>> # Initializing a model from configuration
-    >>> model = Exaone4Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "exaone4"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    # Default tensor parallel plan for base model `LlamaModel`
-    base_model_tp_plan = {
-        "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.k_proj": "colwise",
-        "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.gate_proj": "colwise",
-        "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
-    }
-    base_model_pp_plan = {
-        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
-        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
-        "norm": (["hidden_states"], ["hidden_states"]),
-    }
-
-    def __init__(
-        self,
-        vocab_size=102400,
-        hidden_size=4096,
-        intermediate_size=None,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-5,
-        use_cache=True,
-        bos_token_id=0,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_dropout=0.0,
-        sliding_window=None,
-        sliding_window_pattern=None,
-        layer_types=None,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        if intermediate_size:
-            self.intermediate_size = intermediate_size
-        else:
-            self.intermediate_size = hidden_size * 4
-        self.hidden_act = hidden_act
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.attention_dropout = attention_dropout
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.sliding_window = sliding_window
-        self.sliding_window_pattern = sliding_window_pattern
-
-        self.layer_types = layer_types
-        if self.layer_types is None:
-            self.layer_types = [
-                "sliding_attention"
-                if check_is_sliding(self, i) else "full_attention"
-                for i in range(self.num_hidden_layers)
-            ]
-        layer_type_validation(self.layer_types)
-
-        super().__init__(bos_token_id=bos_token_id,
-                         eos_token_id=eos_token_id,
-                         tie_word_embeddings=tie_word_embeddings,
-                         **kwargs)
-
-
-__all__ = ["Exaone4Config"]
--- a/vllm/transformers_utils/configs/minimax_text_01.py
+++ b/vllm/transformers_utils/configs/minimax_text_01.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-""" MiniMaxText01 model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-
-
-class MiniMaxText01Config(PretrainedConfig):
-    model_type = "MiniMaxText01"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=14336,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=8,
-        hidden_act="silu",
-        max_position_embeddings=4096 * 32,
-        initializer_range=0.02,
-        rms_norm_eps=1e-5,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=None,
-        eos_token_id=None,
-        tie_word_embeddings=False,
-        rope_theta=1e6,
-        sliding_window=None,
-        attention_dropout=0.0,
-        num_experts_per_tok=2,
-        num_local_experts=8,
-        output_router_logits=False,
-        router_aux_loss_coef=0.001,
-        router_jitter_noise=0.0,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.sliding_window = sliding_window
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-
-        self.num_experts_per_tok = num_experts_per_tok
-        self.num_local_experts = num_local_experts
-        self.output_router_logits = output_router_logits
-        self.router_aux_loss_coef = router_aux_loss_coef
-        self.router_jitter_noise = router_jitter_noise
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
--- a/vllm/transformers_utils/configs/minimax_vl_01.py
+++ b/vllm/transformers_utils/configs/minimax_vl_01.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""MiniMaxVL01 model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.models.auto import CONFIG_MAPPING
-
-from .minimax_text_01 import MiniMaxText01Config
-
-
-class MiniMaxVL01Config(PretrainedConfig):
-    model_type = "minimax_vl_01"
-
-    def __init__(
-        self,
-        vision_config=None,
-        text_config=None,
-        ignore_index=-100,
-        image_token_index=32000,
-        projector_hidden_act="gelu",
-        vision_feature_select_strategy="default",
-        vision_feature_layer=-2,
-        image_grid_pinpoints=None,
-        tie_word_embeddings=False,
-        image_seq_length=576,
-        **kwargs,
-    ):
-        self.ignore_index = ignore_index
-        self.image_token_index = image_token_index
-        self.projector_hidden_act = projector_hidden_act
-        self.image_seq_length = image_seq_length
-
-        if vision_feature_select_strategy not in ["default", "full"]:
-            raise ValueError("vision_feature_select_strategy should " +
-                             "be one of 'default', 'full'." +
-                             f"Got: {vision_feature_select_strategy}")
-
-        self.vision_feature_select_strategy = vision_feature_select_strategy
-        self.vision_feature_layer = vision_feature_layer
-        image_grid_pinpoints = (
-            image_grid_pinpoints if image_grid_pinpoints is not None else
-            [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]])
-        self.image_grid_pinpoints = image_grid_pinpoints
-
-        if isinstance(vision_config, dict):
-            if "model_type" not in vision_config:
-                vision_config["model_type"] = "clip_vision_model"
-            vision_config = CONFIG_MAPPING[vision_config["model_type"]](
-                **vision_config)
-        elif vision_config is None:
-            vision_config = CONFIG_MAPPING["clip_vision_model"](
-                intermediate_size=4096,
-                hidden_size=1024,
-                patch_size=14,
-                image_size=336,
-                num_hidden_layers=24,
-                num_attention_heads=16,
-                vocab_size=32000,
-                projection_dim=768,
-            )
-
-        self.vision_config = vision_config
-
-        if text_config is not None:
-            text_config = MiniMaxText01Config(**text_config)
-        else:
-            text_config = MiniMaxText01Config()
-
-        self.text_config = text_config
-
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
--- a/vllm/transformers_utils/configs/mpt.py
+++ b/vllm/transformers_utils/configs/mpt.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Copied from
-# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
-"""A HuggingFace-style model configuration."""
-import warnings
-from typing import Any, Optional, Union
-
-from transformers import PretrainedConfig
-
-attn_config_defaults: dict = {
-    'attn_type': 'multihead_attention',
-    'attn_pdrop': 0.0,
-    'attn_impl': 'triton',
-    'qk_ln': False,
-    'clip_qkv': None,
-    'softmax_scale': None,
-    'prefix_lm': False,
-    'attn_uses_sequence_id': False,
-    'alibi': False,
-    'alibi_bias_max': 8
-}
-ffn_config_defaults: dict = {'ffn_type': 'mptmlp'}
-init_config_defaults: dict = {
-    'name': 'kaiming_normal_',
-    'fan_mode': 'fan_in',
-    'init_nonlinearity': 'relu',
-    'init_div_is_residual': True,
-    'emb_init_std': None,
-    'emb_init_uniform_lim': None,
-    'init_std': None,
-    'init_gain': 0.0
-}
-
-
-class MPTConfig(PretrainedConfig):
-    model_type = 'mpt'
-    attribute_map = {
-        'num_attention_heads': 'n_heads',
-        'hidden_size': 'd_model',
-        'num_hidden_layers': 'n_layers',
-    }
-
-    # pylint: disable=dangerous-default-value
-    def __init__(self,
-                 d_model: int = 2048,
-                 n_heads: int = 16,
-                 n_layers: int = 24,
-                 expansion_ratio: int = 4,
-                 max_seq_len: int = 2048,
-                 vocab_size: int = 50368,
-                 resid_pdrop: float = 0.0,
-                 emb_pdrop: float = 0.0,
-                 learned_pos_emb: bool = True,
-                 attn_config: dict = attn_config_defaults,
-                 ffn_config: dict = ffn_config_defaults,
-                 init_device: str = 'cpu',
-                 logit_scale: Optional[Union[float, str]] = None,
-                 no_bias: bool = False,
-                 embedding_fraction: float = 1.0,
-                 norm_type: str = 'low_precision_layernorm',
-                 use_cache: bool = False,
-                 init_config: dict = init_config_defaults,
-                 fc_type: str = 'torch',
-                 verbose: Optional[int] = None,
-                 **kwargs: Any):
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.expansion_ratio = expansion_ratio
-        self.max_seq_len = max_seq_len
-        self.vocab_size = vocab_size
-        self.resid_pdrop = resid_pdrop
-        self.emb_pdrop = emb_pdrop
-        self.learned_pos_emb = learned_pos_emb
-        self.attn_config = attn_config
-        self.ffn_config = ffn_config
-        self.init_device = init_device
-        self.logit_scale = logit_scale
-        self.no_bias = no_bias
-        self.embedding_fraction = embedding_fraction
-        self.norm_type = norm_type
-        self.use_cache = use_cache
-        self.init_config = init_config
-        self.fc_type = fc_type
-        if verbose is not None:
-            warnings.warn(DeprecationWarning(
-                'verbose argument for MPTConfig is now ignored and '
-                'will be removed. Use python_log_level instead.'),
-                          stacklevel=2)
-        if 'name' in kwargs:
-            del kwargs['name']
-        if 'loss_fn' in kwargs:
-            del kwargs['loss_fn']
-        if self.attn_config.get('alibi', False):
-            self.learned_pos_emb = False
-            warnings.warn(
-                f'alibi is turned on, setting `learned_pos_emb` '
-                f'to {self.learned_pos_emb}`',
-                stacklevel=2)
-        super().__init__(**kwargs)
-        self._validate_config()
-
-    def _set_config_defaults(
-            self, config: dict[str, Any],
-            config_defaults: dict[str, Any]) -> dict[str, Any]:
-        for (k, v) in config_defaults.items():
-            if k not in config:
-                config[k] = v
-        return config
-
-    def _validate_config(self) -> None:
-        self.attn_config = self._set_config_defaults(self.attn_config,
-                                                     attn_config_defaults)
-        self.ffn_config = self._set_config_defaults(self.ffn_config,
-                                                    ffn_config_defaults)
-        self.init_config = self._set_config_defaults(self.init_config,
-                                                     init_config_defaults)
-        if self.d_model % self.n_heads != 0:
-            raise ValueError('d_model must be divisible by n_heads')
-        if any(
-                prob < 0 or prob > 1 for prob in
-            [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop
-             ]):
-            raise ValueError(
-                "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
-                "probabilities and must be between 0 and 1")
-        if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
-            raise ValueError(
-                f"Unknown attn_impl={self.attn_config['attn_impl']}")
-        if self.attn_config['prefix_lm'] and self.attn_config[
-                'attn_impl'] not in ['torch', 'triton']:
-            raise NotImplementedError(
-                'prefix_lm only implemented with torch and triton attention.')
-        if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [
-                'torch', 'triton'
-        ]:
-            raise NotImplementedError(
-                'alibi only implemented with torch and triton attention.')
-        if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
-                'attn_impl'] not in ['torch', 'triton']:
-            raise NotImplementedError(
-                'attn_uses_sequence_id only implemented with torch '
-                'and triton attention.')
-        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
-            raise ValueError(
-                'model.embedding_fraction must be between 0 (exclusive) '
-                'and 1 (inclusive)!')
-        if isinstance(self.logit_scale,
-                      str) and self.logit_scale != 'inv_sqrt_d_model':
-            raise ValueError(
-                f"self.logit_scale={self.logit_scale!r} is not recognized as "
-                "an option; use numeric value or 'inv_sqrt_d_model'.")
-        if self.init_config.get('name', None) is None:
-            raise ValueError(
-                f"self.init_config={self.init_config!r} 'name' needs to be set."
-            )
-        if not self.learned_pos_emb and (not self.attn_config['alibi']):
-            warnings.warn(
-                'Positional information not being provided to the model.',
-                stacklevel=2)
-        if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
-            try:
-                # pylint: disable=import-outside-toplevel
-                import transformer_engine.pytorch as te
-                del te
-            except Exception as exc:
-                raise ImportError(
-                    'TransformerEngine import fail. `fc_type: te` requires '
-                    'TransformerEngine be installed. '
-                    'The required version of transformer_engine also requires '
-                    'FlashAttention v1.0.6 is installed:\n'
-                    'pip install flash-attn==1.0.6 --no-build-isolation \n'
-                    'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
-                ) from exc
-        if self.ffn_config['ffn_type'] == 'mptmlp':
-            self.ffn_config['fc_type'] = self.fc_type
-        elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
-            self.ffn_config['bias'] = not self.no_bias
--- a/vllm/transformers_utils/configs/nvlm_d.py
+++ b/vllm/transformers_utils/configs/nvlm_d.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Adapted from
-# https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
-# --------------------------------------------------------
-# NVLM-D
-# Copyright (c) 2024 NVIDIA
-# Licensed under Apache 2.0 License [see LICENSE for details]
-# --------------------------------------------------------
-from transformers import Qwen2Config
-from transformers.configuration_utils import PretrainedConfig
-
-
-class NVLM_D_Config(PretrainedConfig):
-    model_type = 'NVLM_D'
-    is_composition = True
-
-    def __init__(self, vision_config=None, llm_config=None, **kwargs):
-        super().__init__(**kwargs)
-
-        # Handle vision_config initialization
-        if vision_config is None:
-            vision_config = {}
-
-        # Handle llm_config initialization
-        if llm_config is None:
-            llm_config = {}
-
-        self.vision_config = PretrainedConfig(**vision_config)
-        self.text_config = Qwen2Config(**llm_config)
--- a/vllm/transformers_utils/configs/ovis.py
+++ b/vllm/transformers_utils/configs/ovis.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# yapf: disable
-# ruff: noqa: E501
-# copied from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
-# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
-from typing import Any, Optional, Union
-
-from transformers import AutoConfig, PretrainedConfig
-
-
-class AIMv2Config(PretrainedConfig):
-    """This is the configuration class to store the configuration of an [`AIMv2Model`].
-
-    Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
-
-    Args:
-        hidden_size: Dimension of the hidden representations.
-        intermediate_size: Dimension of the SwiGLU representations.
-        num_hidden_layers: Number of hidden layers in the Transformer.
-        num_attention_heads: Number of attention heads for each attention layer
-            in the Transformer.
-        num_channels: Number of input channels.
-        image_size: Image size.
-        patch_size: Patch size.
-        rms_norm_eps: Epsilon value used for the RMS normalization layer.
-        attention_dropout: Dropout ratio for attention probabilities.
-        projection_dropout: Dropout ratio for the projection layer after the attention.
-        qkv_bias: Whether to add a bias to the queries, keys and values.
-        use_bias: Whether to add a bias in the feed-forward and projection layers.
-        kwargs: Keyword arguments for the [`PretrainedConfig`].
-    """
-
-    model_type: str = "aimv2"
-
-    def __init__(
-        self,
-        hidden_size: int = 1024,
-        intermediate_size: int = 2816,
-        num_hidden_layers: int = 24,
-        num_attention_heads: int = 8,
-        num_channels: int = 3,
-        image_size: int = 224,
-        patch_size: int = 14,
-        rms_norm_eps: float = 1e-5,
-        attention_dropout: float = 0.0,
-        projection_dropout: float = 0.0,
-        qkv_bias: bool = False,
-        use_bias: bool = False,
-        **kwargs: Any,
-    ):
-        super().__init__(**kwargs)
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
-        self.patch_size = patch_size
-        self.image_size = image_size
-        self.attention_dropout = attention_dropout
-        self.rms_norm_eps = rms_norm_eps
-
-        self.projection_dropout = projection_dropout
-        self.qkv_bias = qkv_bias
-        self.use_bias = use_bias
-
-
-IGNORE_ID = -100
-IMAGE_TOKEN_ID = -200
-IMAGE_TOKEN = "<image>"
-IMAGE_ATOM_ID = -300
-IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305]
-
-
-# ----------------------------------------------------------------------
-#                     Visual Tokenizer Configuration
-# ----------------------------------------------------------------------
-class BaseVisualTokenizerConfig(PretrainedConfig):
-
-    def __init__(self,
-                 vocab_size=16384,
-                 tokenize_function="softmax",
-                 tau=1.0,
-                 depths=None,
-                 drop_cls_token=False,
-                 backbone_config: Optional[Union[PretrainedConfig,
-                                                 dict]] = None,
-                 hidden_stride: int = 1,
-                 **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = vocab_size
-        self.tokenize_function = tokenize_function
-        self.tau = tau
-        if isinstance(depths, str):
-            depths = [int(x) for x in depths.split('|')]
-        self.depths = depths
-        self.backbone_kwargs = dict[str, Any]()
-        self.drop_cls_token = drop_cls_token
-        if backbone_config is not None:
-            assert isinstance(backbone_config, (PretrainedConfig, dict)), \
-                f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
-            if not isinstance(backbone_config, PretrainedConfig):
-                model_type = backbone_config['model_type']
-                if model_type != "aimv2":
-                    backbone_config.pop('model_type')
-                    backbone_config = AutoConfig.for_model(model_type, **backbone_config)
-                else:
-                    backbone_config = AIMv2Config(**backbone_config)
-        self.backbone_config = backbone_config
-        self.hidden_stride = hidden_stride
-
-
-class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
-    model_type = "aimv2_visual_tokenizer"
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        if self.drop_cls_token:
-            self.drop_cls_token = False
-        if self.depths:
-            assert len(self.depths) == 1
-            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
-
-
-class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
-    model_type = "siglip_visual_tokenizer"
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        if self.drop_cls_token:
-            self.drop_cls_token = False
-        if self.depths:
-            assert len(self.depths) == 1
-            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
-
-
-AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
-AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
-
-
-# ----------------------------------------------------------------------
-#                           Ovis Configuration
-# ----------------------------------------------------------------------
-class OvisConfig(PretrainedConfig):
-    model_type = "ovis"
-
-    def __init__(self,
-                 llm_config: Optional[Union[PretrainedConfig, dict]] = None,
-                 visual_tokenizer_config: Optional[Union[PretrainedConfig,
-                                                         dict]] = None,
-                 multimodal_max_length=8192,
-                 hidden_size=None,
-                 conversation_formatter_class=None,
-                 llm_attn_implementation=None,
-                 disable_tie_weight=False,
-                 **kwargs):
-        super().__init__(**kwargs)
-        if llm_config is not None:
-            assert isinstance(llm_config, (PretrainedConfig, dict)), \
-                f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
-            if not isinstance(llm_config, PretrainedConfig):
-                model_type = llm_config['model_type']
-                llm_config.pop('model_type')
-                llm_config = AutoConfig.for_model(model_type, **llm_config)
-
-        # map llm_config to text_config
-        self.text_config = llm_config
-        if visual_tokenizer_config is not None:
-            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
-                f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
-            if not isinstance(visual_tokenizer_config, PretrainedConfig):
-                model_type = visual_tokenizer_config['model_type']
-                visual_tokenizer_config.pop('model_type')
-                visual_tokenizer_config = AutoConfig.for_model(
-                    model_type, **visual_tokenizer_config)
-
-        self.visual_tokenizer_config = visual_tokenizer_config
-        self.multimodal_max_length = multimodal_max_length
-        self.hidden_size = hidden_size
-        self.conversation_formatter_class = conversation_formatter_class
-        self.llm_attn_implementation = llm_attn_implementation
-        self.disable_tie_weight = disable_tie_weight
--- a/vllm/transformers_utils/configs/skyworkr1v.py
+++ b/vllm/transformers_utils/configs/skyworkr1v.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Adapted from
-# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/configuration_skywork_chat.py
-# --------------------------------------------------------
-# SkyworkR1V
-# Copyright (c) 2025 Skywork
-# Licensed under The MIT License [see LICENSE for details]
-# --------------------------------------------------------
-from transformers.configuration_utils import PretrainedConfig
-
-
-class SkyworkR1VChatConfig(PretrainedConfig):
-    model_type = 'internvl_chat'
-    is_composition = True
-
-    def __init__(self,
-                 vision_config=None,
-                 llm_config=None,
-                 use_backbone_lora=0,
-                 use_llm_lora=0,
-                 select_layer=-1,
-                 force_image_size=None,
-                 downsample_ratio=0.5,
-                 template=None,
-                 dynamic_image_size=False,
-                 use_thumbnail=False,
-                 ps_version='v1',
-                 min_dynamic_patch=1,
-                 max_dynamic_patch=6,
-                 **kwargs):
-        super().__init__(**kwargs)
-
-        if vision_config is None:
-            vision_config = {}
-
-        if llm_config is None:
-            llm_config = {}
-
-        self.vision_config = PretrainedConfig(**vision_config)
-        self.text_config = PretrainedConfig(**llm_config)
-
-        self.use_backbone_lora = use_backbone_lora
-        self.use_llm_lora = use_llm_lora
-        self.select_layer = select_layer
-        self.force_image_size = force_image_size
-        self.downsample_ratio = downsample_ratio
-        self.template = template
-        self.dynamic_image_size = dynamic_image_size
-        self.use_thumbnail = use_thumbnail
-        self.ps_version = ps_version  # pixel shuffle version
-        self.min_dynamic_patch = min_dynamic_patch
-        self.max_dynamic_patch = max_dynamic_patch