Unverified Commit a8b70304 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Update `rope_scaling` to `rope_parameters` in preparation for Transformers v5 (#28542)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent d44e9df7
...@@ -161,7 +161,6 @@ class MixtralAttention(nn.Module): ...@@ -161,7 +161,6 @@ class MixtralAttention(nn.Module):
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
max_position: int = 4096 * 32, max_position: int = 4096 * 32,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
prefix: str = "", prefix: str = "",
...@@ -189,7 +188,6 @@ class MixtralAttention(nn.Module): ...@@ -189,7 +188,6 @@ class MixtralAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
hidden_size, hidden_size,
...@@ -211,7 +209,7 @@ class MixtralAttention(nn.Module): ...@@ -211,7 +209,7 @@ class MixtralAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position, max_position=max_position,
base=int(self.rope_theta), rope_parameters=config.rope_parameters,
is_neox_style=True, is_neox_style=True,
) )
self.attn = Attention( self.attn = Attention(
...@@ -248,15 +246,12 @@ class MixtralDecoderLayer(nn.Module): ...@@ -248,15 +246,12 @@ class MixtralDecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 10000)
self.self_attn = MixtralAttention( self.self_attn = MixtralAttention(
config=config, config=config,
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings, max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.self_attn", prefix=f"{prefix}.self_attn",
......
...@@ -292,13 +292,17 @@ class Llama4VisionAttention(nn.Module): ...@@ -292,13 +292,17 @@ class Llama4VisionAttention(nn.Module):
prefix=f"{prefix}.o_proj", prefix=f"{prefix}.o_proj",
) )
rope_parameters = {
"rope_type": "mllama4",
"rope_theta": config.rope_parameters["rope_theta"],
}
self.rotary_emb = get_rope( self.rotary_emb = get_rope(
head_size=self.head_dim, head_size=self.head_dim,
rotary_dim=config.hidden_size // config.num_attention_heads // 2, rotary_dim=config.hidden_size // config.num_attention_heads // 2,
# number of image patches # number of image patches
max_position=(config.image_size // config.patch_size) ** 2, max_position=(config.image_size // config.patch_size) ** 2,
base=config.rope_theta, rope_parameters=rope_parameters,
rope_scaling={"rope_type": "mllama4"},
is_neox_style=False, is_neox_style=False,
dtype=torch.complex64, # important dtype=torch.complex64, # important
) )
......
...@@ -410,7 +410,6 @@ class MolmoAttention(nn.Module): ...@@ -410,7 +410,6 @@ class MolmoAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.rope_theta = config.rope_theta
# Attention input projection. Projects x -> (q, k, v) # Attention input projection. Projects x -> (q, k, v)
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
...@@ -437,7 +436,7 @@ class MolmoAttention(nn.Module): ...@@ -437,7 +436,7 @@ class MolmoAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=self.max_position_embeddings, max_position=self.max_position_embeddings,
base=self.rope_theta, rope_parameters=config.rope_parameters,
) )
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.attn = Attention( self.attn = Attention(
......
...@@ -26,7 +26,6 @@ ...@@ -26,7 +26,6 @@
from collections.abc import Iterable from collections.abc import Iterable
from itertools import islice from itertools import islice
from typing import Any
import torch import torch
from torch import nn from torch import nn
...@@ -150,8 +149,6 @@ class NemotronAttention(nn.Module): ...@@ -150,8 +149,6 @@ class NemotronAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
bias: bool = False, bias: bool = False,
...@@ -181,7 +178,6 @@ class NemotronAttention(nn.Module): ...@@ -181,7 +178,6 @@ class NemotronAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.partial_rotary_factor = config.partial_rotary_factor self.partial_rotary_factor = config.partial_rotary_factor
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
...@@ -206,8 +202,7 @@ class NemotronAttention(nn.Module): ...@@ -206,8 +202,7 @@ class NemotronAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
partial_rotary_factor=self.partial_rotary_factor, partial_rotary_factor=self.partial_rotary_factor,
) )
self.attn = Attention( self.attn = Attention(
...@@ -243,14 +238,6 @@ class NemotronDecoderLayer(nn.Module): ...@@ -243,14 +238,6 @@ class NemotronDecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias # Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias # Support internlm/internlm-7b with bias
...@@ -264,8 +251,6 @@ class NemotronDecoderLayer(nn.Module): ...@@ -264,8 +251,6 @@ class NemotronDecoderLayer(nn.Module):
num_kv_heads=getattr( num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads config, "num_key_value_heads", config.num_attention_heads
), ),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
quant_config=quant_config, quant_config=quant_config,
bias=attention_bias, bias=attention_bias,
......
...@@ -26,7 +26,6 @@ ...@@ -26,7 +26,6 @@
from collections.abc import Iterable from collections.abc import Iterable
from itertools import islice from itertools import islice
from typing import Any
import torch import torch
from torch import nn from torch import nn
...@@ -82,8 +81,6 @@ class DeciLMAttention(LlamaAttention): ...@@ -82,8 +81,6 @@ class DeciLMAttention(LlamaAttention):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
bias: bool = False, bias: bool = False,
...@@ -97,8 +94,6 @@ class DeciLMAttention(LlamaAttention): ...@@ -97,8 +94,6 @@ class DeciLMAttention(LlamaAttention):
hidden_size, hidden_size,
num_heads, num_heads,
num_kv_heads, num_kv_heads,
rope_theta,
rope_scaling,
max_position_embeddings, max_position_embeddings,
quant_config, quant_config,
bias, bias,
...@@ -111,7 +106,6 @@ class DeciLMAttention(LlamaAttention): ...@@ -111,7 +106,6 @@ class DeciLMAttention(LlamaAttention):
def _init_rotary_emb( def _init_rotary_emb(
self, self,
config, config,
rope_scaling: dict[str, Any] | None,
quant_config: QuantizationConfig | None, quant_config: QuantizationConfig | None,
) -> None: ) -> None:
# Enables YARN for Mistral and LLaMA4 derivatives. # Enables YARN for Mistral and LLaMA4 derivatives.
...@@ -126,8 +120,7 @@ class DeciLMAttention(LlamaAttention): ...@@ -126,8 +120,7 @@ class DeciLMAttention(LlamaAttention):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=self.max_position_embeddings, max_position=self.max_position_embeddings,
base=self.rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
is_neox_style=is_neox_style, is_neox_style=is_neox_style,
partial_rotary_factor=self.partial_rotary_factor, partial_rotary_factor=self.partial_rotary_factor,
) )
...@@ -148,14 +141,6 @@ class DeciLMDecoderLayer(nn.Module): ...@@ -148,14 +141,6 @@ class DeciLMDecoderLayer(nn.Module):
self._is_no_op_ffn = block_config.ffn.no_op self._is_no_op_ffn = block_config.ffn.no_op
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias # Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias # Support internlm/internlm-7b with bias
...@@ -176,8 +161,6 @@ class DeciLMDecoderLayer(nn.Module): ...@@ -176,8 +161,6 @@ class DeciLMDecoderLayer(nn.Module):
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
num_kv_heads=num_kv_heads, num_kv_heads=num_kv_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
quant_config=quant_config, quant_config=quant_config,
bias=attention_bias, bias=attention_bias,
......
...@@ -87,7 +87,6 @@ class OlmoAttention(nn.Module): ...@@ -87,7 +87,6 @@ class OlmoAttention(nn.Module):
self.num_heads = self.total_num_heads // tensor_model_parallel_world_size self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
self.head_dim = self.hidden_size // self.total_num_heads self.head_dim = self.hidden_size // self.total_num_heads
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.rope_theta = config.rope_theta
self.clip_qkv = config.clip_qkv self.clip_qkv = config.clip_qkv
# Attention input projection. Projects x -> (q, k, v) # Attention input projection. Projects x -> (q, k, v)
...@@ -105,7 +104,7 @@ class OlmoAttention(nn.Module): ...@@ -105,7 +104,7 @@ class OlmoAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=self.max_position_embeddings, max_position=self.max_position_embeddings,
base=self.rope_theta, rope_parameters=config.rope_parameters,
) )
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.attn = Attention( self.attn = Attention(
......
...@@ -99,7 +99,6 @@ class Olmo2Attention(nn.Module): ...@@ -99,7 +99,6 @@ class Olmo2Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.max_position_embeddings = self.config.max_position_embeddings self.max_position_embeddings = self.config.max_position_embeddings
self.rope_theta = self.config.rope_theta
# Attention input projection. Projects x -> (q, k, v) # Attention input projection. Projects x -> (q, k, v)
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
...@@ -139,15 +138,17 @@ class Olmo2Attention(nn.Module): ...@@ -139,15 +138,17 @@ class Olmo2Attention(nn.Module):
prefix=f"{prefix}.attn", prefix=f"{prefix}.attn",
) )
# Rotary embeddings. Rope scaling is only applied on full attention # Rotary embeddings. Rope scaling is only applied on full attention layers.
# layers. if sliding_window is None:
self.rope_scaling = self.config.rope_scaling if sliding_window is None else None rope_parameters = self.config.rope_parameters
else:
rope_theta = self.config.rope_parameters["rope_theta"]
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
self.rotary_emb = get_rope( self.rotary_emb = get_rope(
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=self.max_position_embeddings, max_position=self.max_position_embeddings,
base=self.rope_theta, # type: ignore rope_parameters=rope_parameters,
rope_scaling=self.rope_scaling,
) )
# Attention output projection. # Attention output projection.
......
...@@ -123,8 +123,6 @@ class OlmoeAttention(nn.Module): ...@@ -123,8 +123,6 @@ class OlmoeAttention(nn.Module):
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 4096) max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
num_heads = config.num_attention_heads num_heads = config.num_attention_heads
...@@ -148,7 +146,6 @@ class OlmoeAttention(nn.Module): ...@@ -148,7 +146,6 @@ class OlmoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
...@@ -176,8 +173,7 @@ class OlmoeAttention(nn.Module): ...@@ -176,8 +173,7 @@ class OlmoeAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
is_neox_style=True, is_neox_style=True,
) )
self.attn = Attention( self.attn = Attention(
......
...@@ -77,6 +77,7 @@ from vllm.model_executor.models.utils import ( ...@@ -77,6 +77,7 @@ from vllm.model_executor.models.utils import (
sequence_parallel_chunk, sequence_parallel_chunk,
) )
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
def check_ffn_act_fn(act_fn: str): def check_ffn_act_fn(act_fn: str):
...@@ -259,7 +260,6 @@ class OpenPanguMLAAttention(nn.Module): ...@@ -259,7 +260,6 @@ class OpenPanguMLAAttention(nn.Module):
v_head_dim: int, v_head_dim: int,
q_lora_rank: int | None, q_lora_rank: int | None,
kv_lora_rank: int, kv_lora_rank: int,
rope_theta: float = 10000,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
...@@ -274,8 +274,6 @@ class OpenPanguMLAAttention(nn.Module): ...@@ -274,8 +274,6 @@ class OpenPanguMLAAttention(nn.Module):
self.v_head_dim = v_head_dim self.v_head_dim = v_head_dim
self.q_lora_rank = q_lora_rank self.q_lora_rank = q_lora_rank
self.kv_lora_rank = kv_lora_rank self.kv_lora_rank = kv_lora_rank
self.rope_theta = rope_theta
self.tp_size = get_tensor_model_parallel_world_size() self.tp_size = get_tensor_model_parallel_world_size()
if num_heads % self.tp_size != 0: if num_heads % self.tp_size != 0:
raise ValueError( raise ValueError(
...@@ -339,7 +337,9 @@ class OpenPanguMLAAttention(nn.Module): ...@@ -339,7 +337,9 @@ class OpenPanguMLAAttention(nn.Module):
) )
# TODO: remove hard coding # TODO: remove hard coding
rope_scaling = { set_default_rope_theta(config, default_theta=10000)
rope_parameters = {
"rope_theta": config.rope_parameters["rope_theta"],
"beta_fast": 32, "beta_fast": 32,
"beta_slow": 1, "beta_slow": 1,
"factor": 1, "factor": 1,
...@@ -353,8 +353,7 @@ class OpenPanguMLAAttention(nn.Module): ...@@ -353,8 +353,7 @@ class OpenPanguMLAAttention(nn.Module):
qk_rope_head_dim, qk_rope_head_dim,
rotary_dim=qk_rope_head_dim, rotary_dim=qk_rope_head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=rope_parameters,
rope_scaling=rope_scaling,
is_neox_style=False, is_neox_style=False,
) )
...@@ -407,8 +406,6 @@ class OpenPanguEmbeddedAttention(nn.Module): ...@@ -407,8 +406,6 @@ class OpenPanguEmbeddedAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
bias: bool = False, bias: bool = False,
...@@ -454,7 +451,6 @@ class OpenPanguEmbeddedAttention(nn.Module): ...@@ -454,7 +451,6 @@ class OpenPanguEmbeddedAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
...@@ -475,9 +471,7 @@ class OpenPanguEmbeddedAttention(nn.Module): ...@@ -475,9 +471,7 @@ class OpenPanguEmbeddedAttention(nn.Module):
prefix=f"{prefix}.o_proj", prefix=f"{prefix}.o_proj",
) )
self._init_rotary_emb( self._init_rotary_emb(config, quant_config=quant_config)
config, rope_scaling=rope_scaling, quant_config=quant_config
)
if hasattr(config, "interleaved_sliding_window"): if hasattr(config, "interleaved_sliding_window"):
interleaved_sliding_window = config.interleaved_sliding_window interleaved_sliding_window = config.interleaved_sliding_window
...@@ -521,7 +515,6 @@ class OpenPanguEmbeddedAttention(nn.Module): ...@@ -521,7 +515,6 @@ class OpenPanguEmbeddedAttention(nn.Module):
def _init_rotary_emb( def _init_rotary_emb(
self, self,
config: PretrainedConfig, config: PretrainedConfig,
rope_scaling: dict[str, Any] | None,
quant_config: QuantizationConfig | None, quant_config: QuantizationConfig | None,
) -> None: ) -> None:
is_neox_style = True is_neox_style = True
...@@ -533,8 +526,7 @@ class OpenPanguEmbeddedAttention(nn.Module): ...@@ -533,8 +526,7 @@ class OpenPanguEmbeddedAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=self.max_position_embeddings, max_position=self.max_position_embeddings,
base=self.rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
is_neox_style=is_neox_style, is_neox_style=is_neox_style,
) )
...@@ -555,7 +547,6 @@ class OpenPanguDecoderLayer(nn.Module): ...@@ -555,7 +547,6 @@ class OpenPanguDecoderLayer(nn.Module):
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
layer_idx = int(prefix.split(sep=".")[-1]) layer_idx = int(prefix.split(sep=".")[-1])
...@@ -579,7 +570,6 @@ class OpenPanguDecoderLayer(nn.Module): ...@@ -579,7 +570,6 @@ class OpenPanguDecoderLayer(nn.Module):
config.q_lora_rank if hasattr(config, "q_lora_rank") else None config.q_lora_rank if hasattr(config, "q_lora_rank") else None
), ),
kv_lora_rank=config.kv_lora_rank, kv_lora_rank=config.kv_lora_rank,
rope_theta=rope_theta,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
...@@ -607,8 +597,6 @@ class OpenPanguDecoderLayer(nn.Module): ...@@ -607,8 +597,6 @@ class OpenPanguDecoderLayer(nn.Module):
num_kv_heads=getattr( num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads config, "num_key_value_heads", config.num_attention_heads
), ),
rope_theta=rope_theta,
rope_scaling=getattr(config, "rope_scaling", None),
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
quant_config=quant_config, quant_config=quant_config,
bias=attention_bias, bias=attention_bias,
......
...@@ -88,8 +88,7 @@ class OrionAttention(nn.Module): ...@@ -88,8 +88,7 @@ class OrionAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000, rope_parameters: dict[str, Any] | None = None,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
...@@ -115,7 +114,6 @@ class OrionAttention(nn.Module): ...@@ -115,7 +114,6 @@ class OrionAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
...@@ -139,8 +137,7 @@ class OrionAttention(nn.Module): ...@@ -139,8 +137,7 @@ class OrionAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=rope_parameters,
rope_scaling=rope_scaling,
) )
self.attn = Attention( self.attn = Attention(
self.num_heads, self.num_heads,
...@@ -175,15 +172,12 @@ class OrionDecoderLayer(nn.Module): ...@@ -175,15 +172,12 @@ class OrionDecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.self_attn = OrionAttention( self.self_attn = OrionAttention(
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
......
...@@ -112,10 +112,8 @@ class OuroAttention(nn.Module): ...@@ -112,10 +112,8 @@ class OuroAttention(nn.Module):
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
max_position: int = 4096 * 32, max_position: int = 4096 * 32,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
rope_scaling: tuple | None = None,
prefix: str = "", prefix: str = "",
attn_type: str = AttentionType.DECODER, attn_type: str = AttentionType.DECODER,
dual_chunk_attention_config: dict[str, Any] | None = None, dual_chunk_attention_config: dict[str, Any] | None = None,
...@@ -140,7 +138,6 @@ class OuroAttention(nn.Module): ...@@ -140,7 +138,6 @@ class OuroAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.dual_chunk_attention_config = dual_chunk_attention_config self.dual_chunk_attention_config = dual_chunk_attention_config
# Get total_ut_steps from config, default to 4 if not specified # Get total_ut_steps from config, default to 4 if not specified
...@@ -170,8 +167,7 @@ class OuroAttention(nn.Module): ...@@ -170,8 +167,7 @@ class OuroAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position, max_position=max_position,
base=self.rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
dual_chunk_attention_config=dual_chunk_attention_config, dual_chunk_attention_config=dual_chunk_attention_config,
) )
self.attn = nn.ModuleList() self.attn = nn.ModuleList()
...@@ -226,9 +222,6 @@ class OuroDecoderLayer(nn.Module): ...@@ -226,9 +222,6 @@ class OuroDecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 1000000)
rope_scaling = getattr(config, "rope_scaling", None)
dual_chunk_attention_config = getattr( dual_chunk_attention_config = getattr(
config, "dual_chunk_attention_config", None config, "dual_chunk_attention_config", None
) )
...@@ -244,10 +237,8 @@ class OuroDecoderLayer(nn.Module): ...@@ -244,10 +237,8 @@ class OuroDecoderLayer(nn.Module):
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings, max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
rope_scaling=rope_scaling,
prefix=f"{prefix}.self_attn", prefix=f"{prefix}.self_attn",
attn_type=attn_type, attn_type=attn_type,
dual_chunk_attention_config=dual_chunk_attention_config, dual_chunk_attention_config=dual_chunk_attention_config,
......
...@@ -106,7 +106,6 @@ class PersimmonAttention(nn.Module): ...@@ -106,7 +106,6 @@ class PersimmonAttention(nn.Module):
self.num_heads = self.total_num_heads // tensor_parallel_world_size self.num_heads = self.total_num_heads // tensor_parallel_world_size
self.head_dim = self.hidden_size // self.total_num_heads self.head_dim = self.hidden_size // self.total_num_heads
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.rope_theta = config.rope_theta
self.partial_rotary_factor = config.partial_rotary_factor self.partial_rotary_factor = config.partial_rotary_factor
self.is_causal = True self.is_causal = True
...@@ -138,7 +137,7 @@ class PersimmonAttention(nn.Module): ...@@ -138,7 +137,7 @@ class PersimmonAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=self.max_position_embeddings, max_position=self.max_position_embeddings,
base=self.rope_theta, rope_parameters=config.rope_parameters,
partial_rotary_factor=self.partial_rotary_factor, partial_rotary_factor=self.partial_rotary_factor,
) )
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
......
...@@ -115,16 +115,12 @@ class PhiAttention(nn.Module): ...@@ -115,16 +115,12 @@ class PhiAttention(nn.Module):
) )
assert rotary_dim % 2 == 0 assert rotary_dim % 2 == 0
# pylint: disable=C0301
# Refer to:
# https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518
rope_theta = getattr(config, "rope_theta", 10000.0)
max_position_embeddings = getattr(config, "max_position_embeddings", 2048) max_position_embeddings = getattr(config, "max_position_embeddings", 2048)
self.rotary_emb = get_rope( self.rotary_emb = get_rope(
self.head_size, self.head_size,
rotary_dim=rotary_dim, rotary_dim=rotary_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=config.rope_parameters,
) )
self.attn = Attention( self.attn = Attention(
self.num_heads, self.num_heads,
......
...@@ -86,7 +86,7 @@ class PhiMoEConfig(PretrainedConfig): ...@@ -86,7 +86,7 @@ class PhiMoEConfig(PretrainedConfig):
bos_token_id=1, bos_token_id=1,
eos_token_id=2, eos_token_id=2,
tie_word_embeddings=False, tie_word_embeddings=False,
rope_theta=1e6, rope_parameters=None,
sliding_window=None, sliding_window=None,
attention_dropout=0.0, attention_dropout=0.0,
num_experts_per_tok=2, num_experts_per_tok=2,
...@@ -119,7 +119,9 @@ class PhiMoEConfig(PretrainedConfig): ...@@ -119,7 +119,9 @@ class PhiMoEConfig(PretrainedConfig):
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache self.use_cache = use_cache
self.rope_theta = rope_theta if rope_parameters is None:
rope_theta = kwargs.pop("rope_theta", 1e6)
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
self.attention_dropout = attention_dropout self.attention_dropout = attention_dropout
self.num_experts_per_tok = num_experts_per_tok self.num_experts_per_tok = num_experts_per_tok
...@@ -302,12 +304,11 @@ class PhiMoEAttention(nn.Module): ...@@ -302,12 +304,11 @@ class PhiMoEAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_parameters: dict,
head_dim: int | None = None, head_dim: int | None = None,
max_position: int = 4096 * 32, max_position: int = 4096 * 32,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
rope_scaling: dict | None = None,
prefix: str = "", prefix: str = "",
) -> None: ) -> None:
super().__init__() super().__init__()
...@@ -332,8 +333,6 @@ class PhiMoEAttention(nn.Module): ...@@ -332,8 +333,6 @@ class PhiMoEAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
hidden_size, hidden_size,
...@@ -355,9 +354,8 @@ class PhiMoEAttention(nn.Module): ...@@ -355,9 +354,8 @@ class PhiMoEAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position, max_position=max_position,
base=int(self.rope_theta), rope_parameters=rope_parameters,
is_neox_style=True, is_neox_style=True,
rope_scaling=self.rope_scaling,
) )
self.attn = Attention( self.attn = Attention(
self.num_heads, self.num_heads,
...@@ -393,7 +391,6 @@ class PhiMoEDecoderLayer(nn.Module): ...@@ -393,7 +391,6 @@ class PhiMoEDecoderLayer(nn.Module):
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0 # Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 10000)
self.self_attn = PhiMoEAttention( self.self_attn = PhiMoEAttention(
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
...@@ -402,10 +399,9 @@ class PhiMoEDecoderLayer(nn.Module): ...@@ -402,10 +399,9 @@ class PhiMoEDecoderLayer(nn.Module):
head_dim=getattr( head_dim=getattr(
config, "head_dim", self.hidden_size // config.num_attention_heads config, "head_dim", self.hidden_size // config.num_attention_heads
), ),
rope_theta=rope_theta,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
rope_scaling=config.rope_scaling, rope_parameters=config.rope_parameters,
prefix=f"{prefix}.self_attn", prefix=f"{prefix}.self_attn",
) )
self.block_sparse_moe = PhiMoE( self.block_sparse_moe = PhiMoE(
......
...@@ -567,10 +567,6 @@ class Plamo2AttentionMixer(nn.Module): ...@@ -567,10 +567,6 @@ class Plamo2AttentionMixer(nn.Module):
prefix=f"{prefix}.o_proj", prefix=f"{prefix}.o_proj",
) )
self.rope_theta = config.rope_theta if hasattr(config, "rope_theta") else 10000
self.rope_scaling = (
config.rope_scaling if hasattr(config, "rope_scaling") else None
)
max_position = config.max_position_embeddings max_position = config.max_position_embeddings
if hasattr(vllm_config.model_config, "max_model_len") and isinstance( if hasattr(vllm_config.model_config, "max_model_len") and isinstance(
vllm_config.model_config.max_model_len, int vllm_config.model_config.max_model_len, int
...@@ -581,8 +577,7 @@ class Plamo2AttentionMixer(nn.Module): ...@@ -581,8 +577,7 @@ class Plamo2AttentionMixer(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position, max_position=max_position,
base=self.rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=self.rope_scaling,
) )
self.q_norm = RMSNorm(config.hidden_size_per_head, eps=config.rms_norm_eps) self.q_norm = RMSNorm(config.hidden_size_per_head, eps=config.rms_norm_eps)
self.q_norm.weight = torch.nn.Parameter( self.q_norm.weight = torch.nn.Parameter(
......
...@@ -83,8 +83,7 @@ class QWenAttention(nn.Module): ...@@ -83,8 +83,7 @@ class QWenAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
max_position_embeddings: int, max_position_embeddings: int,
rope_theta: float = 10000, rope_parameters: dict[str, Any] | None = None,
rope_scaling: dict[str, Any] | None = None,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
prefix: str = "", prefix: str = "",
...@@ -117,8 +116,7 @@ class QWenAttention(nn.Module): ...@@ -117,8 +116,7 @@ class QWenAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=rope_parameters,
rope_scaling=rope_scaling,
) )
self.attn = Attention( self.attn = Attention(
self.num_heads, self.num_heads,
...@@ -153,14 +151,11 @@ class QWenBlock(nn.Module): ...@@ -153,14 +151,11 @@ class QWenBlock(nn.Module):
super().__init__() super().__init__()
self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
self.attn = QWenAttention( self.attn = QWenAttention(
config.hidden_size, config.hidden_size,
config.num_attention_heads, config.num_attention_heads,
config.max_position_embeddings, config.max_position_embeddings,
rope_theta=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.attn", prefix=f"{prefix}.attn",
......
...@@ -57,7 +57,7 @@ from vllm.model_executor.model_loader.weight_utils import ( ...@@ -57,7 +57,7 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name, maybe_remap_kv_scale_name,
) )
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import is_interleaved from vllm.transformers_utils.config import is_interleaved, set_default_rope_theta
from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
from .utils import ( from .utils import (
...@@ -114,11 +114,10 @@ class Qwen2Attention(nn.Module): ...@@ -114,11 +114,10 @@ class Qwen2Attention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_parameters: dict[str, Any],
max_position: int = 4096 * 32, max_position: int = 4096 * 32,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
rope_scaling: tuple | None = None,
prefix: str = "", prefix: str = "",
attn_type: str = AttentionType.DECODER, attn_type: str = AttentionType.DECODER,
dual_chunk_attention_config: dict[str, Any] | None = None, dual_chunk_attention_config: dict[str, Any] | None = None,
...@@ -143,7 +142,6 @@ class Qwen2Attention(nn.Module): ...@@ -143,7 +142,6 @@ class Qwen2Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.dual_chunk_attention_config = dual_chunk_attention_config self.dual_chunk_attention_config = dual_chunk_attention_config
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
...@@ -167,8 +165,7 @@ class Qwen2Attention(nn.Module): ...@@ -167,8 +165,7 @@ class Qwen2Attention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position, max_position=max_position,
base=self.rope_theta, rope_parameters=rope_parameters,
rope_scaling=rope_scaling,
dual_chunk_attention_config=dual_chunk_attention_config, dual_chunk_attention_config=dual_chunk_attention_config,
) )
attn_cls = ( attn_cls = (
...@@ -216,9 +213,7 @@ class Qwen2DecoderLayer(nn.Module): ...@@ -216,9 +213,7 @@ class Qwen2DecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0 set_default_rope_theta(config, default_theta=1000000)
rope_theta = getattr(config, "rope_theta", 1000000)
rope_scaling = getattr(config, "rope_scaling", None)
dual_chunk_attention_config = getattr( dual_chunk_attention_config = getattr(
config, "dual_chunk_attention_config", None config, "dual_chunk_attention_config", None
) )
...@@ -237,10 +232,9 @@ class Qwen2DecoderLayer(nn.Module): ...@@ -237,10 +232,9 @@ class Qwen2DecoderLayer(nn.Module):
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings, max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
rope_scaling=rope_scaling, rope_parameters=config.rope_parameters,
prefix=f"{prefix}.self_attn", prefix=f"{prefix}.self_attn",
attn_type=attn_type, attn_type=attn_type,
dual_chunk_attention_config=dual_chunk_attention_config, dual_chunk_attention_config=dual_chunk_attention_config,
......
...@@ -641,7 +641,6 @@ class Qwen2_5_VisionTransformer(nn.Module): ...@@ -641,7 +641,6 @@ class Qwen2_5_VisionTransformer(nn.Module):
head_size=head_dim, head_size=head_dim,
rotary_dim=head_dim // 2, rotary_dim=head_dim // 2,
max_position=8192, max_position=8192,
base=10000.0,
is_neox_style=True, is_neox_style=True,
) )
......
...@@ -194,8 +194,7 @@ class Qwen2MoeAttention(nn.Module): ...@@ -194,8 +194,7 @@ class Qwen2MoeAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000, rope_parameters: dict[str, Any] | None = None,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
...@@ -222,7 +221,6 @@ class Qwen2MoeAttention(nn.Module): ...@@ -222,7 +221,6 @@ class Qwen2MoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.dual_chunk_attention_config = dual_chunk_attention_config self.dual_chunk_attention_config = dual_chunk_attention_config
...@@ -248,8 +246,7 @@ class Qwen2MoeAttention(nn.Module): ...@@ -248,8 +246,7 @@ class Qwen2MoeAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=rope_parameters,
rope_scaling=rope_scaling,
dual_chunk_attention_config=dual_chunk_attention_config, dual_chunk_attention_config=dual_chunk_attention_config,
) )
self.attn = Attention( self.attn = Attention(
...@@ -291,8 +288,6 @@ class Qwen2MoeDecoderLayer(nn.Module): ...@@ -291,8 +288,6 @@ class Qwen2MoeDecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
dual_chunk_attention_config = getattr( dual_chunk_attention_config = getattr(
config, "dual_chunk_attention_config", None config, "dual_chunk_attention_config", None
) )
...@@ -301,8 +296,7 @@ class Qwen2MoeDecoderLayer(nn.Module): ...@@ -301,8 +296,7 @@ class Qwen2MoeDecoderLayer(nn.Module):
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
......
...@@ -643,7 +643,6 @@ class Qwen2VisionTransformer(nn.Module): ...@@ -643,7 +643,6 @@ class Qwen2VisionTransformer(nn.Module):
head_size=head_dim, head_size=head_dim,
rotary_dim=head_dim // 2, rotary_dim=head_dim // 2,
max_position=8192, max_position=8192,
base=10000.0,
is_neox_style=True, is_neox_style=True,
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment