"vscode:/vscode.git/clone" did not exist on "8e22da1d7fcd43efd8fec18c0c0bf6a8e7cf61a6"
Unverified Commit a8b70304 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Update `rope_scaling` to `rope_parameters` in preparation for Transformers v5 (#28542)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent d44e9df7
...@@ -67,16 +67,16 @@ class OAIAttention(nn.Module): ...@@ -67,16 +67,16 @@ class OAIAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=config.max_position_embeddings, max_position=config.max_position_embeddings,
base=config.rope_theta,
dtype=torch.float32, dtype=torch.float32,
rope_scaling={ rope_parameters={
"rope_theta": config.rope_parameters["rope_theta"],
"rope_type": "yarn", "rope_type": "yarn",
"factor": config.rope_scaling["factor"], "factor": config.rope_parameters["factor"],
"original_max_position_embeddings": config.rope_scaling[ "original_max_position_embeddings": config.rope_parameters[
"original_max_position_embeddings" "original_max_position_embeddings"
], ],
"beta_fast": config.rope_scaling["beta_fast"], "beta_fast": config.rope_parameters["beta_fast"],
"beta_slow": config.rope_scaling["beta_slow"], "beta_slow": config.rope_parameters["beta_slow"],
}, },
is_neox_style=True, is_neox_style=True,
) )
...@@ -90,7 +90,6 @@ class OAIAttention(nn.Module): ...@@ -90,7 +90,6 @@ class OAIAttention(nn.Module):
self.q_size = self.num_attention_heads * self.head_dim // tp_size self.q_size = self.num_attention_heads * self.head_dim // tp_size
self.kv_size = self.num_key_value_heads * self.head_dim // tp_size self.kv_size = self.num_key_value_heads * self.head_dim // tp_size
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = config.rope_theta
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
......
...@@ -26,7 +26,6 @@ ...@@ -26,7 +26,6 @@
from collections.abc import Iterable from collections.abc import Iterable
from itertools import islice from itertools import islice
from typing import Any
import torch import torch
from torch import nn from torch import nn
...@@ -112,8 +111,6 @@ class GraniteAttention(nn.Module): ...@@ -112,8 +111,6 @@ class GraniteAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
bias: bool = False, bias: bool = False,
...@@ -143,7 +140,6 @@ class GraniteAttention(nn.Module): ...@@ -143,7 +140,6 @@ class GraniteAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = config.attention_multiplier self.scaling = config.attention_multiplier
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
...@@ -167,8 +163,7 @@ class GraniteAttention(nn.Module): ...@@ -167,8 +163,7 @@ class GraniteAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
) )
self.attn = Attention( self.attn = Attention(
self.num_heads, self.num_heads,
...@@ -204,14 +199,6 @@ class GraniteDecoderLayer(nn.Module): ...@@ -204,14 +199,6 @@ class GraniteDecoderLayer(nn.Module):
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.residual_multiplier = config.residual_multiplier self.residual_multiplier = config.residual_multiplier
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias # Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias # Support internlm/internlm-7b with bias
...@@ -225,8 +212,6 @@ class GraniteDecoderLayer(nn.Module): ...@@ -225,8 +212,6 @@ class GraniteDecoderLayer(nn.Module):
num_kv_heads=getattr( num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads config, "num_key_value_heads", config.num_attention_heads
), ),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
quant_config=quant_config, quant_config=quant_config,
bias=attention_bias, bias=attention_bias,
......
...@@ -141,8 +141,7 @@ class GraniteMoeAttention(nn.Module): ...@@ -141,8 +141,7 @@ class GraniteMoeAttention(nn.Module):
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
max_position: int = 4096 * 32, max_position: int = 4096 * 32,
rope_theta: float = 10000, rope_parameters: dict[str, Any] | None = None,
rope_scaling: dict[str, Any] | None = None,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
attention_multiplier: float | None = None, attention_multiplier: float | None = None,
...@@ -172,7 +171,6 @@ class GraniteMoeAttention(nn.Module): ...@@ -172,7 +171,6 @@ class GraniteMoeAttention(nn.Module):
if attention_multiplier is not None if attention_multiplier is not None
else self.head_dim**-1 else self.head_dim**-1
) )
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
hidden_size, hidden_size,
...@@ -194,9 +192,8 @@ class GraniteMoeAttention(nn.Module): ...@@ -194,9 +192,8 @@ class GraniteMoeAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position, max_position=max_position,
base=int(self.rope_theta), rope_parameters=rope_parameters,
is_neox_style=True, is_neox_style=True,
rope_scaling=rope_scaling,
) )
self.attn = Attention( self.attn = Attention(
self.num_heads, self.num_heads,
...@@ -235,16 +232,12 @@ class GraniteMoeDecoderLayer(nn.Module): ...@@ -235,16 +232,12 @@ class GraniteMoeDecoderLayer(nn.Module):
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
self.self_attn = GraniteMoeAttention( self.self_attn = GraniteMoeAttention(
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings, max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.self_attn", prefix=f"{prefix}.self_attn",
......
...@@ -273,10 +273,7 @@ class GraniteMoeHybridAttention(nn.Module): ...@@ -273,10 +273,7 @@ class GraniteMoeHybridAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=config.max_position_embeddings, max_position=config.max_position_embeddings,
base=int(config.rope_theta), rope_parameters=config.rope_parameters,
rope_scaling=config.rope_scaling
if hasattr(config, "rope_scaling") and config.rope_scaling is not None
else None,
is_neox_style=True, is_neox_style=True,
) )
else: else:
......
...@@ -84,16 +84,12 @@ class GraniteMoeSharedDecoderLayer(nn.Module): ...@@ -84,16 +84,12 @@ class GraniteMoeSharedDecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
self.self_attn = GraniteMoeAttention( self.self_attn = GraniteMoeAttention(
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings, max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.self_attn", prefix=f"{prefix}.self_attn",
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
from collections.abc import Iterable from collections.abc import Iterable
from itertools import islice from itertools import islice
from typing import Any
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
...@@ -134,7 +135,7 @@ class Grok1Attention(nn.Module): ...@@ -134,7 +135,7 @@ class Grok1Attention(nn.Module):
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
max_position: int = 4096 * 32, max_position: int = 4096 * 32,
rope_theta: float = 10000, rope_parameters: dict[str, Any] | None = None,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
prefix: str = "", prefix: str = "",
...@@ -161,7 +162,6 @@ class Grok1Attention(nn.Module): ...@@ -161,7 +162,6 @@ class Grok1Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
hidden_size, hidden_size,
...@@ -183,7 +183,7 @@ class Grok1Attention(nn.Module): ...@@ -183,7 +183,7 @@ class Grok1Attention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position, max_position=max_position,
base=int(self.rope_theta), rope_parameters=rope_parameters,
is_neox_style=True, is_neox_style=True,
) )
...@@ -234,15 +234,12 @@ class Grok1DecoderLayer(nn.Module): ...@@ -234,15 +234,12 @@ class Grok1DecoderLayer(nn.Module):
if not self.use_fp8 and hasattr(quant_config, "is_fp8"): if not self.use_fp8 and hasattr(quant_config, "is_fp8"):
self.use_fp8 = quant_config.is_fp8 self.use_fp8 = quant_config.is_fp8
# Requires transformers > 4.32.0
# Default rope_theta value if not in config
rope_theta = 10000
self.attn = Grok1Attention( self.attn = Grok1Attention(
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings, max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta, rope_parameters=config.rope_parameters,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.attn", prefix=f"{prefix}.attn",
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
import typing import typing
from collections.abc import Callable, Iterable from collections.abc import Callable, Iterable
from itertools import islice from itertools import islice
from typing import Any
import regex as re import regex as re
import torch import torch
...@@ -142,8 +141,6 @@ class HunYuanAttention(nn.Module): ...@@ -142,8 +141,6 @@ class HunYuanAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
bias: bool = False, bias: bool = False,
...@@ -177,7 +174,6 @@ class HunYuanAttention(nn.Module): ...@@ -177,7 +174,6 @@ class HunYuanAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.use_qk_norm = getattr(config, "use_qk_norm", False) self.use_qk_norm = getattr(config, "use_qk_norm", False)
self.layer_id = layer_id self.layer_id = layer_id
...@@ -204,8 +200,7 @@ class HunYuanAttention(nn.Module): ...@@ -204,8 +200,7 @@ class HunYuanAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
is_neox_style=True, is_neox_style=True,
) )
self.attn = Attention( self.attn = Attention(
...@@ -254,8 +249,6 @@ class HunYuanCrossAttention(nn.Module): ...@@ -254,8 +249,6 @@ class HunYuanCrossAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
bias: bool = False, bias: bool = False,
...@@ -289,7 +282,6 @@ class HunYuanCrossAttention(nn.Module): ...@@ -289,7 +282,6 @@ class HunYuanCrossAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.use_qk_norm = getattr(config, "use_qk_norm", False) self.use_qk_norm = getattr(config, "use_qk_norm", False)
self.layer_id = layer_id self.layer_id = layer_id
...@@ -314,8 +306,7 @@ class HunYuanCrossAttention(nn.Module): ...@@ -314,8 +306,7 @@ class HunYuanCrossAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
is_neox_style=True, is_neox_style=True,
) )
self.attn = Attention( self.attn = Attention(
...@@ -494,14 +485,6 @@ class HunYuanDecoderLayer(nn.Module): ...@@ -494,14 +485,6 @@ class HunYuanDecoderLayer(nn.Module):
if isinstance(config.intermediate_size, int) if isinstance(config.intermediate_size, int)
else config.intermediate_size[layer_id] else config.intermediate_size[layer_id]
) )
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
attention_bias = getattr(config, "attention_bias", False) or getattr( attention_bias = getattr(config, "attention_bias", False) or getattr(
config, "bias", False config, "bias", False
...@@ -520,8 +503,6 @@ class HunYuanDecoderLayer(nn.Module): ...@@ -520,8 +503,6 @@ class HunYuanDecoderLayer(nn.Module):
num_kv_heads=getattr( num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads config, "num_key_value_heads", config.num_attention_heads
), ),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
quant_config=quant_config, quant_config=quant_config,
bias=attention_bias, bias=attention_bias,
...@@ -537,8 +518,6 @@ class HunYuanDecoderLayer(nn.Module): ...@@ -537,8 +518,6 @@ class HunYuanDecoderLayer(nn.Module):
num_kv_heads=getattr( num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads config, "num_key_value_heads", config.num_attention_heads
), ),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
quant_config=quant_config, quant_config=quant_config,
bias=attention_bias, bias=attention_bias,
......
...@@ -91,8 +91,7 @@ class InternLM2Attention(nn.Module): ...@@ -91,8 +91,7 @@ class InternLM2Attention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000, rope_parameters: dict[str, Any] | None = None,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
...@@ -120,7 +119,6 @@ class InternLM2Attention(nn.Module): ...@@ -120,7 +119,6 @@ class InternLM2Attention(nn.Module):
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.key_value_groups = int(self.num_heads / self.num_kv_heads) self.key_value_groups = int(self.num_heads / self.num_kv_heads)
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.wqkv = QKVParallelLinear( self.wqkv = QKVParallelLinear(
...@@ -144,8 +142,7 @@ class InternLM2Attention(nn.Module): ...@@ -144,8 +142,7 @@ class InternLM2Attention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=rope_parameters,
rope_scaling=rope_scaling,
) )
self.attn = Attention( self.attn = Attention(
self.num_heads, self.num_heads,
...@@ -204,15 +201,12 @@ class InternLMDecoderLayer(nn.Module): ...@@ -204,15 +201,12 @@ class InternLMDecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.attention = InternLM2Attention( self.attention = InternLM2Attention(
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
......
...@@ -30,15 +30,12 @@ class InternLM2VEDecoderLayer(nn.Module): ...@@ -30,15 +30,12 @@ class InternLM2VEDecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.attention = InternLM2Attention( self.attention = InternLM2Attention(
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable from collections.abc import Iterable
from typing import Any
import torch import torch
from torch import nn from torch import nn
...@@ -190,9 +189,7 @@ class KimiMLAAttention(nn.Module): ...@@ -190,9 +189,7 @@ class KimiMLAAttention(nn.Module):
v_head_dim: int, v_head_dim: int,
q_lora_rank: int | None, q_lora_rank: int | None,
kv_lora_rank: int, kv_lora_rank: int,
rope_theta: float = 10000,
use_nope: bool = False, use_nope: bool = False,
rope_scaling: dict[str, Any] | None = None,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
prefix: str = "", prefix: str = "",
...@@ -210,11 +207,9 @@ class KimiMLAAttention(nn.Module): ...@@ -210,11 +207,9 @@ class KimiMLAAttention(nn.Module):
tp_size = get_tensor_model_parallel_world_size() tp_size = get_tensor_model_parallel_world_size()
self.num_local_heads = num_heads // tp_size self.num_local_heads = num_heads // tp_size
self.scaling = self.qk_head_dim**-0.5 self.scaling = self.qk_head_dim**-0.5
self.rope_theta = rope_theta
self.use_nope = use_nope self.use_nope = use_nope
assert self.use_nope is True assert self.use_nope is True
assert self.q_lora_rank is None assert self.q_lora_rank is None
assert rope_scaling is None
assert num_heads % tp_size == 0 assert num_heads % tp_size == 0
self.kv_a_proj_with_mqa = ReplicatedLinear( self.kv_a_proj_with_mqa = ReplicatedLinear(
self.hidden_size, self.hidden_size,
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable from collections.abc import Iterable
from itertools import islice from itertools import islice
from typing import Any
import torch import torch
import torch.nn as nn import torch.nn as nn
...@@ -96,8 +95,6 @@ class Lfm2Attention(nn.Module): ...@@ -96,8 +95,6 @@ class Lfm2Attention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
...@@ -126,7 +123,6 @@ class Lfm2Attention(nn.Module): ...@@ -126,7 +123,6 @@ class Lfm2Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
...@@ -149,8 +145,7 @@ class Lfm2Attention(nn.Module): ...@@ -149,8 +145,7 @@ class Lfm2Attention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=self.max_position_embeddings, max_position=self.max_position_embeddings,
base=self.rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
is_neox_style=True, is_neox_style=True,
) )
self.attn = Attention( self.attn = Attention(
...@@ -199,14 +194,6 @@ class Lfm2AttentionDecoderLayer(nn.Module): ...@@ -199,14 +194,6 @@ class Lfm2AttentionDecoderLayer(nn.Module):
self.config = config self.config = config
self.layer_idx = layer_idx self.layer_idx = layer_idx
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.self_attn = Lfm2Attention( self.self_attn = Lfm2Attention(
...@@ -215,8 +202,6 @@ class Lfm2AttentionDecoderLayer(nn.Module): ...@@ -215,8 +202,6 @@ class Lfm2AttentionDecoderLayer(nn.Module):
hidden_size=config.hidden_size, hidden_size=config.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable from collections.abc import Iterable
from itertools import islice from itertools import islice
from typing import Any
import torch import torch
import torch.nn as nn import torch.nn as nn
...@@ -189,8 +188,6 @@ class Lfm2MoeAttention(nn.Module): ...@@ -189,8 +188,6 @@ class Lfm2MoeAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
...@@ -219,7 +216,6 @@ class Lfm2MoeAttention(nn.Module): ...@@ -219,7 +216,6 @@ class Lfm2MoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
...@@ -242,8 +238,7 @@ class Lfm2MoeAttention(nn.Module): ...@@ -242,8 +238,7 @@ class Lfm2MoeAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=self.max_position_embeddings, max_position=self.max_position_embeddings,
base=self.rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
is_neox_style=True, is_neox_style=True,
) )
self.attn = Attention( self.attn = Attention(
...@@ -293,14 +288,6 @@ class Lfm2MoeAttentionDecoderLayer(nn.Module): ...@@ -293,14 +288,6 @@ class Lfm2MoeAttentionDecoderLayer(nn.Module):
self.config = config self.config = config
self.layer_idx = layer_idx self.layer_idx = layer_idx
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.self_attn = Lfm2MoeAttention( self.self_attn = Lfm2MoeAttention(
...@@ -309,8 +296,6 @@ class Lfm2MoeAttentionDecoderLayer(nn.Module): ...@@ -309,8 +296,6 @@ class Lfm2MoeAttentionDecoderLayer(nn.Module):
hidden_size=config.hidden_size, hidden_size=config.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
......
...@@ -26,7 +26,6 @@ ...@@ -26,7 +26,6 @@
from collections.abc import Iterable from collections.abc import Iterable
from itertools import islice from itertools import islice
from typing import Any
import torch import torch
from torch import nn from torch import nn
...@@ -120,8 +119,6 @@ class LlamaAttention(nn.Module): ...@@ -120,8 +119,6 @@ class LlamaAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
bias: bool = False, bias: bool = False,
...@@ -157,7 +154,6 @@ class LlamaAttention(nn.Module): ...@@ -157,7 +154,6 @@ class LlamaAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
llama_4_scaling_config = getattr(config, "llama_4_scaling", None) llama_4_scaling_config = getattr(config, "llama_4_scaling", None)
...@@ -186,9 +182,7 @@ class LlamaAttention(nn.Module): ...@@ -186,9 +182,7 @@ class LlamaAttention(nn.Module):
prefix=f"{prefix}.o_proj", prefix=f"{prefix}.o_proj",
) )
self._init_rotary_emb( self._init_rotary_emb(config, quant_config=quant_config)
config, rope_scaling=rope_scaling, quant_config=quant_config
)
sliding_window = None sliding_window = None
if layer_types := getattr(config, "layer_types", None): if layer_types := getattr(config, "layer_types", None):
...@@ -258,7 +252,6 @@ class LlamaAttention(nn.Module): ...@@ -258,7 +252,6 @@ class LlamaAttention(nn.Module):
def _init_rotary_emb( def _init_rotary_emb(
self, self,
config: LlamaConfig, config: LlamaConfig,
rope_scaling: dict[str, Any] | None,
quant_config: QuantizationConfig | None, quant_config: QuantizationConfig | None,
) -> None: ) -> None:
is_neox_style = True is_neox_style = True
...@@ -270,8 +263,7 @@ class LlamaAttention(nn.Module): ...@@ -270,8 +263,7 @@ class LlamaAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=self.max_position_embeddings, max_position=self.max_position_embeddings,
base=self.rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
is_neox_style=is_neox_style, is_neox_style=is_neox_style,
partial_rotary_factor=self.partial_rotary_factor, partial_rotary_factor=self.partial_rotary_factor,
) )
...@@ -291,14 +283,6 @@ class LlamaDecoderLayer(nn.Module): ...@@ -291,14 +283,6 @@ class LlamaDecoderLayer(nn.Module):
quant_config = self.get_quant_config(vllm_config) quant_config = self.get_quant_config(vllm_config)
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias # Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias # Support internlm/internlm-7b with bias
...@@ -326,8 +310,6 @@ class LlamaDecoderLayer(nn.Module): ...@@ -326,8 +310,6 @@ class LlamaDecoderLayer(nn.Module):
num_kv_heads=getattr( num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads config, "num_key_value_heads", config.num_attention_heads
), ),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
quant_config=quant_config, quant_config=quant_config,
bias=attention_bias, bias=attention_bias,
......
...@@ -19,7 +19,6 @@ ...@@ -19,7 +19,6 @@
"""Inference-only LLaMA model compatible with HuggingFace weights.""" """Inference-only LLaMA model compatible with HuggingFace weights."""
from collections.abc import Iterable from collections.abc import Iterable
from typing import Any
import torch import torch
from torch import nn from torch import nn
...@@ -171,8 +170,6 @@ class Llama4Attention(nn.Module): ...@@ -171,8 +170,6 @@ class Llama4Attention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
bias: bool = False, bias: bool = False,
...@@ -208,7 +205,6 @@ class Llama4Attention(nn.Module): ...@@ -208,7 +205,6 @@ class Llama4Attention(nn.Module):
self.floor_scale = getattr(config, "floor_scale", 8192.0) self.floor_scale = getattr(config, "floor_scale", 8192.0)
self.attn_scale = getattr(config, "attn_scale", 0.1) self.attn_scale = getattr(config, "attn_scale", 0.1)
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.n_rep = self.num_heads // self.num_kv_heads self.n_rep = self.num_heads // self.num_kv_heads
self.qk_norm = ( self.qk_norm = (
...@@ -248,8 +244,7 @@ class Llama4Attention(nn.Module): ...@@ -248,8 +244,7 @@ class Llama4Attention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=int(rope_theta), rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling if rope_scaling != "default" else None,
is_neox_style=is_neox_style, is_neox_style=is_neox_style,
) )
if not self.nope if not self.nope
...@@ -331,8 +326,6 @@ class Llama4DecoderLayer(nn.Module): ...@@ -331,8 +326,6 @@ class Llama4DecoderLayer(nn.Module):
self.layer_idx = extract_layer_index(prefix) self.layer_idx = extract_layer_index(prefix)
self.global_layer = config.no_rope_layers[self.layer_idx] == 0 self.global_layer = config.no_rope_layers[self.layer_idx] == 0
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = config.rope_theta
rope_scaling = config.rope_scaling
max_position_embeddings = config.max_position_embeddings max_position_embeddings = config.max_position_embeddings
self.self_attn = Llama4Attention( self.self_attn = Llama4Attention(
...@@ -340,8 +333,6 @@ class Llama4DecoderLayer(nn.Module): ...@@ -340,8 +333,6 @@ class Llama4DecoderLayer(nn.Module):
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
quant_config=quant_config, quant_config=quant_config,
bias=False, bias=False,
......
...@@ -108,8 +108,7 @@ class FlashConfig(PretrainedConfig): ...@@ -108,8 +108,7 @@ class FlashConfig(PretrainedConfig):
eos_token_id=100001, eos_token_id=100001,
pretraining_tp=1, pretraining_tp=1,
tie_word_embeddings=False, tie_word_embeddings=False,
rope_theta=1000000.0, rope_parameters=None,
rope_scaling=None,
attention_bias=False, attention_bias=False,
attention_dropout=0.0, attention_dropout=0.0,
mla_scale_q_lora=False, mla_scale_q_lora=False,
...@@ -162,8 +161,13 @@ class FlashConfig(PretrainedConfig): ...@@ -162,8 +161,13 @@ class FlashConfig(PretrainedConfig):
self.rms_norm_eps = rms_norm_eps self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp self.pretraining_tp = pretraining_tp
self.use_cache = use_cache self.use_cache = use_cache
self.rope_theta = rope_theta # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
self.rope_scaling = rope_scaling rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 1000000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
self.attention_bias = attention_bias self.attention_bias = attention_bias
self.attention_dropout = attention_dropout self.attention_dropout = attention_dropout
self.mla_scale_q_lora = mla_scale_q_lora self.mla_scale_q_lora = mla_scale_q_lora
...@@ -336,15 +340,7 @@ class FlashDecoderLayer(nn.Module): ...@@ -336,15 +340,7 @@ class FlashDecoderLayer(nn.Module):
super().__init__() super().__init__()
self.layer_idx = int(prefix.split(sep=".")[-1]) self.layer_idx = int(prefix.split(sep=".")[-1])
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
# Dual attention structure # Dual attention structure
self.self_attn = nn.ModuleList( self.self_attn = nn.ModuleList(
...@@ -361,8 +357,6 @@ class FlashDecoderLayer(nn.Module): ...@@ -361,8 +357,6 @@ class FlashDecoderLayer(nn.Module):
config.q_lora_rank if hasattr(config, "q_lora_rank") else None config.q_lora_rank if hasattr(config, "q_lora_rank") else None
), ),
kv_lora_rank=config.kv_lora_rank, kv_lora_rank=config.kv_lora_rank,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
cache_config=cache_config, cache_config=cache_config,
quant_config=None quant_config=None
......
...@@ -230,8 +230,7 @@ class MiniCPMAttention(nn.Module): ...@@ -230,8 +230,7 @@ class MiniCPMAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000, rope_parameters: dict[str, Any] | None = None,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
...@@ -257,7 +256,6 @@ class MiniCPMAttention(nn.Module): ...@@ -257,7 +256,6 @@ class MiniCPMAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
...@@ -281,8 +279,7 @@ class MiniCPMAttention(nn.Module): ...@@ -281,8 +279,7 @@ class MiniCPMAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=rope_parameters,
rope_scaling=rope_scaling,
) )
self.attn = Attention( self.attn = Attention(
...@@ -324,8 +321,6 @@ class MiniCPMDecoderLayer(nn.Module): ...@@ -324,8 +321,6 @@ class MiniCPMDecoderLayer(nn.Module):
self.cache_config = cache_config self.cache_config = cache_config
self.quant_config = quant_config self.quant_config = quant_config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.rope_theta = getattr(config, "rope_theta", 10000)
self.rope_scaling = getattr(config, "rope_scaling", None)
self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.prefix = prefix self.prefix = prefix
self._init_attn_block() self._init_attn_block()
...@@ -339,8 +334,7 @@ class MiniCPMDecoderLayer(nn.Module): ...@@ -339,8 +334,7 @@ class MiniCPMDecoderLayer(nn.Module):
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=self.config.num_attention_heads, num_heads=self.config.num_attention_heads,
num_kv_heads=self.config.num_key_value_heads, num_kv_heads=self.config.num_key_value_heads,
rope_theta=self.rope_theta, rope_parameters=self.config.rope_parameters,
rope_scaling=self.rope_scaling,
max_position_embeddings=self.max_position_embeddings, max_position_embeddings=self.max_position_embeddings,
cache_config=self.cache_config, cache_config=self.cache_config,
quant_config=self.quant_config, quant_config=self.quant_config,
......
...@@ -25,8 +25,6 @@ ...@@ -25,8 +25,6 @@
# limitations under the License. # limitations under the License.
"""Inference-only MiniCPM3 model compatible with HuggingFace weights.""" """Inference-only MiniCPM3 model compatible with HuggingFace weights."""
from typing import Any
import torch import torch
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
...@@ -62,8 +60,6 @@ class MiniCPM3Attention(nn.Module): ...@@ -62,8 +60,6 @@ class MiniCPM3Attention(nn.Module):
v_head_dim: int, v_head_dim: int,
q_lora_rank: int, q_lora_rank: int,
kv_lora_rank: int, kv_lora_rank: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
...@@ -84,7 +80,6 @@ class MiniCPM3Attention(nn.Module): ...@@ -84,7 +80,6 @@ class MiniCPM3Attention(nn.Module):
self.num_local_heads = num_heads // tp_size self.num_local_heads = num_heads // tp_size
self.scaling = self.qk_head_dim**-0.5 self.scaling = self.qk_head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.q_a_proj = ReplicatedLinear( self.q_a_proj = ReplicatedLinear(
...@@ -127,8 +122,7 @@ class MiniCPM3Attention(nn.Module): ...@@ -127,8 +122,7 @@ class MiniCPM3Attention(nn.Module):
self.qk_rope_head_dim, self.qk_rope_head_dim,
rotary_dim=self.qk_rope_head_dim, rotary_dim=self.qk_rope_head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
) )
self.attn = Attention( self.attn = Attention(
self.num_local_heads, self.num_local_heads,
...@@ -204,8 +198,6 @@ class MiniCPM3DecoderLayer(MiniCPMDecoderLayer): ...@@ -204,8 +198,6 @@ class MiniCPM3DecoderLayer(MiniCPMDecoderLayer):
v_head_dim=self.config.v_head_dim, v_head_dim=self.config.v_head_dim,
q_lora_rank=self.config.q_lora_rank, q_lora_rank=self.config.q_lora_rank,
kv_lora_rank=self.config.kv_lora_rank, kv_lora_rank=self.config.kv_lora_rank,
rope_theta=self.rope_theta,
rope_scaling=self.rope_scaling,
max_position_embeddings=self.max_position_embeddings, max_position_embeddings=self.max_position_embeddings,
cache_config=self.cache_config, cache_config=self.cache_config,
quant_config=self.quant_config, quant_config=self.quant_config,
......
...@@ -69,8 +69,6 @@ class EagleMiniCPMDecoderLayer(nn.Module): ...@@ -69,8 +69,6 @@ class EagleMiniCPMDecoderLayer(nn.Module):
self.cache_config = cache_config self.cache_config = cache_config
self.quant_config = quant_config self.quant_config = quant_config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.rope_theta = getattr(config, "rope_theta", 10000)
self.rope_scaling = getattr(config, "rope_scaling", None)
self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.prefix = prefix self.prefix = prefix
self._init_attn_block() self._init_attn_block()
...@@ -84,8 +82,7 @@ class EagleMiniCPMDecoderLayer(nn.Module): ...@@ -84,8 +82,7 @@ class EagleMiniCPMDecoderLayer(nn.Module):
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=self.config.num_attention_heads, num_heads=self.config.num_attention_heads,
num_kv_heads=self.config.num_key_value_heads, num_kv_heads=self.config.num_key_value_heads,
rope_theta=self.rope_theta, rope_parameters=self.config.rope_parameters,
rope_scaling=self.rope_scaling,
max_position_embeddings=self.max_position_embeddings, max_position_embeddings=self.max_position_embeddings,
cache_config=self.cache_config, cache_config=self.cache_config,
quant_config=self.quant_config, quant_config=self.quant_config,
......
...@@ -149,8 +149,7 @@ class MiniMaxM2Attention(nn.Module): ...@@ -149,8 +149,7 @@ class MiniMaxM2Attention(nn.Module):
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rotary_dim: int, rotary_dim: int,
rope_theta: float = 10000, rope_parameters: dict[str, Any] | None = None,
rope_scaling: dict[str, Any] | None = None,
attn_window_size: int | None = None, attn_window_size: int | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
head_dim: int | None = None, head_dim: int | None = None,
...@@ -180,7 +179,6 @@ class MiniMaxM2Attention(nn.Module): ...@@ -180,7 +179,6 @@ class MiniMaxM2Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
...@@ -205,8 +203,7 @@ class MiniMaxM2Attention(nn.Module): ...@@ -205,8 +203,7 @@ class MiniMaxM2Attention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=rotary_dim, rotary_dim=rotary_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=rope_parameters,
rope_scaling=rope_scaling,
) )
self.attn = Attention( self.attn = Attention(
self.num_heads, self.num_heads,
...@@ -252,8 +249,6 @@ class MiniMaxM2DecoderLayer(nn.Module): ...@@ -252,8 +249,6 @@ class MiniMaxM2DecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int):
max_position_embeddings = max( max_position_embeddings = max(
...@@ -269,8 +264,7 @@ class MiniMaxM2DecoderLayer(nn.Module): ...@@ -269,8 +264,7 @@ class MiniMaxM2DecoderLayer(nn.Module):
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rotary_dim=config.rotary_dim, rotary_dim=config.rotary_dim,
rope_theta=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
rms_norm_eps=config.rms_norm_eps, rms_norm_eps=config.rms_norm_eps,
qkv_bias=getattr(config, "attention_bias", False), qkv_bias=getattr(config, "attention_bias", False),
......
...@@ -188,7 +188,7 @@ class MiniMaxText01Attention(nn.Module): ...@@ -188,7 +188,7 @@ class MiniMaxText01Attention(nn.Module):
num_kv_heads: int, num_kv_heads: int,
rotary_dim: int, rotary_dim: int,
max_position: int = 4096 * 32, max_position: int = 4096 * 32,
rope_theta: float = 10000, rope_parameters: dict | None = None,
sliding_window: int | None = None, sliding_window: int | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
layer_idx: int = None, layer_idx: int = None,
...@@ -214,7 +214,6 @@ class MiniMaxText01Attention(nn.Module): ...@@ -214,7 +214,6 @@ class MiniMaxText01Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.sliding_window = sliding_window self.sliding_window = sliding_window
self.prefix = prefix self.prefix = prefix
...@@ -247,7 +246,7 @@ class MiniMaxText01Attention(nn.Module): ...@@ -247,7 +246,7 @@ class MiniMaxText01Attention(nn.Module):
head_size=self.head_dim, head_size=self.head_dim,
rotary_dim=rotary_dim, rotary_dim=rotary_dim,
max_position=max_position, max_position=max_position,
base=int(rope_theta), rope_parameters=rope_parameters,
is_neox_style=True, is_neox_style=True,
dtype=torch.float32, dtype=torch.float32,
) )
...@@ -287,8 +286,6 @@ class MiniMaxText01DecoderLayer(nn.Module): ...@@ -287,8 +286,6 @@ class MiniMaxText01DecoderLayer(nn.Module):
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.expert_num = expert_num self.expert_num = expert_num
rope_theta = getattr(config, "rope_theta", 10000)
head_dim = getattr(config, "head_dim", None) head_dim = getattr(config, "head_dim", None)
if head_dim is None: if head_dim is None:
head_dim = config.hidden_size // config.num_attention_heads head_dim = config.hidden_size // config.num_attention_heads
...@@ -328,7 +325,7 @@ class MiniMaxText01DecoderLayer(nn.Module): ...@@ -328,7 +325,7 @@ class MiniMaxText01DecoderLayer(nn.Module):
else head_dim, else head_dim,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
max_position=max_position_embeddings, max_position=max_position_embeddings,
rope_theta=rope_theta, rope_parameters=config.rope_parameters,
sliding_window=config.sliding_window, sliding_window=config.sliding_window,
quant_config=quant_config, quant_config=quant_config,
layer_idx=self._ilayer, layer_idx=self._ilayer,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment