Unverified Commit a8b70304 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Update `rope_scaling` to `rope_parameters` in preparation for Transformers v5 (#28542)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent d44e9df7
......@@ -67,16 +67,16 @@ class OAIAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=config.max_position_embeddings,
base=config.rope_theta,
dtype=torch.float32,
rope_scaling={
rope_parameters={
"rope_theta": config.rope_parameters["rope_theta"],
"rope_type": "yarn",
"factor": config.rope_scaling["factor"],
"original_max_position_embeddings": config.rope_scaling[
"factor": config.rope_parameters["factor"],
"original_max_position_embeddings": config.rope_parameters[
"original_max_position_embeddings"
],
"beta_fast": config.rope_scaling["beta_fast"],
"beta_slow": config.rope_scaling["beta_slow"],
"beta_fast": config.rope_parameters["beta_fast"],
"beta_slow": config.rope_parameters["beta_slow"],
},
is_neox_style=True,
)
......@@ -90,7 +90,6 @@ class OAIAttention(nn.Module):
self.q_size = self.num_attention_heads * self.head_dim // tp_size
self.kv_size = self.num_key_value_heads * self.head_dim // tp_size
self.scaling = self.head_dim**-0.5
self.rope_theta = config.rope_theta
self.qkv_proj = QKVParallelLinear(
hidden_size=self.hidden_size,
......
......@@ -26,7 +26,6 @@
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
from torch import nn
......@@ -112,8 +111,6 @@ class GraniteAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
......@@ -143,7 +140,6 @@ class GraniteAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = config.attention_multiplier
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
......@@ -167,8 +163,7 @@ class GraniteAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
)
self.attn = Attention(
self.num_heads,
......@@ -204,14 +199,6 @@ class GraniteDecoderLayer(nn.Module):
super().__init__()
self.hidden_size = config.hidden_size
self.residual_multiplier = config.residual_multiplier
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias
......@@ -225,8 +212,6 @@ class GraniteDecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,
......
......@@ -141,8 +141,7 @@ class GraniteMoeAttention(nn.Module):
num_heads: int,
num_kv_heads: int,
max_position: int = 4096 * 32,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
rope_parameters: dict[str, Any] | None = None,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
attention_multiplier: float | None = None,
......@@ -172,7 +171,6 @@ class GraniteMoeAttention(nn.Module):
if attention_multiplier is not None
else self.head_dim**-1
)
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear(
hidden_size,
......@@ -194,9 +192,8 @@ class GraniteMoeAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position,
base=int(self.rope_theta),
rope_parameters=rope_parameters,
is_neox_style=True,
rope_scaling=rope_scaling,
)
self.attn = Attention(
self.num_heads,
......@@ -235,16 +232,12 @@ class GraniteMoeDecoderLayer(nn.Module):
parallel_config = vllm_config.parallel_config
self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
self.self_attn = GraniteMoeAttention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.self_attn",
......
......@@ -273,10 +273,7 @@ class GraniteMoeHybridAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=config.max_position_embeddings,
base=int(config.rope_theta),
rope_scaling=config.rope_scaling
if hasattr(config, "rope_scaling") and config.rope_scaling is not None
else None,
rope_parameters=config.rope_parameters,
is_neox_style=True,
)
else:
......
......@@ -84,16 +84,12 @@ class GraniteMoeSharedDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
self.self_attn = GraniteMoeAttention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.self_attn",
......
......@@ -25,6 +25,7 @@
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
import torch.nn.functional as F
......@@ -134,7 +135,7 @@ class Grok1Attention(nn.Module):
num_heads: int,
num_kv_heads: int,
max_position: int = 4096 * 32,
rope_theta: float = 10000,
rope_parameters: dict[str, Any] | None = None,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
prefix: str = "",
......@@ -161,7 +162,6 @@ class Grok1Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear(
hidden_size,
......@@ -183,7 +183,7 @@ class Grok1Attention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position,
base=int(self.rope_theta),
rope_parameters=rope_parameters,
is_neox_style=True,
)
......@@ -234,15 +234,12 @@ class Grok1DecoderLayer(nn.Module):
if not self.use_fp8 and hasattr(quant_config, "is_fp8"):
self.use_fp8 = quant_config.is_fp8
# Requires transformers > 4.32.0
# Default rope_theta value if not in config
rope_theta = 10000
self.attn = Grok1Attention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_parameters=config.rope_parameters,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.attn",
......
......@@ -27,7 +27,6 @@
import typing
from collections.abc import Callable, Iterable
from itertools import islice
from typing import Any
import regex as re
import torch
......@@ -142,8 +141,6 @@ class HunYuanAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
......@@ -177,7 +174,6 @@ class HunYuanAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.use_qk_norm = getattr(config, "use_qk_norm", False)
self.layer_id = layer_id
......@@ -204,8 +200,7 @@ class HunYuanAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=True,
)
self.attn = Attention(
......@@ -254,8 +249,6 @@ class HunYuanCrossAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
......@@ -289,7 +282,6 @@ class HunYuanCrossAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.use_qk_norm = getattr(config, "use_qk_norm", False)
self.layer_id = layer_id
......@@ -314,8 +306,7 @@ class HunYuanCrossAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=True,
)
self.attn = Attention(
......@@ -494,14 +485,6 @@ class HunYuanDecoderLayer(nn.Module):
if isinstance(config.intermediate_size, int)
else config.intermediate_size[layer_id]
)
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
attention_bias = getattr(config, "attention_bias", False) or getattr(
config, "bias", False
......@@ -520,8 +503,6 @@ class HunYuanDecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,
......@@ -537,8 +518,6 @@ class HunYuanDecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,
......
......@@ -91,8 +91,7 @@ class InternLM2Attention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
rope_parameters: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
......@@ -120,7 +119,6 @@ class InternLM2Attention(nn.Module):
self.kv_size = self.num_kv_heads * self.head_dim
self.key_value_groups = int(self.num_heads / self.num_kv_heads)
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.wqkv = QKVParallelLinear(
......@@ -144,8 +142,7 @@ class InternLM2Attention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
)
self.attn = Attention(
self.num_heads,
......@@ -204,15 +201,12 @@ class InternLMDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.attention = InternLM2Attention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,
......
......@@ -30,15 +30,12 @@ class InternLM2VEDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.attention = InternLM2Attention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,
......
......@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from typing import Any
import torch
from torch import nn
......@@ -190,9 +189,7 @@ class KimiMLAAttention(nn.Module):
v_head_dim: int,
q_lora_rank: int | None,
kv_lora_rank: int,
rope_theta: float = 10000,
use_nope: bool = False,
rope_scaling: dict[str, Any] | None = None,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
prefix: str = "",
......@@ -210,11 +207,9 @@ class KimiMLAAttention(nn.Module):
tp_size = get_tensor_model_parallel_world_size()
self.num_local_heads = num_heads // tp_size
self.scaling = self.qk_head_dim**-0.5
self.rope_theta = rope_theta
self.use_nope = use_nope
assert self.use_nope is True
assert self.q_lora_rank is None
assert rope_scaling is None
assert num_heads % tp_size == 0
self.kv_a_proj_with_mqa = ReplicatedLinear(
self.hidden_size,
......
......@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
import torch.nn as nn
......@@ -96,8 +95,6 @@ class Lfm2Attention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
......@@ -126,7 +123,6 @@ class Lfm2Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
......@@ -149,8 +145,7 @@ class Lfm2Attention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=self.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=True,
)
self.attn = Attention(
......@@ -199,14 +194,6 @@ class Lfm2AttentionDecoderLayer(nn.Module):
self.config = config
self.layer_idx = layer_idx
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.self_attn = Lfm2Attention(
......@@ -215,8 +202,6 @@ class Lfm2AttentionDecoderLayer(nn.Module):
hidden_size=config.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,
......
......@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
import torch.nn as nn
......@@ -189,8 +188,6 @@ class Lfm2MoeAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
......@@ -219,7 +216,6 @@ class Lfm2MoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
......@@ -242,8 +238,7 @@ class Lfm2MoeAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=self.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=True,
)
self.attn = Attention(
......@@ -293,14 +288,6 @@ class Lfm2MoeAttentionDecoderLayer(nn.Module):
self.config = config
self.layer_idx = layer_idx
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.self_attn = Lfm2MoeAttention(
......@@ -309,8 +296,6 @@ class Lfm2MoeAttentionDecoderLayer(nn.Module):
hidden_size=config.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,
......
......@@ -26,7 +26,6 @@
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
from torch import nn
......@@ -120,8 +119,6 @@ class LlamaAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
......@@ -157,7 +154,6 @@ class LlamaAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
llama_4_scaling_config = getattr(config, "llama_4_scaling", None)
......@@ -186,9 +182,7 @@ class LlamaAttention(nn.Module):
prefix=f"{prefix}.o_proj",
)
self._init_rotary_emb(
config, rope_scaling=rope_scaling, quant_config=quant_config
)
self._init_rotary_emb(config, quant_config=quant_config)
sliding_window = None
if layer_types := getattr(config, "layer_types", None):
......@@ -258,7 +252,6 @@ class LlamaAttention(nn.Module):
def _init_rotary_emb(
self,
config: LlamaConfig,
rope_scaling: dict[str, Any] | None,
quant_config: QuantizationConfig | None,
) -> None:
is_neox_style = True
......@@ -270,8 +263,7 @@ class LlamaAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=self.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=is_neox_style,
partial_rotary_factor=self.partial_rotary_factor,
)
......@@ -291,14 +283,6 @@ class LlamaDecoderLayer(nn.Module):
quant_config = self.get_quant_config(vllm_config)
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias
......@@ -326,8 +310,6 @@ class LlamaDecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,
......
......@@ -19,7 +19,6 @@
"""Inference-only LLaMA model compatible with HuggingFace weights."""
from collections.abc import Iterable
from typing import Any
import torch
from torch import nn
......@@ -171,8 +170,6 @@ class Llama4Attention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
......@@ -208,7 +205,6 @@ class Llama4Attention(nn.Module):
self.floor_scale = getattr(config, "floor_scale", 8192.0)
self.attn_scale = getattr(config, "attn_scale", 0.1)
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.n_rep = self.num_heads // self.num_kv_heads
self.qk_norm = (
......@@ -248,8 +244,7 @@ class Llama4Attention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=int(rope_theta),
rope_scaling=rope_scaling if rope_scaling != "default" else None,
rope_parameters=config.rope_parameters,
is_neox_style=is_neox_style,
)
if not self.nope
......@@ -331,8 +326,6 @@ class Llama4DecoderLayer(nn.Module):
self.layer_idx = extract_layer_index(prefix)
self.global_layer = config.no_rope_layers[self.layer_idx] == 0
self.hidden_size = config.hidden_size
rope_theta = config.rope_theta
rope_scaling = config.rope_scaling
max_position_embeddings = config.max_position_embeddings
self.self_attn = Llama4Attention(
......@@ -340,8 +333,6 @@ class Llama4DecoderLayer(nn.Module):
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=False,
......
......@@ -108,8 +108,7 @@ class FlashConfig(PretrainedConfig):
eos_token_id=100001,
pretraining_tp=1,
tie_word_embeddings=False,
rope_theta=1000000.0,
rope_scaling=None,
rope_parameters=None,
attention_bias=False,
attention_dropout=0.0,
mla_scale_q_lora=False,
......@@ -162,8 +161,13 @@ class FlashConfig(PretrainedConfig):
self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 1000000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.mla_scale_q_lora = mla_scale_q_lora
......@@ -336,15 +340,7 @@ class FlashDecoderLayer(nn.Module):
super().__init__()
self.layer_idx = int(prefix.split(sep=".")[-1])
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
# Dual attention structure
self.self_attn = nn.ModuleList(
......@@ -361,8 +357,6 @@ class FlashDecoderLayer(nn.Module):
config.q_lora_rank if hasattr(config, "q_lora_rank") else None
),
kv_lora_rank=config.kv_lora_rank,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=None
......
......@@ -230,8 +230,7 @@ class MiniCPMAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
rope_parameters: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
......@@ -257,7 +256,6 @@ class MiniCPMAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
......@@ -281,8 +279,7 @@ class MiniCPMAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
)
self.attn = Attention(
......@@ -324,8 +321,6 @@ class MiniCPMDecoderLayer(nn.Module):
self.cache_config = cache_config
self.quant_config = quant_config
self.hidden_size = config.hidden_size
self.rope_theta = getattr(config, "rope_theta", 10000)
self.rope_scaling = getattr(config, "rope_scaling", None)
self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.prefix = prefix
self._init_attn_block()
......@@ -339,8 +334,7 @@ class MiniCPMDecoderLayer(nn.Module):
hidden_size=self.hidden_size,
num_heads=self.config.num_attention_heads,
num_kv_heads=self.config.num_key_value_heads,
rope_theta=self.rope_theta,
rope_scaling=self.rope_scaling,
rope_parameters=self.config.rope_parameters,
max_position_embeddings=self.max_position_embeddings,
cache_config=self.cache_config,
quant_config=self.quant_config,
......
......@@ -25,8 +25,6 @@
# limitations under the License.
"""Inference-only MiniCPM3 model compatible with HuggingFace weights."""
from typing import Any
import torch
from torch import nn
from transformers import PretrainedConfig
......@@ -62,8 +60,6 @@ class MiniCPM3Attention(nn.Module):
v_head_dim: int,
q_lora_rank: int,
kv_lora_rank: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
......@@ -84,7 +80,6 @@ class MiniCPM3Attention(nn.Module):
self.num_local_heads = num_heads // tp_size
self.scaling = self.qk_head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.q_a_proj = ReplicatedLinear(
......@@ -127,8 +122,7 @@ class MiniCPM3Attention(nn.Module):
self.qk_rope_head_dim,
rotary_dim=self.qk_rope_head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
)
self.attn = Attention(
self.num_local_heads,
......@@ -204,8 +198,6 @@ class MiniCPM3DecoderLayer(MiniCPMDecoderLayer):
v_head_dim=self.config.v_head_dim,
q_lora_rank=self.config.q_lora_rank,
kv_lora_rank=self.config.kv_lora_rank,
rope_theta=self.rope_theta,
rope_scaling=self.rope_scaling,
max_position_embeddings=self.max_position_embeddings,
cache_config=self.cache_config,
quant_config=self.quant_config,
......
......@@ -69,8 +69,6 @@ class EagleMiniCPMDecoderLayer(nn.Module):
self.cache_config = cache_config
self.quant_config = quant_config
self.hidden_size = config.hidden_size
self.rope_theta = getattr(config, "rope_theta", 10000)
self.rope_scaling = getattr(config, "rope_scaling", None)
self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.prefix = prefix
self._init_attn_block()
......@@ -84,8 +82,7 @@ class EagleMiniCPMDecoderLayer(nn.Module):
hidden_size=self.hidden_size,
num_heads=self.config.num_attention_heads,
num_kv_heads=self.config.num_key_value_heads,
rope_theta=self.rope_theta,
rope_scaling=self.rope_scaling,
rope_parameters=self.config.rope_parameters,
max_position_embeddings=self.max_position_embeddings,
cache_config=self.cache_config,
quant_config=self.quant_config,
......
......@@ -149,8 +149,7 @@ class MiniMaxM2Attention(nn.Module):
num_heads: int,
num_kv_heads: int,
rotary_dim: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
rope_parameters: dict[str, Any] | None = None,
attn_window_size: int | None = None,
max_position_embeddings: int = 8192,
head_dim: int | None = None,
......@@ -180,7 +179,6 @@ class MiniMaxM2Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
......@@ -205,8 +203,7 @@ class MiniMaxM2Attention(nn.Module):
self.head_dim,
rotary_dim=rotary_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
)
self.attn = Attention(
self.num_heads,
......@@ -252,8 +249,6 @@ class MiniMaxM2DecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int):
max_position_embeddings = max(
......@@ -269,8 +264,7 @@ class MiniMaxM2DecoderLayer(nn.Module):
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rotary_dim=config.rotary_dim,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
max_position_embeddings=max_position_embeddings,
rms_norm_eps=config.rms_norm_eps,
qkv_bias=getattr(config, "attention_bias", False),
......
......@@ -188,7 +188,7 @@ class MiniMaxText01Attention(nn.Module):
num_kv_heads: int,
rotary_dim: int,
max_position: int = 4096 * 32,
rope_theta: float = 10000,
rope_parameters: dict | None = None,
sliding_window: int | None = None,
quant_config: QuantizationConfig | None = None,
layer_idx: int = None,
......@@ -214,7 +214,6 @@ class MiniMaxText01Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.sliding_window = sliding_window
self.prefix = prefix
......@@ -247,7 +246,7 @@ class MiniMaxText01Attention(nn.Module):
head_size=self.head_dim,
rotary_dim=rotary_dim,
max_position=max_position,
base=int(rope_theta),
rope_parameters=rope_parameters,
is_neox_style=True,
dtype=torch.float32,
)
......@@ -287,8 +286,6 @@ class MiniMaxText01DecoderLayer(nn.Module):
self.hidden_size = config.hidden_size
self.expert_num = expert_num
rope_theta = getattr(config, "rope_theta", 10000)
head_dim = getattr(config, "head_dim", None)
if head_dim is None:
head_dim = config.hidden_size // config.num_attention_heads
......@@ -328,7 +325,7 @@ class MiniMaxText01DecoderLayer(nn.Module):
else head_dim,
num_kv_heads=config.num_key_value_heads,
max_position=max_position_embeddings,
rope_theta=rope_theta,
rope_parameters=config.rope_parameters,
sliding_window=config.sliding_window,
quant_config=quant_config,
layer_idx=self._ilayer,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment