Unverified Commit a8b70304 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Update `rope_scaling` to `rope_parameters` in preparation for Transformers v5 (#28542)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent d44e9df7
......@@ -156,8 +156,6 @@ class CohereAttention(nn.Module):
self.max_position_embeddings = getattr(
config, "model_max_length", None
) or getattr(config, "max_position_embeddings", 8192)
self.rope_theta = config.rope_theta
self.rope_scaling = getattr(config, "rope_scaling", None)
self.use_qk_norm = getattr(config, "use_qk_norm", False)
self.qkv_proj = QKVParallelLinear(
self.hidden_size,
......@@ -179,8 +177,7 @@ class CohereAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=self.rope_theta,
rope_scaling=self.rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=False,
)
......
......@@ -8,6 +8,7 @@ import vllm.envs as envs
from vllm.logger import init_logger
from vllm.model_executor.models import ModelRegistry
from vllm.platforms import current_platform
from vllm.transformers_utils.config import set_default_rope_theta
from vllm.utils.math_utils import cdiv, round_up
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
......@@ -46,8 +47,7 @@ class GteNewModelConfig(VerifyAndUpdateConfig):
"head_size": head_dim,
"rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
"max_position": config.max_position_embeddings,
"base": config.rope_theta,
"rope_scaling": getattr(config, "rope_scaling", None),
"rope_parameters": config.rope_parameters,
}
......@@ -78,12 +78,13 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):
if not model_config.enforce_eager:
max_position = round_up(max_position, 8)
set_default_rope_theta(config, default_theta=config.rotary_emb_base)
config.rotary_kwargs = {
"head_size": head_dim,
"rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
"max_position": max_position,
"base": getattr(config, "rope_theta", config.rotary_emb_base),
"rope_scaling": getattr(config, "rope_scaling", None),
"rope_parameters": config.rope_parameters,
}
......@@ -117,18 +118,20 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
head_dim = config.hidden_size // config.num_attention_heads
rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
max_trained_positions = getattr(config, "max_trained_positions", 2048)
set_default_rope_theta(config, default_theta=config.rotary_emb_base)
config.rotary_kwargs = {
"head_size": head_dim,
"rotary_dim": rotary_emb_dim,
"max_position": max_trained_positions,
"base": getattr(config, "rope_theta", config.rotary_emb_base),
"rope_scaling": getattr(config, "rope_scaling", None),
"rope_parameters": config.rope_parameters,
}
# we ignore config.rotary_scaling_factor so that for datasets shorter
# than max_trained_positions 2048, the results are consistent
# with SentenceTransformer.
# The context extension uses vllm style rope_theta and rope_scaling.
# The context extension uses vllm style rope_theta and rope_parameters.
# See #17785 #18755
if (
not vllm_config.model_config.hf_overrides
......@@ -172,7 +175,7 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
if hasattr(hf_text_config, "max_model_len"):
delattr(hf_text_config, "max_model_len")
hf_text_config.max_position_embeddings = max_trained_positions
hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"]
hf_text_config.rope_parameters = config.rotary_kwargs["rope_parameters"]
# The priority of sentence_bert_config.json is higher
# than max_position_embeddings
......@@ -246,8 +249,7 @@ class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
"head_size": head_dim,
"rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
"max_position": config.max_position_embeddings,
"base": config.rope_theta,
"rope_scaling": getattr(config, "rope_scaling", None),
"rope_parameters": config.rope_parameters,
}
......
......@@ -197,7 +197,10 @@ class DbrxAttention(nn.Module):
self.head_dim = self.d_model // self.total_num_heads
self.total_num_kv_heads = config.attn_config.kv_n_heads
self.clip_qkv = config.attn_config.clip_qkv
self.rope_theta = config.attn_config.rope_theta
rope_parameters = {
"rope_type": "default",
"rope_theta": int(config.attn_config.rope_theta),
}
self.max_position = config.max_seq_len
# pylint: disable=invalid-name
......@@ -221,7 +224,7 @@ class DbrxAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position,
base=int(self.rope_theta),
rope_parameters=rope_parameters,
is_neox_style=True,
)
......
......@@ -27,7 +27,6 @@
import typing
from collections.abc import Callable, Iterable
from itertools import islice
from typing import Any
import torch
from torch import nn
......@@ -111,8 +110,6 @@ class DeepseekAttention(nn.Module):
config: DeepseekV2Config | DeepseekV3Config,
hidden_size: int,
num_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
......@@ -139,7 +136,6 @@ class DeepseekAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
......@@ -162,8 +158,7 @@ class DeepseekAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
)
self.attn = Attention(
self.num_heads,
......@@ -409,8 +404,6 @@ class DeepseekV2Attention(nn.Module):
v_head_dim: int,
q_lora_rank: int,
kv_lora_rank: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
......@@ -430,7 +423,6 @@ class DeepseekV2Attention(nn.Module):
assert num_heads % tp_size == 0
self.num_local_heads = num_heads // tp_size
self.scaling = self.qk_head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
assert topk_indices_buffer is None, (
"topk_indices_buffer is not \
......@@ -485,21 +477,20 @@ class DeepseekV2Attention(nn.Module):
quant_config=quant_config,
prefix=f"{prefix}.o_proj",
)
if rope_scaling:
rope_scaling["rope_type"] = "deepseek_yarn"
if config.rope_parameters["rope_type"] != "default":
config.rope_parameters["rope_type"] = "deepseek_yarn"
self.rotary_emb = get_rope(
qk_rope_head_dim,
rotary_dim=qk_rope_head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=False,
)
if rope_scaling:
mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
scaling_factor = rope_scaling["factor"]
if config.rope_parameters["rope_type"] != "default":
mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
scaling_factor = config.rope_parameters["factor"]
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
self.scaling = self.scaling * mscale * mscale
......@@ -903,8 +894,6 @@ class DeepseekV2MLAAttention(nn.Module):
v_head_dim: int,
q_lora_rank: int | None,
kv_lora_rank: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
......@@ -927,7 +916,6 @@ class DeepseekV2MLAAttention(nn.Module):
self.num_local_heads = num_heads // tp_size
self.scaling = self.qk_head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
if self.q_lora_rank is not None:
......@@ -981,19 +969,18 @@ class DeepseekV2MLAAttention(nn.Module):
prefix=f"{prefix}.o_proj",
)
if rope_scaling:
rope_scaling["rope_type"] = "deepseek_yarn"
if config.rope_parameters["rope_type"] != "default":
config.rope_parameters["rope_type"] = "deepseek_yarn"
self.rotary_emb = get_rope(
qk_rope_head_dim,
rotary_dim=qk_rope_head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=False,
)
if rope_scaling:
mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
scaling_factor = rope_scaling["factor"]
if config.rope_parameters["rope_type"] != "default":
mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
scaling_factor = config.rope_parameters["factor"]
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
self.scaling = self.scaling * mscale * mscale
......@@ -1073,8 +1060,6 @@ class DeepseekV2DecoderLayer(nn.Module):
parallel_config = vllm_config.parallel_config
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
moe_layer_freq = getattr(config, "moe_layer_freq", 1)
# DecoderLayers are created with `make_layers` which passes the prefix
......@@ -1107,8 +1092,6 @@ class DeepseekV2DecoderLayer(nn.Module):
v_head_dim=v_head_dim,
q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None,
kv_lora_rank=kv_lora_rank,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,
......
......@@ -27,7 +27,6 @@
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
from torch import nn
......@@ -202,8 +201,6 @@ class Dots1Attention(nn.Module):
num_heads: int,
num_kv_heads: int,
config: Dots1Config,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
......@@ -229,7 +226,6 @@ class Dots1Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
attention_bias = config.attention_bias
......@@ -255,8 +251,7 @@ class Dots1Attention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
)
self.attn = Attention(
self.num_heads,
......@@ -296,8 +291,6 @@ class Dots1DecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
layer_idx = int(prefix.split(sep=".")[-1])
self.layer_idx = layer_idx
......@@ -307,8 +300,6 @@ class Dots1DecoderLayer(nn.Module):
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
config=config,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,
......
......@@ -62,6 +62,7 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name,
)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
from .utils import (
......@@ -232,9 +233,8 @@ class Ernie4_5_MoeAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_parameters: dict[str, Any],
head_dim: int | None = None,
rope_theta: float = 500000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 131072,
rms_norm_eps: float = 1e-05,
qkv_bias: bool = False,
......@@ -266,7 +266,6 @@ class Ernie4_5_MoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
......@@ -291,9 +290,8 @@ class Ernie4_5_MoeAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_parameters=rope_parameters,
is_neox_style=False,
rope_scaling=rope_scaling,
)
self.attn = Attention(
self.num_heads,
......@@ -333,16 +331,14 @@ class Ernie4_5_MoeDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 500000)
rope_scaling = getattr(config, "rope_scaling", None)
set_default_rope_theta(config, default_theta=500000)
max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
self.self_attn = Ernie4_5_MoeAttention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
head_dim=getattr(config, "head_dim", None),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
max_position_embeddings=max_position_embeddings,
rms_norm_eps=config.rms_norm_eps,
qkv_bias=getattr(config, "use_bias", False),
......
......@@ -58,6 +58,7 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name,
)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from .ernie45_moe import Ernie4_5_MoeMLP
from .interfaces import SupportsPP
......@@ -91,9 +92,8 @@ class Ernie4_5_VLMoeAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_parameters: dict[str, Any],
head_dim: int | None = None,
rope_theta: float = 500000,
rope_scaling: dict[str, Any] | None = None,
freq_allocation: int = 20,
max_position_embeddings: int = 131072,
rms_norm_eps: float = 1e-05,
......@@ -126,7 +126,6 @@ class Ernie4_5_VLMoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
......@@ -155,7 +154,7 @@ class Ernie4_5_VLMoeAttention(nn.Module):
head_size=self.head_dim,
rotary_dim=self.head_dim,
max_position_embeddings=max_position_embeddings,
base=rope_theta,
base=rope_parameters["rope_theta"],
is_neox_style=False,
dtype=torch.get_default_dtype(),
mrope_section=[h_rope, w_rope, t_rope],
......@@ -413,8 +412,7 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 500000)
rope_scaling = getattr(config, "rope_scaling", None)
set_default_rope_theta(config, default_theta=500000)
freq_allocation = getattr(config, "freq_allocation", 20)
max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
......@@ -423,8 +421,7 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module):
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
head_dim=getattr(config, "head_dim", None),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
freq_allocation=freq_allocation,
max_position_embeddings=max_position_embeddings,
rms_norm_eps=config.rms_norm_eps,
......
......@@ -27,7 +27,6 @@
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
from torch import nn
......@@ -113,8 +112,6 @@ class ExaoneAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
......@@ -144,7 +141,6 @@ class ExaoneAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
......@@ -173,8 +169,7 @@ class ExaoneAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=is_neox_style,
)
self.attn = Attention(
......@@ -207,8 +202,6 @@ class ExaoneBlockAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
......@@ -221,8 +214,6 @@ class ExaoneBlockAttention(nn.Module):
hidden_size=hidden_size,
num_heads=num_heads,
num_kv_heads=num_kv_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=bias,
......@@ -251,14 +242,6 @@ class ExaoneDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias
......@@ -272,8 +255,6 @@ class ExaoneDecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,
......
......@@ -23,7 +23,6 @@
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
from torch import nn
......@@ -52,6 +51,7 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name,
)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from .interfaces import SupportsLoRA, SupportsPP
from .utils import (
......@@ -110,8 +110,6 @@ class Exaone4Attention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 1000000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
......@@ -141,7 +139,6 @@ class Exaone4Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
......@@ -176,12 +173,12 @@ class Exaone4Attention(nn.Module):
# apply rotary embeddings to every layer in full attention models
self.apply_rope_all_layers = "sliding_attention" not in config.layer_types
set_default_rope_theta(config, default_theta=1000000)
self.rotary_emb = get_rope(
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=is_neox_style,
)
self.attn = Attention(
......@@ -227,14 +224,6 @@ class Exaone4DecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 1000000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias
......@@ -249,8 +238,6 @@ class Exaone4DecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,
......
......@@ -164,13 +164,12 @@ class FalconAttention(nn.Module):
)
if self.use_rotary:
rope_theta = getattr(config, "rope_theta", 10000)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.rotary_emb = get_rope(
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_parameters=config.rope_parameters,
)
self.attn = Attention(
self.num_heads,
......
......@@ -35,6 +35,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from .interfaces import (
HasInnerState,
......@@ -214,8 +215,7 @@ class FalconH1AttentionDecoderLayer(nn.Module):
prefix: str = "",
) -> None:
super().__init__()
rope_theta = getattr(config, "rope_theta", 1e11)
rope_scaling = getattr(config, "rope_scaling", None)
set_default_rope_theta(config, default_theta=1e11)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.hidden_size = config.hidden_size
tp_size = get_tensor_model_parallel_world_size()
......@@ -240,7 +240,6 @@ class FalconH1AttentionDecoderLayer(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
if hasattr(config, "partial_rotary_factor"):
......@@ -254,8 +253,7 @@ class FalconH1AttentionDecoderLayer(nn.Module):
head_size=self.head_dim,
rotary_dim=rotary_dim,
max_position=max_position_embeddings,
rope_scaling=rope_scaling,
base=rope_theta,
rope_parameters=config.rope_parameters,
is_neox_style=True,
dtype=None, # see impl of get_rope
)
......
......@@ -20,6 +20,7 @@
from collections.abc import Iterable
from functools import cache
from itertools import islice
from typing import Any
import torch
from torch import nn
......@@ -127,8 +128,8 @@ class GemmaAttention(nn.Module):
num_heads: int,
num_kv_heads: int,
head_dim: int,
rope_parameters: dict[str, Any],
max_position_embeddings: int = 8192,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
prefix: str = "",
......@@ -153,7 +154,6 @@ class GemmaAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear(
hidden_size,
......@@ -176,7 +176,7 @@ class GemmaAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=self.rope_theta,
rope_parameters=rope_parameters,
is_neox_style=True,
)
self.attn = Attention(
......@@ -218,7 +218,7 @@ class GemmaDecoderLayer(nn.Module):
num_kv_heads=config.num_key_value_heads,
head_dim=config.head_dim,
max_position_embeddings=config.max_position_embeddings,
rope_theta=config.rope_theta,
rope_parameters=config.rope_parameters,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.self_attn",
......
......@@ -107,7 +107,6 @@ class Gemma2Attention(nn.Module):
num_kv_heads: int,
head_dim: int,
max_position_embeddings: int,
rope_theta: float,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
attn_logits_soft_cap: float | None = None,
......@@ -134,7 +133,6 @@ class Gemma2Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = config.query_pre_attn_scalar**-0.5
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear(
hidden_size,
......@@ -156,7 +154,7 @@ class Gemma2Attention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=self.rope_theta,
rope_parameters=config.rope_parameters,
is_neox_style=True,
)
......@@ -206,7 +204,6 @@ class Gemma2DecoderLayer(nn.Module):
num_kv_heads=config.num_key_value_heads,
head_dim=config.head_dim,
max_position_embeddings=config.max_position_embeddings,
rope_theta=config.rope_theta,
cache_config=cache_config,
quant_config=quant_config,
attn_logits_soft_cap=config.attn_logit_softcapping,
......
......@@ -155,25 +155,28 @@ class Gemma3Attention(nn.Module):
self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
layer_idx = extract_layer_index(prefix)
self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
layer_type = config.layer_types[layer_idx]
self.is_sliding = layer_type == "sliding_attention"
sliding_window = config.sliding_window if self.is_sliding else None
# Initialize the rotary embedding.
if self.is_sliding:
# Local attention. Override the values in config.json.
self.rope_theta = config.rope_local_base_freq
self.rope_scaling = {"rope_type": "default"}
if layer_type in config.rope_parameters:
# Transformers v5 rope config.
rope_parameters = config.rope_parameters[layer_type]
else:
# Transformers v4 rope config.
# Global attention. Use the values in config.json.
self.rope_theta = config.rope_theta
self.rope_scaling = config.rope_scaling
rope_parameters = config.rope_parameters.copy()
# Local attention. Override the values in config.json.
if self.is_sliding:
rope_parameters["rope_theta"] = config.rope_local_base_freq
self.rotary_emb = get_rope(
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=self.rope_theta,
rope_parameters=rope_parameters,
is_neox_style=True,
rope_scaling=self.rope_scaling,
)
if getattr(config, "is_causal", True):
......
......@@ -332,18 +332,21 @@ class Gemma3nAttention(nn.Module):
)
layer_idx = extract_layer_index(prefix)
is_sliding = config.layer_types[layer_idx] == "sliding_attention"
layer_type = config.layer_types[layer_idx]
is_sliding = layer_type == "sliding_attention"
self.sliding_window = config.sliding_window if is_sliding else None
# Initialize the rotary embedding.
if is_sliding:
# Local attention. Override the values in config.json.
rope_theta = config.rope_local_base_freq
rope_scaling = {"rope_type": "default"}
if layer_type in config.rope_parameters:
# Transformers v5 rope config.
rope_parameters = config.rope_parameters[layer_type]
else:
# Transformers v4 rope config.
# Global attention. Use the values in config.json.
rope_theta = config.rope_theta
rope_scaling = config.rope_scaling
rope_parameters = config.rope_parameters.copy()
# Local attention. Override the values in config.json.
if is_sliding:
rope_parameters["rope_theta"] = config.rope_local_base_freq
first_kv_shared_layer_idx = (
config.num_hidden_layers - config.num_kv_shared_layers
......@@ -383,9 +386,8 @@ class Gemma3nAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_parameters=rope_parameters,
is_neox_style=True,
rope_scaling=rope_scaling,
)
self.attn = Attention(
......
......@@ -57,10 +57,8 @@ class Glm4Attention(nn.Module):
max_position: int = 4096 * 32,
head_dim: int | None = None,
qkv_bias: bool = False,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
rope_scaling: tuple | None = None,
prefix: str = "",
attn_type: str = AttentionType.DECODER,
) -> None:
......@@ -86,7 +84,6 @@ class Glm4Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear(
hidden_size,
self.head_dim,
......@@ -107,8 +104,7 @@ class Glm4Attention(nn.Module):
self.head_dim,
rotary_dim=self.rotary_dim,
max_position=max_position,
base=self.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
partial_rotary_factor=partial_rotary_factor,
is_neox_style=False,
)
......@@ -150,8 +146,6 @@ class Glm4DecoderLayer(nn.Module):
quant_config = vllm_config.quant_config
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 1000000)
rope_scaling = getattr(config, "rope_scaling", None)
self.self_attn = Glm4Attention(
config=config,
......@@ -159,12 +153,10 @@ class Glm4DecoderLayer(nn.Module):
num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
qkv_bias=getattr(config, "attention_bias", False),
head_dim=getattr(config, "head_dim", None),
cache_config=cache_config,
quant_config=quant_config,
rope_scaling=rope_scaling,
prefix=f"{prefix}.self_attn",
attn_type=AttentionType.DECODER,
)
......
......@@ -703,7 +703,6 @@ class Glm4vVisionTransformer(nn.Module):
head_size=head_dim,
rotary_dim=head_dim // 2,
max_position=8192,
base=10000.0,
is_neox_style=True,
)
self.blocks = nn.ModuleList(
......
......@@ -26,7 +26,6 @@
import typing
from collections.abc import Callable, Iterable
from itertools import islice
from typing import Any
import torch
from torch import nn
......@@ -233,8 +232,6 @@ class Glm4MoeAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 131072,
head_dim: int | None = None,
rms_norm_eps: float = 1e-05,
......@@ -264,7 +261,6 @@ class Glm4MoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.use_qk_norm = use_qk_norm
......@@ -291,8 +287,7 @@ class Glm4MoeAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
partial_rotary_factor=partial_rotary_factor,
)
self.attn = Attention(
......@@ -341,8 +336,6 @@ class Glm4MoeDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
# DecoderLayers are created with `make_layers` which passes the prefix
# with the layer's index.
......@@ -354,8 +347,6 @@ class Glm4MoeDecoderLayer(nn.Module):
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
head_dim=config.head_dim,
rms_norm_eps=config.rms_norm_eps,
......
......@@ -95,13 +95,12 @@ class GPTJAttention(nn.Module):
scaling = self.head_size**-0.5
assert getattr(config, "rotary", True)
assert config.rotary_dim % 2 == 0
rope_theta = getattr(config, "rope_theta", 10000)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.rotary_emb = get_rope(
self.head_size,
rotary_dim=config.rotary_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_parameters=config.rope_parameters,
is_neox_style=False,
)
self.attn = Attention(
......
......@@ -92,13 +92,12 @@ class GPTNeoXAttention(nn.Module):
scaling = self.head_size**-0.5
rotary_dim = int(self.head_size * config.rotary_pct)
assert rotary_dim % 2 == 0
rope_theta = getattr(config, "rope_theta", 10000)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.rotary_emb = get_rope(
self.head_size,
rotary_dim=rotary_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_parameters=config.rope_parameters,
)
self.attn = Attention(
self.num_heads,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment