Unverified Commit a8b70304 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Update `rope_scaling` to `rope_parameters` in preparation for Transformers v5 (#28542)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent d44e9df7
...@@ -156,8 +156,6 @@ class CohereAttention(nn.Module): ...@@ -156,8 +156,6 @@ class CohereAttention(nn.Module):
self.max_position_embeddings = getattr( self.max_position_embeddings = getattr(
config, "model_max_length", None config, "model_max_length", None
) or getattr(config, "max_position_embeddings", 8192) ) or getattr(config, "max_position_embeddings", 8192)
self.rope_theta = config.rope_theta
self.rope_scaling = getattr(config, "rope_scaling", None)
self.use_qk_norm = getattr(config, "use_qk_norm", False) self.use_qk_norm = getattr(config, "use_qk_norm", False)
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
self.hidden_size, self.hidden_size,
...@@ -179,8 +177,7 @@ class CohereAttention(nn.Module): ...@@ -179,8 +177,7 @@ class CohereAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=self.max_position_embeddings, max_position=self.max_position_embeddings,
base=self.rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=self.rope_scaling,
is_neox_style=False, is_neox_style=False,
) )
......
...@@ -8,6 +8,7 @@ import vllm.envs as envs ...@@ -8,6 +8,7 @@ import vllm.envs as envs
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.models import ModelRegistry from vllm.model_executor.models import ModelRegistry
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.transformers_utils.config import set_default_rope_theta
from vllm.utils.math_utils import cdiv, round_up from vllm.utils.math_utils import cdiv, round_up
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
...@@ -46,8 +47,7 @@ class GteNewModelConfig(VerifyAndUpdateConfig): ...@@ -46,8 +47,7 @@ class GteNewModelConfig(VerifyAndUpdateConfig):
"head_size": head_dim, "head_size": head_dim,
"rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
"max_position": config.max_position_embeddings, "max_position": config.max_position_embeddings,
"base": config.rope_theta, "rope_parameters": config.rope_parameters,
"rope_scaling": getattr(config, "rope_scaling", None),
} }
...@@ -78,12 +78,13 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig): ...@@ -78,12 +78,13 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):
if not model_config.enforce_eager: if not model_config.enforce_eager:
max_position = round_up(max_position, 8) max_position = round_up(max_position, 8)
set_default_rope_theta(config, default_theta=config.rotary_emb_base)
config.rotary_kwargs = { config.rotary_kwargs = {
"head_size": head_dim, "head_size": head_dim,
"rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
"max_position": max_position, "max_position": max_position,
"base": getattr(config, "rope_theta", config.rotary_emb_base), "rope_parameters": config.rope_parameters,
"rope_scaling": getattr(config, "rope_scaling", None),
} }
...@@ -117,18 +118,20 @@ class NomicBertModelConfig(VerifyAndUpdateConfig): ...@@ -117,18 +118,20 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
head_dim = config.hidden_size // config.num_attention_heads head_dim = config.hidden_size // config.num_attention_heads
rotary_emb_dim = int(head_dim * config.rotary_emb_fraction) rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
max_trained_positions = getattr(config, "max_trained_positions", 2048) max_trained_positions = getattr(config, "max_trained_positions", 2048)
set_default_rope_theta(config, default_theta=config.rotary_emb_base)
config.rotary_kwargs = { config.rotary_kwargs = {
"head_size": head_dim, "head_size": head_dim,
"rotary_dim": rotary_emb_dim, "rotary_dim": rotary_emb_dim,
"max_position": max_trained_positions, "max_position": max_trained_positions,
"base": getattr(config, "rope_theta", config.rotary_emb_base), "rope_parameters": config.rope_parameters,
"rope_scaling": getattr(config, "rope_scaling", None),
} }
# we ignore config.rotary_scaling_factor so that for datasets shorter # we ignore config.rotary_scaling_factor so that for datasets shorter
# than max_trained_positions 2048, the results are consistent # than max_trained_positions 2048, the results are consistent
# with SentenceTransformer. # with SentenceTransformer.
# The context extension uses vllm style rope_theta and rope_scaling. # The context extension uses vllm style rope_theta and rope_parameters.
# See #17785 #18755 # See #17785 #18755
if ( if (
not vllm_config.model_config.hf_overrides not vllm_config.model_config.hf_overrides
...@@ -172,7 +175,7 @@ class NomicBertModelConfig(VerifyAndUpdateConfig): ...@@ -172,7 +175,7 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
if hasattr(hf_text_config, "max_model_len"): if hasattr(hf_text_config, "max_model_len"):
delattr(hf_text_config, "max_model_len") delattr(hf_text_config, "max_model_len")
hf_text_config.max_position_embeddings = max_trained_positions hf_text_config.max_position_embeddings = max_trained_positions
hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"] hf_text_config.rope_parameters = config.rotary_kwargs["rope_parameters"]
# The priority of sentence_bert_config.json is higher # The priority of sentence_bert_config.json is higher
# than max_position_embeddings # than max_position_embeddings
...@@ -246,8 +249,7 @@ class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig): ...@@ -246,8 +249,7 @@ class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
"head_size": head_dim, "head_size": head_dim,
"rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
"max_position": config.max_position_embeddings, "max_position": config.max_position_embeddings,
"base": config.rope_theta, "rope_parameters": config.rope_parameters,
"rope_scaling": getattr(config, "rope_scaling", None),
} }
......
...@@ -197,7 +197,10 @@ class DbrxAttention(nn.Module): ...@@ -197,7 +197,10 @@ class DbrxAttention(nn.Module):
self.head_dim = self.d_model // self.total_num_heads self.head_dim = self.d_model // self.total_num_heads
self.total_num_kv_heads = config.attn_config.kv_n_heads self.total_num_kv_heads = config.attn_config.kv_n_heads
self.clip_qkv = config.attn_config.clip_qkv self.clip_qkv = config.attn_config.clip_qkv
self.rope_theta = config.attn_config.rope_theta rope_parameters = {
"rope_type": "default",
"rope_theta": int(config.attn_config.rope_theta),
}
self.max_position = config.max_seq_len self.max_position = config.max_seq_len
# pylint: disable=invalid-name # pylint: disable=invalid-name
...@@ -221,7 +224,7 @@ class DbrxAttention(nn.Module): ...@@ -221,7 +224,7 @@ class DbrxAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=self.max_position, max_position=self.max_position,
base=int(self.rope_theta), rope_parameters=rope_parameters,
is_neox_style=True, is_neox_style=True,
) )
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
import typing import typing
from collections.abc import Callable, Iterable from collections.abc import Callable, Iterable
from itertools import islice from itertools import islice
from typing import Any
import torch import torch
from torch import nn from torch import nn
...@@ -111,8 +110,6 @@ class DeepseekAttention(nn.Module): ...@@ -111,8 +110,6 @@ class DeepseekAttention(nn.Module):
config: DeepseekV2Config | DeepseekV3Config, config: DeepseekV2Config | DeepseekV3Config,
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
...@@ -139,7 +136,6 @@ class DeepseekAttention(nn.Module): ...@@ -139,7 +136,6 @@ class DeepseekAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
...@@ -162,8 +158,7 @@ class DeepseekAttention(nn.Module): ...@@ -162,8 +158,7 @@ class DeepseekAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
) )
self.attn = Attention( self.attn = Attention(
self.num_heads, self.num_heads,
...@@ -409,8 +404,6 @@ class DeepseekV2Attention(nn.Module): ...@@ -409,8 +404,6 @@ class DeepseekV2Attention(nn.Module):
v_head_dim: int, v_head_dim: int,
q_lora_rank: int, q_lora_rank: int,
kv_lora_rank: int, kv_lora_rank: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
...@@ -430,7 +423,6 @@ class DeepseekV2Attention(nn.Module): ...@@ -430,7 +423,6 @@ class DeepseekV2Attention(nn.Module):
assert num_heads % tp_size == 0 assert num_heads % tp_size == 0
self.num_local_heads = num_heads // tp_size self.num_local_heads = num_heads // tp_size
self.scaling = self.qk_head_dim**-0.5 self.scaling = self.qk_head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
assert topk_indices_buffer is None, ( assert topk_indices_buffer is None, (
"topk_indices_buffer is not \ "topk_indices_buffer is not \
...@@ -485,21 +477,20 @@ class DeepseekV2Attention(nn.Module): ...@@ -485,21 +477,20 @@ class DeepseekV2Attention(nn.Module):
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.o_proj", prefix=f"{prefix}.o_proj",
) )
if rope_scaling: if config.rope_parameters["rope_type"] != "default":
rope_scaling["rope_type"] = "deepseek_yarn" config.rope_parameters["rope_type"] = "deepseek_yarn"
self.rotary_emb = get_rope( self.rotary_emb = get_rope(
qk_rope_head_dim, qk_rope_head_dim,
rotary_dim=qk_rope_head_dim, rotary_dim=qk_rope_head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
is_neox_style=False, is_neox_style=False,
) )
if rope_scaling: if config.rope_parameters["rope_type"] != "default":
mscale_all_dim = rope_scaling.get("mscale_all_dim", False) mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
scaling_factor = rope_scaling["factor"] scaling_factor = config.rope_parameters["factor"]
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
self.scaling = self.scaling * mscale * mscale self.scaling = self.scaling * mscale * mscale
...@@ -903,8 +894,6 @@ class DeepseekV2MLAAttention(nn.Module): ...@@ -903,8 +894,6 @@ class DeepseekV2MLAAttention(nn.Module):
v_head_dim: int, v_head_dim: int,
q_lora_rank: int | None, q_lora_rank: int | None,
kv_lora_rank: int, kv_lora_rank: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
...@@ -927,7 +916,6 @@ class DeepseekV2MLAAttention(nn.Module): ...@@ -927,7 +916,6 @@ class DeepseekV2MLAAttention(nn.Module):
self.num_local_heads = num_heads // tp_size self.num_local_heads = num_heads // tp_size
self.scaling = self.qk_head_dim**-0.5 self.scaling = self.qk_head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
if self.q_lora_rank is not None: if self.q_lora_rank is not None:
...@@ -981,19 +969,18 @@ class DeepseekV2MLAAttention(nn.Module): ...@@ -981,19 +969,18 @@ class DeepseekV2MLAAttention(nn.Module):
prefix=f"{prefix}.o_proj", prefix=f"{prefix}.o_proj",
) )
if rope_scaling: if config.rope_parameters["rope_type"] != "default":
rope_scaling["rope_type"] = "deepseek_yarn" config.rope_parameters["rope_type"] = "deepseek_yarn"
self.rotary_emb = get_rope( self.rotary_emb = get_rope(
qk_rope_head_dim, qk_rope_head_dim,
rotary_dim=qk_rope_head_dim, rotary_dim=qk_rope_head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
is_neox_style=False, is_neox_style=False,
) )
if rope_scaling: if config.rope_parameters["rope_type"] != "default":
mscale_all_dim = rope_scaling.get("mscale_all_dim", False) mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
scaling_factor = rope_scaling["factor"] scaling_factor = config.rope_parameters["factor"]
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
self.scaling = self.scaling * mscale * mscale self.scaling = self.scaling * mscale * mscale
...@@ -1073,8 +1060,6 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -1073,8 +1060,6 @@ class DeepseekV2DecoderLayer(nn.Module):
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
moe_layer_freq = getattr(config, "moe_layer_freq", 1) moe_layer_freq = getattr(config, "moe_layer_freq", 1)
# DecoderLayers are created with `make_layers` which passes the prefix # DecoderLayers are created with `make_layers` which passes the prefix
...@@ -1107,8 +1092,6 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -1107,8 +1092,6 @@ class DeepseekV2DecoderLayer(nn.Module):
v_head_dim=v_head_dim, v_head_dim=v_head_dim,
q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None, q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None,
kv_lora_rank=kv_lora_rank, kv_lora_rank=kv_lora_rank,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
from collections.abc import Iterable from collections.abc import Iterable
from itertools import islice from itertools import islice
from typing import Any
import torch import torch
from torch import nn from torch import nn
...@@ -202,8 +201,6 @@ class Dots1Attention(nn.Module): ...@@ -202,8 +201,6 @@ class Dots1Attention(nn.Module):
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
config: Dots1Config, config: Dots1Config,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
...@@ -229,7 +226,6 @@ class Dots1Attention(nn.Module): ...@@ -229,7 +226,6 @@ class Dots1Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
attention_bias = config.attention_bias attention_bias = config.attention_bias
...@@ -255,8 +251,7 @@ class Dots1Attention(nn.Module): ...@@ -255,8 +251,7 @@ class Dots1Attention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
) )
self.attn = Attention( self.attn = Attention(
self.num_heads, self.num_heads,
...@@ -296,8 +291,6 @@ class Dots1DecoderLayer(nn.Module): ...@@ -296,8 +291,6 @@ class Dots1DecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
layer_idx = int(prefix.split(sep=".")[-1]) layer_idx = int(prefix.split(sep=".")[-1])
self.layer_idx = layer_idx self.layer_idx = layer_idx
...@@ -307,8 +300,6 @@ class Dots1DecoderLayer(nn.Module): ...@@ -307,8 +300,6 @@ class Dots1DecoderLayer(nn.Module):
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
config=config, config=config,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
......
...@@ -62,6 +62,7 @@ from vllm.model_executor.model_loader.weight_utils import ( ...@@ -62,6 +62,7 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name, maybe_remap_kv_scale_name,
) )
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
from .utils import ( from .utils import (
...@@ -232,9 +233,8 @@ class Ernie4_5_MoeAttention(nn.Module): ...@@ -232,9 +233,8 @@ class Ernie4_5_MoeAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_parameters: dict[str, Any],
head_dim: int | None = None, head_dim: int | None = None,
rope_theta: float = 500000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 131072, max_position_embeddings: int = 131072,
rms_norm_eps: float = 1e-05, rms_norm_eps: float = 1e-05,
qkv_bias: bool = False, qkv_bias: bool = False,
...@@ -266,7 +266,6 @@ class Ernie4_5_MoeAttention(nn.Module): ...@@ -266,7 +266,6 @@ class Ernie4_5_MoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
...@@ -291,9 +290,8 @@ class Ernie4_5_MoeAttention(nn.Module): ...@@ -291,9 +290,8 @@ class Ernie4_5_MoeAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=rope_parameters,
is_neox_style=False, is_neox_style=False,
rope_scaling=rope_scaling,
) )
self.attn = Attention( self.attn = Attention(
self.num_heads, self.num_heads,
...@@ -333,16 +331,14 @@ class Ernie4_5_MoeDecoderLayer(nn.Module): ...@@ -333,16 +331,14 @@ class Ernie4_5_MoeDecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 500000) set_default_rope_theta(config, default_theta=500000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 131072) max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
self.self_attn = Ernie4_5_MoeAttention( self.self_attn = Ernie4_5_MoeAttention(
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
head_dim=getattr(config, "head_dim", None), head_dim=getattr(config, "head_dim", None),
rope_theta=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
rms_norm_eps=config.rms_norm_eps, rms_norm_eps=config.rms_norm_eps,
qkv_bias=getattr(config, "use_bias", False), qkv_bias=getattr(config, "use_bias", False),
......
...@@ -58,6 +58,7 @@ from vllm.model_executor.model_loader.weight_utils import ( ...@@ -58,6 +58,7 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name, maybe_remap_kv_scale_name,
) )
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from .ernie45_moe import Ernie4_5_MoeMLP from .ernie45_moe import Ernie4_5_MoeMLP
from .interfaces import SupportsPP from .interfaces import SupportsPP
...@@ -91,9 +92,8 @@ class Ernie4_5_VLMoeAttention(nn.Module): ...@@ -91,9 +92,8 @@ class Ernie4_5_VLMoeAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_parameters: dict[str, Any],
head_dim: int | None = None, head_dim: int | None = None,
rope_theta: float = 500000,
rope_scaling: dict[str, Any] | None = None,
freq_allocation: int = 20, freq_allocation: int = 20,
max_position_embeddings: int = 131072, max_position_embeddings: int = 131072,
rms_norm_eps: float = 1e-05, rms_norm_eps: float = 1e-05,
...@@ -126,7 +126,6 @@ class Ernie4_5_VLMoeAttention(nn.Module): ...@@ -126,7 +126,6 @@ class Ernie4_5_VLMoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
...@@ -155,7 +154,7 @@ class Ernie4_5_VLMoeAttention(nn.Module): ...@@ -155,7 +154,7 @@ class Ernie4_5_VLMoeAttention(nn.Module):
head_size=self.head_dim, head_size=self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
base=rope_theta, base=rope_parameters["rope_theta"],
is_neox_style=False, is_neox_style=False,
dtype=torch.get_default_dtype(), dtype=torch.get_default_dtype(),
mrope_section=[h_rope, w_rope, t_rope], mrope_section=[h_rope, w_rope, t_rope],
...@@ -413,8 +412,7 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module): ...@@ -413,8 +412,7 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 500000) set_default_rope_theta(config, default_theta=500000)
rope_scaling = getattr(config, "rope_scaling", None)
freq_allocation = getattr(config, "freq_allocation", 20) freq_allocation = getattr(config, "freq_allocation", 20)
max_position_embeddings = getattr(config, "max_position_embeddings", 131072) max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
...@@ -423,8 +421,7 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module): ...@@ -423,8 +421,7 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module):
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
head_dim=getattr(config, "head_dim", None), head_dim=getattr(config, "head_dim", None),
rope_theta=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
freq_allocation=freq_allocation, freq_allocation=freq_allocation,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
rms_norm_eps=config.rms_norm_eps, rms_norm_eps=config.rms_norm_eps,
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
from collections.abc import Iterable from collections.abc import Iterable
from itertools import islice from itertools import islice
from typing import Any
import torch import torch
from torch import nn from torch import nn
...@@ -113,8 +112,6 @@ class ExaoneAttention(nn.Module): ...@@ -113,8 +112,6 @@ class ExaoneAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
bias: bool = False, bias: bool = False,
...@@ -144,7 +141,6 @@ class ExaoneAttention(nn.Module): ...@@ -144,7 +141,6 @@ class ExaoneAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
...@@ -173,8 +169,7 @@ class ExaoneAttention(nn.Module): ...@@ -173,8 +169,7 @@ class ExaoneAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
is_neox_style=is_neox_style, is_neox_style=is_neox_style,
) )
self.attn = Attention( self.attn = Attention(
...@@ -207,8 +202,6 @@ class ExaoneBlockAttention(nn.Module): ...@@ -207,8 +202,6 @@ class ExaoneBlockAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
bias: bool = False, bias: bool = False,
...@@ -221,8 +214,6 @@ class ExaoneBlockAttention(nn.Module): ...@@ -221,8 +214,6 @@ class ExaoneBlockAttention(nn.Module):
hidden_size=hidden_size, hidden_size=hidden_size,
num_heads=num_heads, num_heads=num_heads,
num_kv_heads=num_kv_heads, num_kv_heads=num_kv_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
quant_config=quant_config, quant_config=quant_config,
bias=bias, bias=bias,
...@@ -251,14 +242,6 @@ class ExaoneDecoderLayer(nn.Module): ...@@ -251,14 +242,6 @@ class ExaoneDecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias # Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias # Support internlm/internlm-7b with bias
...@@ -272,8 +255,6 @@ class ExaoneDecoderLayer(nn.Module): ...@@ -272,8 +255,6 @@ class ExaoneDecoderLayer(nn.Module):
num_kv_heads=getattr( num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads config, "num_key_value_heads", config.num_attention_heads
), ),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
quant_config=quant_config, quant_config=quant_config,
bias=attention_bias, bias=attention_bias,
......
...@@ -23,7 +23,6 @@ ...@@ -23,7 +23,6 @@
from collections.abc import Iterable from collections.abc import Iterable
from itertools import islice from itertools import islice
from typing import Any
import torch import torch
from torch import nn from torch import nn
...@@ -52,6 +51,7 @@ from vllm.model_executor.model_loader.weight_utils import ( ...@@ -52,6 +51,7 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name, maybe_remap_kv_scale_name,
) )
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from .interfaces import SupportsLoRA, SupportsPP from .interfaces import SupportsLoRA, SupportsPP
from .utils import ( from .utils import (
...@@ -110,8 +110,6 @@ class Exaone4Attention(nn.Module): ...@@ -110,8 +110,6 @@ class Exaone4Attention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 1000000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
bias: bool = False, bias: bool = False,
...@@ -141,7 +139,6 @@ class Exaone4Attention(nn.Module): ...@@ -141,7 +139,6 @@ class Exaone4Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
...@@ -176,12 +173,12 @@ class Exaone4Attention(nn.Module): ...@@ -176,12 +173,12 @@ class Exaone4Attention(nn.Module):
# apply rotary embeddings to every layer in full attention models # apply rotary embeddings to every layer in full attention models
self.apply_rope_all_layers = "sliding_attention" not in config.layer_types self.apply_rope_all_layers = "sliding_attention" not in config.layer_types
set_default_rope_theta(config, default_theta=1000000)
self.rotary_emb = get_rope( self.rotary_emb = get_rope(
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
is_neox_style=is_neox_style, is_neox_style=is_neox_style,
) )
self.attn = Attention( self.attn = Attention(
...@@ -227,14 +224,6 @@ class Exaone4DecoderLayer(nn.Module): ...@@ -227,14 +224,6 @@ class Exaone4DecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 1000000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias # Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias # Support internlm/internlm-7b with bias
...@@ -249,8 +238,6 @@ class Exaone4DecoderLayer(nn.Module): ...@@ -249,8 +238,6 @@ class Exaone4DecoderLayer(nn.Module):
num_kv_heads=getattr( num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads config, "num_key_value_heads", config.num_attention_heads
), ),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
quant_config=quant_config, quant_config=quant_config,
bias=attention_bias, bias=attention_bias,
......
...@@ -164,13 +164,12 @@ class FalconAttention(nn.Module): ...@@ -164,13 +164,12 @@ class FalconAttention(nn.Module):
) )
if self.use_rotary: if self.use_rotary:
rope_theta = getattr(config, "rope_theta", 10000)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.rotary_emb = get_rope( self.rotary_emb = get_rope(
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=config.rope_parameters,
) )
self.attn = Attention( self.attn = Attention(
self.num_heads, self.num_heads,
......
...@@ -35,6 +35,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ...@@ -35,6 +35,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
) )
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from .interfaces import ( from .interfaces import (
HasInnerState, HasInnerState,
...@@ -214,8 +215,7 @@ class FalconH1AttentionDecoderLayer(nn.Module): ...@@ -214,8 +215,7 @@ class FalconH1AttentionDecoderLayer(nn.Module):
prefix: str = "", prefix: str = "",
) -> None: ) -> None:
super().__init__() super().__init__()
rope_theta = getattr(config, "rope_theta", 1e11) set_default_rope_theta(config, default_theta=1e11)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
tp_size = get_tensor_model_parallel_world_size() tp_size = get_tensor_model_parallel_world_size()
...@@ -240,7 +240,6 @@ class FalconH1AttentionDecoderLayer(nn.Module): ...@@ -240,7 +240,6 @@ class FalconH1AttentionDecoderLayer(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
if hasattr(config, "partial_rotary_factor"): if hasattr(config, "partial_rotary_factor"):
...@@ -254,8 +253,7 @@ class FalconH1AttentionDecoderLayer(nn.Module): ...@@ -254,8 +253,7 @@ class FalconH1AttentionDecoderLayer(nn.Module):
head_size=self.head_dim, head_size=self.head_dim,
rotary_dim=rotary_dim, rotary_dim=rotary_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
rope_scaling=rope_scaling, rope_parameters=config.rope_parameters,
base=rope_theta,
is_neox_style=True, is_neox_style=True,
dtype=None, # see impl of get_rope dtype=None, # see impl of get_rope
) )
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
from collections.abc import Iterable from collections.abc import Iterable
from functools import cache from functools import cache
from itertools import islice from itertools import islice
from typing import Any
import torch import torch
from torch import nn from torch import nn
...@@ -127,8 +128,8 @@ class GemmaAttention(nn.Module): ...@@ -127,8 +128,8 @@ class GemmaAttention(nn.Module):
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
head_dim: int, head_dim: int,
rope_parameters: dict[str, Any],
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
prefix: str = "", prefix: str = "",
...@@ -153,7 +154,6 @@ class GemmaAttention(nn.Module): ...@@ -153,7 +154,6 @@ class GemmaAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
hidden_size, hidden_size,
...@@ -176,7 +176,7 @@ class GemmaAttention(nn.Module): ...@@ -176,7 +176,7 @@ class GemmaAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=self.rope_theta, rope_parameters=rope_parameters,
is_neox_style=True, is_neox_style=True,
) )
self.attn = Attention( self.attn = Attention(
...@@ -218,7 +218,7 @@ class GemmaDecoderLayer(nn.Module): ...@@ -218,7 +218,7 @@ class GemmaDecoderLayer(nn.Module):
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
head_dim=config.head_dim, head_dim=config.head_dim,
max_position_embeddings=config.max_position_embeddings, max_position_embeddings=config.max_position_embeddings,
rope_theta=config.rope_theta, rope_parameters=config.rope_parameters,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.self_attn", prefix=f"{prefix}.self_attn",
......
...@@ -107,7 +107,6 @@ class Gemma2Attention(nn.Module): ...@@ -107,7 +107,6 @@ class Gemma2Attention(nn.Module):
num_kv_heads: int, num_kv_heads: int,
head_dim: int, head_dim: int,
max_position_embeddings: int, max_position_embeddings: int,
rope_theta: float,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
attn_logits_soft_cap: float | None = None, attn_logits_soft_cap: float | None = None,
...@@ -134,7 +133,6 @@ class Gemma2Attention(nn.Module): ...@@ -134,7 +133,6 @@ class Gemma2Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = config.query_pre_attn_scalar**-0.5 self.scaling = config.query_pre_attn_scalar**-0.5
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
hidden_size, hidden_size,
...@@ -156,7 +154,7 @@ class Gemma2Attention(nn.Module): ...@@ -156,7 +154,7 @@ class Gemma2Attention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=self.rope_theta, rope_parameters=config.rope_parameters,
is_neox_style=True, is_neox_style=True,
) )
...@@ -206,7 +204,6 @@ class Gemma2DecoderLayer(nn.Module): ...@@ -206,7 +204,6 @@ class Gemma2DecoderLayer(nn.Module):
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
head_dim=config.head_dim, head_dim=config.head_dim,
max_position_embeddings=config.max_position_embeddings, max_position_embeddings=config.max_position_embeddings,
rope_theta=config.rope_theta,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
attn_logits_soft_cap=config.attn_logit_softcapping, attn_logits_soft_cap=config.attn_logit_softcapping,
......
...@@ -155,25 +155,28 @@ class Gemma3Attention(nn.Module): ...@@ -155,25 +155,28 @@ class Gemma3Attention(nn.Module):
self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps) self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
layer_idx = extract_layer_index(prefix) layer_idx = extract_layer_index(prefix)
self.is_sliding = config.layer_types[layer_idx] == "sliding_attention" layer_type = config.layer_types[layer_idx]
self.is_sliding = layer_type == "sliding_attention"
sliding_window = config.sliding_window if self.is_sliding else None sliding_window = config.sliding_window if self.is_sliding else None
# Initialize the rotary embedding. # Initialize the rotary embedding.
if self.is_sliding: if layer_type in config.rope_parameters:
# Local attention. Override the values in config.json. # Transformers v5 rope config.
self.rope_theta = config.rope_local_base_freq rope_parameters = config.rope_parameters[layer_type]
self.rope_scaling = {"rope_type": "default"}
else: else:
# Transformers v4 rope config.
# Global attention. Use the values in config.json. # Global attention. Use the values in config.json.
self.rope_theta = config.rope_theta rope_parameters = config.rope_parameters.copy()
self.rope_scaling = config.rope_scaling # Local attention. Override the values in config.json.
if self.is_sliding:
rope_parameters["rope_theta"] = config.rope_local_base_freq
self.rotary_emb = get_rope( self.rotary_emb = get_rope(
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=self.rope_theta, rope_parameters=rope_parameters,
is_neox_style=True, is_neox_style=True,
rope_scaling=self.rope_scaling,
) )
if getattr(config, "is_causal", True): if getattr(config, "is_causal", True):
......
...@@ -332,18 +332,21 @@ class Gemma3nAttention(nn.Module): ...@@ -332,18 +332,21 @@ class Gemma3nAttention(nn.Module):
) )
layer_idx = extract_layer_index(prefix) layer_idx = extract_layer_index(prefix)
is_sliding = config.layer_types[layer_idx] == "sliding_attention" layer_type = config.layer_types[layer_idx]
is_sliding = layer_type == "sliding_attention"
self.sliding_window = config.sliding_window if is_sliding else None self.sliding_window = config.sliding_window if is_sliding else None
# Initialize the rotary embedding. # Initialize the rotary embedding.
if is_sliding: if layer_type in config.rope_parameters:
# Local attention. Override the values in config.json. # Transformers v5 rope config.
rope_theta = config.rope_local_base_freq rope_parameters = config.rope_parameters[layer_type]
rope_scaling = {"rope_type": "default"}
else: else:
# Transformers v4 rope config.
# Global attention. Use the values in config.json. # Global attention. Use the values in config.json.
rope_theta = config.rope_theta rope_parameters = config.rope_parameters.copy()
rope_scaling = config.rope_scaling # Local attention. Override the values in config.json.
if is_sliding:
rope_parameters["rope_theta"] = config.rope_local_base_freq
first_kv_shared_layer_idx = ( first_kv_shared_layer_idx = (
config.num_hidden_layers - config.num_kv_shared_layers config.num_hidden_layers - config.num_kv_shared_layers
...@@ -383,9 +386,8 @@ class Gemma3nAttention(nn.Module): ...@@ -383,9 +386,8 @@ class Gemma3nAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=rope_parameters,
is_neox_style=True, is_neox_style=True,
rope_scaling=rope_scaling,
) )
self.attn = Attention( self.attn = Attention(
......
...@@ -57,10 +57,8 @@ class Glm4Attention(nn.Module): ...@@ -57,10 +57,8 @@ class Glm4Attention(nn.Module):
max_position: int = 4096 * 32, max_position: int = 4096 * 32,
head_dim: int | None = None, head_dim: int | None = None,
qkv_bias: bool = False, qkv_bias: bool = False,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
rope_scaling: tuple | None = None,
prefix: str = "", prefix: str = "",
attn_type: str = AttentionType.DECODER, attn_type: str = AttentionType.DECODER,
) -> None: ) -> None:
...@@ -86,7 +84,6 @@ class Glm4Attention(nn.Module): ...@@ -86,7 +84,6 @@ class Glm4Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
hidden_size, hidden_size,
self.head_dim, self.head_dim,
...@@ -107,8 +104,7 @@ class Glm4Attention(nn.Module): ...@@ -107,8 +104,7 @@ class Glm4Attention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.rotary_dim, rotary_dim=self.rotary_dim,
max_position=max_position, max_position=max_position,
base=self.rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
partial_rotary_factor=partial_rotary_factor, partial_rotary_factor=partial_rotary_factor,
is_neox_style=False, is_neox_style=False,
) )
...@@ -150,8 +146,6 @@ class Glm4DecoderLayer(nn.Module): ...@@ -150,8 +146,6 @@ class Glm4DecoderLayer(nn.Module):
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 1000000)
rope_scaling = getattr(config, "rope_scaling", None)
self.self_attn = Glm4Attention( self.self_attn = Glm4Attention(
config=config, config=config,
...@@ -159,12 +153,10 @@ class Glm4DecoderLayer(nn.Module): ...@@ -159,12 +153,10 @@ class Glm4DecoderLayer(nn.Module):
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings, max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
qkv_bias=getattr(config, "attention_bias", False), qkv_bias=getattr(config, "attention_bias", False),
head_dim=getattr(config, "head_dim", None), head_dim=getattr(config, "head_dim", None),
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
rope_scaling=rope_scaling,
prefix=f"{prefix}.self_attn", prefix=f"{prefix}.self_attn",
attn_type=AttentionType.DECODER, attn_type=AttentionType.DECODER,
) )
......
...@@ -703,7 +703,6 @@ class Glm4vVisionTransformer(nn.Module): ...@@ -703,7 +703,6 @@ class Glm4vVisionTransformer(nn.Module):
head_size=head_dim, head_size=head_dim,
rotary_dim=head_dim // 2, rotary_dim=head_dim // 2,
max_position=8192, max_position=8192,
base=10000.0,
is_neox_style=True, is_neox_style=True,
) )
self.blocks = nn.ModuleList( self.blocks = nn.ModuleList(
......
...@@ -26,7 +26,6 @@ ...@@ -26,7 +26,6 @@
import typing import typing
from collections.abc import Callable, Iterable from collections.abc import Callable, Iterable
from itertools import islice from itertools import islice
from typing import Any
import torch import torch
from torch import nn from torch import nn
...@@ -233,8 +232,6 @@ class Glm4MoeAttention(nn.Module): ...@@ -233,8 +232,6 @@ class Glm4MoeAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 131072, max_position_embeddings: int = 131072,
head_dim: int | None = None, head_dim: int | None = None,
rms_norm_eps: float = 1e-05, rms_norm_eps: float = 1e-05,
...@@ -264,7 +261,6 @@ class Glm4MoeAttention(nn.Module): ...@@ -264,7 +261,6 @@ class Glm4MoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.use_qk_norm = use_qk_norm self.use_qk_norm = use_qk_norm
...@@ -291,8 +287,7 @@ class Glm4MoeAttention(nn.Module): ...@@ -291,8 +287,7 @@ class Glm4MoeAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
partial_rotary_factor=partial_rotary_factor, partial_rotary_factor=partial_rotary_factor,
) )
self.attn = Attention( self.attn = Attention(
...@@ -341,8 +336,6 @@ class Glm4MoeDecoderLayer(nn.Module): ...@@ -341,8 +336,6 @@ class Glm4MoeDecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 131072) max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
# DecoderLayers are created with `make_layers` which passes the prefix # DecoderLayers are created with `make_layers` which passes the prefix
# with the layer's index. # with the layer's index.
...@@ -354,8 +347,6 @@ class Glm4MoeDecoderLayer(nn.Module): ...@@ -354,8 +347,6 @@ class Glm4MoeDecoderLayer(nn.Module):
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
head_dim=config.head_dim, head_dim=config.head_dim,
rms_norm_eps=config.rms_norm_eps, rms_norm_eps=config.rms_norm_eps,
......
...@@ -95,13 +95,12 @@ class GPTJAttention(nn.Module): ...@@ -95,13 +95,12 @@ class GPTJAttention(nn.Module):
scaling = self.head_size**-0.5 scaling = self.head_size**-0.5
assert getattr(config, "rotary", True) assert getattr(config, "rotary", True)
assert config.rotary_dim % 2 == 0 assert config.rotary_dim % 2 == 0
rope_theta = getattr(config, "rope_theta", 10000)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.rotary_emb = get_rope( self.rotary_emb = get_rope(
self.head_size, self.head_size,
rotary_dim=config.rotary_dim, rotary_dim=config.rotary_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=config.rope_parameters,
is_neox_style=False, is_neox_style=False,
) )
self.attn = Attention( self.attn = Attention(
......
...@@ -92,13 +92,12 @@ class GPTNeoXAttention(nn.Module): ...@@ -92,13 +92,12 @@ class GPTNeoXAttention(nn.Module):
scaling = self.head_size**-0.5 scaling = self.head_size**-0.5
rotary_dim = int(self.head_size * config.rotary_pct) rotary_dim = int(self.head_size * config.rotary_pct)
assert rotary_dim % 2 == 0 assert rotary_dim % 2 == 0
rope_theta = getattr(config, "rope_theta", 10000)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.rotary_emb = get_rope( self.rotary_emb = get_rope(
self.head_size, self.head_size,
rotary_dim=rotary_dim, rotary_dim=rotary_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=config.rope_parameters,
) )
self.attn = Attention( self.attn = Attention(
self.num_heads, self.num_heads,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment