Unverified Commit a8b70304 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Update `rope_scaling` to `rope_parameters` in preparation for Transformers v5 (#28542)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent d44e9df7
......@@ -88,8 +88,8 @@ class NemotronConfig(PretrainedConfig):
End of stream token id.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_parameters (`dict`, *optional*):
The parameters of the RoPE embeddings.
partial_rotary_factor (`float`, *optional*, defaults to 0.5):
Percentage of the query and keys which will have rotary embedding.
attention_bias (`bool`, *optional*, defaults to `False`):
......@@ -132,8 +132,7 @@ class NemotronConfig(PretrainedConfig):
bos_token_id=2,
eos_token_id=3,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
rope_parameters=None,
partial_rotary_factor=0.5,
attention_bias=False,
attention_dropout=0.0,
......@@ -160,8 +159,13 @@ class NemotronConfig(PretrainedConfig):
self.initializer_range = initializer_range
self.norm_eps = norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 10000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
# for backward compatibility
partial_rotary_factor = (
kwargs.get("rope_percent")
......@@ -169,7 +173,7 @@ class NemotronConfig(PretrainedConfig):
or partial_rotary_factor
)
self.partial_rotary_factor = partial_rotary_factor
self._rope_scaling_validation()
self._rope_parameters_validation()
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.mlp_bias = mlp_bias
......@@ -182,31 +186,29 @@ class NemotronConfig(PretrainedConfig):
**kwargs,
)
def _rope_scaling_validation(self):
def _rope_parameters_validation(self):
"""
Validate the `rope_scaling` configuration.
Validate the `rope_parameters` configuration.
"""
if self.rope_scaling is None:
if self.rope_parameters is None:
return
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
raise ValueError(
"`rope_scaling` must be a dictionary with two fields, "
f"`type` and `factor`, got {self.rope_scaling}"
)
rope_scaling_type = self.rope_scaling.get("type", None)
rope_scaling_factor = self.rope_scaling.get("factor", None)
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
raise ValueError(
"`rope_scaling`'s type field must be one of ['linear', "
f"'dynamic'], got {rope_scaling_type}"
)
if (
rope_scaling_factor is None
or not isinstance(rope_scaling_factor, float)
or rope_scaling_factor <= 1.0
):
rope_type: str | None = self.rope_parameters.get("rope_type", None)
factor: float | None = self.rope_parameters.get("factor", None)
if rope_type not in {"default", "linear", "dynamic"}:
raise ValueError(
"`rope_scaling`'s factor field must be a float > 1, got "
f"{rope_scaling_factor}"
"`rope_type` must be one of ['default', 'linear', 'dynamic'], "
f"got {rope_type}"
)
if rope_type != "default":
if factor is None:
raise ValueError(
"If `rope_type` is not 'default', `rope_parameters` "
"must include a `factor` field. Got `None`."
)
if not isinstance(factor, float) or factor <= 1.0:
raise ValueError(
"`rope_parameters`'s factor field must be a float > 1, got "
f"{factor}"
)
......@@ -24,8 +24,7 @@ class Olmo3Config(PretrainedConfig):
bos_token_id=None,
eos_token_id=50279,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
rope_parameters=None,
attention_bias=False,
attention_dropout=0.0,
rms_norm_eps=1e-5,
......@@ -63,8 +62,13 @@ class Olmo3Config(PretrainedConfig):
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 10000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
......
......@@ -66,13 +66,12 @@ class Qwen3NextConfig(PretrainedConfig):
relevant if `config.is_decoder=True`.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether the model's input and output word embeddings should be tied.
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
rope_parameters (`dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
accordingly.
Expected contents:
`rope_theta` (`float`): The base period of the RoPE embeddings.
`rope_type` (`str`):
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
'llama3'], with 'default' being the original RoPE implementation.
......@@ -199,8 +198,7 @@ class Qwen3NextConfig(PretrainedConfig):
rms_norm_eps=1e-6,
use_cache=True,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
rope_parameters=None,
partial_rotary_factor=0.25,
attention_bias=False,
attention_dropout=0.0,
......@@ -236,8 +234,13 @@ class Qwen3NextConfig(PretrainedConfig):
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 10000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
self.partial_rotary_factor = partial_rotary_factor
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
......
......@@ -52,8 +52,7 @@ class Step3TextConfig(PretrainedConfig):
moe_intermediate_size: int = 5120,
moe_num_experts: int = 48,
moe_top_k: int = 3,
rope_theta: float = 500000,
rope_scaling: dict[str, Any] | None = None,
rope_parameters: dict[str, Any] | None = None,
max_position_embedding: int = 65536,
share_expert_dim: int = 5120,
share_q_dim: int = 2048,
......@@ -130,8 +129,13 @@ class Step3TextConfig(PretrainedConfig):
self.moe_intermediate_size = moe_intermediate_size
self.moe_num_experts = moe_num_experts
self.moe_top_k = moe_top_k
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 500000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
self.max_position_embedding = max_position_embedding
self.share_expert_dim = share_expert_dim
self.share_q_dim = share_q_dim
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment