Unverified Commit 6cbc1369 authored by Tom Aarsen's avatar Tom Aarsen Committed by GitHub
Browse files

Fix RoPE config validation for FalconConfig + various config typos (#26929)

* Resolve incorrect ValueError in RoPE config for Falcon

* Add broken codeblock tag in Falcon Config

* Fix typo: an float -> a float

* Implement copy functionality for Fuyu and Persimmon

for RoPE scaling validation

* Make style
parent a0fd3448
...@@ -69,8 +69,8 @@ class OpenLlamaConfig(PretrainedConfig): ...@@ -69,8 +69,8 @@ class OpenLlamaConfig(PretrainedConfig):
Whether to tie weight embeddings Whether to tie weight embeddings
rope_scaling (`Dict`, *optional*): rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
these scaling strategies behave: these scaling strategies behave:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
...@@ -164,4 +164,4 @@ class OpenLlamaConfig(PretrainedConfig): ...@@ -164,4 +164,4 @@ class OpenLlamaConfig(PretrainedConfig):
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
) )
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}") raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
...@@ -79,8 +79,8 @@ class FalconConfig(PretrainedConfig): ...@@ -79,8 +79,8 @@ class FalconConfig(PretrainedConfig):
The base period of the RoPE embeddings. The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*): rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
these scaling strategies behave: these scaling strategies behave:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
...@@ -173,7 +173,7 @@ class FalconConfig(PretrainedConfig): ...@@ -173,7 +173,7 @@ class FalconConfig(PretrainedConfig):
if self.rope_scaling is None: if self.rope_scaling is None:
return return
if self.rotary: if self.alibi:
raise ValueError("`rope_scaling` is not supported when `alibi` is `True`.") raise ValueError("`rope_scaling` is not supported when `alibi` is `True`.")
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
...@@ -188,4 +188,4 @@ class FalconConfig(PretrainedConfig): ...@@ -188,4 +188,4 @@ class FalconConfig(PretrainedConfig):
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
) )
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}") raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
...@@ -72,8 +72,8 @@ class FuyuConfig(PretrainedConfig): ...@@ -72,8 +72,8 @@ class FuyuConfig(PretrainedConfig):
The base period of the RoPE embeddings. The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*): rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
these scaling strategies behave: these scaling strategies behave:
https://www.reddit.com/r/LocalFuyu/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an https://www.reddit.com/r/LocalFuyu/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
...@@ -189,6 +189,7 @@ class FuyuConfig(PretrainedConfig): ...@@ -189,6 +189,7 @@ class FuyuConfig(PretrainedConfig):
**kwargs, **kwargs,
) )
# Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
def _rope_scaling_validation(self): def _rope_scaling_validation(self):
""" """
Validate the `rope_scaling` configuration. Validate the `rope_scaling` configuration.
...@@ -208,4 +209,4 @@ class FuyuConfig(PretrainedConfig): ...@@ -208,4 +209,4 @@ class FuyuConfig(PretrainedConfig):
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
) )
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}") raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
...@@ -80,8 +80,8 @@ class GPTNeoXConfig(PretrainedConfig): ...@@ -80,8 +80,8 @@ class GPTNeoXConfig(PretrainedConfig):
speedup at large scales (e.g. 20B). speedup at large scales (e.g. 20B).
rope_scaling (`Dict`, *optional*): rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
these scaling strategies behave: these scaling strategies behave:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
...@@ -173,4 +173,4 @@ class GPTNeoXConfig(PretrainedConfig): ...@@ -173,4 +173,4 @@ class GPTNeoXConfig(PretrainedConfig):
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
) )
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}") raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
...@@ -87,8 +87,8 @@ class LlamaConfig(PretrainedConfig): ...@@ -87,8 +87,8 @@ class LlamaConfig(PretrainedConfig):
The base period of the RoPE embeddings. The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*): rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
these scaling strategies behave: these scaling strategies behave:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
...@@ -184,4 +184,4 @@ class LlamaConfig(PretrainedConfig): ...@@ -184,4 +184,4 @@ class LlamaConfig(PretrainedConfig):
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
) )
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}") raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
...@@ -65,8 +65,8 @@ class PersimmonConfig(PretrainedConfig): ...@@ -65,8 +65,8 @@ class PersimmonConfig(PretrainedConfig):
The base period of the RoPE embeddings. The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*): rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
these scaling strategies behave: these scaling strategies behave:
https://www.reddit.com/r/LocalPersimmon/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This https://www.reddit.com/r/LocalPersimmon/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This
...@@ -141,6 +141,7 @@ class PersimmonConfig(PretrainedConfig): ...@@ -141,6 +141,7 @@ class PersimmonConfig(PretrainedConfig):
**kwargs, **kwargs,
) )
# Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
def _rope_scaling_validation(self): def _rope_scaling_validation(self):
""" """
Validate the `rope_scaling` configuration. Validate the `rope_scaling` configuration.
...@@ -160,4 +161,4 @@ class PersimmonConfig(PretrainedConfig): ...@@ -160,4 +161,4 @@ class PersimmonConfig(PretrainedConfig):
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
) )
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}") raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment