Unverified Commit a8b70304 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Update `rope_scaling` to `rope_parameters` in preparation for Transformers v5 (#28542)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent d44e9df7
......@@ -42,6 +42,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
from .qwen2 import Qwen2MLP as Qwen3MLP
......@@ -57,14 +58,13 @@ class Qwen3Attention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_parameters: dict,
max_position: int = 4096 * 32,
head_dim: int | None = None,
rms_norm_eps: float = 1e-06,
qkv_bias: bool = False,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
rope_scaling: tuple | None = None,
prefix: str = "",
attn_type: str = AttentionType.DECODER,
dual_chunk_attention_config: dict[str, Any] | None = None,
......@@ -89,7 +89,6 @@ class Qwen3Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.dual_chunk_attention_config = dual_chunk_attention_config
self.qkv_proj = QKVParallelLinear(
......@@ -113,8 +112,7 @@ class Qwen3Attention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position,
base=self.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
dual_chunk_attention_config=dual_chunk_attention_config,
)
self.attn = Attention(
......@@ -166,9 +164,7 @@ class Qwen3DecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 1000000)
rope_scaling = getattr(config, "rope_scaling", None)
set_default_rope_theta(config, default_theta=1000000)
dual_chunk_attention_config = getattr(
config, "dual_chunk_attention_config", None
)
......@@ -187,13 +183,12 @@ class Qwen3DecoderLayer(nn.Module):
num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rms_norm_eps=config.rms_norm_eps,
qkv_bias=getattr(config, "attention_bias", False),
head_dim=getattr(config, "head_dim", None),
cache_config=cache_config,
quant_config=quant_config,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
prefix=f"{prefix}.self_attn",
attn_type=attn_type,
dual_chunk_attention_config=dual_chunk_attention_config,
......
......@@ -216,8 +216,7 @@ class Qwen3MoeAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
rope_parameters: dict[str, Any],
max_position_embeddings: int = 8192,
head_dim: int | None = None,
rms_norm_eps: float = 1e-06,
......@@ -247,7 +246,6 @@ class Qwen3MoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.dual_chunk_attention_config = dual_chunk_attention_config
......@@ -273,8 +271,7 @@ class Qwen3MoeAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
dual_chunk_attention_config=dual_chunk_attention_config,
)
self.attn = Attention(
......@@ -326,8 +323,6 @@ class Qwen3MoeDecoderLayer(nn.Module):
quant_config = vllm_config.quant_config
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
dual_chunk_attention_config = getattr(
config, "dual_chunk_attention_config", None
......@@ -336,8 +331,7 @@ class Qwen3MoeDecoderLayer(nn.Module):
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
max_position_embeddings=max_position_embeddings,
rms_norm_eps=config.rms_norm_eps,
qkv_bias=getattr(config, "attention_bias", False),
......
......@@ -748,8 +748,7 @@ class Qwen3NextAttention(nn.Module):
head_size=self.head_dim,
rotary_dim=self.head_dim,
max_position=config.max_position_embeddings,
base=config.rope_theta,
rope_scaling=config.rope_scaling,
rope_parameters=config.rope_parameters,
partial_rotary_factor=config.partial_rotary_factor,
dual_chunk_attention_config=self.dual_chunk_attention_config,
)
......
......@@ -338,7 +338,6 @@ class Qwen3Omni_VisionTransformer(nn.Module):
head_size=head_dim,
rotary_dim=head_dim // 2,
max_position=8192,
base=10000.0,
is_neox_style=True,
)
......
......@@ -345,7 +345,6 @@ class Qwen3_VisionTransformer(nn.Module):
head_size=head_dim,
rotary_dim=head_dim // 2,
max_position=8192,
base=10000.0,
is_neox_style=True,
)
......
......@@ -54,6 +54,7 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name,
)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from .interfaces import SupportsLoRA, SupportsPP
from .utils import (
......@@ -112,11 +113,10 @@ class SeedOssAttention(nn.Module):
num_heads: int,
num_kv_heads: int,
head_dim: int,
rope_parameters: dict,
max_position: int = 4096 * 32,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
rope_scaling: tuple | None = None,
prefix: str = "",
attn_type: str = AttentionType.DECODER,
) -> None:
......@@ -140,7 +140,6 @@ class SeedOssAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear(
hidden_size,
......@@ -163,8 +162,7 @@ class SeedOssAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position,
base=self.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
)
self.attn = Attention(
self.num_heads,
......@@ -200,9 +198,7 @@ class SeedOssDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 1000000)
rope_scaling = getattr(config, "rope_scaling", None)
set_default_rope_theta(config, default_theta=1000000)
# By default, SeedOss uses causal attention as it is a
# decoder-only model.
......@@ -219,10 +215,9 @@ class SeedOssDecoderLayer(nn.Module):
max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads,
head_dim=config.head_dim,
rope_theta=rope_theta,
cache_config=cache_config,
quant_config=quant_config,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
prefix=f"{prefix}.self_attn",
attn_type=attn_type,
)
......
......@@ -25,7 +25,6 @@
"""Inference-only Solar model compatible with HuggingFace weights."""
from collections.abc import Iterable
from typing import Any
import torch
from torch import nn
......@@ -111,8 +110,6 @@ class SolarAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
......@@ -142,7 +139,6 @@ class SolarAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
......@@ -166,8 +162,7 @@ class SolarAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
)
self.attn = Attention(
self.num_heads,
......@@ -202,15 +197,6 @@ class SolarDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias
......@@ -224,8 +210,6 @@ class SolarDecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,
......
......@@ -153,7 +153,7 @@ class StablelmAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.config.max_position_embeddings,
base=self.config.rope_theta,
rope_parameters=self.config.rope_parameters,
partial_rotary_factor=self.partial_rotary_factor,
)
self.attn = Attention(
......
......@@ -91,7 +91,6 @@ class Starcoder2Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = config.rope_theta
self.max_position_embeddings = config.max_position_embeddings
self.use_bias = config.use_bias
......@@ -115,7 +114,7 @@ class Starcoder2Attention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=int(self.rope_theta),
rope_parameters=config.rope_parameters,
is_neox_style=True,
)
self.attn = Attention(
......
......@@ -36,6 +36,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs.step3_vl import Step3TextConfig
from .interfaces import SupportsPP
from .utils import (
......@@ -144,9 +145,8 @@ class Step3TextAttention(nn.Module):
num_heads: int,
num_kv_heads: int,
norm_eps: float,
rope_theta: int,
rope_parameters: dict[str, Any],
share_q_dim: int | None = None,
rope_scaling: dict[str, Any] | None = None,
max_position_embedding: int = 8192,
head_dim: int = 256,
cache_config: CacheConfig | None = None,
......@@ -198,8 +198,7 @@ class Step3TextAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embedding,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
)
scaling = self.head_dim**-0.5
self.attn = Attention(
......@@ -227,15 +226,13 @@ class Step3TextAttention(nn.Module):
class Step3TextDecoderLayer(nn.Module):
def __init__(
self,
config: ModelConfig,
config: Step3TextConfig,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
prefix: str = "",
) -> None:
super().__init__()
config = config.hf_config
self.hidden_size = config.hidden_size
rope_scaling = getattr(config, "rope_scaling", None)
self.self_attn = Step3TextAttention(
hidden_size=self.hidden_size,
......@@ -247,8 +244,7 @@ class Step3TextDecoderLayer(nn.Module):
max_position_embedding=config.max_position_embedding,
head_dim=config.head_dim,
share_q_dim=config.share_q_dim,
rope_theta=config.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
prefix=f"{prefix}.self_attn",
)
......@@ -338,7 +334,7 @@ class Step3TextModel(nn.Module):
self.start_layer, self.end_layer, self.layers = make_layers(
config.num_hidden_layers,
lambda prefix: Step3TextDecoderLayer(
config=vllm_config.model_config,
config=config,
cache_config=cache_config,
quant_config=quant_config,
prefix=prefix,
......
......@@ -22,6 +22,7 @@ from typing import TYPE_CHECKING, Literal
import torch
from torch import nn
from transformers.configuration_utils import ALLOWED_LAYER_TYPES
from vllm.config.utils import getattr_iter
from vllm.logger import init_logger
......@@ -203,5 +204,10 @@ def can_enable_torch_compile(vllm_config: "VllmConfig") -> bool:
"""
text_config = vllm_config.model_config.hf_config.get_text_config()
# Dynamic rope scaling is not compatible with torch.compile
rope_scaling: dict = getattr(text_config, "rope_scaling", None) or {}
return rope_scaling.get("rope_type") != "dynamic"
rope_parameters: dict | None = getattr(text_config, "rope_parameters", None) or {}
if rope_parameters:
# Nest rope_parameters if not nested already to simplify logic
if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
rope_parameters = {"": rope_parameters}
return all(rp["rope_type"] != "dynamic" for rp in rope_parameters.values())
return True
......@@ -128,7 +128,6 @@ class Zamba2Attention(nn.Module):
tp_size = get_tensor_model_parallel_world_size()
self.config = config
self.num_hybrid_layers = num_hybrid_layers
self.rope_theta = config.rope_theta
self.attention_hidden_size = config.attention_hidden_size
self.total_num_attention_heads = config.num_attention_heads
......@@ -233,8 +232,7 @@ class Zamba2Attention(nn.Module):
head_size=self.attention_head_dim,
rotary_dim=self.attention_head_dim,
max_position=config.max_position_embeddings,
base=self.rope_theta,
rope_scaling=None,
rope_parameters=config.rope_parameters,
is_neox_style=True,
)
......
......@@ -7,8 +7,9 @@ import time
from collections.abc import Callable
from dataclasses import asdict
from functools import cache, partial
from importlib.metadata import version
from pathlib import Path
from typing import Any, Literal, TypeVar
from typing import Any, Literal, TypeAlias, TypeVar
import huggingface_hub
from huggingface_hub import (
......@@ -24,7 +25,9 @@ from huggingface_hub.utils import (
RepositoryNotFoundError,
RevisionNotFoundError,
)
from packaging.version import Version
from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig
from transformers.configuration_utils import ALLOWED_LAYER_TYPES
from transformers.models.auto.image_processing_auto import get_image_processor_config
from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
......@@ -390,21 +393,61 @@ def file_or_path_exists(
)
def patch_rope_scaling(config: PretrainedConfig) -> None:
def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> None:
"""Some models may have no rope_theta in their config but still use RoPE.
This function sets a default rope_theta if it's missing."""
if getattr(config, "rope_parameters", None) is None:
config.rope_parameters = {"rope_type": "default"}
if "rope_theta" not in config.rope_parameters:
config.rope_parameters["rope_theta"] = default_theta
def patch_rope_parameters(config: PretrainedConfig) -> None:
"""Provide backwards compatibility for RoPE."""
text_config = getattr(config, "text_config", None)
if text_config is not None:
patch_rope_scaling(text_config)
# Retrieve rope_parameters differently based on Transformers version
if Version(version("transformers")) >= Version("5.0.0.dev0"):
from transformers.modeling_rope_utils import RopeParameters
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None:
patch_rope_scaling_dict(rope_scaling)
rope_parameters: RopeParameters | dict[str, RopeParameters] | None = getattr(
config, "rope_parameters", None
)
elif hasattr(config, "rope_parameters"):
# We are in Transformers v4 and rope_parameters
# has already been patched for this config
return
else:
# Convert Transformers v4 rope_theta and rope_scaling into rope_parameters
rope_theta: float | None = getattr(config, "rope_theta", None)
rope_scaling: dict | None = getattr(config, "rope_scaling", None)
rope_parameters = rope_scaling
# Move rope_theta into rope_parameters
if rope_theta is not None:
rope_parameters = rope_parameters or {"rope_type": "default"}
rope_parameters["rope_theta"] = rope_theta
# Add original_max_position_embeddings if present
if rope_parameters and (
ompe := getattr(config, "original_max_position_embeddings", None)
):
rope_parameters["original_max_position_embeddings"] = ompe
# Write back to config
config.rope_parameters = rope_parameters
# No RoPE parameters to patch
if rope_parameters is None:
return
# Handle nested rope_parameters in interleaved sliding attention models
if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
for rope_parameters_layer_type in rope_parameters.values():
patch_rope_parameters_dict(rope_parameters_layer_type)
else:
patch_rope_parameters_dict(rope_parameters)
def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None:
if "rope_type" in rope_scaling and "type" in rope_scaling:
rope_type = rope_scaling["rope_type"]
rope_type_legacy = rope_scaling["type"]
def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None:
if "rope_type" in rope_parameters and "type" in rope_parameters:
rope_type = rope_parameters["rope_type"]
rope_type_legacy = rope_parameters["type"]
if rope_type != rope_type_legacy:
raise ValueError(
f"Found conflicts between 'rope_type={rope_type}' (modern "
......@@ -412,28 +455,28 @@ def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None:
"You should only specify one of them."
)
if "rope_type" not in rope_scaling and "type" in rope_scaling:
rope_scaling["rope_type"] = rope_scaling["type"]
if "rope_type" not in rope_parameters and "type" in rope_parameters:
rope_parameters["rope_type"] = rope_parameters["type"]
logger.info("Replacing legacy 'type' key with 'rope_type'")
if "rope_type" not in rope_scaling:
raise ValueError("rope_scaling should have a 'rope_type' key")
if "rope_type" not in rope_parameters:
raise ValueError("rope_parameters should have a 'rope_type' key")
if rope_scaling["rope_type"] == "su":
rope_scaling["rope_type"] = "longrope"
if rope_parameters["rope_type"] == "su":
rope_parameters["rope_type"] = "longrope"
logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
elif rope_scaling["rope_type"] == "mrope":
assert "mrope_section" in rope_scaling
rope_scaling["rope_type"] = "default"
elif rope_parameters["rope_type"] == "mrope":
assert "mrope_section" in rope_parameters
rope_parameters["rope_type"] = "default"
logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
def _uses_mrope(config: PretrainedConfig) -> bool:
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is None:
rope_parameters = getattr(config, "rope_parameters", None)
if rope_parameters is None:
return False
return "mrope_section" in rope_scaling
return "mrope_section" in rope_parameters
def uses_mrope(config: PretrainedConfig) -> bool:
......@@ -690,7 +733,14 @@ def get_config(
logger.debug("Overriding HF config with %s", hf_overrides_fn)
config = hf_overrides_fn(config)
patch_rope_scaling(config)
# Exhaustively patch RoPE parameters everywhere they might be
patch_rope_parameters(config)
patch_rope_parameters(config.get_text_config())
SubConfigs: TypeAlias = dict[str, PretrainedConfig]
sub_configs: SubConfigs | None = getattr(config, "sub_configs", None)
if sub_configs:
for sub_config in sub_configs:
patch_rope_parameters(getattr(config, sub_config))
if trust_remote_code:
maybe_register_config_serialize_by_value()
......
......@@ -24,7 +24,7 @@ class AfmoeConfig(PretrainedConfig):
rms_norm_eps: float = 1e-5,
use_cache: bool = True,
tie_word_embeddings: bool = False,
rope_theta: float = 10000.0,
rope_parameters: dict | None = None,
rope_scaling: dict | None = None,
num_experts: int = 64,
num_experts_per_tok: int = 6,
......@@ -56,7 +56,10 @@ class AfmoeConfig(PretrainedConfig):
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
rope_theta = kwargs.pop("rope_theta", 10000.0)
if rope_parameters is None:
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
self.rope_parameters = rope_parameters
self.rope_scaling = rope_scaling
self.moe_intermediate_size = moe_intermediate_size
......
......@@ -85,8 +85,15 @@ class ArcticConfig(PretrainedConfig):
The id of the "end-of-sequence" token.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether the model's input and output word embeddings should be tied.
rope_theta (`float`, *optional*, defaults to 1000000.0):
The base period of the RoPE embeddings.
rope_parameters (`dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
accordingly.
Expected contents:
`rope_theta` (`float`): The base period of the RoPE embeddings.
`rope_type` (`str`):
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
'llama3'], with 'default' being the original RoPE implementation.
sliding_window (`int`, *optional*):
Sliding window attention window size. If not specified, will default to `4096`.
attention_dropout (`float`, *optional*, defaults to 0.0):
......@@ -132,7 +139,7 @@ class ArcticConfig(PretrainedConfig):
bos_token_id=1,
eos_token_id=2,
tie_word_embeddings=False,
rope_theta=1e6,
rope_parameters: dict[str, Any] | None = None,
sliding_window=None,
attention_dropout=0.0,
num_experts_per_tok=1,
......@@ -165,7 +172,10 @@ class ArcticConfig(PretrainedConfig):
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
rope_theta = kwargs.pop("rope_theta", 1e6)
if rope_parameters is None:
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
self.rope_parameters = rope_parameters
self.attention_dropout = attention_dropout
self.num_experts_per_tok = num_experts_per_tok
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
from transformers.configuration_utils import PretrainedConfig
......@@ -25,8 +26,7 @@ class FlexOlmoConfig(PretrainedConfig):
bos_token_id=None,
eos_token_id=100257,
tie_word_embeddings=False,
rope_theta=500000.0,
rope_scaling=None,
rope_parameters: dict[str, Any] | None = None,
attention_bias=False,
attention_dropout=0.0,
num_experts_per_tok=5,
......@@ -62,8 +62,13 @@ class FlexOlmoConfig(PretrainedConfig):
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 500000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.num_experts_per_tok = num_experts_per_tok
......@@ -73,5 +78,5 @@ class FlexOlmoConfig(PretrainedConfig):
self.norm_topk_prob = norm_topk_prob
# Validate the correctness of rotary position embeddings parameters
# BC: if there is a 'type' field, move it to 'rope_type'.
if self.rope_scaling is not None and "type" in self.rope_scaling:
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
if self.rope_parameters is not None and "type" in self.rope_parameters:
self.rope_parameters["rope_type"] = self.rope_parameters["type"]
......@@ -29,8 +29,7 @@ class KimiLinearConfig(PretrainedConfig):
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
rope_theta=10000.0,
rope_scaling=None,
rope_parameters=None,
tie_word_embeddings=False,
moe_intermediate_size: int | None = None,
moe_renormalize: bool = True,
......@@ -73,8 +72,13 @@ class KimiLinearConfig(PretrainedConfig):
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 10000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
self.q_lora_rank = q_lora_rank
self.kv_lora_rank = kv_lora_rank
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
from transformers.configuration_utils import PretrainedConfig
......@@ -35,8 +36,8 @@ class Lfm2MoeConfig(PretrainedConfig):
End of stream token id.
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 1000000.0):
The base period of the RoPE embeddings.
rope_parameters (`dict`, *optional*):
The parameters of the RoPE embeddings.
max_position_embeddings (`int`, *optional*, defaults to 128000):
The maximum sequence length that this model might ever be used with.
use_cache (`bool`, *optional*, defaults to `True`):
......@@ -100,7 +101,7 @@ class Lfm2MoeConfig(PretrainedConfig):
bos_token_id: int = 1,
eos_token_id: int = 2,
tie_word_embeddings: bool = True,
rope_theta: float = 1000000.0,
rope_parameters: dict[str, Any] | None = None,
max_position_embeddings: int = 128_000,
use_cache: bool = True,
norm_eps: float = 0.00001,
......@@ -121,7 +122,10 @@ class Lfm2MoeConfig(PretrainedConfig):
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.rope_theta = rope_theta
rope_theta = kwargs.pop("rope_theta", 1000000.0)
if rope_parameters is None:
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
self.rope_parameters = rope_parameters
self.max_position_embeddings = max_position_embeddings
self.use_cache = use_cache
self.norm_eps = norm_eps
......
......@@ -98,6 +98,6 @@ class MiDashengLMConfig(PretrainedConfig):
if text_config
else Qwen2_5OmniTextConfig()
)
self.text_config.rope_scaling = None # uses_mrope is false
self.text_config.rope_parameters = None # uses_mrope is false
self.audio_token_id = audio_token_id
super().__init__(**kwargs)
......@@ -86,13 +86,13 @@ def _remap_mistral_yarn_args(config: dict) -> dict:
"apply_scale": "apply_yarn_scaling",
}
yarn_config = config.get("yarn") or {}
config["rope_scaling"] = {
config["rope_parameters"] = {
"rope_type": "yarn",
"mscale_all_dim": 1,
}
for old_name, new_name in yarn_config_map.items():
if old_name in yarn_config:
config["rope_scaling"][new_name] = yarn_config.pop(old_name)
config["rope_parameters"][new_name] = yarn_config.pop(old_name)
assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment