Unverified Commit a8b70304 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Update `rope_scaling` to `rope_parameters` in preparation for Transformers v5 (#28542)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent d44e9df7
...@@ -42,6 +42,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig ...@@ -42,6 +42,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
from .qwen2 import Qwen2MLP as Qwen3MLP from .qwen2 import Qwen2MLP as Qwen3MLP
...@@ -57,14 +58,13 @@ class Qwen3Attention(nn.Module): ...@@ -57,14 +58,13 @@ class Qwen3Attention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_parameters: dict,
max_position: int = 4096 * 32, max_position: int = 4096 * 32,
head_dim: int | None = None, head_dim: int | None = None,
rms_norm_eps: float = 1e-06, rms_norm_eps: float = 1e-06,
qkv_bias: bool = False, qkv_bias: bool = False,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
rope_scaling: tuple | None = None,
prefix: str = "", prefix: str = "",
attn_type: str = AttentionType.DECODER, attn_type: str = AttentionType.DECODER,
dual_chunk_attention_config: dict[str, Any] | None = None, dual_chunk_attention_config: dict[str, Any] | None = None,
...@@ -89,7 +89,6 @@ class Qwen3Attention(nn.Module): ...@@ -89,7 +89,6 @@ class Qwen3Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.dual_chunk_attention_config = dual_chunk_attention_config self.dual_chunk_attention_config = dual_chunk_attention_config
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
...@@ -113,8 +112,7 @@ class Qwen3Attention(nn.Module): ...@@ -113,8 +112,7 @@ class Qwen3Attention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position, max_position=max_position,
base=self.rope_theta, rope_parameters=rope_parameters,
rope_scaling=rope_scaling,
dual_chunk_attention_config=dual_chunk_attention_config, dual_chunk_attention_config=dual_chunk_attention_config,
) )
self.attn = Attention( self.attn = Attention(
...@@ -166,9 +164,7 @@ class Qwen3DecoderLayer(nn.Module): ...@@ -166,9 +164,7 @@ class Qwen3DecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0 set_default_rope_theta(config, default_theta=1000000)
rope_theta = getattr(config, "rope_theta", 1000000)
rope_scaling = getattr(config, "rope_scaling", None)
dual_chunk_attention_config = getattr( dual_chunk_attention_config = getattr(
config, "dual_chunk_attention_config", None config, "dual_chunk_attention_config", None
) )
...@@ -187,13 +183,12 @@ class Qwen3DecoderLayer(nn.Module): ...@@ -187,13 +183,12 @@ class Qwen3DecoderLayer(nn.Module):
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings, max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rms_norm_eps=config.rms_norm_eps, rms_norm_eps=config.rms_norm_eps,
qkv_bias=getattr(config, "attention_bias", False), qkv_bias=getattr(config, "attention_bias", False),
head_dim=getattr(config, "head_dim", None), head_dim=getattr(config, "head_dim", None),
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
rope_scaling=rope_scaling, rope_parameters=config.rope_parameters,
prefix=f"{prefix}.self_attn", prefix=f"{prefix}.self_attn",
attn_type=attn_type, attn_type=attn_type,
dual_chunk_attention_config=dual_chunk_attention_config, dual_chunk_attention_config=dual_chunk_attention_config,
......
...@@ -216,8 +216,7 @@ class Qwen3MoeAttention(nn.Module): ...@@ -216,8 +216,7 @@ class Qwen3MoeAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000, rope_parameters: dict[str, Any],
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
head_dim: int | None = None, head_dim: int | None = None,
rms_norm_eps: float = 1e-06, rms_norm_eps: float = 1e-06,
...@@ -247,7 +246,6 @@ class Qwen3MoeAttention(nn.Module): ...@@ -247,7 +246,6 @@ class Qwen3MoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.dual_chunk_attention_config = dual_chunk_attention_config self.dual_chunk_attention_config = dual_chunk_attention_config
...@@ -273,8 +271,7 @@ class Qwen3MoeAttention(nn.Module): ...@@ -273,8 +271,7 @@ class Qwen3MoeAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=rope_parameters,
rope_scaling=rope_scaling,
dual_chunk_attention_config=dual_chunk_attention_config, dual_chunk_attention_config=dual_chunk_attention_config,
) )
self.attn = Attention( self.attn = Attention(
...@@ -326,8 +323,6 @@ class Qwen3MoeDecoderLayer(nn.Module): ...@@ -326,8 +323,6 @@ class Qwen3MoeDecoderLayer(nn.Module):
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
dual_chunk_attention_config = getattr( dual_chunk_attention_config = getattr(
config, "dual_chunk_attention_config", None config, "dual_chunk_attention_config", None
...@@ -336,8 +331,7 @@ class Qwen3MoeDecoderLayer(nn.Module): ...@@ -336,8 +331,7 @@ class Qwen3MoeDecoderLayer(nn.Module):
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
rms_norm_eps=config.rms_norm_eps, rms_norm_eps=config.rms_norm_eps,
qkv_bias=getattr(config, "attention_bias", False), qkv_bias=getattr(config, "attention_bias", False),
......
...@@ -748,8 +748,7 @@ class Qwen3NextAttention(nn.Module): ...@@ -748,8 +748,7 @@ class Qwen3NextAttention(nn.Module):
head_size=self.head_dim, head_size=self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=config.max_position_embeddings, max_position=config.max_position_embeddings,
base=config.rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=config.rope_scaling,
partial_rotary_factor=config.partial_rotary_factor, partial_rotary_factor=config.partial_rotary_factor,
dual_chunk_attention_config=self.dual_chunk_attention_config, dual_chunk_attention_config=self.dual_chunk_attention_config,
) )
......
...@@ -338,7 +338,6 @@ class Qwen3Omni_VisionTransformer(nn.Module): ...@@ -338,7 +338,6 @@ class Qwen3Omni_VisionTransformer(nn.Module):
head_size=head_dim, head_size=head_dim,
rotary_dim=head_dim // 2, rotary_dim=head_dim // 2,
max_position=8192, max_position=8192,
base=10000.0,
is_neox_style=True, is_neox_style=True,
) )
......
...@@ -345,7 +345,6 @@ class Qwen3_VisionTransformer(nn.Module): ...@@ -345,7 +345,6 @@ class Qwen3_VisionTransformer(nn.Module):
head_size=head_dim, head_size=head_dim,
rotary_dim=head_dim // 2, rotary_dim=head_dim // 2,
max_position=8192, max_position=8192,
base=10000.0,
is_neox_style=True, is_neox_style=True,
) )
......
...@@ -54,6 +54,7 @@ from vllm.model_executor.model_loader.weight_utils import ( ...@@ -54,6 +54,7 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name, maybe_remap_kv_scale_name,
) )
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from .interfaces import SupportsLoRA, SupportsPP from .interfaces import SupportsLoRA, SupportsPP
from .utils import ( from .utils import (
...@@ -112,11 +113,10 @@ class SeedOssAttention(nn.Module): ...@@ -112,11 +113,10 @@ class SeedOssAttention(nn.Module):
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
head_dim: int, head_dim: int,
rope_parameters: dict,
max_position: int = 4096 * 32, max_position: int = 4096 * 32,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
rope_scaling: tuple | None = None,
prefix: str = "", prefix: str = "",
attn_type: str = AttentionType.DECODER, attn_type: str = AttentionType.DECODER,
) -> None: ) -> None:
...@@ -140,7 +140,6 @@ class SeedOssAttention(nn.Module): ...@@ -140,7 +140,6 @@ class SeedOssAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
hidden_size, hidden_size,
...@@ -163,8 +162,7 @@ class SeedOssAttention(nn.Module): ...@@ -163,8 +162,7 @@ class SeedOssAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position, max_position=max_position,
base=self.rope_theta, rope_parameters=rope_parameters,
rope_scaling=rope_scaling,
) )
self.attn = Attention( self.attn = Attention(
self.num_heads, self.num_heads,
...@@ -200,9 +198,7 @@ class SeedOssDecoderLayer(nn.Module): ...@@ -200,9 +198,7 @@ class SeedOssDecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0 set_default_rope_theta(config, default_theta=1000000)
rope_theta = getattr(config, "rope_theta", 1000000)
rope_scaling = getattr(config, "rope_scaling", None)
# By default, SeedOss uses causal attention as it is a # By default, SeedOss uses causal attention as it is a
# decoder-only model. # decoder-only model.
...@@ -219,10 +215,9 @@ class SeedOssDecoderLayer(nn.Module): ...@@ -219,10 +215,9 @@ class SeedOssDecoderLayer(nn.Module):
max_position=config.max_position_embeddings, max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
head_dim=config.head_dim, head_dim=config.head_dim,
rope_theta=rope_theta,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
rope_scaling=rope_scaling, rope_parameters=config.rope_parameters,
prefix=f"{prefix}.self_attn", prefix=f"{prefix}.self_attn",
attn_type=attn_type, attn_type=attn_type,
) )
......
...@@ -25,7 +25,6 @@ ...@@ -25,7 +25,6 @@
"""Inference-only Solar model compatible with HuggingFace weights.""" """Inference-only Solar model compatible with HuggingFace weights."""
from collections.abc import Iterable from collections.abc import Iterable
from typing import Any
import torch import torch
from torch import nn from torch import nn
...@@ -111,8 +110,6 @@ class SolarAttention(nn.Module): ...@@ -111,8 +110,6 @@ class SolarAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192, max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
bias: bool = False, bias: bool = False,
...@@ -142,7 +139,6 @@ class SolarAttention(nn.Module): ...@@ -142,7 +139,6 @@ class SolarAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear( self.qkv_proj = QKVParallelLinear(
...@@ -166,8 +162,7 @@ class SolarAttention(nn.Module): ...@@ -166,8 +162,7 @@ class SolarAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
base=rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
) )
self.attn = Attention( self.attn = Attention(
self.num_heads, self.num_heads,
...@@ -202,15 +197,6 @@ class SolarDecoderLayer(nn.Module): ...@@ -202,15 +197,6 @@ class SolarDecoderLayer(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias # Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias # Support internlm/internlm-7b with bias
...@@ -224,8 +210,6 @@ class SolarDecoderLayer(nn.Module): ...@@ -224,8 +210,6 @@ class SolarDecoderLayer(nn.Module):
num_kv_heads=getattr( num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads config, "num_key_value_heads", config.num_attention_heads
), ),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings, max_position_embeddings=max_position_embeddings,
quant_config=quant_config, quant_config=quant_config,
bias=attention_bias, bias=attention_bias,
......
...@@ -153,7 +153,7 @@ class StablelmAttention(nn.Module): ...@@ -153,7 +153,7 @@ class StablelmAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=self.config.max_position_embeddings, max_position=self.config.max_position_embeddings,
base=self.config.rope_theta, rope_parameters=self.config.rope_parameters,
partial_rotary_factor=self.partial_rotary_factor, partial_rotary_factor=self.partial_rotary_factor,
) )
self.attn = Attention( self.attn = Attention(
......
...@@ -91,7 +91,6 @@ class Starcoder2Attention(nn.Module): ...@@ -91,7 +91,6 @@ class Starcoder2Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = config.rope_theta
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.use_bias = config.use_bias self.use_bias = config.use_bias
...@@ -115,7 +114,7 @@ class Starcoder2Attention(nn.Module): ...@@ -115,7 +114,7 @@ class Starcoder2Attention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=self.max_position_embeddings, max_position=self.max_position_embeddings,
base=int(self.rope_theta), rope_parameters=config.rope_parameters,
is_neox_style=True, is_neox_style=True,
) )
self.attn = Attention( self.attn = Attention(
......
...@@ -36,6 +36,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ...@@ -36,6 +36,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
) )
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs.step3_vl import Step3TextConfig
from .interfaces import SupportsPP from .interfaces import SupportsPP
from .utils import ( from .utils import (
...@@ -144,9 +145,8 @@ class Step3TextAttention(nn.Module): ...@@ -144,9 +145,8 @@ class Step3TextAttention(nn.Module):
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
norm_eps: float, norm_eps: float,
rope_theta: int, rope_parameters: dict[str, Any],
share_q_dim: int | None = None, share_q_dim: int | None = None,
rope_scaling: dict[str, Any] | None = None,
max_position_embedding: int = 8192, max_position_embedding: int = 8192,
head_dim: int = 256, head_dim: int = 256,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
...@@ -198,8 +198,7 @@ class Step3TextAttention(nn.Module): ...@@ -198,8 +198,7 @@ class Step3TextAttention(nn.Module):
self.head_dim, self.head_dim,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
max_position=max_position_embedding, max_position=max_position_embedding,
base=rope_theta, rope_parameters=rope_parameters,
rope_scaling=rope_scaling,
) )
scaling = self.head_dim**-0.5 scaling = self.head_dim**-0.5
self.attn = Attention( self.attn = Attention(
...@@ -227,15 +226,13 @@ class Step3TextAttention(nn.Module): ...@@ -227,15 +226,13 @@ class Step3TextAttention(nn.Module):
class Step3TextDecoderLayer(nn.Module): class Step3TextDecoderLayer(nn.Module):
def __init__( def __init__(
self, self,
config: ModelConfig, config: Step3TextConfig,
cache_config: CacheConfig | None = None, cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None, quant_config: QuantizationConfig | None = None,
prefix: str = "", prefix: str = "",
) -> None: ) -> None:
super().__init__() super().__init__()
config = config.hf_config
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
rope_scaling = getattr(config, "rope_scaling", None)
self.self_attn = Step3TextAttention( self.self_attn = Step3TextAttention(
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
...@@ -247,8 +244,7 @@ class Step3TextDecoderLayer(nn.Module): ...@@ -247,8 +244,7 @@ class Step3TextDecoderLayer(nn.Module):
max_position_embedding=config.max_position_embedding, max_position_embedding=config.max_position_embedding,
head_dim=config.head_dim, head_dim=config.head_dim,
share_q_dim=config.share_q_dim, share_q_dim=config.share_q_dim,
rope_theta=config.rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=rope_scaling,
prefix=f"{prefix}.self_attn", prefix=f"{prefix}.self_attn",
) )
...@@ -338,7 +334,7 @@ class Step3TextModel(nn.Module): ...@@ -338,7 +334,7 @@ class Step3TextModel(nn.Module):
self.start_layer, self.end_layer, self.layers = make_layers( self.start_layer, self.end_layer, self.layers = make_layers(
config.num_hidden_layers, config.num_hidden_layers,
lambda prefix: Step3TextDecoderLayer( lambda prefix: Step3TextDecoderLayer(
config=vllm_config.model_config, config=config,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
prefix=prefix, prefix=prefix,
......
...@@ -22,6 +22,7 @@ from typing import TYPE_CHECKING, Literal ...@@ -22,6 +22,7 @@ from typing import TYPE_CHECKING, Literal
import torch import torch
from torch import nn from torch import nn
from transformers.configuration_utils import ALLOWED_LAYER_TYPES
from vllm.config.utils import getattr_iter from vllm.config.utils import getattr_iter
from vllm.logger import init_logger from vllm.logger import init_logger
...@@ -203,5 +204,10 @@ def can_enable_torch_compile(vllm_config: "VllmConfig") -> bool: ...@@ -203,5 +204,10 @@ def can_enable_torch_compile(vllm_config: "VllmConfig") -> bool:
""" """
text_config = vllm_config.model_config.hf_config.get_text_config() text_config = vllm_config.model_config.hf_config.get_text_config()
# Dynamic rope scaling is not compatible with torch.compile # Dynamic rope scaling is not compatible with torch.compile
rope_scaling: dict = getattr(text_config, "rope_scaling", None) or {} rope_parameters: dict | None = getattr(text_config, "rope_parameters", None) or {}
return rope_scaling.get("rope_type") != "dynamic" if rope_parameters:
# Nest rope_parameters if not nested already to simplify logic
if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
rope_parameters = {"": rope_parameters}
return all(rp["rope_type"] != "dynamic" for rp in rope_parameters.values())
return True
...@@ -128,7 +128,6 @@ class Zamba2Attention(nn.Module): ...@@ -128,7 +128,6 @@ class Zamba2Attention(nn.Module):
tp_size = get_tensor_model_parallel_world_size() tp_size = get_tensor_model_parallel_world_size()
self.config = config self.config = config
self.num_hybrid_layers = num_hybrid_layers self.num_hybrid_layers = num_hybrid_layers
self.rope_theta = config.rope_theta
self.attention_hidden_size = config.attention_hidden_size self.attention_hidden_size = config.attention_hidden_size
self.total_num_attention_heads = config.num_attention_heads self.total_num_attention_heads = config.num_attention_heads
...@@ -233,8 +232,7 @@ class Zamba2Attention(nn.Module): ...@@ -233,8 +232,7 @@ class Zamba2Attention(nn.Module):
head_size=self.attention_head_dim, head_size=self.attention_head_dim,
rotary_dim=self.attention_head_dim, rotary_dim=self.attention_head_dim,
max_position=config.max_position_embeddings, max_position=config.max_position_embeddings,
base=self.rope_theta, rope_parameters=config.rope_parameters,
rope_scaling=None,
is_neox_style=True, is_neox_style=True,
) )
......
...@@ -7,8 +7,9 @@ import time ...@@ -7,8 +7,9 @@ import time
from collections.abc import Callable from collections.abc import Callable
from dataclasses import asdict from dataclasses import asdict
from functools import cache, partial from functools import cache, partial
from importlib.metadata import version
from pathlib import Path from pathlib import Path
from typing import Any, Literal, TypeVar from typing import Any, Literal, TypeAlias, TypeVar
import huggingface_hub import huggingface_hub
from huggingface_hub import ( from huggingface_hub import (
...@@ -24,7 +25,9 @@ from huggingface_hub.utils import ( ...@@ -24,7 +25,9 @@ from huggingface_hub.utils import (
RepositoryNotFoundError, RepositoryNotFoundError,
RevisionNotFoundError, RevisionNotFoundError,
) )
from packaging.version import Version
from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig
from transformers.configuration_utils import ALLOWED_LAYER_TYPES
from transformers.models.auto.image_processing_auto import get_image_processor_config from transformers.models.auto.image_processing_auto import get_image_processor_config
from transformers.models.auto.modeling_auto import ( from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
...@@ -390,21 +393,61 @@ def file_or_path_exists( ...@@ -390,21 +393,61 @@ def file_or_path_exists(
) )
def patch_rope_scaling(config: PretrainedConfig) -> None: def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> None:
"""Some models may have no rope_theta in their config but still use RoPE.
This function sets a default rope_theta if it's missing."""
if getattr(config, "rope_parameters", None) is None:
config.rope_parameters = {"rope_type": "default"}
if "rope_theta" not in config.rope_parameters:
config.rope_parameters["rope_theta"] = default_theta
def patch_rope_parameters(config: PretrainedConfig) -> None:
"""Provide backwards compatibility for RoPE.""" """Provide backwards compatibility for RoPE."""
text_config = getattr(config, "text_config", None) # Retrieve rope_parameters differently based on Transformers version
if text_config is not None: if Version(version("transformers")) >= Version("5.0.0.dev0"):
patch_rope_scaling(text_config) from transformers.modeling_rope_utils import RopeParameters
rope_scaling = getattr(config, "rope_scaling", None) rope_parameters: RopeParameters | dict[str, RopeParameters] | None = getattr(
if rope_scaling is not None: config, "rope_parameters", None
patch_rope_scaling_dict(rope_scaling) )
elif hasattr(config, "rope_parameters"):
# We are in Transformers v4 and rope_parameters
# has already been patched for this config
return
else:
# Convert Transformers v4 rope_theta and rope_scaling into rope_parameters
rope_theta: float | None = getattr(config, "rope_theta", None)
rope_scaling: dict | None = getattr(config, "rope_scaling", None)
rope_parameters = rope_scaling
# Move rope_theta into rope_parameters
if rope_theta is not None:
rope_parameters = rope_parameters or {"rope_type": "default"}
rope_parameters["rope_theta"] = rope_theta
# Add original_max_position_embeddings if present
if rope_parameters and (
ompe := getattr(config, "original_max_position_embeddings", None)
):
rope_parameters["original_max_position_embeddings"] = ompe
# Write back to config
config.rope_parameters = rope_parameters
# No RoPE parameters to patch
if rope_parameters is None:
return
# Handle nested rope_parameters in interleaved sliding attention models
if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
for rope_parameters_layer_type in rope_parameters.values():
patch_rope_parameters_dict(rope_parameters_layer_type)
else:
patch_rope_parameters_dict(rope_parameters)
def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None: def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None:
if "rope_type" in rope_scaling and "type" in rope_scaling: if "rope_type" in rope_parameters and "type" in rope_parameters:
rope_type = rope_scaling["rope_type"] rope_type = rope_parameters["rope_type"]
rope_type_legacy = rope_scaling["type"] rope_type_legacy = rope_parameters["type"]
if rope_type != rope_type_legacy: if rope_type != rope_type_legacy:
raise ValueError( raise ValueError(
f"Found conflicts between 'rope_type={rope_type}' (modern " f"Found conflicts between 'rope_type={rope_type}' (modern "
...@@ -412,28 +455,28 @@ def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None: ...@@ -412,28 +455,28 @@ def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None:
"You should only specify one of them." "You should only specify one of them."
) )
if "rope_type" not in rope_scaling and "type" in rope_scaling: if "rope_type" not in rope_parameters and "type" in rope_parameters:
rope_scaling["rope_type"] = rope_scaling["type"] rope_parameters["rope_type"] = rope_parameters["type"]
logger.info("Replacing legacy 'type' key with 'rope_type'") logger.info("Replacing legacy 'type' key with 'rope_type'")
if "rope_type" not in rope_scaling: if "rope_type" not in rope_parameters:
raise ValueError("rope_scaling should have a 'rope_type' key") raise ValueError("rope_parameters should have a 'rope_type' key")
if rope_scaling["rope_type"] == "su": if rope_parameters["rope_type"] == "su":
rope_scaling["rope_type"] = "longrope" rope_parameters["rope_type"] = "longrope"
logger.warning("Replacing legacy rope_type 'su' with 'longrope'") logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
elif rope_scaling["rope_type"] == "mrope": elif rope_parameters["rope_type"] == "mrope":
assert "mrope_section" in rope_scaling assert "mrope_section" in rope_parameters
rope_scaling["rope_type"] = "default" rope_parameters["rope_type"] = "default"
logger.warning("Replacing legacy rope_type 'mrope' with 'default'") logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
def _uses_mrope(config: PretrainedConfig) -> bool: def _uses_mrope(config: PretrainedConfig) -> bool:
rope_scaling = getattr(config, "rope_scaling", None) rope_parameters = getattr(config, "rope_parameters", None)
if rope_scaling is None: if rope_parameters is None:
return False return False
return "mrope_section" in rope_scaling return "mrope_section" in rope_parameters
def uses_mrope(config: PretrainedConfig) -> bool: def uses_mrope(config: PretrainedConfig) -> bool:
...@@ -690,7 +733,14 @@ def get_config( ...@@ -690,7 +733,14 @@ def get_config(
logger.debug("Overriding HF config with %s", hf_overrides_fn) logger.debug("Overriding HF config with %s", hf_overrides_fn)
config = hf_overrides_fn(config) config = hf_overrides_fn(config)
patch_rope_scaling(config) # Exhaustively patch RoPE parameters everywhere they might be
patch_rope_parameters(config)
patch_rope_parameters(config.get_text_config())
SubConfigs: TypeAlias = dict[str, PretrainedConfig]
sub_configs: SubConfigs | None = getattr(config, "sub_configs", None)
if sub_configs:
for sub_config in sub_configs:
patch_rope_parameters(getattr(config, sub_config))
if trust_remote_code: if trust_remote_code:
maybe_register_config_serialize_by_value() maybe_register_config_serialize_by_value()
......
...@@ -24,7 +24,7 @@ class AfmoeConfig(PretrainedConfig): ...@@ -24,7 +24,7 @@ class AfmoeConfig(PretrainedConfig):
rms_norm_eps: float = 1e-5, rms_norm_eps: float = 1e-5,
use_cache: bool = True, use_cache: bool = True,
tie_word_embeddings: bool = False, tie_word_embeddings: bool = False,
rope_theta: float = 10000.0, rope_parameters: dict | None = None,
rope_scaling: dict | None = None, rope_scaling: dict | None = None,
num_experts: int = 64, num_experts: int = 64,
num_experts_per_tok: int = 6, num_experts_per_tok: int = 6,
...@@ -56,7 +56,10 @@ class AfmoeConfig(PretrainedConfig): ...@@ -56,7 +56,10 @@ class AfmoeConfig(PretrainedConfig):
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache self.use_cache = use_cache
self.rope_theta = rope_theta rope_theta = kwargs.pop("rope_theta", 10000.0)
if rope_parameters is None:
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
self.rope_parameters = rope_parameters
self.rope_scaling = rope_scaling self.rope_scaling = rope_scaling
self.moe_intermediate_size = moe_intermediate_size self.moe_intermediate_size = moe_intermediate_size
......
...@@ -85,8 +85,15 @@ class ArcticConfig(PretrainedConfig): ...@@ -85,8 +85,15 @@ class ArcticConfig(PretrainedConfig):
The id of the "end-of-sequence" token. The id of the "end-of-sequence" token.
tie_word_embeddings (`bool`, *optional*, defaults to `False`): tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether the model's input and output word embeddings should be tied. Whether the model's input and output word embeddings should be tied.
rope_theta (`float`, *optional*, defaults to 1000000.0): rope_parameters (`dict`, *optional*):
The base period of the RoPE embeddings. Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
accordingly.
Expected contents:
`rope_theta` (`float`): The base period of the RoPE embeddings.
`rope_type` (`str`):
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
'llama3'], with 'default' being the original RoPE implementation.
sliding_window (`int`, *optional*): sliding_window (`int`, *optional*):
Sliding window attention window size. If not specified, will default to `4096`. Sliding window attention window size. If not specified, will default to `4096`.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
...@@ -132,7 +139,7 @@ class ArcticConfig(PretrainedConfig): ...@@ -132,7 +139,7 @@ class ArcticConfig(PretrainedConfig):
bos_token_id=1, bos_token_id=1,
eos_token_id=2, eos_token_id=2,
tie_word_embeddings=False, tie_word_embeddings=False,
rope_theta=1e6, rope_parameters: dict[str, Any] | None = None,
sliding_window=None, sliding_window=None,
attention_dropout=0.0, attention_dropout=0.0,
num_experts_per_tok=1, num_experts_per_tok=1,
...@@ -165,7 +172,10 @@ class ArcticConfig(PretrainedConfig): ...@@ -165,7 +172,10 @@ class ArcticConfig(PretrainedConfig):
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache self.use_cache = use_cache
self.rope_theta = rope_theta rope_theta = kwargs.pop("rope_theta", 1e6)
if rope_parameters is None:
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
self.rope_parameters = rope_parameters
self.attention_dropout = attention_dropout self.attention_dropout = attention_dropout
self.num_experts_per_tok = num_experts_per_tok self.num_experts_per_tok = num_experts_per_tok
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
...@@ -25,8 +26,7 @@ class FlexOlmoConfig(PretrainedConfig): ...@@ -25,8 +26,7 @@ class FlexOlmoConfig(PretrainedConfig):
bos_token_id=None, bos_token_id=None,
eos_token_id=100257, eos_token_id=100257,
tie_word_embeddings=False, tie_word_embeddings=False,
rope_theta=500000.0, rope_parameters: dict[str, Any] | None = None,
rope_scaling=None,
attention_bias=False, attention_bias=False,
attention_dropout=0.0, attention_dropout=0.0,
num_experts_per_tok=5, num_experts_per_tok=5,
...@@ -62,8 +62,13 @@ class FlexOlmoConfig(PretrainedConfig): ...@@ -62,8 +62,13 @@ class FlexOlmoConfig(PretrainedConfig):
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache self.use_cache = use_cache
self.rope_theta = rope_theta # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
self.rope_scaling = rope_scaling rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 500000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
self.attention_bias = attention_bias self.attention_bias = attention_bias
self.attention_dropout = attention_dropout self.attention_dropout = attention_dropout
self.num_experts_per_tok = num_experts_per_tok self.num_experts_per_tok = num_experts_per_tok
...@@ -73,5 +78,5 @@ class FlexOlmoConfig(PretrainedConfig): ...@@ -73,5 +78,5 @@ class FlexOlmoConfig(PretrainedConfig):
self.norm_topk_prob = norm_topk_prob self.norm_topk_prob = norm_topk_prob
# Validate the correctness of rotary position embeddings parameters # Validate the correctness of rotary position embeddings parameters
# BC: if there is a 'type' field, move it to 'rope_type'. # BC: if there is a 'type' field, move it to 'rope_type'.
if self.rope_scaling is not None and "type" in self.rope_scaling: if self.rope_parameters is not None and "type" in self.rope_parameters:
self.rope_scaling["rope_type"] = self.rope_scaling["type"] self.rope_parameters["rope_type"] = self.rope_parameters["type"]
...@@ -29,8 +29,7 @@ class KimiLinearConfig(PretrainedConfig): ...@@ -29,8 +29,7 @@ class KimiLinearConfig(PretrainedConfig):
pad_token_id=0, pad_token_id=0,
bos_token_id=1, bos_token_id=1,
eos_token_id=2, eos_token_id=2,
rope_theta=10000.0, rope_parameters=None,
rope_scaling=None,
tie_word_embeddings=False, tie_word_embeddings=False,
moe_intermediate_size: int | None = None, moe_intermediate_size: int | None = None,
moe_renormalize: bool = True, moe_renormalize: bool = True,
...@@ -73,8 +72,13 @@ class KimiLinearConfig(PretrainedConfig): ...@@ -73,8 +72,13 @@ class KimiLinearConfig(PretrainedConfig):
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache self.use_cache = use_cache
self.rope_theta = rope_theta # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
self.rope_scaling = rope_scaling rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 10000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
self.q_lora_rank = q_lora_rank self.q_lora_rank = q_lora_rank
self.kv_lora_rank = kv_lora_rank self.kv_lora_rank = kv_lora_rank
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
...@@ -35,8 +36,8 @@ class Lfm2MoeConfig(PretrainedConfig): ...@@ -35,8 +36,8 @@ class Lfm2MoeConfig(PretrainedConfig):
End of stream token id. End of stream token id.
tie_word_embeddings (`bool`, *optional*, defaults to `True`): tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie weight embeddings Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 1000000.0): rope_parameters (`dict`, *optional*):
The base period of the RoPE embeddings. The parameters of the RoPE embeddings.
max_position_embeddings (`int`, *optional*, defaults to 128000): max_position_embeddings (`int`, *optional*, defaults to 128000):
The maximum sequence length that this model might ever be used with. The maximum sequence length that this model might ever be used with.
use_cache (`bool`, *optional*, defaults to `True`): use_cache (`bool`, *optional*, defaults to `True`):
...@@ -100,7 +101,7 @@ class Lfm2MoeConfig(PretrainedConfig): ...@@ -100,7 +101,7 @@ class Lfm2MoeConfig(PretrainedConfig):
bos_token_id: int = 1, bos_token_id: int = 1,
eos_token_id: int = 2, eos_token_id: int = 2,
tie_word_embeddings: bool = True, tie_word_embeddings: bool = True,
rope_theta: float = 1000000.0, rope_parameters: dict[str, Any] | None = None,
max_position_embeddings: int = 128_000, max_position_embeddings: int = 128_000,
use_cache: bool = True, use_cache: bool = True,
norm_eps: float = 0.00001, norm_eps: float = 0.00001,
...@@ -121,7 +122,10 @@ class Lfm2MoeConfig(PretrainedConfig): ...@@ -121,7 +122,10 @@ class Lfm2MoeConfig(PretrainedConfig):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.intermediate_size = intermediate_size self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers
self.rope_theta = rope_theta rope_theta = kwargs.pop("rope_theta", 1000000.0)
if rope_parameters is None:
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
self.rope_parameters = rope_parameters
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.use_cache = use_cache self.use_cache = use_cache
self.norm_eps = norm_eps self.norm_eps = norm_eps
......
...@@ -98,6 +98,6 @@ class MiDashengLMConfig(PretrainedConfig): ...@@ -98,6 +98,6 @@ class MiDashengLMConfig(PretrainedConfig):
if text_config if text_config
else Qwen2_5OmniTextConfig() else Qwen2_5OmniTextConfig()
) )
self.text_config.rope_scaling = None # uses_mrope is false self.text_config.rope_parameters = None # uses_mrope is false
self.audio_token_id = audio_token_id self.audio_token_id = audio_token_id
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -86,13 +86,13 @@ def _remap_mistral_yarn_args(config: dict) -> dict: ...@@ -86,13 +86,13 @@ def _remap_mistral_yarn_args(config: dict) -> dict:
"apply_scale": "apply_yarn_scaling", "apply_scale": "apply_yarn_scaling",
} }
yarn_config = config.get("yarn") or {} yarn_config = config.get("yarn") or {}
config["rope_scaling"] = { config["rope_parameters"] = {
"rope_type": "yarn", "rope_type": "yarn",
"mscale_all_dim": 1, "mscale_all_dim": 1,
} }
for old_name, new_name in yarn_config_map.items(): for old_name, new_name in yarn_config_map.items():
if old_name in yarn_config: if old_name in yarn_config:
config["rope_scaling"][new_name] = yarn_config.pop(old_name) config["rope_parameters"][new_name] = yarn_config.pop(old_name)
assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}" assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment