# SPDX-License-Identifier: Apache-2.0 from typing import Any, Dict, List, Optional, Union from transformers import PretrainedConfig class StepConfig(PretrainedConfig): model_type = "step" def __init__( self, hidden_size: int = 5120, intermediate_size: int = 13312, num_attention_heads: int = 40, num_attention_groups: int = 8, num_hidden_layers: int = 48, max_seq_len: int = 4096, vocab_size: int = 65536, rms_norm_eps: float = 1e-5, moe_every_n_layer: int = 2, # 2 means 50% layers use MoE, interleaved with normal non-MoE layers. use_moe: bool = False, moe_intermediate_size: int = 10240, moe_num_experts: int = 16, moe_top_k: int = 4, max_pos_interp_ratio: float = 1, alibi_slopes: Optional[List[float]] = None, moe_layer_offset: int = 0, moe_dynamic_exp_p: float = 1.0, rope_theta: float = 500000, rope_scaling: Optional[Dict[str, Any]] = None, head_dim: Optional[int] = None, max_position_embedding: int = 16384, share_expert_dim: Optional[int] = None, allgather_dtype: Optional[str] = None, share_q_dim: Optional[int] = None, norm_expert_weight: bool = True, bos_token_id: Optional[Union[List[int], int]] = None, eos_token_id: Optional[Union[List[int], int]] = None, **kwargs, ) -> None: self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_attention_heads = num_attention_heads self.num_attention_groups = num_attention_groups self.num_hidden_layers = num_hidden_layers self.max_seq_len = max_seq_len self.vocab_size = vocab_size self.rms_norm_eps = rms_norm_eps self.use_moe = use_moe self.moe_intermediate_size = moe_intermediate_size self.moe_every_n_layer = moe_every_n_layer self.moe_num_experts = moe_num_experts self.moe_top_k = moe_top_k self.max_pos_interp_ratio = max_pos_interp_ratio self.alibi_slopes = alibi_slopes self.moe_layer_offset = moe_layer_offset self.moe_dynamic_exp_p = moe_dynamic_exp_p #for step2 mini self.rope_theta = rope_theta self.rope_scaling = rope_scaling self.head_dim = head_dim self.max_position_embedding = max_position_embedding if share_expert_dim is None: self.share_expert_dim = self.moe_intermediate_size * self.moe_top_k else: self.share_expert_dim = share_expert_dim self.share_q_dim = share_q_dim self.norm_expert_weight = norm_expert_weight self.allgather_dtype = allgather_dtype self._verify_slopes() super().__init__( bos_token_id=1 if bos_token_id is None else bos_token_id, eos_token_id=[2, 3] if eos_token_id is None else eos_token_id, **kwargs) def _verify_slopes(self): if self.alibi_slopes is None: return if len(self.alibi_slopes) != self.num_attention_heads: raise ValueError( f"Number of alibi_slopes ({len(self.alibi_slopes)}) does not match num_attention_heads ({self.num_attention_heads})" ) class Step1Config(StepConfig): model_type = "step1" class Step2Config(StepConfig): model_type = "step2" def __init__(self, use_offline_input_scales: bool = True, **kwargs): self.use_offline_input_scales = use_offline_input_scales super().__init__(**kwargs) class Step2MiniConfig(StepConfig): model_type = "step2_mini"