"vscode:/vscode.git/clone" did not exist on "b5f882cc98e2c9c6dde7357dbac2ec0c2c57d8cd"
Unverified Commit f77bce00 authored by Pranav's avatar Pranav Committed by GitHub
Browse files

[Model] Add Afmoe architecture implementation (#28332)


Signed-off-by: default avatarMaziyar Panahi <maziyar.panahi@iscpif.fr>
Signed-off-by: default avatarPranav <veldurthipranav@gmail.com>
Co-authored-by: default avatarMaziyar Panahi <maziyar.panahi@iscpif.fr>
parent a289cc1d
......@@ -351,6 +351,7 @@ th {
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|--------------|--------|-------------------|----------------------|---------------------------|
| `AfmoeForCausalLM` | Afmoe | TBA | ✅︎ | ✅︎ |
| `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ |
| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ |
| `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ |
......
......@@ -173,6 +173,10 @@ class _HfExamplesInfo:
_TEXT_GENERATION_EXAMPLE_MODELS = {
# [Decoder-only]
"AfmoeForCausalLM": _HfExamplesInfo(
"arcee-ai/Trinity-Nano",
is_available_online=False,
),
"ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-Instruct-2509"),
"AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", trust_remote_code=True),
"AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", trust_remote_code=True),
......
This diff is collapsed.
......@@ -56,6 +56,7 @@ logger = init_logger(__name__)
_TEXT_GENERATION_MODELS = {
# [Decoder-only]
"AfmoeForCausalLM": ("afmoe", "AfmoeForCausalLM"),
"ApertusForCausalLM": ("apertus", "ApertusForCausalLM"),
"AquilaModel": ("llama", "LlamaForCausalLM"),
"AquilaForCausalLM": ("llama", "LlamaForCausalLM"), # AquilaChat2
......
......@@ -77,6 +77,7 @@ class LazyConfigDict(dict):
_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
afmoe="AfmoeConfig",
chatglm="ChatGLMConfig",
deepseek_vl_v2="DeepseekVLV2Config",
deepseek_v32=DeepseekV3Config,
......
......@@ -7,6 +7,7 @@ Model configs may be defined in this directory for the following reasons:
- There is a need to override the existing config to support vLLM.
"""
from vllm.transformers_utils.configs.afmoe import AfmoeConfig
from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
......@@ -40,6 +41,7 @@ from vllm.transformers_utils.configs.step3_vl import (
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
__all__ = [
"AfmoeConfig",
"ChatGLMConfig",
"DeepseekVLV2Config",
"DotsOCRConfig",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from transformers.configuration_utils import PretrainedConfig
class AfmoeConfig(PretrainedConfig):
model_type = "afmoe"
def __init__(
self,
vocab_size: int = 200_192,
hidden_size: int = 2048,
intermediate_size: int = 6144,
moe_intermediate_size: int = 1408,
num_hidden_layers: int = 32,
num_dense_layers: int = 1,
num_attention_heads: int = 16,
num_key_value_heads: int | None = None,
head_dim: int = 128,
hidden_act: str = "silu",
max_position_embeddings: int = 131072,
initializer_range: float = 0.02,
rms_norm_eps: float = 1e-5,
use_cache: bool = True,
tie_word_embeddings: bool = False,
rope_theta: float = 10000.0,
rope_scaling: dict | None = None,
num_experts: int = 64,
num_experts_per_tok: int = 6,
num_shared_experts: int = 2,
num_expert_groups: int = 1,
num_limited_groups: int = 1,
score_func: str = "sigmoid",
route_norm: bool = True,
route_scale: float = 1.0,
global_attn_every_n_layers: int = 4,
sliding_window: int = 2048,
layer_types: list[str] | None = None,
attention_dropout: float = 0.0,
mup_enabled: bool = False,
n_group: int = 1,
topk_group: int = 1,
**kwargs,
):
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_dense_layers = num_dense_layers
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads or num_attention_heads
self.head_dim = head_dim
self.hidden_act = hidden_act
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.moe_intermediate_size = moe_intermediate_size
self.num_experts = num_experts
self.num_experts_per_tok = num_experts_per_tok
self.num_shared_experts = num_shared_experts
self.num_expert_groups = num_expert_groups
self.num_limited_groups = num_limited_groups
self.score_func = score_func
self.route_norm = route_norm
self.route_scale = route_scale
self.global_attn_every_n_layers = global_attn_every_n_layers
self.sliding_window = sliding_window
self.layer_types = layer_types
self.attention_dropout = attention_dropout
self.mup_enabled = mup_enabled
self.n_group = n_group
self.topk_group = topk_group
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
__all__ = ["AfmoeConfig"]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment