[Model] Lfm2Moe (#26344)

Signed-off-by: Paul Pak <paulpak58@gmail.com>

[Model] Lfm2Moe (#26344)
Signed-off-by: Paul Pak <paulpak58@gmail.com>
320feae6 · Paul Pak · GitHub · 1e4ecca1 · 320feae6 · 320feae6
Unverified Commit 320feae6 authored Oct 08, 2025 by Paul Pak Committed by GitHub Oct 07, 2025
8 changed files
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -390,6 +390,7 @@ th {
 | `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | ✅︎ |
 | `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Lfm2ForCausalLM`  | LFM2  | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Lfm2MoeForCausalLM`  | LFM2MoE  | `LiquidAI/LFM2-8B-A1B-preview`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | ✅︎ |
 | `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | ✅︎ |

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -321,6 +321,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Lfm2ForCausalLM": _HfExamplesInfo(
        "LiquidAI/LFM2-1.2B", min_transformers_version="4.54"
    ),
+    "Lfm2MoeForCausalLM": _HfExamplesInfo(
+        "LiquidAI/LFM2-8B-A1B", min_transformers_version="4.58"
+    ),
    "LlamaForCausalLM": _HfExamplesInfo(
        "meta-llama/Llama-3.2-1B-Instruct",
        extras={

--- a/vllm/model_executor/models/lfm2.py
+++ b/vllm/model_executor/models/lfm2.py
@@ -71,14 +71,14 @@ class Lfm2MLP(nn.Module):
            output_sizes=[ff_dim] * 2,
            bias=False,
            quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj",
+            prefix=f"{prefix}.w1",
        )
        self.w2 = RowParallelLinear(
            input_size=ff_dim,
            output_size=dim,
            bias=False,
            quant_config=quant_config,
-            prefix=f"{prefix}.down_proj",
+            prefix=f"{prefix}.w2",
        )
        self.act_fn = SiluAndMul()
@@ -484,17 +484,12 @@ class Lfm2ForCausalLM(
        quant_config = vllm_config.quant_config
        cache_config = vllm_config.cache_config
        lora_config = vllm_config.lora_config
-        scheduler_config = vllm_config.scheduler_config
        assert not cache_config.enable_prefix_caching, (
            "Lfm2 currently does not support prefix caching"
        )
        super().__init__()
        self.config = config
-        self.vllm_config = vllm_config
-        self.scheduler_config = scheduler_config
-        self.model_config = vllm_config.model_config
        self.model = Lfm2Model(
            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
        )

--- a/vllm/model_executor/models/lfm2_moe.py
+++ b/vllm/model_executor/models/lfm2_moe.py
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -119,6 +119,7 @@ _TEXT_GENERATION_MODELS = {
    "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
    "JambaForCausalLM": ("jamba", "JambaForCausalLM"),
    "Lfm2ForCausalLM": ("lfm2", "Lfm2ForCausalLM"),
+    "Lfm2MoeForCausalLM": ("lfm2_moe", "Lfm2MoeForCausalLM"),
    "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
    "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"),
    # For decapoda-research/llama-*

--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -91,6 +91,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
    step3_vl="Step3VLConfig",
    step3_text="Step3TextConfig",
    qwen3_next="Qwen3NextConfig",
+    lfm2_moe="Lfm2MoeConfig",
 )
 _CONFIG_ATTRS_MAPPING: dict[str, str] = {

--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -19,6 +19,7 @@ from vllm.transformers_utils.configs.eagle import EAGLEConfig
 from vllm.transformers_utils.configs.falcon import RWConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
+from vllm.transformers_utils.configs.lfm2_moe import Lfm2MoeConfig
 from vllm.transformers_utils.configs.medusa import MedusaConfig
 from vllm.transformers_utils.configs.midashenglm import MiDashengLMConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
@@ -46,6 +47,7 @@ __all__ = [
    "EAGLEConfig",
    "RWConfig",
    "JAISConfig",
+    "Lfm2MoeConfig",
    "MedusaConfig",
    "MiDashengLMConfig",
    "MLPSpeculatorConfig",

--- a/vllm/transformers_utils/configs/lfm2_moe.py
+++ b/vllm/transformers_utils/configs/lfm2_moe.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+from transformers.configuration_utils import PretrainedConfig
+class Lfm2MoeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Lfm2MoeModel`]. It is used to instantiate a LFM2 Moe
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LFM2-8B-A1B model.
+    e.g. [LiquidAI/LFM2-8B-A1B](https://huggingface.co/LiquidAI/LFM2-8B-A1B)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65536):
+            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Lfm2Model`]
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 7168):
+            Dimension of the MLP representations.
+        moe_intermediate_size (`int`, *optional*, defaults to 1792):
+            Intermediate size of the routed expert.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        max_position_embeddings (`int`, *optional*, defaults to 128000):
+            The maximum sequence length that this model might ever be used with.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the conv layers.
+        conv_L_cache (`int`, *optional*, defaults to 3):
+            L_cache dim in the conv layers.
+        num_dense_layers (`int`, *optional*, defaults to 2):
+            Number of dense Lfm2MoeMLP layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
+        num_experts_per_tok (`int`, *optional*, defaults to 4):
+            Number of selected experts.
+        num_experts (`int`, *optional*, defaults to 32):
+            Number of routed experts.
+        use_expert_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use the expert bias on the routing weights.
+        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor for routed experts in MoE models.
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the topk probabilities.
+        layer_types (`Optional`, *optional*):
+            Type of each layers.
+    ```python
+    >>> from transformers import Lfm2MoeModel, Lfm2MoeConfig
+    >>> # Initializing a LFM2 Moe model
+    >>> configuration = Lfm2MoeConfig()
+    >>> # Initializing a model from the LFM2-8B-A1B style configuration
+    >>> model = Lfm2MoeModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""  # noqa: E501
+    model_type = "lfm2_moe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size: int = 65536,
+        hidden_size: int = 2048,
+        intermediate_size: int = 7168,
+        moe_intermediate_size: int = 1792,
+        num_hidden_layers: int = 32,
+        pad_token_id: int = 0,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        tie_word_embeddings: bool = True,
+        rope_theta: float = 1000000.0,
+        max_position_embeddings: int = 128_000,
+        use_cache: bool = True,
+        norm_eps: float = 0.00001,
+        num_attention_heads: int = 32,
+        num_key_value_heads: int = 8,
+        conv_bias: bool = False,
+        conv_L_cache: int = 3,
+        num_dense_layers: int = 2,
+        num_experts_per_tok: int = 4,
+        num_experts: int = 32,
+        use_expert_bias: bool = True,
+        routed_scaling_factor: float = 1.0,
+        norm_topk_prob: bool = True,
+        layer_types: Optional[list[str]] = None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.use_cache = use_cache
+        self.norm_eps = norm_eps
+        # attn operator config
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        # custom operator config
+        self.conv_bias = conv_bias
+        self.conv_L_cache = conv_L_cache
+        # moe config
+        self.num_dense_layers = num_dense_layers
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.use_expert_bias = use_expert_bias
+        self.routed_scaling_factor = routed_scaling_factor
+        self.norm_topk_prob = norm_topk_prob
+        self.layer_types = layer_types
+        tie_word_embeddings = kwargs.get(
+            "tie_embedding", tie_word_embeddings
+        )  # to fit original config keys
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+__all__ = ["Lfm2MoeConfig"]