Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
320feae6
Unverified
Commit
320feae6
authored
Oct 08, 2025
by
Paul Pak
Committed by
GitHub
Oct 07, 2025
Browse files
[Model] Lfm2Moe (#26344)
Signed-off-by:
Paul Pak
<
paulpak58@gmail.com
>
parent
1e4ecca1
Changes
8
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
967 additions
and
7 deletions
+967
-7
docs/models/supported_models.md
docs/models/supported_models.md
+1
-0
tests/models/registry.py
tests/models/registry.py
+3
-0
vllm/model_executor/models/lfm2.py
vllm/model_executor/models/lfm2.py
+2
-7
vllm/model_executor/models/lfm2_moe.py
vllm/model_executor/models/lfm2_moe.py
+797
-0
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+1
-0
vllm/transformers_utils/config.py
vllm/transformers_utils/config.py
+1
-0
vllm/transformers_utils/configs/__init__.py
vllm/transformers_utils/configs/__init__.py
+2
-0
vllm/transformers_utils/configs/lfm2_moe.py
vllm/transformers_utils/configs/lfm2_moe.py
+160
-0
No files found.
docs/models/supported_models.md
View file @
320feae6
...
@@ -390,6 +390,7 @@ th {
...
@@ -390,6 +390,7 @@ th {
|
`JAISLMHeadModel`
| Jais |
`inceptionai/jais-13b`
,
`inceptionai/jais-13b-chat`
,
`inceptionai/jais-30b-v3`
,
`inceptionai/jais-30b-chat-v3`
, etc. | | ✅︎ | ✅︎ |
|
`JAISLMHeadModel`
| Jais |
`inceptionai/jais-13b`
,
`inceptionai/jais-13b-chat`
,
`inceptionai/jais-30b-v3`
,
`inceptionai/jais-30b-chat-v3`
, etc. | | ✅︎ | ✅︎ |
|
`JambaForCausalLM`
| Jamba |
`ai21labs/AI21-Jamba-1.5-Large`
,
`ai21labs/AI21-Jamba-1.5-Mini`
,
`ai21labs/Jamba-v0.1`
, etc. | ✅︎ | ✅︎ | ✅︎ |
|
`JambaForCausalLM`
| Jamba |
`ai21labs/AI21-Jamba-1.5-Large`
,
`ai21labs/AI21-Jamba-1.5-Mini`
,
`ai21labs/Jamba-v0.1`
, etc. | ✅︎ | ✅︎ | ✅︎ |
|
`Lfm2ForCausalLM`
| LFM2 |
`LiquidAI/LFM2-1.2B`
,
`LiquidAI/LFM2-700M`
,
`LiquidAI/LFM2-350M`
, etc. | ✅︎ | ✅︎ | ✅︎ |
|
`Lfm2ForCausalLM`
| LFM2 |
`LiquidAI/LFM2-1.2B`
,
`LiquidAI/LFM2-700M`
,
`LiquidAI/LFM2-350M`
, etc. | ✅︎ | ✅︎ | ✅︎ |
|
`Lfm2MoeForCausalLM`
| LFM2MoE |
`LiquidAI/LFM2-8B-A1B-preview`
, etc. | ✅︎ | ✅︎ | ✅︎ |
|
`LlamaForCausalLM`
| Llama 3.1, Llama 3, Llama 2, LLaMA, Yi |
`meta-llama/Meta-Llama-3.1-405B-Instruct`
,
`meta-llama/Meta-Llama-3.1-70B`
,
`meta-llama/Meta-Llama-3-70B-Instruct`
,
`meta-llama/Llama-2-70b-hf`
,
`01-ai/Yi-34B`
, etc. | ✅︎ | ✅︎ | ✅︎ |
|
`LlamaForCausalLM`
| Llama 3.1, Llama 3, Llama 2, LLaMA, Yi |
`meta-llama/Meta-Llama-3.1-405B-Instruct`
,
`meta-llama/Meta-Llama-3.1-70B`
,
`meta-llama/Meta-Llama-3-70B-Instruct`
,
`meta-llama/Llama-2-70b-hf`
,
`01-ai/Yi-34B`
, etc. | ✅︎ | ✅︎ | ✅︎ |
|
`MambaForCausalLM`
| Mamba |
`state-spaces/mamba-130m-hf`
,
`state-spaces/mamba-790m-hf`
,
`state-spaces/mamba-2.8b-hf`
, etc. | | ✅︎ | ✅︎ |
|
`MambaForCausalLM`
| Mamba |
`state-spaces/mamba-130m-hf`
,
`state-spaces/mamba-790m-hf`
,
`state-spaces/mamba-2.8b-hf`
, etc. | | ✅︎ | ✅︎ |
|
`Mamba2ForCausalLM`
| Mamba2 |
`mistralai/Mamba-Codestral-7B-v0.1`
, etc. | | ✅︎ | ✅︎ |
|
`Mamba2ForCausalLM`
| Mamba2 |
`mistralai/Mamba-Codestral-7B-v0.1`
, etc. | | ✅︎ | ✅︎ |
...
...
tests/models/registry.py
View file @
320feae6
...
@@ -321,6 +321,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -321,6 +321,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Lfm2ForCausalLM"
:
_HfExamplesInfo
(
"Lfm2ForCausalLM"
:
_HfExamplesInfo
(
"LiquidAI/LFM2-1.2B"
,
min_transformers_version
=
"4.54"
"LiquidAI/LFM2-1.2B"
,
min_transformers_version
=
"4.54"
),
),
"Lfm2MoeForCausalLM"
:
_HfExamplesInfo
(
"LiquidAI/LFM2-8B-A1B"
,
min_transformers_version
=
"4.58"
),
"LlamaForCausalLM"
:
_HfExamplesInfo
(
"LlamaForCausalLM"
:
_HfExamplesInfo
(
"meta-llama/Llama-3.2-1B-Instruct"
,
"meta-llama/Llama-3.2-1B-Instruct"
,
extras
=
{
extras
=
{
...
...
vllm/model_executor/models/lfm2.py
View file @
320feae6
...
@@ -71,14 +71,14 @@ class Lfm2MLP(nn.Module):
...
@@ -71,14 +71,14 @@ class Lfm2MLP(nn.Module):
output_sizes
=
[
ff_dim
]
*
2
,
output_sizes
=
[
ff_dim
]
*
2
,
bias
=
False
,
bias
=
False
,
quant_config
=
quant_config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.
gate_up_proj
"
,
prefix
=
f
"
{
prefix
}
.
w1
"
,
)
)
self
.
w2
=
RowParallelLinear
(
self
.
w2
=
RowParallelLinear
(
input_size
=
ff_dim
,
input_size
=
ff_dim
,
output_size
=
dim
,
output_size
=
dim
,
bias
=
False
,
bias
=
False
,
quant_config
=
quant_config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.
down_proj
"
,
prefix
=
f
"
{
prefix
}
.
w2
"
,
)
)
self
.
act_fn
=
SiluAndMul
()
self
.
act_fn
=
SiluAndMul
()
...
@@ -484,17 +484,12 @@ class Lfm2ForCausalLM(
...
@@ -484,17 +484,12 @@ class Lfm2ForCausalLM(
quant_config
=
vllm_config
.
quant_config
quant_config
=
vllm_config
.
quant_config
cache_config
=
vllm_config
.
cache_config
cache_config
=
vllm_config
.
cache_config
lora_config
=
vllm_config
.
lora_config
lora_config
=
vllm_config
.
lora_config
scheduler_config
=
vllm_config
.
scheduler_config
assert
not
cache_config
.
enable_prefix_caching
,
(
assert
not
cache_config
.
enable_prefix_caching
,
(
"Lfm2 currently does not support prefix caching"
"Lfm2 currently does not support prefix caching"
)
)
super
().
__init__
()
super
().
__init__
()
self
.
config
=
config
self
.
config
=
config
self
.
vllm_config
=
vllm_config
self
.
scheduler_config
=
scheduler_config
self
.
model_config
=
vllm_config
.
model_config
self
.
model
=
Lfm2Model
(
self
.
model
=
Lfm2Model
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
)
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
)
)
)
...
...
vllm/model_executor/models/lfm2_moe.py
0 → 100644
View file @
320feae6
This diff is collapsed.
Click to expand it.
vllm/model_executor/models/registry.py
View file @
320feae6
...
@@ -119,6 +119,7 @@ _TEXT_GENERATION_MODELS = {
...
@@ -119,6 +119,7 @@ _TEXT_GENERATION_MODELS = {
"JAISLMHeadModel"
:
(
"jais"
,
"JAISLMHeadModel"
),
"JAISLMHeadModel"
:
(
"jais"
,
"JAISLMHeadModel"
),
"JambaForCausalLM"
:
(
"jamba"
,
"JambaForCausalLM"
),
"JambaForCausalLM"
:
(
"jamba"
,
"JambaForCausalLM"
),
"Lfm2ForCausalLM"
:
(
"lfm2"
,
"Lfm2ForCausalLM"
),
"Lfm2ForCausalLM"
:
(
"lfm2"
,
"Lfm2ForCausalLM"
),
"Lfm2MoeForCausalLM"
:
(
"lfm2_moe"
,
"Lfm2MoeForCausalLM"
),
"LlamaForCausalLM"
:
(
"llama"
,
"LlamaForCausalLM"
),
"LlamaForCausalLM"
:
(
"llama"
,
"LlamaForCausalLM"
),
"Llama4ForCausalLM"
:
(
"llama4"
,
"Llama4ForCausalLM"
),
"Llama4ForCausalLM"
:
(
"llama4"
,
"Llama4ForCausalLM"
),
# For decapoda-research/llama-*
# For decapoda-research/llama-*
...
...
vllm/transformers_utils/config.py
View file @
320feae6
...
@@ -91,6 +91,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
...
@@ -91,6 +91,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
step3_vl
=
"Step3VLConfig"
,
step3_vl
=
"Step3VLConfig"
,
step3_text
=
"Step3TextConfig"
,
step3_text
=
"Step3TextConfig"
,
qwen3_next
=
"Qwen3NextConfig"
,
qwen3_next
=
"Qwen3NextConfig"
,
lfm2_moe
=
"Lfm2MoeConfig"
,
)
)
_CONFIG_ATTRS_MAPPING
:
dict
[
str
,
str
]
=
{
_CONFIG_ATTRS_MAPPING
:
dict
[
str
,
str
]
=
{
...
...
vllm/transformers_utils/configs/__init__.py
View file @
320feae6
...
@@ -19,6 +19,7 @@ from vllm.transformers_utils.configs.eagle import EAGLEConfig
...
@@ -19,6 +19,7 @@ from vllm.transformers_utils.configs.eagle import EAGLEConfig
from
vllm.transformers_utils.configs.falcon
import
RWConfig
from
vllm.transformers_utils.configs.falcon
import
RWConfig
from
vllm.transformers_utils.configs.jais
import
JAISConfig
from
vllm.transformers_utils.configs.jais
import
JAISConfig
from
vllm.transformers_utils.configs.kimi_vl
import
KimiVLConfig
from
vllm.transformers_utils.configs.kimi_vl
import
KimiVLConfig
from
vllm.transformers_utils.configs.lfm2_moe
import
Lfm2MoeConfig
from
vllm.transformers_utils.configs.medusa
import
MedusaConfig
from
vllm.transformers_utils.configs.medusa
import
MedusaConfig
from
vllm.transformers_utils.configs.midashenglm
import
MiDashengLMConfig
from
vllm.transformers_utils.configs.midashenglm
import
MiDashengLMConfig
from
vllm.transformers_utils.configs.mlp_speculator
import
MLPSpeculatorConfig
from
vllm.transformers_utils.configs.mlp_speculator
import
MLPSpeculatorConfig
...
@@ -46,6 +47,7 @@ __all__ = [
...
@@ -46,6 +47,7 @@ __all__ = [
"EAGLEConfig"
,
"EAGLEConfig"
,
"RWConfig"
,
"RWConfig"
,
"JAISConfig"
,
"JAISConfig"
,
"Lfm2MoeConfig"
,
"MedusaConfig"
,
"MedusaConfig"
,
"MiDashengLMConfig"
,
"MiDashengLMConfig"
,
"MLPSpeculatorConfig"
,
"MLPSpeculatorConfig"
,
...
...
vllm/transformers_utils/configs/lfm2_moe.py
0 → 100644
View file @
320feae6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Optional
from
transformers.configuration_utils
import
PretrainedConfig
class
Lfm2MoeConfig
(
PretrainedConfig
):
r
"""
This is the configuration class to store the configuration of a [`Lfm2MoeModel`]. It is used to instantiate a LFM2 Moe
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the LFM2-8B-A1B model.
e.g. [LiquidAI/LFM2-8B-A1B](https://huggingface.co/LiquidAI/LFM2-8B-A1B)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 65536):
Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`Lfm2Model`]
hidden_size (`int`, *optional*, defaults to 2048):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 7168):
Dimension of the MLP representations.
moe_intermediate_size (`int`, *optional*, defaults to 1792):
Intermediate size of the routed expert.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
pad_token_id (`int`, *optional*, defaults to 0):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 1000000.0):
The base period of the RoPE embeddings.
max_position_embeddings (`int`, *optional*, defaults to 128000):
The maximum sequence length that this model might ever be used with.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the rms normalization layers.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*, defaults to 8):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details, check out [this
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
`num_attention_heads`.
conv_bias (`bool`, *optional*, defaults to `False`):
Whether to use bias in the conv layers.
conv_L_cache (`int`, *optional*, defaults to 3):
L_cache dim in the conv layers.
num_dense_layers (`int`, *optional*, defaults to 2):
Number of dense Lfm2MoeMLP layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
num_experts_per_tok (`int`, *optional*, defaults to 4):
Number of selected experts.
num_experts (`int`, *optional*, defaults to 32):
Number of routed experts.
use_expert_bias (`bool`, *optional*, defaults to `True`):
Whether to use the expert bias on the routing weights.
routed_scaling_factor (`float`, *optional*, defaults to 1.0):
Scaling factor for routed experts in MoE models.
norm_topk_prob (`bool`, *optional*, defaults to `True`):
Whether to normalize the topk probabilities.
layer_types (`Optional`, *optional*):
Type of each layers.
```python
>>> from transformers import Lfm2MoeModel, Lfm2MoeConfig
>>> # Initializing a LFM2 Moe model
>>> configuration = Lfm2MoeConfig()
>>> # Initializing a model from the LFM2-8B-A1B style configuration
>>> model = Lfm2MoeModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
# noqa: E501
model_type
=
"lfm2_moe"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
:
int
=
65536
,
hidden_size
:
int
=
2048
,
intermediate_size
:
int
=
7168
,
moe_intermediate_size
:
int
=
1792
,
num_hidden_layers
:
int
=
32
,
pad_token_id
:
int
=
0
,
bos_token_id
:
int
=
1
,
eos_token_id
:
int
=
2
,
tie_word_embeddings
:
bool
=
True
,
rope_theta
:
float
=
1000000.0
,
max_position_embeddings
:
int
=
128_000
,
use_cache
:
bool
=
True
,
norm_eps
:
float
=
0.00001
,
num_attention_heads
:
int
=
32
,
num_key_value_heads
:
int
=
8
,
conv_bias
:
bool
=
False
,
conv_L_cache
:
int
=
3
,
num_dense_layers
:
int
=
2
,
num_experts_per_tok
:
int
=
4
,
num_experts
:
int
=
32
,
use_expert_bias
:
bool
=
True
,
routed_scaling_factor
:
float
=
1.0
,
norm_topk_prob
:
bool
=
True
,
layer_types
:
Optional
[
list
[
str
]]
=
None
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
rope_theta
=
rope_theta
self
.
max_position_embeddings
=
max_position_embeddings
self
.
use_cache
=
use_cache
self
.
norm_eps
=
norm_eps
# attn operator config
self
.
num_attention_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
# custom operator config
self
.
conv_bias
=
conv_bias
self
.
conv_L_cache
=
conv_L_cache
# moe config
self
.
num_dense_layers
=
num_dense_layers
self
.
moe_intermediate_size
=
moe_intermediate_size
self
.
num_experts_per_tok
=
num_experts_per_tok
self
.
num_experts
=
num_experts
self
.
use_expert_bias
=
use_expert_bias
self
.
routed_scaling_factor
=
routed_scaling_factor
self
.
norm_topk_prob
=
norm_topk_prob
self
.
layer_types
=
layer_types
tie_word_embeddings
=
kwargs
.
get
(
"tie_embedding"
,
tie_word_embeddings
)
# to fit original config keys
super
().
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
__all__
=
[
"Lfm2MoeConfig"
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment