Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
43ecd0a9
Unverified
Commit
43ecd0a9
authored
Nov 06, 2025
by
Isotr0py
Committed by
GitHub
Nov 06, 2025
Browse files
[Chore] Clean up deepseek v2/v3 config copy (#28055)
Signed-off-by:
Isotr0py
<
mozf@mail2.sysu.edu.cn
>
parent
07d61451
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
15 additions
and
222 deletions
+15
-222
vllm/model_executor/models/deepseek.py
vllm/model_executor/models/deepseek.py
+2
-1
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+2
-1
vllm/model_executor/models/kimi_vl.py
vllm/model_executor/models/kimi_vl.py
+1
-2
vllm/transformers_utils/config.py
vllm/transformers_utils/config.py
+6
-4
vllm/transformers_utils/configs/__init__.py
vllm/transformers_utils/configs/__init__.py
+0
-2
vllm/transformers_utils/configs/deepseek_v3.py
vllm/transformers_utils/configs/deepseek_v3.py
+0
-100
vllm/transformers_utils/configs/deepseek_vl2.py
vllm/transformers_utils/configs/deepseek_vl2.py
+1
-101
vllm/transformers_utils/configs/eagle.py
vllm/transformers_utils/configs/eagle.py
+2
-10
vllm/transformers_utils/configs/kimi_vl.py
vllm/transformers_utils/configs/kimi_vl.py
+1
-1
No files found.
vllm/model_executor/models/deepseek.py
View file @
43ecd0a9
...
...
@@ -292,6 +292,7 @@ class DeepseekDecoderLayer(nn.Module):
rope_theta
=
getattr
(
config
,
"rope_theta"
,
10000
)
rope_scaling
=
getattr
(
config
,
"rope_scaling"
,
None
)
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
8192
)
moe_layer_freq
=
getattr
(
config
,
"moe_layer_freq"
,
1
)
self
.
self_attn
=
DeepseekAttention
(
hidden_size
=
self
.
hidden_size
,
num_heads
=
config
.
num_attention_heads
,
...
...
@@ -306,7 +307,7 @@ class DeepseekDecoderLayer(nn.Module):
if
(
config
.
n_routed_experts
is
not
None
and
layer_idx
>=
config
.
first_k_dense_replace
and
layer_idx
%
config
.
moe_layer_freq
==
0
and
layer_idx
%
moe_layer_freq
==
0
):
self
.
mlp
=
DeepseekMoE
(
config
=
config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
43ecd0a9
...
...
@@ -994,6 +994,7 @@ class DeepseekV2DecoderLayer(nn.Module):
rope_theta
=
getattr
(
config
,
"rope_theta"
,
10000
)
rope_scaling
=
getattr
(
config
,
"rope_scaling"
,
None
)
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
8192
)
moe_layer_freq
=
getattr
(
config
,
"moe_layer_freq"
,
1
)
# DecoderLayers are created with `make_layers` which passes the prefix
# with the layer's index.
layer_idx
=
int
(
prefix
.
split
(
sep
=
"."
)[
-
1
])
...
...
@@ -1024,7 +1025,7 @@ class DeepseekV2DecoderLayer(nn.Module):
if
(
config
.
n_routed_experts
is
not
None
and
layer_idx
>=
config
.
first_k_dense_replace
and
layer_idx
%
config
.
moe_layer_freq
==
0
and
layer_idx
%
moe_layer_freq
==
0
):
self
.
mlp
=
DeepseekV2MoE
(
config
=
config
,
...
...
vllm/model_executor/models/kimi_vl.py
View file @
43ecd0a9
...
...
@@ -50,7 +50,7 @@ from typing import Annotated, Any, Literal
import
torch
from
torch
import
nn
from
transformers
import
BatchFeature
from
transformers
import
BatchFeature
,
DeepseekV2Config
from
transformers.activations
import
GELUActivation
from
vllm.config
import
VllmConfig
...
...
@@ -91,7 +91,6 @@ from vllm.multimodal.processing import (
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
KimiVLConfig
,
MoonViTConfig
from
vllm.transformers_utils.configs.deepseek_vl2
import
DeepseekV2Config
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.utils
import
PPMissingLayer
,
is_pp_missing_parameter
,
maybe_prefix
...
...
vllm/transformers_utils/config.py
View file @
43ecd0a9
...
...
@@ -24,7 +24,7 @@ from huggingface_hub.utils import (
RepositoryNotFoundError
,
RevisionNotFoundError
,
)
from
transformers
import
GenerationConfig
,
PretrainedConfig
from
transformers
import
DeepseekV3Config
,
GenerationConfig
,
PretrainedConfig
from
transformers.models.auto.image_processing_auto
import
get_image_processor_config
from
transformers.models.auto.modeling_auto
import
(
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
,
...
...
@@ -68,16 +68,18 @@ def _get_hf_token() -> str | None:
class
LazyConfigDict
(
dict
):
def
__getitem__
(
self
,
key
):
if
isinstance
(
value
:
=
super
().
__getitem__
(
key
),
type
):
return
value
import
vllm.transformers_utils.configs
as
configs
return
getattr
(
configs
,
super
().
__getitem__
(
key
)
)
return
getattr
(
configs
,
value
)
_CONFIG_REGISTRY
:
dict
[
str
,
type
[
PretrainedConfig
]]
=
LazyConfigDict
(
chatglm
=
"ChatGLMConfig"
,
deepseek_vl_v2
=
"DeepseekVLV2Config"
,
deepseek_v3
=
"DeepseekV3Config"
,
deepseek_v32
=
"DeepseekV3Config"
,
deepseek_v32
=
DeepseekV3Config
,
flex_olmo
=
"FlexOlmoConfig"
,
kimi_linear
=
"KimiLinearConfig"
,
kimi_vl
=
"KimiVLConfig"
,
...
...
vllm/transformers_utils/configs/__init__.py
View file @
43ecd0a9
...
...
@@ -8,7 +8,6 @@ Model configs may be defined in this directory for the following reasons:
"""
from
vllm.transformers_utils.configs.chatglm
import
ChatGLMConfig
from
vllm.transformers_utils.configs.deepseek_v3
import
DeepseekV3Config
from
vllm.transformers_utils.configs.deepseek_vl2
import
DeepseekVLV2Config
from
vllm.transformers_utils.configs.dotsocr
import
DotsOCRConfig
from
vllm.transformers_utils.configs.eagle
import
EAGLEConfig
...
...
@@ -43,7 +42,6 @@ from vllm.transformers_utils.configs.ultravox import UltravoxConfig
__all__
=
[
"ChatGLMConfig"
,
"DeepseekVLV2Config"
,
"DeepseekV3Config"
,
"DotsOCRConfig"
,
"EAGLEConfig"
,
"FlexOlmoConfig"
,
...
...
vllm/transformers_utils/configs/deepseek_v3.py
deleted
100644 → 0
View file @
07d61451
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
transformers.configuration_utils
import
PretrainedConfig
from
transformers.utils
import
logging
logger
=
logging
.
get_logger
(
__name__
)
class
DeepseekV3Config
(
PretrainedConfig
):
model_type
=
"deepseek_v3"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
=
129280
,
hidden_size
=
7168
,
intermediate_size
=
18432
,
moe_intermediate_size
=
2048
,
num_hidden_layers
=
61
,
num_nextn_predict_layers
=
1
,
num_attention_heads
=
128
,
num_key_value_heads
=
128
,
n_shared_experts
=
1
,
n_routed_experts
=
256
,
ep_size
=
1
,
routed_scaling_factor
=
2.5
,
kv_lora_rank
=
512
,
q_lora_rank
=
1536
,
qk_rope_head_dim
=
64
,
v_head_dim
=
128
,
qk_nope_head_dim
=
128
,
topk_method
=
"noaux_tc"
,
n_group
=
8
,
topk_group
=
4
,
num_experts_per_tok
=
8
,
moe_layer_freq
=
1
,
first_k_dense_replace
=
3
,
norm_topk_prob
=
True
,
scoring_func
=
"sigmoid"
,
hidden_act
=
"silu"
,
max_position_embeddings
=
4096
,
initializer_range
=
0.02
,
rms_norm_eps
=
1e-6
,
use_cache
=
True
,
pad_token_id
=
None
,
bos_token_id
=
0
,
eos_token_id
=
1
,
tie_word_embeddings
=
False
,
rope_theta
=
10000.0
,
rope_scaling
=
None
,
attention_bias
=
False
,
attention_dropout
=
0.0
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
moe_intermediate_size
=
moe_intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_nextn_predict_layers
=
num_nextn_predict_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
n_shared_experts
=
n_shared_experts
self
.
n_routed_experts
=
n_routed_experts
self
.
ep_size
=
ep_size
self
.
routed_scaling_factor
=
routed_scaling_factor
self
.
kv_lora_rank
=
kv_lora_rank
self
.
q_lora_rank
=
q_lora_rank
self
.
qk_rope_head_dim
=
qk_rope_head_dim
self
.
v_head_dim
=
v_head_dim
self
.
qk_nope_head_dim
=
qk_nope_head_dim
self
.
topk_method
=
topk_method
self
.
n_group
=
n_group
self
.
topk_group
=
topk_group
self
.
num_experts_per_tok
=
num_experts_per_tok
self
.
moe_layer_freq
=
moe_layer_freq
self
.
first_k_dense_replace
=
first_k_dense_replace
self
.
norm_topk_prob
=
norm_topk_prob
self
.
scoring_func
=
scoring_func
# for backward compatibility
if
num_key_value_heads
is
None
:
num_key_value_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
use_cache
=
use_cache
self
.
rope_theta
=
rope_theta
self
.
rope_scaling
=
rope_scaling
self
.
attention_bias
=
attention_bias
self
.
attention_dropout
=
attention_dropout
super
().
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
vllm/transformers_utils/configs/deepseek_vl2.py
View file @
43ecd0a9
...
...
@@ -3,7 +3,7 @@
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
from
transformers
.configuration_utils
import
PretrainedConfig
from
transformers
import
DeepseekV2Config
,
PretrainedConfig
class
VisionEncoderConfig
(
PretrainedConfig
):
...
...
@@ -87,106 +87,6 @@ class MlpProjectorConfig(PretrainedConfig):
super
().
__init__
(
**
kwargs
)
class
DeepseekV2Config
(
PretrainedConfig
):
model_type
=
"deepseek_v2"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
=
102400
,
hidden_size
=
4096
,
intermediate_size
=
11008
,
moe_intermediate_size
=
1407
,
num_hidden_layers
=
30
,
num_attention_heads
=
32
,
num_key_value_heads
=
32
,
n_shared_experts
=
None
,
n_routed_experts
=
None
,
ep_size
=
1
,
routed_scaling_factor
=
1.0
,
kv_lora_rank
=
512
,
q_lora_rank
=
1536
,
qk_rope_head_dim
=
64
,
v_head_dim
=
128
,
qk_nope_head_dim
=
128
,
topk_method
=
"gready"
,
n_group
=
None
,
topk_group
=
None
,
num_experts_per_tok
=
None
,
moe_layer_freq
=
1
,
first_k_dense_replace
=
0
,
norm_topk_prob
=
False
,
scoring_func
=
"softmax"
,
aux_loss_alpha
=
0.001
,
seq_aux
=
True
,
hidden_act
=
"silu"
,
max_position_embeddings
=
2048
,
initializer_range
=
0.02
,
rms_norm_eps
=
1e-6
,
use_cache
=
True
,
pad_token_id
=
None
,
bos_token_id
=
100000
,
eos_token_id
=
100001
,
pretraining_tp
=
1
,
tie_word_embeddings
=
False
,
rope_theta
=
10000.0
,
rope_scaling
=
None
,
attention_bias
=
False
,
attention_dropout
=
0.0
,
use_mla
=
True
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
moe_intermediate_size
=
moe_intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
n_shared_experts
=
n_shared_experts
self
.
n_routed_experts
=
n_routed_experts
self
.
ep_size
=
ep_size
self
.
routed_scaling_factor
=
routed_scaling_factor
self
.
kv_lora_rank
=
kv_lora_rank
self
.
q_lora_rank
=
q_lora_rank
self
.
qk_rope_head_dim
=
qk_rope_head_dim
self
.
v_head_dim
=
v_head_dim
self
.
qk_nope_head_dim
=
qk_nope_head_dim
self
.
topk_method
=
topk_method
self
.
n_group
=
n_group
self
.
topk_group
=
topk_group
self
.
num_experts_per_tok
=
num_experts_per_tok
self
.
moe_layer_freq
=
moe_layer_freq
self
.
first_k_dense_replace
=
first_k_dense_replace
self
.
norm_topk_prob
=
norm_topk_prob
self
.
scoring_func
=
scoring_func
self
.
aux_loss_alpha
=
aux_loss_alpha
self
.
seq_aux
=
seq_aux
# for backward compatibility
if
num_key_value_heads
is
None
:
num_key_value_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
float
(
rms_norm_eps
)
self
.
pretraining_tp
=
pretraining_tp
self
.
use_cache
=
use_cache
self
.
rope_theta
=
rope_theta
self
.
rope_scaling
=
rope_scaling
self
.
attention_bias
=
attention_bias
self
.
attention_dropout
=
attention_dropout
self
.
use_mla
=
use_mla
super
().
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
class
DeepseekVLV2Config
(
PretrainedConfig
):
model_type
=
"deepseek_vl_v2"
vision_config
:
VisionEncoderConfig
...
...
vllm/transformers_utils/configs/eagle.py
View file @
43ecd0a9
...
...
@@ -3,9 +3,7 @@
import
os
from
transformers
import
AutoConfig
,
PretrainedConfig
from
vllm.transformers_utils.configs.deepseek_vl2
import
DeepseekV2Config
from
transformers
import
AutoConfig
,
DeepseekV2Config
,
PretrainedConfig
class
EAGLEConfig
(
PretrainedConfig
):
...
...
@@ -20,12 +18,6 @@ class EAGLEConfig(PretrainedConfig):
):
model_config
:
PretrainedConfig
|
DeepseekV2Config
|
None
if
isinstance
(
model
,
dict
):
archs
=
model
.
get
(
"architectures"
,
[])
target_archs
=
[
"DeepseekV2ForCausalLM"
,
"DeepseekV3ForCausalLM"
]
if
any
(
target_arch
in
archs
for
target_arch
in
target_archs
):
# AutoConfig does not support DeepSeek MoE models yet
model_config
=
DeepseekV2Config
(
**
model
)
else
:
model_config
=
AutoConfig
.
for_model
(
**
model
)
else
:
model_config
=
model
...
...
vllm/transformers_utils/configs/kimi_vl.py
View file @
43ecd0a9
...
...
@@ -2,9 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
from
transformers
import
DeepseekV2Config
from
transformers.configuration_utils
import
PretrainedConfig
from
vllm.transformers_utils.configs.deepseek_vl2
import
DeepseekV2Config
from
vllm.transformers_utils.configs.moonvit
import
MoonViTConfig
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment