Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2805c93e
Commit
2805c93e
authored
Oct 15, 2025
by
zhuwenwen
Browse files
Merge branch 'v0.9.2-dev' into v0.9.2-dev-ds
parents
c8de4a43
1b98d0bb
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
35 additions
and
5 deletions
+35
-5
README.md
README.md
+1
-0
setup.py
setup.py
+2
-2
vllm/config.py
vllm/config.py
+4
-0
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+8
-0
vllm/envs.py
vllm/envs.py
+2
-2
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+18
-1
No files found.
README.md
View file @
2805c93e
...
@@ -19,6 +19,7 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention
...
@@ -19,6 +19,7 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention
| Qwen3MoeForCausalLM | QWen3MoE | Yes | - | - | v0.8.4 | Yes |
| Qwen3MoeForCausalLM | QWen3MoE | Yes | - | - | v0.8.4 | Yes |
| ChatGLMModel | glm-4v-9b,chatglm3,chatglm2 | Yes | No | Yes | v0.5.0 | Yes |
| ChatGLMModel | glm-4v-9b,chatglm3,chatglm2 | Yes | No | Yes | v0.5.0 | Yes |
| Glm4ForCausalLM | GLM-4-0414 | No/Yes | - | - | v0.8.5.post1 | Yes |
| Glm4ForCausalLM | GLM-4-0414 | No/Yes | - | - | v0.8.5.post1 | Yes |
| Glm4MoeForCausalLM | GLM-4.5,GLM-4.5-Air | No/Yes | - | - | v0.9.2 | Yes |
| DeepseekForCausalLM | Deepseek | Yes | No | - | v0.5.0 | Yes |
| DeepseekForCausalLM | Deepseek | Yes | No | - | v0.5.0 | Yes |
| DeepseekV2ForCausalLM | DeepSeek-V2 | Yes | No | - | v0.6.2 | Yes |
| DeepseekV2ForCausalLM | DeepSeek-V2 | Yes | No | - | v0.6.2 | Yes |
| DeepseekVLV2ForCausalLM | DeepSeek-VL2 | Yes | No | - | v0.7.2 | Yes |
| DeepseekVLV2ForCausalLM | DeepSeek-VL2 | Yes | No | - | v0.7.2 | Yes |
...
...
setup.py
View file @
2805c93e
...
@@ -559,10 +559,10 @@ def get_version_add(sha: Optional[str] = None) -> str:
...
@@ -559,10 +559,10 @@ def get_version_add(sha: Optional[str] = None) -> str:
if
sha
is
None
:
if
sha
is
None
:
sha
=
get_sha
(
vllm_root
)
sha
=
get_sha
(
vllm_root
)
if
(
major
,
minor
)
>=
(
'2'
,
'5'
):
if
(
major
,
minor
)
>=
(
'2'
,
'5'
):
version
=
'das.opt1.
rc2.
'
+
sha
[:
7
]
version
=
'das.opt1.'
+
sha
[:
7
]
else
:
else
:
if
(
major
,
minor
)
>=
(
'2'
,
'5'
):
if
(
major
,
minor
)
>=
(
'2'
,
'5'
):
version
=
'das.opt1
.rc2
'
version
=
'das.opt1'
# dtk version
# dtk version
...
...
vllm/config.py
View file @
2805c93e
...
@@ -418,6 +418,9 @@ class ModelConfig:
...
@@ -418,6 +418,9 @@ class ModelConfig:
- "transformers" will use the Transformers model implementation."""
- "transformers" will use the Transformers model implementation."""
override_attention_dtype
:
Optional
[
str
]
=
None
override_attention_dtype
:
Optional
[
str
]
=
None
"""Override dtype for attention"""
"""Override dtype for attention"""
enable_chunked_prefill
:
Optional
[
bool
]
=
None
"""If True, prefill requests can be chunked based
on the remaining max_num_batched_tokens."""
def
compute_hash
(
self
)
->
str
:
def
compute_hash
(
self
)
->
str
:
"""
"""
...
@@ -448,6 +451,7 @@ class ModelConfig:
...
@@ -448,6 +451,7 @@ class ModelConfig:
factors
.
append
(
self
.
rope_theta
)
factors
.
append
(
self
.
rope_theta
)
# hf_config can control how the model looks!
# hf_config can control how the model looks!
factors
.
append
(
self
.
hf_config
.
to_json_string
())
factors
.
append
(
self
.
hf_config
.
to_json_string
())
factors
.
append
(
self
.
enable_chunked_prefill
)
str_factors
=
str
(
factors
)
str_factors
=
str
(
factors
)
assert_hashable
(
str_factors
)
assert_hashable
(
str_factors
)
return
hashlib
.
sha256
(
str
(
factors
).
encode
()).
hexdigest
()
return
hashlib
.
sha256
(
str
(
factors
).
encode
()).
hexdigest
()
...
...
vllm/engine/arg_utils.py
View file @
2805c93e
...
@@ -1004,6 +1004,7 @@ class EngineArgs:
...
@@ -1004,6 +1004,7 @@ class EngineArgs:
enable_sleep_mode
=
self
.
enable_sleep_mode
,
enable_sleep_mode
=
self
.
enable_sleep_mode
,
model_impl
=
self
.
model_impl
,
model_impl
=
self
.
model_impl
,
override_attention_dtype
=
self
.
override_attention_dtype
,
override_attention_dtype
=
self
.
override_attention_dtype
,
enable_chunked_prefill
=
self
.
enable_chunked_prefill
,
)
)
def
create_load_config
(
self
)
->
LoadConfig
:
def
create_load_config
(
self
)
->
LoadConfig
:
...
@@ -1593,6 +1594,9 @@ class EngineArgs:
...
@@ -1593,6 +1594,9 @@ class EngineArgs:
# For pooling tasks the default is False
# For pooling tasks the default is False
if
model_config
.
runner_type
!=
"pooling"
:
if
model_config
.
runner_type
!=
"pooling"
:
self
.
enable_chunked_prefill
=
True
self
.
enable_chunked_prefill
=
True
if
model_config
.
enable_chunked_prefill
is
not
None
and
\
model_config
.
enable_chunked_prefill
is
False
:
self
.
enable_chunked_prefill
=
False
if
self
.
enable_prefix_caching
is
None
:
if
self
.
enable_prefix_caching
is
None
:
self
.
enable_prefix_caching
=
True
self
.
enable_prefix_caching
=
True
else
:
else
:
...
@@ -1606,6 +1610,10 @@ class EngineArgs:
...
@@ -1606,6 +1610,10 @@ class EngineArgs:
action
=
"Enabling"
if
\
action
=
"Enabling"
if
\
incremental_prefill_supported
else
"Disabling"
incremental_prefill_supported
else
"Disabling"
if
model_config
.
enable_chunked_prefill
is
not
None
and
\
model_config
.
enable_chunked_prefill
is
False
:
self
.
enable_chunked_prefill
=
False
if
self
.
enable_chunked_prefill
is
None
:
if
self
.
enable_chunked_prefill
is
None
:
self
.
enable_chunked_prefill
=
incremental_prefill_supported
self
.
enable_chunked_prefill
=
incremental_prefill_supported
...
...
vllm/envs.py
View file @
2805c93e
...
@@ -1113,11 +1113,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -1113,11 +1113,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
(
"true"
,
"1"
)),
(
"true"
,
"1"
)),
# vLLM will use lightop moe_sum
# vLLM will use lightop moe_sum
"VLLM_USE_LIGHTOP_MOE_SUM"
:
"VLLM_USE_LIGHTOP_MOE_SUM"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_LIGHTOP_MOE_SUM"
,
"
Fals
e"
).
lower
()
in
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_LIGHTOP_MOE_SUM"
,
"
Tru
e"
).
lower
()
in
(
"true"
,
"1"
)),
(
"true"
,
"1"
)),
# vLLM will use lightop moe_align_block_size
# vLLM will use lightop moe_align_block_size
"VLLM_USE_LIGHTOP_MOE_ALIGN"
:
"VLLM_USE_LIGHTOP_MOE_ALIGN"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_LIGHTOP_MOE_ALIGN"
,
"
Fals
e"
).
lower
()
in
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_LIGHTOP_MOE_ALIGN"
,
"
Tru
e"
).
lower
()
in
(
"true"
,
"1"
)),
(
"true"
,
"1"
)),
# vLLM will use opt merge_aatn_states, not triton
# vLLM will use opt merge_aatn_states, not triton
"VLLM_USE_MERGE_ATTN_STATES_OPT"
:
"VLLM_USE_MERGE_ATTN_STATES_OPT"
:
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
2805c93e
...
@@ -251,6 +251,23 @@ class DeepseekV2MoE(nn.Module):
...
@@ -251,6 +251,23 @@ class DeepseekV2MoE(nn.Module):
else
:
else
:
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
)
router_logits
=
router_logits
)
if
shared_output
is
not
None
:
if
hidden_states
.
dtype
!=
torch
.
float16
:
final_hidden_states
=
final_hidden_states
+
shared_output
else
:
# Fix FP16 overflow
# See DeepseekV2DecoderLayer for more details.
final_hidden_states
=
final_hidden_states
+
shared_output
\
*
(
1.
/
self
.
routed_scaling_factor
)
if
self
.
tp_size
>
1
:
if
envs
.
VLLM_ENABLE_TBO
:
final_hidden_states
=
self
.
tbo_all_reduce
(
final_hidden_states
)
else
:
final_hidden_states
=
(
self
.
experts
.
maybe_all_reduce_tensor_model_parallel
(
final_hidden_states
))
if
not
self
.
use_mori_ep
:
if
not
self
.
use_mori_ep
:
if
self
.
tp_size
>
1
:
if
self
.
tp_size
>
1
:
...
@@ -721,7 +738,7 @@ class DeepseekV2DecoderLayer(nn.Module):
...
@@ -721,7 +738,7 @@ class DeepseekV2DecoderLayer(nn.Module):
residual
=
residual
residual
=
residual
)
)
residual
=
new_residual
residual
=
new_residual
if
hidden_states
.
dtype
==
torch
.
float16
:
if
hidden_states
.
dtype
==
torch
.
float16
:
# rmsnorm, and rmsnorm result would not affect by scale.
# rmsnorm, and rmsnorm result would not affect by scale.
hidden_states
*=
1.
/
self
.
routed_scaling_factor
hidden_states
*=
1.
/
self
.
routed_scaling_factor
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment