Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0e607f8e
Commit
0e607f8e
authored
Jan 14, 2026
by
zhuwenwen
Browse files
fix tests of kernels
set VLLM_USE_PD_SPLIT=1 update moe_align_block_size
parent
cbdc58ec
Changes
50
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
39 additions
and
32 deletions
+39
-32
tests/kernels/untest_flex_attention.py
tests/kernels/untest_flex_attention.py
+0
-0
tests/kernels/untest_onednn.py
tests/kernels/untest_onednn.py
+0
-0
tests/kernels/untest_shuffle_rows.py
tests/kernels/untest_shuffle_rows.py
+0
-0
tests/models/registry.py
tests/models/registry.py
+3
-3
vllm/envs.py
vllm/envs.py
+13
-1
vllm/model_executor/layers/fused_moe/moe_align_block_size.py
vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+2
-2
vllm/model_executor/layers/rotary_embedding/__init__.py
vllm/model_executor/layers/rotary_embedding/__init__.py
+3
-9
vllm/model_executor/model_loader/utils.py
vllm/model_executor/model_loader/utils.py
+8
-8
vllm/utils/__init__.py
vllm/utils/__init__.py
+2
-2
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/scheduler.py
+8
-7
No files found.
tests/kernels/test_flex_attention.py
→
tests/kernels/
un
test_flex_attention.py
View file @
0e607f8e
File moved
tests/kernels/test_onednn.py
→
tests/kernels/
un
test_onednn.py
View file @
0e607f8e
File moved
tests/kernels/test_shuffle_rows.py
→
tests/kernels/
un
test_shuffle_rows.py
View file @
0e607f8e
File moved
tests/models/registry.py
View file @
0e607f8e
...
...
@@ -401,7 +401,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
"LlavaNextForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"royokong/e5-v"
)),
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"TIGER-Lab/VLM2Vec-Full"
),
trust_remote_code
=
True
),
"Qwen2VLForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"MrLight/dse-qwen2-2b-mrl-v1"
),
# noqa: E501
"Qwen2VLForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"MrLight/dse-qwen2-2b-mrl-v1"
)
)
,
# noqa: E501
"PrithviGeoSpatialMAE"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
),
# noqa: E501
dtype
=
torch
.
float16
,
enforce_eager
=
True
,
...
...
@@ -656,9 +656,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
os
.
path
.
join
(
models_path_prefix
,
"meituan-longcat/LongCat-Flash-Chat"
),
trust_remote_code
=
True
,
speculative_model
=
os
.
path
.
join
(
models_path_prefix
,
"meituan-longcat/LongCat-Flash-Chat"
)),
"MiMoMTPModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"XiaomiMiMo/MiMo-7B-RL"
)
)
,
"MiMoMTPModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"XiaomiMiMo/MiMo-7B-RL"
),
trust_remote_code
=
True
,
speculative_model
=
os
.
path
.
join
(
models_path_prefix
,
"XiaomiMiMo/MiMo-7B-RL"
),
speculative_model
=
os
.
path
.
join
(
models_path_prefix
,
"XiaomiMiMo/MiMo-7B-RL"
)
)
,
"Qwen3NextMTP"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen3-Next-80B-A3B-Instruct"
),
min_transformers_version
=
"4.56.3"
),
}
...
...
vllm/envs.py
View file @
0e607f8e
...
...
@@ -233,6 +233,8 @@ if TYPE_CHECKING:
VLLM_USE_LIGHTOP_MOE_SUM
:
bool
=
False
VLLM_USE_LIGHTOP_MOE_ALIGN
:
bool
=
False
VLLM_USE_MERGE_ATTN_STATES_OPT
:
bool
=
False
USE_FUSED_RMS_QUANT
:
bool
=
False
USE_FUSED_SILU_MUL_QUANT
:
bool
=
False
VLLM_USE_PD_SPLIT
:
bool
=
False
VLLM_USE_PP_SYNC
:
bool
=
False
VLLM_USE_PIECEWISE
:
bool
=
False
...
...
@@ -1635,9 +1637,19 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_MERGE_ATTN_STATES_OPT"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_MERGE_ATTN_STATES_OPT"
,
"True"
).
lower
()
in
(
"true"
,
"1"
)),
# vllm will use rmsquant fused op
"USE_FUSED_RMS_QUANT"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"USE_FUSED_RMS_QUANT"
,
"0"
))),
# vllm will use silu_mul_quant fused op,
# This variable has a default value of true,
# but it is still controlled by CRQ and RQ.
"USE_FUSED_SILU_MUL_QUANT"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"USE_FUSED_SILU_MUL_QUANT"
,
"0"
))),
# vLLM will split prefill and decode, not mix up
"VLLM_USE_PD_SPLIT"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_PD_SPLIT"
,
"
Fals
e"
).
lower
()
in
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_PD_SPLIT"
,
"
Tru
e"
).
lower
()
in
(
"true"
,
"1"
)),
# vLLM will sync to avoid pp vmfault
"VLLM_USE_PP_SYNC"
:
...
...
vllm/model_executor/layers/fused_moe/moe_align_block_size.py
View file @
0e607f8e
...
...
@@ -102,7 +102,7 @@ def moe_align_block_size(
expert_map
=
expert_map
,
expert_mask
=
expert_mask
,
num_local_tokens
=
None
,
Is_fuse_fill
=
Fals
e
)
Is_fuse_fill
=
Tru
e
)
else
:
if
envs
.
VLLM_USE_LIGHTOP_MOE_ALIGN
:
from
lightop
import
op
as
op
...
...
@@ -111,7 +111,7 @@ def moe_align_block_size(
expert_map
=
None
,
expert_mask
=
None
,
num_local_tokens
=
None
,
Is_fuse_fill
=
Fals
e
)
Is_fuse_fill
=
Tru
e
)
else
:
ops
.
moe_align_block_size
(
topk_ids
,
num_experts
,
block_size
,
sorted_ids
,
expert_ids
,
num_tokens_post_pad
)
...
...
vllm/model_executor/layers/rotary_embedding/__init__.py
View file @
0e607f8e
...
...
@@ -137,15 +137,9 @@ def get_rope(
scaling_alpha
,
dtype
)
elif
"factor"
in
rope_scaling
:
scaling_factor
=
rope_scaling
[
"factor"
]
scaling_alpha
=
rope_scaling
[
"alpha"
]
if
scaling_alpha
:
rotary_emb
=
DynamicNTKAlphaRotaryEmbedding
(
head_size
,
rotary_dim
,
max_position
,
base
,
is_neox_style
,
scaling_alpha
,
dtype
)
else
:
rotary_emb
=
DynamicNTKScalingRotaryEmbedding
(
head_size
,
rotary_dim
,
max_position
,
base
,
is_neox_style
,
scaling_factor
,
dtype
)
rotary_emb
=
DynamicNTKScalingRotaryEmbedding
(
head_size
,
rotary_dim
,
max_position
,
base
,
is_neox_style
,
scaling_factor
,
dtype
)
else
:
raise
ValueError
(
"Dynamic rope scaling must contain either "
"'alpha' or 'factor' field"
)
...
...
vllm/model_executor/model_loader/utils.py
View file @
0e607f8e
...
...
@@ -199,11 +199,11 @@ def _get_model_architecture(
if
not
envs
.
is_set
(
"VLLM_USE_OPT_CAT"
):
os
.
environ
[
'VLLM_USE_OPT_CAT'
]
=
'1'
else
:
if
not
envs
.
is_set
(
"VLLM_USE_PD_SPLIT"
):
os
.
environ
[
'VLLM_USE_PD_SPLIT'
]
=
'1'
#
if not envs.is_set("VLLM_USE_PD_SPLIT"):
#
os.environ['VLLM_USE_PD_SPLIT'] = '1'
if
architectures
in
[[
'Qwen3MoeForCausalLM'
]]:
#
if not envs.is_set("VLLM_USE_LIGHTOP_MOE_ALIGN"):
#
os.environ['VLLM_USE_LIGHTOP_MOE_ALIGN'] = '1'
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP_MOE_ALIGN"
):
os
.
environ
[
'VLLM_USE_LIGHTOP_MOE_ALIGN'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP_MOE_SUM"
):
os
.
environ
[
'VLLM_USE_LIGHTOP_MOE_SUM'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_FUSE_SILU_AND_MUL"
):
...
...
@@ -226,11 +226,11 @@ def _get_model_architecture(
if
not
envs
.
is_set
(
"VLLM_USE_OPT_CAT"
):
os
.
environ
[
'VLLM_USE_OPT_CAT'
]
=
'1'
else
:
if
not
envs
.
is_set
(
"VLLM_USE_PD_SPLIT"
):
os
.
environ
[
'VLLM_USE_PD_SPLIT'
]
=
'1'
#
if not envs.is_set("VLLM_USE_PD_SPLIT"):
#
os.environ['VLLM_USE_PD_SPLIT'] = '1'
if
architectures
in
[[
'Qwen3MoeForCausalLM'
]]:
#
if not envs.is_set("VLLM_USE_LIGHTOP_MOE_ALIGN"):
#
os.environ['VLLM_USE_LIGHTOP_MOE_ALIGN'] = '1'
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP_MOE_ALIGN"
):
os
.
environ
[
'VLLM_USE_LIGHTOP_MOE_ALIGN'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP_MOE_SUM"
):
os
.
environ
[
'VLLM_USE_LIGHTOP_MOE_SUM'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_FUSE_SILU_AND_MUL"
):
...
...
vllm/utils/__init__.py
View file @
0e607f8e
...
...
@@ -129,8 +129,8 @@ STR_DTYPE_TO_TORCH_DTYPE = {
"bfloat16"
:
torch
.
bfloat16
,
"float"
:
torch
.
float
,
"fp8"
:
torch
.
uint8
,
#
"fp8_e4m3": torch.uint8,
#
"fp8_e5m2": torch.uint8,
"fp8_e4m3"
:
torch
.
uint8
,
"fp8_e5m2"
:
torch
.
uint8
,
"int8"
:
torch
.
int8
,
"fp8_inc"
:
torch
.
float8_e4m3fn
,
"fp8_ds_mla"
:
torch
.
uint8
,
...
...
vllm/v1/core/sched/scheduler.py
View file @
0e607f8e
...
...
@@ -1089,14 +1089,15 @@ class Scheduler(SchedulerInterface):
def
schedule
(
self
)
->
SchedulerOutput
:
if
envs
.
VLLM_USE_PD_SPLIT
:
return
self
.
schedule_split_pd
()
else
:
if
self
.
connector
is
not
None
:
return
self
.
schedule_default
()
if
self
.
full_cuda_graph
and
self
.
use_mla
and
self
.
num_spec_tokens
>
0
:
return
self
.
schedule_split_pd
()
if
self
.
use_mla
:
if
self
.
full_cuda_graph
and
self
.
num_spec_tokens
>
0
:
return
self
.
schedule_split_pd
()
else
:
self
.
schedule_default
()
else
:
return
self
.
schedule_default
()
return
self
.
schedule_split_pd
()
else
:
return
self
.
schedule_default
()
def
_update_after_schedule
(
self
,
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment