Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0e35e124
Commit
0e35e124
authored
Nov 03, 2025
by
王敏
Browse files
[fix]修复mori报错
parent
d698d6f2
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
6 additions
and
4 deletions
+6
-4
vllm/config.py
vllm/config.py
+1
-1
vllm/distributed/device_communicators/cuda_communicator.py
vllm/distributed/device_communicators/cuda_communicator.py
+2
-0
vllm/model_executor/layers/fused_moe/mori_moe/layer.py
vllm/model_executor/layers/fused_moe/mori_moe/layer.py
+1
-1
vllm/model_executor/layers/quantization/slimquant_w4a8_marlin.py
...del_executor/layers/quantization/slimquant_w4a8_marlin.py
+2
-2
No files found.
vllm/config.py
View file @
0e35e124
...
@@ -4755,7 +4755,7 @@ class VllmConfig:
...
@@ -4755,7 +4755,7 @@ class VllmConfig:
batch_size_capture_list
=
[]
batch_size_capture_list
=
[]
if
self
.
model_config
is
not
None
and
\
if
self
.
model_config
is
not
None
and
\
not
self
.
model_config
.
enforce_eager
:
not
self
.
model_config
.
enforce_eager
:
if
self
.
model_config
.
use_mla
and
self
.
compilation_config
.
full_cuda_graph
and
self
.
scheduler_config
.
max_num_seqs
<=
512
:
if
self
.
model_config
.
use_mla
and
self
.
scheduler_config
.
max_num_seqs
<=
512
:
cuda_graph_sizes
=
[
self
.
scheduler_config
.
max_num_seqs
]
cuda_graph_sizes
=
[
self
.
scheduler_config
.
max_num_seqs
]
else
:
else
:
cuda_graph_sizes
=
self
.
scheduler_config
.
cuda_graph_sizes
cuda_graph_sizes
=
self
.
scheduler_config
.
cuda_graph_sizes
...
...
vllm/distributed/device_communicators/cuda_communicator.py
View file @
0e35e124
...
@@ -87,6 +87,8 @@ class CudaCommunicator(DeviceCommunicatorBase):
...
@@ -87,6 +87,8 @@ class CudaCommunicator(DeviceCommunicatorBase):
from
.all2all
import
DeepEPLLAll2AllManager
from
.all2all
import
DeepEPLLAll2AllManager
self
.
all2all_manager
=
DeepEPLLAll2AllManager
(
self
.
cpu_group
)
self
.
all2all_manager
=
DeepEPLLAll2AllManager
(
self
.
cpu_group
)
logger
.
info
(
"Using DeepEP Low-Latency all2all manager."
)
logger
.
info
(
"Using DeepEP Low-Latency all2all manager."
)
elif
all2all_backend
==
"mori"
:
pass
else
:
else
:
raise
ValueError
(
f
"Unknown all2all backend:
{
all2all_backend
}
"
)
raise
ValueError
(
f
"Unknown all2all backend:
{
all2all_backend
}
"
)
...
...
vllm/model_executor/layers/fused_moe/mori_moe/layer.py
View file @
0e35e124
...
@@ -369,7 +369,7 @@ class MoriMoE(FusedMoE):
...
@@ -369,7 +369,7 @@ class MoriMoE(FusedMoE):
apply_router_weight_on_input
=
self
.
apply_router_weight_on_input
,
apply_router_weight_on_input
=
self
.
apply_router_weight_on_input
,
use_nn_moe
=
self
.
use_nn_moe
,
use_nn_moe
=
self
.
use_nn_moe
,
num_local_tokens
=
dispatch_recv_num_token
,
num_local_tokens
=
dispatch_recv_num_token
,
config_sel
ect_
bs
=
hidden_states
.
shape
[
0
]
*
self
.
ep_size
/
self
.
dp_size
,
exp
ect_
m
=
hidden_states
.
shape
[
0
],
scales
=
dispatch_scales
if
self
.
use_int8_dispatch
else
None
scales
=
dispatch_scales
if
self
.
use_int8_dispatch
else
None
)
)
...
...
vllm/model_executor/layers/quantization/slimquant_w4a8_marlin.py
View file @
0e35e124
...
@@ -293,7 +293,7 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
...
@@ -293,7 +293,7 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
activation
:
str
=
"silu"
,
activation
:
str
=
"silu"
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
num_local_tokens
:
Optional
[
torch
.
Tensor
]
=
None
,
num_local_tokens
:
Optional
[
torch
.
Tensor
]
=
None
,
config_sel
ect_
bs
:
Optional
[
int
]
=
None
,
exp
ect_
m
:
Optional
[
int
]
=
None
,
routed_scaling_factor
:
Optional
[
float
]
=
None
,
routed_scaling_factor
:
Optional
[
float
]
=
None
,
scales
:
Optional
[
torch
.
Tensor
]
=
None
,
scales
:
Optional
[
torch
.
Tensor
]
=
None
,
**
_
**
_
...
@@ -320,7 +320,7 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
...
@@ -320,7 +320,7 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
a2_scale
=
layer
.
w2_input_scale
,
a2_scale
=
layer
.
w2_input_scale
,
use_nn_moe
=
use_nn_moe
,
use_nn_moe
=
use_nn_moe
,
num_local_tokens
=
num_local_tokens
,
num_local_tokens
=
num_local_tokens
,
config_select_bs
=
config_sel
ect_
bs
,
expect_m
=
exp
ect_
m
,
)
)
def
apply
(
def
apply
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment