Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6ef5d322
Commit
6ef5d322
authored
Mar 24, 2026
by
laibao
Browse files
fix(moe): 补齐非Marlin量化路径 shared_output/routed_scaling_factor 透传
parent
06185134
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
25 additions
and
6 deletions
+25
-6
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+5
-1
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
...quantization/compressed_tensors/compressed_tensors_moe.py
+11
-4
vllm/model_executor/layers/quantization/fp8.py
vllm/model_executor/layers/quantization/fp8.py
+4
-0
vllm/model_executor/layers/quantization/slimquant_w4a8.py
vllm/model_executor/layers/quantization/slimquant_w4a8.py
+5
-1
No files found.
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
6ef5d322
...
@@ -1881,6 +1881,8 @@ def fused_experts_impl(
...
@@ -1881,6 +1881,8 @@ def fused_experts_impl(
a2_scale
=
a2_scale
,
a2_scale
=
a2_scale
,
block_shape
=
block_shape
,
block_shape
=
block_shape
,
use_nn_moe
=
False
,
use_nn_moe
=
False
,
routed_scaling_factor
=
routed_scaling_factor
,
shared_output
=
shared_output
,
i_q
=
i_q
,
i_q
=
i_q
,
i_s
=
i_s
i_s
=
i_s
)
)
...
@@ -1903,7 +1905,9 @@ def fused_experts_impl(
...
@@ -1903,7 +1905,9 @@ def fused_experts_impl(
a1_scale
=
a1_scale
,
a1_scale
=
a1_scale
,
a2_scale
=
a2_scale
,
a2_scale
=
a2_scale
,
block_shape
=
block_shape
,
block_shape
=
block_shape
,
use_nn_moe
=
False
use_nn_moe
=
False
,
routed_scaling_factor
=
routed_scaling_factor
,
shared_output
=
shared_output
)
)
if
use_int4_w4a16
:
if
use_int4_w4a16
:
...
...
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
View file @
6ef5d322
...
@@ -1111,7 +1111,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
...
@@ -1111,7 +1111,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
topk_ids
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
use_nn_moe
:
bool
|
None
=
False
,
use_nn_moe
:
bool
|
None
=
False
,
use_fused_gate
:
bool
|
None
=
False
,
use_fused_gate
:
bool
|
None
=
False
,
shared_output
:
torch
.
Tensor
|
None
=
None
,
routed_scaling_factor
:
float
=
1.0
,
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
assert
not
self
.
is_monolithic
assert
not
self
.
is_monolithic
assert
self
.
kernel
is
not
None
assert
self
.
kernel
is
not
None
...
@@ -1131,6 +1132,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
...
@@ -1131,6 +1132,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
quant_config
=
self
.
moe_quant_config
,
quant_config
=
self
.
moe_quant_config
,
use_fused_gate
=
use_fused_gate
,
use_fused_gate
=
use_fused_gate
,
use_nn_moe
=
False
,
use_nn_moe
=
False
,
shared_output
=
shared_output
,
routed_scaling_factor
=
routed_scaling_factor
,
)
)
@
property
@
property
...
@@ -1256,7 +1259,9 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
...
@@ -1256,7 +1259,9 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
topk_ids
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
use_nn_moe
:
bool
|
None
=
False
,
use_nn_moe
:
bool
|
None
=
False
,
i_q
:
torch
.
Tensor
|
None
=
None
,
i_q
:
torch
.
Tensor
|
None
=
None
,
i_s
:
torch
.
Tensor
|
None
=
None
i_s
:
torch
.
Tensor
|
None
=
None
,
shared_output
:
torch
.
Tensor
|
None
=
None
,
routed_scaling_factor
:
float
=
1.0
,
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
from
vllm.model_executor.layers.fused_moe
import
fused_experts
from
vllm.model_executor.layers.fused_moe
import
fused_experts
...
@@ -1274,7 +1279,9 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
...
@@ -1274,7 +1279,9 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
quant_config
=
self
.
moe_quant_config
,
quant_config
=
self
.
moe_quant_config
,
use_nn_moe
=
use_nn_moe
,
use_nn_moe
=
use_nn_moe
,
i_q
=
i_q
,
i_q
=
i_q
,
i_s
=
i_s
i_s
=
i_s
,
shared_output
=
shared_output
,
routed_scaling_factor
=
routed_scaling_factor
,
)
)
...
@@ -2515,4 +2522,4 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
...
@@ -2515,4 +2522,4 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
@
property
@
property
def
supports_eplb
(
self
)
->
bool
:
def
supports_eplb
(
self
)
->
bool
:
return
False
return
False
\ No newline at end of file
vllm/model_executor/layers/quantization/fp8.py
View file @
6ef5d322
...
@@ -1028,6 +1028,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
...
@@ -1028,6 +1028,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
topk_ids
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
use_nn_moe
:
bool
|
None
=
False
,
use_nn_moe
:
bool
|
None
=
False
,
use_fused_gate
:
bool
|
None
=
False
,
use_fused_gate
:
bool
|
None
=
False
,
shared_output
:
torch
.
Tensor
|
None
=
None
,
routed_scaling_factor
:
float
=
1.0
,
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
assert
self
.
kernel
is
not
None
assert
self
.
kernel
is
not
None
assert
not
self
.
is_monolithic
assert
not
self
.
is_monolithic
...
@@ -1047,6 +1049,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
...
@@ -1047,6 +1049,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
quant_config
=
self
.
moe_quant_config
,
quant_config
=
self
.
moe_quant_config
,
use_fused_gate
=
use_fused_gate
,
use_fused_gate
=
use_fused_gate
,
use_nn_moe
=
False
,
use_nn_moe
=
False
,
shared_output
=
shared_output
,
routed_scaling_factor
=
routed_scaling_factor
,
)
)
...
...
vllm/model_executor/layers/quantization/slimquant_w4a8.py
View file @
6ef5d322
...
@@ -308,7 +308,9 @@ class SlimQuantW4A8Int8MoEMethod:
...
@@ -308,7 +308,9 @@ class SlimQuantW4A8Int8MoEMethod:
use_nn_moe
:
bool
|
None
=
False
,
use_nn_moe
:
bool
|
None
=
False
,
use_fused_gate
:
bool
|
None
=
False
,
use_fused_gate
:
bool
|
None
=
False
,
i_q
:
torch
.
Tensor
|
None
=
None
,
i_q
:
torch
.
Tensor
|
None
=
None
,
i_s
:
torch
.
Tensor
|
None
=
None
i_s
:
torch
.
Tensor
|
None
=
None
,
shared_output
:
torch
.
Tensor
|
None
=
None
,
routed_scaling_factor
:
float
=
1.0
,
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
from
vllm.model_executor.layers.fused_moe
import
fused_experts
from
vllm.model_executor.layers.fused_moe
import
fused_experts
return
fused_experts
(
return
fused_experts
(
...
@@ -324,4 +326,6 @@ class SlimQuantW4A8Int8MoEMethod:
...
@@ -324,4 +326,6 @@ class SlimQuantW4A8Int8MoEMethod:
global_num_experts
=
layer
.
global_num_experts
,
global_num_experts
=
layer
.
global_num_experts
,
quant_config
=
self
.
moe_quant_config
,
quant_config
=
self
.
moe_quant_config
,
use_nn_moe
=
use_nn_moe
,
use_nn_moe
=
use_nn_moe
,
shared_output
=
shared_output
,
routed_scaling_factor
=
routed_scaling_factor
,
)
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment