Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b8ef3436
Commit
b8ef3436
authored
Dec 13, 2025
by
zhuwenwen
Browse files
fix optional error
parent
cffe15ef
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
35 additions
and
35 deletions
+35
-35
vllm/envs.py
vllm/envs.py
+2
-2
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+26
-26
vllm/model_executor/layers/layernorm.py
vllm/model_executor/layers/layernorm.py
+2
-2
vllm/model_executor/layers/linear.py
vllm/model_executor/layers/linear.py
+5
-5
No files found.
vllm/envs.py
View file @
b8ef3436
...
@@ -237,7 +237,7 @@ if TYPE_CHECKING:
...
@@ -237,7 +237,7 @@ if TYPE_CHECKING:
VLLM_COMPILE_CACHE_SAVE_FORMAT
:
Literal
[
"binary"
,
"unpacked"
]
=
"binary"
VLLM_COMPILE_CACHE_SAVE_FORMAT
:
Literal
[
"binary"
,
"unpacked"
]
=
"binary"
VLLM_USE_V2_MODEL_RUNNER
:
bool
=
False
VLLM_USE_V2_MODEL_RUNNER
:
bool
=
False
# add envs
# add envs
VLLM_OPTEST_URLS_PORT
:
Optional
[
int
]
=
None
VLLM_OPTEST_URLS_PORT
:
int
|
None
=
None
VLLM_OPTEST_MODELS_PATH
:
str
=
""
VLLM_OPTEST_MODELS_PATH
:
str
=
""
VLLM_USE_TRITON_PREFIX_FLASH_ATTN
:
bool
=
False
VLLM_USE_TRITON_PREFIX_FLASH_ATTN
:
bool
=
False
VLLM_USE_FLASH_MLA
:
bool
=
False
VLLM_USE_FLASH_MLA
:
bool
=
False
...
@@ -248,7 +248,7 @@ if TYPE_CHECKING:
...
@@ -248,7 +248,7 @@ if TYPE_CHECKING:
VLLM_SPEC_DECODE_EAGER
:
bool
=
False
VLLM_SPEC_DECODE_EAGER
:
bool
=
False
VLLM_PCIE_USE_CUSTOM_ALLREDUCE
:
bool
=
False
VLLM_PCIE_USE_CUSTOM_ALLREDUCE
:
bool
=
False
VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX
:
int
=
16
VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX
:
int
=
16
VLLM_ENFORCE_EAGER_BS_THRESHOLD
:
Optional
[
int
]
=
None
VLLM_ENFORCE_EAGER_BS_THRESHOLD
:
int
|
None
=
None
VLLM_HAS_CONTEXT_DEFAULT
:
bool
=
False
VLLM_HAS_CONTEXT_DEFAULT
:
bool
=
False
VLLM_USE_NN
:
bool
=
False
VLLM_USE_NN
:
bool
=
False
VLLM_ENABLE_TBO
:
bool
=
False
VLLM_ENABLE_TBO
:
bool
=
False
...
...
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
b8ef3436
...
@@ -960,24 +960,24 @@ def invoke_fused_moe_kernel(
...
@@ -960,24 +960,24 @@ def invoke_fused_moe_kernel(
)
)
if
use_moe_wna16_cuda
:
if
use_moe_wna16_cuda
:
bit
=
4
if
use_int4_w4a16
else
8
bit
=
4
if
use_int4_w4a16
else
8
ops
.
moe_wna16_gemm
(
ops
.
moe_wna16_gemm
(
A
,
A
,
C
,
C
,
B
,
B
,
B_scale
,
B_scale
,
B_zp
,
B_zp
,
topk_weights
if
mul_routed_weight
else
None
,
topk_weights
if
mul_routed_weight
else
None
,
sorted_token_ids
,
sorted_token_ids
,
expert_ids
,
expert_ids
,
num_tokens_post_padded
,
num_tokens_post_padded
,
top_k
,
top_k
,
config
[
"BLOCK_SIZE_M"
],
config
[
"BLOCK_SIZE_M"
],
config
[
"BLOCK_SIZE_N"
],
config
[
"BLOCK_SIZE_N"
],
config
[
"BLOCK_SIZE_K"
],
config
[
"BLOCK_SIZE_K"
],
bit
,
bit
,
)
)
return
return
if
os
.
environ
.
get
(
'AWQ_MOE_SZ'
)
==
'1'
:
if
os
.
environ
.
get
(
'AWQ_MOE_SZ'
)
==
'1'
:
fused_moe_kernel_awq
[
grid
](
fused_moe_kernel_awq
[
grid
](
...
@@ -1208,7 +1208,7 @@ def get_moe_configs(
...
@@ -1208,7 +1208,7 @@ def get_moe_configs(
dtype
:
str
|
None
,
dtype
:
str
|
None
,
block_n
:
int
|
None
=
None
,
block_n
:
int
|
None
=
None
,
block_k
:
int
|
None
=
None
,
block_k
:
int
|
None
=
None
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
use_nn_moe
:
bool
|
None
=
False
,
)
->
dict
[
int
,
Any
]
|
None
:
)
->
dict
[
int
,
Any
]
|
None
:
"""
"""
Return optimized configurations for the fused MoE kernel.
Return optimized configurations for the fused MoE kernel.
...
@@ -1365,7 +1365,7 @@ def get_default_config(
...
@@ -1365,7 +1365,7 @@ def get_default_config(
topk
:
int
,
topk
:
int
,
dtype
:
str
|
None
,
dtype
:
str
|
None
,
block_shape
:
list
[
int
]
|
None
=
None
,
block_shape
:
list
[
int
]
|
None
=
None
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
use_nn_moe
:
bool
|
None
=
False
,
)
->
dict
[
str
,
int
]:
)
->
dict
[
str
,
int
]:
if
vllm_is_batch_invariant
():
if
vllm_is_batch_invariant
():
config
=
{
config
=
{
...
@@ -1434,7 +1434,7 @@ def try_get_optimal_moe_config(
...
@@ -1434,7 +1434,7 @@ def try_get_optimal_moe_config(
dtype
:
str
|
None
,
dtype
:
str
|
None
,
M
:
int
,
M
:
int
,
block_shape
:
list
[
int
]
|
None
=
None
,
block_shape
:
list
[
int
]
|
None
=
None
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
use_nn_moe
:
bool
|
None
=
False
,
)
->
dict
[
str
,
int
]:
)
->
dict
[
str
,
int
]:
from
vllm.model_executor.layers.fused_moe
import
get_config
from
vllm.model_executor.layers.fused_moe
import
get_config
...
@@ -1791,7 +1791,7 @@ def inplace_fused_experts(
...
@@ -1791,7 +1791,7 @@ def inplace_fused_experts(
block_shape
:
list
[
int
]
|
None
=
None
,
block_shape
:
list
[
int
]
|
None
=
None
,
w1_bias
:
torch
.
Tensor
|
None
=
None
,
w1_bias
:
torch
.
Tensor
|
None
=
None
,
w2_bias
:
torch
.
Tensor
|
None
=
None
,
w2_bias
:
torch
.
Tensor
|
None
=
None
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
use_nn_moe
:
bool
|
None
=
False
,
)
->
None
:
)
->
None
:
fused_experts_impl
(
fused_experts_impl
(
hidden_states
,
hidden_states
,
...
@@ -1850,7 +1850,7 @@ def inplace_fused_experts_fake(
...
@@ -1850,7 +1850,7 @@ def inplace_fused_experts_fake(
block_shape
:
list
[
int
]
|
None
=
None
,
block_shape
:
list
[
int
]
|
None
=
None
,
w1_bias
:
torch
.
Tensor
|
None
=
None
,
w1_bias
:
torch
.
Tensor
|
None
=
None
,
w2_bias
:
torch
.
Tensor
|
None
=
None
,
w2_bias
:
torch
.
Tensor
|
None
=
None
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
use_nn_moe
:
bool
|
None
=
False
,
)
->
None
:
)
->
None
:
pass
pass
...
@@ -1952,7 +1952,7 @@ def outplace_fused_experts_fake(
...
@@ -1952,7 +1952,7 @@ def outplace_fused_experts_fake(
block_shape
:
list
[
int
]
|
None
=
None
,
block_shape
:
list
[
int
]
|
None
=
None
,
w1_bias
:
torch
.
Tensor
|
None
=
None
,
w1_bias
:
torch
.
Tensor
|
None
=
None
,
w2_bias
:
torch
.
Tensor
|
None
=
None
,
w2_bias
:
torch
.
Tensor
|
None
=
None
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
use_nn_moe
:
bool
|
None
=
False
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
return
torch
.
empty_like
(
hidden_states
)
return
torch
.
empty_like
(
hidden_states
)
...
@@ -2002,7 +2002,7 @@ def fused_experts(
...
@@ -2002,7 +2002,7 @@ def fused_experts(
allow_deep_gemm
:
bool
=
False
,
allow_deep_gemm
:
bool
=
False
,
allow_cutlass_block_scaled_grouped_gemm
:
bool
=
False
,
allow_cutlass_block_scaled_grouped_gemm
:
bool
=
False
,
use_int4_w4a8
:
bool
=
False
,
use_int4_w4a8
:
bool
=
False
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
use_nn_moe
:
bool
|
None
=
False
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
if
quant_config
is
None
:
if
quant_config
is
None
:
quant_config
=
FUSED_MOE_UNQUANTIZED_CONFIG
quant_config
=
FUSED_MOE_UNQUANTIZED_CONFIG
...
@@ -2145,7 +2145,7 @@ def fused_experts_impl(
...
@@ -2145,7 +2145,7 @@ def fused_experts_impl(
block_shape
:
list
[
int
]
|
None
=
None
,
block_shape
:
list
[
int
]
|
None
=
None
,
w1_bias
:
torch
.
Tensor
|
None
=
None
,
w1_bias
:
torch
.
Tensor
|
None
=
None
,
w2_bias
:
torch
.
Tensor
|
None
=
None
,
w2_bias
:
torch
.
Tensor
|
None
=
None
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
use_nn_moe
:
bool
|
None
=
False
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
# Check constraints.
# Check constraints.
num_tokens
=
hidden_states
.
size
(
0
)
num_tokens
=
hidden_states
.
size
(
0
)
...
...
vllm/model_executor/layers/layernorm.py
View file @
b8ef3436
...
@@ -249,8 +249,8 @@ class RMSNorm(CustomOp):
...
@@ -249,8 +249,8 @@ class RMSNorm(CustomOp):
def
forward_apex
(
def
forward_apex
(
self
,
self
,
x
:
torch
.
Tensor
,
x
:
torch
.
Tensor
,
residual
:
Optional
[
torch
.
Tensor
]
=
None
,
residual
:
torch
.
Tensor
|
None
=
None
,
)
->
Union
[
torch
.
Tensor
,
T
uple
[
torch
.
Tensor
,
torch
.
Tensor
]
]
:
)
->
torch
.
Tensor
|
t
uple
[
torch
.
Tensor
,
torch
.
Tensor
]:
if
self
.
variance_size_override
is
not
None
:
if
self
.
variance_size_override
is
not
None
:
return
self
.
forward_native
(
x
,
residual
)
return
self
.
forward_native
(
x
,
residual
)
...
...
vllm/model_executor/layers/linear.py
View file @
b8ef3436
...
@@ -740,10 +740,10 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
...
@@ -740,10 +740,10 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
def
forward
(
def
forward
(
self
,
input_
,
self
,
input_
,
rms_weight
:
Optional
[
torch
.
Tensor
]
=
None
,
rms_weight
:
torch
.
Tensor
|
None
=
None
,
residual
:
Optional
[
torch
.
Tensor
]
=
None
,
residual
:
torch
.
Tensor
|
None
=
None
,
update_hd
:
Optional
[
bool
]
=
True
update_hd
:
bool
|
None
=
True
)
->
Union
[
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
Optional
[
Parameter
]
]]
:
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
Parameter
]
|
None
:
if
envs
.
USE_FUSED_RMS_QUANT
and
rms_weight
is
not
None
:
if
envs
.
USE_FUSED_RMS_QUANT
and
rms_weight
is
not
None
:
input_quant_args
=
None
input_quant_args
=
None
assert
residual
is
not
None
and
rms_weight
is
not
None
assert
residual
is
not
None
and
rms_weight
is
not
None
...
@@ -795,7 +795,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
...
@@ -795,7 +795,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
skip_bias_add
:
bool
=
False
,
skip_bias_add
:
bool
=
False
,
params_dtype
:
torch
.
dtype
|
None
=
None
,
params_dtype
:
torch
.
dtype
|
None
=
None
,
quant_config
:
QuantizationConfig
|
None
=
None
,
quant_config
:
QuantizationConfig
|
None
=
None
,
eps
:
Optional
[
float
]
=
1e-6
,
eps
:
float
|
None
=
1e-6
,
prefix
:
str
=
""
,
prefix
:
str
=
""
,
*
,
*
,
return_bias
:
bool
=
True
,
return_bias
:
bool
=
True
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment