Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a014d6a5
Commit
a014d6a5
authored
Nov 12, 2025
by
zhuwenwen
Browse files
update qwen3_moe of layernorm and activation
parent
8d6b0b0a
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
8 additions
and
8 deletions
+8
-8
vllm/attention/layer.py
vllm/attention/layer.py
+3
-3
vllm/envs.py
vllm/envs.py
+1
-1
vllm/model_executor/layers/activation.py
vllm/model_executor/layers/activation.py
+1
-1
vllm/model_executor/layers/layernorm.py
vllm/model_executor/layers/layernorm.py
+1
-1
vllm/model_executor/models/qwen3_moe.py
vllm/model_executor/models/qwen3_moe.py
+2
-2
No files found.
vllm/attention/layer.py
View file @
a014d6a5
...
@@ -29,7 +29,7 @@ try:
...
@@ -29,7 +29,7 @@ try:
tag_cudagraph_unsafe
=
(
torch
.
_C
.
Tag
.
cudagraph_unsafe
,
)
tag_cudagraph_unsafe
=
(
torch
.
_C
.
Tag
.
cudagraph_unsafe
,
)
except
AttributeError
:
except
AttributeError
:
tag_cudagraph_unsafe
=
()
# type: ignore[assignment]
tag_cudagraph_unsafe
=
()
# type: ignore[assignment]
class
Attention
(
nn
.
Module
):
class
Attention
(
nn
.
Module
):
"""Attention layer.
"""Attention layer.
...
@@ -220,8 +220,8 @@ class Attention(nn.Module):
...
@@ -220,8 +220,8 @@ class Attention(nn.Module):
output_shape
=
(
output_shape
output_shape
=
(
output_shape
if
output_shape
is
not
None
else
query
.
shape
)
if
output_shape
is
not
None
else
query
.
shape
)
output
=
torch
.
zeros
(
output_shape
,
output
=
torch
.
zeros
(
output_shape
,
dtype
=
query
.
dtype
,
dtype
=
query
.
dtype
,
device
=
query
.
device
)
device
=
query
.
device
)
hidden_size
=
output_shape
[
-
1
]
hidden_size
=
output_shape
[
-
1
]
# We skip reshaping query, key and value tensors for the MLA
# We skip reshaping query, key and value tensors for the MLA
# backend since these tensors have different semantics and are
# backend since these tensors have different semantics and are
...
...
vllm/envs.py
View file @
a014d6a5
...
@@ -1124,7 +1124,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -1124,7 +1124,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# vLLM will use lightop moe_align_block_size
# vLLM will use lightop moe_align_block_size
"VLLM_USE_LIGHTOP_MOE_ALIGN"
:
"VLLM_USE_LIGHTOP_MOE_ALIGN"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_LIGHTOP_MOE_ALIGN"
,
"True"
).
lower
()
in
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_LIGHTOP_MOE_ALIGN"
,
"True"
).
lower
()
in
(
"true"
,
"1"
)),
(
"true"
,
"1"
)),
# vLLM will use opt merge_aatn_states, not triton
# vLLM will use opt merge_aatn_states, not triton
"VLLM_USE_MERGE_ATTN_STATES_OPT"
:
"VLLM_USE_MERGE_ATTN_STATES_OPT"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_MERGE_ATTN_STATES_OPT"
,
"True"
).
lower
()
in
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_MERGE_ATTN_STATES_OPT"
,
"True"
).
lower
()
in
...
...
vllm/model_executor/layers/activation.py
View file @
a014d6a5
...
@@ -77,7 +77,7 @@ class SiluAndMul(CustomOp):
...
@@ -77,7 +77,7 @@ class SiluAndMul(CustomOp):
"""PyTorch-native implementation equivalent to forward()."""
"""PyTorch-native implementation equivalent to forward()."""
if
not
torch
.
compiler
.
is_compiling
()
and
envs
.
VLLM_ENABLE_TBO
:
if
not
torch
.
compiler
.
is_compiling
()
and
envs
.
VLLM_ENABLE_TBO
:
return
self
.
forward_cuda
(
x
)
return
self
.
forward_cuda
(
x
)
elif
envs
.
VLLM_USE_OPT_OP
:
elif
not
torch
.
compiler
.
is_compiling
()
and
envs
.
VLLM_USE_OPT_OP
:
return
self
.
forward_cuda
(
x
)
return
self
.
forward_cuda
(
x
)
else
:
else
:
d
=
x
.
shape
[
-
1
]
//
2
d
=
x
.
shape
[
-
1
]
//
2
...
...
vllm/model_executor/layers/layernorm.py
View file @
a014d6a5
...
@@ -167,7 +167,7 @@ class RMSNorm(CustomOp):
...
@@ -167,7 +167,7 @@ class RMSNorm(CustomOp):
)
->
Union
[
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]]:
)
->
Union
[
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]]:
if
not
torch
.
compiler
.
is_compiling
()
and
envs
.
VLLM_ENABLE_TBO
:
if
not
torch
.
compiler
.
is_compiling
()
and
envs
.
VLLM_ENABLE_TBO
:
return
self
.
forward_cuda
(
x
,
residual
)
return
self
.
forward_cuda
(
x
,
residual
)
elif
envs
.
VLLM_USE_OPT_OP
:
elif
not
torch
.
compiler
.
is_compiling
()
and
envs
.
VLLM_USE_OPT_OP
:
return
self
.
forward_cuda
(
x
,
residual
)
return
self
.
forward_cuda
(
x
,
residual
)
else
:
else
:
orig_dtype
=
x
.
dtype
orig_dtype
=
x
.
dtype
...
...
vllm/model_executor/models/qwen3_moe.py
View file @
a014d6a5
...
@@ -234,7 +234,7 @@ class Qwen3MoeAttention(nn.Module):
...
@@ -234,7 +234,7 @@ class Qwen3MoeAttention(nn.Module):
if
envs
.
VLLM_USE_APEX_RN
:
if
envs
.
VLLM_USE_APEX_RN
:
q_by_head
=
self
.
q_norm
.
forward_apex
(
q_by_head
)
q_by_head
=
self
.
q_norm
.
forward_apex
(
q_by_head
)
else
:
else
:
q_by_head
=
self
.
q_norm
(
q_by_head
)
q_by_head
=
self
.
q_norm
.
forward_cuda
(
q_by_head
)
q
=
q_by_head
.
view
(
q
.
shape
)
q
=
q_by_head
.
view
(
q
.
shape
)
k_by_head
=
k
.
view
(
*
k
.
shape
[:
-
1
],
k
.
shape
[
-
1
]
//
self
.
head_dim
,
k_by_head
=
k
.
view
(
*
k
.
shape
[:
-
1
],
k
.
shape
[
-
1
]
//
self
.
head_dim
,
...
@@ -242,7 +242,7 @@ class Qwen3MoeAttention(nn.Module):
...
@@ -242,7 +242,7 @@ class Qwen3MoeAttention(nn.Module):
if
envs
.
VLLM_USE_APEX_RN
:
if
envs
.
VLLM_USE_APEX_RN
:
k_by_head
=
self
.
k_norm
.
forward_apex
(
k_by_head
)
k_by_head
=
self
.
k_norm
.
forward_apex
(
k_by_head
)
else
:
else
:
k_by_head
=
self
.
k_norm
(
k_by_head
)
k_by_head
=
self
.
k_norm
.
forward_cuda
(
k_by_head
)
k
=
k_by_head
.
view
(
k
.
shape
)
k
=
k_by_head
.
view
(
k
.
shape
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
attn_output
=
self
.
attn
(
q
,
k
,
v
)
attn_output
=
self
.
attn
(
q
,
k
,
v
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment